[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 01/34] x86emul: support AVX512 opmask insns



These are all VEX encoded, so the EVEX decoding logic continues to
remain unused at this point.

The new testcase is deliberately coded in assembly, as a C one would
have become almost unreadable due to the overwhelming amount of
__builtin_...() that would need to be used. After all the compiler has
no underlying type (yet) that could be operated on without builtins,
other than the vector types used for "normal" SIMD insns.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v3: Use distinct temporary file names in testcase.mk. Additions to clean
    target.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,6 +16,8 @@ FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
+OPMASK := avx512f avx512dq avx512bw
+
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
@@ -51,6 +53,10 @@ xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
+avx512f-opmask-vecs := 2
+avx512dq-opmask-vecs := 1
+avx512bw-opmask-vecs := 4 8
+
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
 # use for floating point operations, to avoid mixing MMX and FPU register
@@ -80,9 +86,13 @@ $(1)-cflags := \
           $(foreach flt,$($(1)-flts), \
             "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os 
-DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
 endef
+define opmask-defs
+$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os 
-DSIZE=$(vec)")
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
        rm -f $@.new $*.bin
@@ -100,6 +110,22 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
        )
        mv $@.new $@
 
+$(addsuffix -opmask.h,$(OPMASK)): %.h: opmask.S testcase.mk Makefile
+       rm -f $@.new $*.bin
+       $(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) 
$(XEN_COMPILE_ARCH), \
+           for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
+               $(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) 
$*-cflags="$$cflags" all; \
+               prefix=$(shell echo $(subst -,_,$*) | sed -e 
's,^\([0-9]\),_\1,'); \
+               flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
+               (echo 'static const unsigned int __attribute__((section(".test, 
\"ax\", @progbits #")))' \
+                     "$${prefix}_$(arch)$${flavor}[] = {"; \
+                od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 
's/$$/,/'; \
+                echo "};") >>$@.new; \
+               rm -f $*.bin; \
+           done; \
+       )
+       mv $@.new $@
+
 $(addsuffix .c,$(SIMD)):
        ln -sf simd.c $@
 
@@ -118,7 +144,8 @@ $(TARGET): x86-emulate.o test_x86_emulat
 
 .PHONY: clean
 clean:
-       rm -rf $(TARGET) *.o *~ core $(addsuffix .h,$(TESTCASES)) *.bin 
x86_emulate
+       rm -rf $(TARGET) *.o *~ core *.bin x86_emulate
+       rm -rf $(TARGET) $(addsuffix .h,$(TESTCASES)) $(addsuffix 
-opmask.h,$(OPMASK))
 
 .PHONY: distclean
 distclean: clean
@@ -145,4 +172,4 @@ x86-emulate.o test_x86_emulator.o wrappe
 x86-emulate.o: x86_emulate/x86_emulate.c
 x86-emulate.o: HOSTCFLAGS += -D__XEN_TOOLS__
 
-test_x86_emulator.o: $(addsuffix .h,$(TESTCASES))
+test_x86_emulator.o: $(addsuffix .h,$(TESTCASES)) $(addsuffix 
-opmask.h,$(OPMASK))
--- /dev/null
+++ b/tools/tests/x86_emulator/opmask.S
@@ -0,0 +1,144 @@
+#ifdef __i386__
+# define R(x) e##x
+# define DATA(x) x
+#else
+# if SIZE == 8
+#  define R(x) r##x
+# else
+#  define R(x) e##x
+# endif
+# define DATA(x) x(%rip)
+#endif
+
+#if SIZE == 1
+# define _(x) x##b
+#elif SIZE == 2
+# define _(x) x##w
+# define WIDEN(x) x##bw
+#elif SIZE == 4
+# define _(x) x##d
+# define WIDEN(x) x##wd
+#elif SIZE == 8
+# define _(x) x##q
+# define WIDEN(x) x##dq
+#endif
+
+    .macro check res1:req, res2:req, line:req
+    _(kmov)       %\res1, DATA(out)
+#if SIZE < 8 || !defined(__i386__)
+    _(kmov)       %\res2, %R(dx)
+    cmp           DATA(out), %R(dx)
+#else
+    sub           $8, %esp
+    kmovq         %\res2, (%esp)
+    pop           %ecx
+    pop           %edx
+    cmp           DATA(out), %ecx
+    jne           0f
+    cmp           DATA(out+4), %edx
+0:
+#endif
+    je            1f
+    mov           $\line, %eax
+    ret
+1:
+    .endm
+
+    .text
+    .globl _start
+_start:
+    _(kmov)       DATA(in1), %k1
+#if SIZE < 8 || !defined(__i386__)
+    mov           DATA(in2), %R(ax)
+    _(kmov)       %R(ax), %k2
+#else
+    _(kmov)       DATA(in2), %k2
+#endif
+
+    _(kor)        %k1, %k2, %k3
+    _(kand)       %k1, %k2, %k4
+    _(kandn)      %k3, %k4, %k5
+    _(kxor)       %k1, %k2, %k6
+    check         k5, k6, __LINE__
+
+    _(knot)       %k6, %k3
+    _(kxnor)      %k1, %k2, %k4
+    check         k3, k4, __LINE__
+
+    _(kshiftl)    $1, %k1, %k3
+    _(kshiftl)    $2, %k3, %k4
+    _(kshiftl)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kshiftr)    $1, %k1, %k3
+    _(kshiftr)    $2, %k3, %k4
+    _(kshiftr)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kortest)    %k6, %k6
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#if SIZE > 1
+
+    _(kshiftr)    $SIZE*4, %k3, %k4
+    WIDEN(kunpck) %k4, %k4, %k5
+    check         k3, k5, __LINE__
+
+#endif
+
+#if SIZE != 2 || defined(__AVX512DQ__)
+
+    _(kadd)       %k1, %k1, %k3
+    _(kshiftl)    $1, %k1, %k4
+    check         k3, k4, __LINE__
+
+    _(ktest)      %k2, %k1
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(ktest)      %k0, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k4
+    _(ktest)      %k0, %k4
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#endif
+
+    xor           %eax, %eax
+    ret
+
+    .section .rodata, "a", @progbits
+    .balign 8
+in1: .byte 0b10110011, 0b10001111, 0b00001111, 0b10000011, 0b11110000, 
0b00111111, 0b10000000, 0b11111111
+in2: .byte 0b11111111, 0b00000001, 0b11111100, 0b00001111, 0b11000001, 
0b11110000, 0b11110001, 0b11001101
+
+    .data
+    .balign 8
+out: .quad 0
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -18,6 +18,9 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx2.h"
 #include "avx2-sg.h"
 #include "xop.h"
+#include "avx512f-opmask.h"
+#include "avx512dq-opmask.h"
+#include "avx512bw-opmask.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -78,6 +81,24 @@ static bool simd_check_xop(void)
     return cpu_has_xop;
 }
 
+static bool simd_check_avx512f(void)
+{
+    return cpu_has_avx512f;
+}
+#define simd_check_avx512f_opmask simd_check_avx512f
+
+static bool simd_check_avx512dq(void)
+{
+    return cpu_has_avx512dq;
+}
+#define simd_check_avx512dq_opmask simd_check_avx512dq
+
+static bool simd_check_avx512bw(void)
+{
+    return cpu_has_avx512bw;
+}
+#define simd_check_avx512bw_opmask simd_check_avx512bw
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -223,6 +244,10 @@ static const struct {
     SIMD(XOP i16x16,              xop,      32i2),
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
+    SIMD(OPMASK/w,     avx512f_opmask,         2),
+    SIMD(OPMASK/b,    avx512dq_opmask,         1),
+    SIMD(OPMASK/d,    avx512bw_opmask,         4),
+    SIMD(OPMASK/q,    avx512bw_opmask,         8),
 #undef SIMD_
 #undef SIMD
 };
@@ -3426,8 +3451,8 @@ int main(int argc, char **argv)
             rc = x86_emulate(&ctxt, &emulops);
             if ( rc != X86EMUL_OKAY )
             {
-                printf("failed at %%eip == %08lx (opcode %08x)\n",
-                       (unsigned long)regs.eip, ctxt.opcode);
+                printf("failed (%d) at %%eip == %08lx (opcode %08x)\n",
+                       rc, (unsigned long)regs.eip, ctxt.opcode);
                 return 1;
             }
         }
--- a/tools/tests/x86_emulator/testcase.mk
+++ b/tools/tests/x86_emulator/testcase.mk
@@ -14,3 +14,9 @@ all: $(TESTCASE).bin
        $(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $*.tmp $*.o
        $(OBJCOPY) -O binary $*.tmp $@
        rm -f $*.tmp
+
+%-opmask.bin: opmask.S
+       $(CC) $(filter-out -M% .%,$(CFLAGS)) -c $< -o $(basename $@).o
+       $(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $(basename $@).tmp 
$(basename $@).o
+       $(OBJCOPY) -O binary $(basename $@).tmp $@
+       rm -f $(basename $@).tmp
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -209,6 +209,9 @@ int emul_test_get_fpu(
     case X86EMUL_FPU_ymm:
         if ( cpu_has_avx )
             break;
+    case X86EMUL_FPU_opmask:
+        if ( cpu_has_avx512f )
+            break;
     default:
         return X86EMUL_UNHANDLEABLE;
     }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -236,6 +236,36 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 21)) != 0; \
 })
 
+#define cpu_has_avx512f ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 16)) != 0; \
+})
+
+#define cpu_has_avx512dq ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 17)) != 0; \
+})
+
+#define cpu_has_avx512bw ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 30)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -491,6 +491,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
@@ -1187,6 +1188,11 @@ static int _get_fpu(
             return X86EMUL_UNHANDLEABLE;
         break;
 
+    case X86EMUL_FPU_opmask:
+        if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
     default:
         break;
     }
@@ -1762,12 +1768,15 @@ static bool vcpu_has(
 #define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
 #define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
 #define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
+#define vcpu_has_avx512f()     vcpu_has(         7, EBX, 16, ctxt, ops)
+#define vcpu_has_avx512dq()    vcpu_has(         7, EBX, 17, ctxt, ops)
 #define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
 #define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
 #define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
 #define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
+#define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2396,6 +2405,18 @@ x86_decode_twobyte(
         }
         break;
 
+    case X86EMUL_OPC_VEX(0, 0x90):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x90): /* kmov{b,d} */
+        state->desc = DstReg | SrcMem | Mov;
+        state->simd_size = simd_other;
+        break;
+
+    case X86EMUL_OPC_VEX(0, 0x91):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x91): /* kmov{b,d} */
+        state->desc = DstMem | SrcReg | Mov;
+        state->simd_size = simd_other;
+        break;
+
     case 0xae:
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
@@ -6002,6 +6023,60 @@ x86_emulate(
             dst.val = src.val;
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x41):    /* kand{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x41): /* kand{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x42):    /* kandn{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x42): /* kandn{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x45):    /* kor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x45): /* kor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x46):    /* kxnor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x46): /* kxnor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x47):    /* kxor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x47): /* kxor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4a): /* kadd{b,d} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+    opmask_basic:
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_common:
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(!vex.r || (mode_64bit() && !(vex.reg & 8)) ||
+                              ea.type != OP_REG, EXC_UD);
+
+        vex.reg |= 8;
+        d &= ~TwoOp;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
+
+        state->simd_size = simd_other;
+        op_bytes = 1; /* Any non-zero value will do. */
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x44):    /* knot{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x44): /* knot{b,d} k,k */
+        generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+        goto opmask_basic;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x4b):    /* kunpck{w,d}{d,q} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4b): /* kunpckbw k,k,k */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto opmask_common;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
@@ -6552,6 +6627,154 @@ x86_emulate(
         dst.val = test_cc(b, _regs.eflags);
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x91):    /* kmov{w,q} k,mem */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x91): /* kmov{b,d} k,mem */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x90):    /* kmov{w,q} k/mem,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x90): /* kmov{b,d} k/mem,k */
+        generate_exception_if(vex.l || !vex.r, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            op_bytes = 4 << !vex.pfx;
+        }
+        else if ( vex.pfx )
+        {
+            host_and_vcpu_must_have(avx512dq);
+            op_bytes = 1;
+        }
+        else
+            op_bytes = 2;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = PFX_BYTES + 2;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x92):    /* kmovw r32,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x92): /* kmovb r32,k */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x92): /* kmov{d,q} reg,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+            host_and_vcpu_must_have(avx512bw);
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        vex.b = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xf8;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        ea.reg = decode_gpr(&_regs, modrm_rm);
+        invoke_stub("", "", "=m" (dummy) : "a" (*ea.reg));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x93):    /* kmovw k,r32 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x93): /* kmovb k,r32 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x93): /* kmov{d,q} k,reg */
+        generate_exception_if(vex.l || vex.reg != 0xf || ea.type != OP_REG,
+                              EXC_UD);
+        dst = ea;
+        dst.reg = decode_gpr(&_regs, modrm_reg);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            dst.bytes = 4 << (mode_64bit() && vex.w);
+        }
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            dst.bytes = 4;
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR destination to %rAX. */
+        vex.r = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xc7;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x99):    /* ktest{w,q} k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x98):    /* kortest{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x98): /* kortest{b,d} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x99): /* ktest{b,d} k,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    [eflags] "+g" (_regs.eflags),
+                    "=a" (dst.val), [tmp] "=&r" (dummy)
+                    : [mask] "i" (EFLAGS_MASK));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
         msr_val = 0;
         fail_if(ops->cpuid == NULL);
@@ -8170,6 +8393,23 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_shift_imm:
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_opmask);
+        op_bytes = 1; /* Any non-zero value will do. */
+        goto simd_0f_imm8;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x31): /* kshiftr{d,q} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x33): /* kshiftl{d,q} $imm8,k,k */
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_shift_imm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x44):     /* pclmulqdq $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq 
$imm8,xmm/m128,xmm,xmm */
         host_and_vcpu_must_have(pclmulqdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -170,6 +170,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
+    X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -99,9 +99,12 @@
 #define cpu_has_rtm             boot_cpu_has(X86_FEATURE_RTM)
 #define cpu_has_fpu_sel         (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL))
 #define cpu_has_mpx             boot_cpu_has(X86_FEATURE_MPX)
+#define cpu_has_avx512f         boot_cpu_has(X86_FEATURE_AVX512F)
+#define cpu_has_avx512dq        boot_cpu_has(X86_FEATURE_AVX512DQ)
 #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
+#define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.