[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v7 18/49] x86emul: support AVX512{F, BW, _VBMI} permute insns



Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v7: Re-base.
v5: Re-base over changes earlier in the series.
v4: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -178,6 +178,10 @@ static const struct test avx512f_all[] =
     INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
     INSN(permi2,       66, 0f38, 76,    vl,     dq, vl),
     INSN(permi2,       66, 0f38, 77,    vl,     sd, vl),
+    INSN(permilpd,     66, 0f38, 0d,    vl,      q, vl),
+    INSN(permilpd,     66, 0f3a, 05,    vl,      q, vl),
+    INSN(permilps,     66, 0f38, 0c,    vl,      d, vl),
+    INSN(permilps,     66, 0f3a, 04,    vl,      d, vl),
     INSN(permt2,       66, 0f38, 7e,    vl,     dq, vl),
     INSN(permt2,       66, 0f38, 7f,    vl,     sd, vl),
     INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
@@ -278,6 +282,10 @@ static const struct test avx512f_no128[]
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
     INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
     INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
+    INSN(perm,           66, 0f38, 36, vl,   dq, vl),
+    INSN(perm,           66, 0f38, 16, vl,   sd, vl),
+    INSN(permpd,         66, 0f3a, 01, vl,    q, vl),
+    INSN(permq,          66, 0f3a, 00, vl,    q, vl),
     INSN(shuff32x4,      66, 0f3a, 23, vl,    d, vl),
     INSN(shuff64x2,      66, 0f3a, 23, vl,    q, vl),
     INSN(shufi32x4,      66, 0f3a, 43, vl,    d, vl),
@@ -316,6 +324,7 @@ static const struct test avx512bw_all[]
     INSN(pcmpgtb,     66,   0f, 64,    vl,    b, vl),
     INSN(pcmpgtw,     66,   0f, 65,    vl,    w, vl),
     INSN(pcmpu,       66, 0f3a, 3e,    vl,   bw, vl),
+    INSN(permw,       66, 0f38, 8d,    vl,    w, vl),
     INSN(permi2w,     66, 0f38, 75,    vl,    w, vl),
     INSN(permt2w,     66, 0f38, 7d,    vl,    w, vl),
     INSN(pmaddwd,     66,   0f, f5,    vl,    w, vl),
@@ -412,6 +421,7 @@ static const struct test avx512dq_512[]
 };
 
 static const struct test avx512_vbmi_all[] = {
+    INSN(permb,         66, 0f38, 8d, vl, b, vl),
     INSN(permi2b,       66, 0f38, 75, vl, b, vl),
     INSN(permt2b,       66, 0f38, 7d, vl, b, vl),
 };
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -186,6 +186,7 @@ static inline bool _to_bool(byte_vec_t b
 #   define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
 #   define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
 #   define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+#   define swap2(x) B_(vpermilps, _mask, x, 0b00011011, undef(), ~0)
 #  else
 #   define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
 #   define insert_pair(x, y, p) \
@@ -200,6 +201,10 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, 
undef(), ~0); \
     B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
 })
+#   define swap2(x) B(vpermilps, _mask, \
+                       B(shuf_f32x4_, _mask, x, x, \
+                         VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
+                       0b00011011, undef(), ~0)
 #  endif
 # elif FLOAT_SIZE == 8
 #  if VEC_SIZE >= 32
@@ -233,6 +238,7 @@ static inline bool _to_bool(byte_vec_t b
 #   define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
 #   define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
 #   define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+#   define swap2(x) B_(vpermilpd, _mask, x, 0b01, undef(), ~0)
 #  else
 #   define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
 #   define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
@@ -240,6 +246,10 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, 
undef(), ~0); \
     B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
 })
+#   define swap2(x) B(vpermilpd, _mask, \
+                       B(shuf_f64x2_, _mask, x, x, \
+                         VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
+                       0b01010101, undef(), ~0)
 #  endif
 # endif
 #elif FLOAT_SIZE == 4 && defined(__SSE__)
@@ -405,6 +415,7 @@ static inline bool _to_bool(byte_vec_t b
                              B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
                                VEC_SIZE == 32 ? 0b01 : 0b00011011, 
(vsi_t)undef(), ~0), \
                              0b00011011, (vsi_t)undef(), ~0))
+#   define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), 
(vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -442,8 +453,17 @@ static inline bool _to_bool(byte_vec_t b
                              (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), 
(vdi_t)(x), \
                                       VEC_SIZE == 32 ? 0b01 : 0b00011011, 
(vdi_t)undef(), ~0), \
                              0b01001110, (vsi_t)undef(), ~0))
+#   define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), 
(vdi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 
0b01010101))
+#  if VEC_SIZE == 32
+#   define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, 
(vdi_t)undef(), ~0))
+#  elif VEC_SIZE == 64
+#   define swap3(x) ({ \
+    vdi_t t_ = B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0); \
+    B(shuf_i64x2_, _mask, t_, t_, 0b01001110, (vdi_t)undef(), ~0); \
+})
+#  endif
 # endif
 # if INT_SIZE == 4
 #  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
@@ -489,6 +509,9 @@ static inline bool _to_bool(byte_vec_t b
 #  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
 #  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), 
(vqi_quarter_t){}, ~0))
 #  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, 
~0))
+#  ifdef __AVX512VBMI__
+#   define swap2(x) ((vec_t)B(permvarqi, _mask, (vqi_t)(x), (vqi_t)(inv - 1), 
(vqi_t)undef(), ~0))
+#  endif
 # elif INT_SIZE == 2 || UINT_SIZE == 2
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -517,6 +540,7 @@ static inline bool _to_bool(byte_vec_t b
                               (0b01010101010101010101010101010101 & ALL_TRUE)))
 #  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
 #  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), 
(vhi_quarter_t){}, ~0))
+#  define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), 
(vhi_t)undef(), ~0))
 # endif
 # if INT_SIZE == 1
 #  define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), 
(vqi_t)undef(), ~0))
@@ -1325,6 +1349,12 @@ int simd_test(void)
     if ( !eq(swap2(src), inv) ) return __LINE__;
 #endif
 
+#ifdef swap3
+    touch(src);
+    if ( !eq(swap3(src), inv) ) return __LINE__;
+    touch(src);
+#endif
+
 #ifdef broadcast
     if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -275,6 +275,8 @@ OVR(movlps);
 OVR_VFP(movnt);
 OVR_VFP(movu);
 OVR_FP(mul);
+OVR_VFP(perm);
+OVR_VFP(permil);
 OVR_VFP(shuf);
 OVR_INT(sll);
 OVR_DQ(sllv);
@@ -331,6 +333,8 @@ OVR(movntdq);
 OVR(movntdqa);
 OVR(movshdup);
 OVR(movsldup);
+OVR(permd);
+OVR(permq);
 OVR(pmovsxbd);
 OVR(pmovsxbq);
 OVR(pmovsxdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -434,7 +434,8 @@ static const struct ext0f38_table {
 } ext0f38_table[256] = {
     [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x01 ... 0x0b] = { .simd_size = simd_packed_int },
-    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
+    [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x0e ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -477,6 +478,7 @@ static const struct ext0f38_table {
     [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x8c] = { .simd_size = simd_packed_int },
+    [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -522,10 +524,10 @@ static const struct ext0f3a_table {
     uint8_t four_op:1;
     disp8scale_t d8s:4;
 } ext0f3a_table[256] = {
-    [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
+    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0x02] = { .simd_size = simd_packed_int },
-    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = 
d8s_vl },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
@@ -8091,6 +8093,9 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf2): /* vpslld 
xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf3): /* vpsllq 
xmm/m128,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.brs, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x0c): /* vpermilps 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x0d): /* vpermilpd 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         if ( b == 0xe2 )
             goto avx512f_no_sae;
@@ -8436,6 +8441,12 @@ x86_emulate(
         generate_exception_if(!vex.l || vex.w, EXC_UD);
         goto simd_0f_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} 
{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} 
{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+        generate_exception_if(!evex.lr, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -8641,6 +8652,7 @@ x86_emulate(
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         if ( !evex.w )
             host_and_vcpu_must_have(avx512_vbmi);
         else
@@ -9066,6 +9078,12 @@ x86_emulate(
         generate_exception_if(!vex.l || !vex.w, EXC_UD);
         goto simd_0f_imm8_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x00): /* vpermq 
$imm8,{y,z}mm/mem,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x01): /* vpermpd 
$imm8,{y,z}mm/mem,{y,z}mm{k} */
+        generate_exception_if(!evex.lr || !evex.w, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm8_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 
$imm8,xmm/m128,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 
*/
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 
$imm8,ymm/m256,ymm,ymm */
@@ -9085,6 +9103,12 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x04): /* vpermilps 
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x05): /* vpermilpd 
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w != (b & 1), EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm8_no_sae;
+
     case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.