x86: also allow REP STOS emulation acceleration While the REP MOVS acceleration appears to have helped qemu-traditional based guests, qemu-upstream (or really the respective video BIOSes) doesn't appear to benefit from that. Instead the acceleration added here provides a visible performance improvement during very early HVM guest boot. Signed-off-by: Jan Beulich --- v2: fix asm() constraints in hvmemul_rep_stos(), as pointed out by Andrew. Add output operand telling the compiler that "buf" is being written. --- unstable.orig/xen/arch/x86/hvm/emulate.c 2015-01-09 11:19:01.000000000 +0100 +++ unstable/xen/arch/x86/hvm/emulate.c 2015-01-09 11:19:36.000000000 +0100 @@ -731,6 +731,17 @@ static int hvmemul_rep_movs_discard( return X86EMUL_OKAY; } +static int hvmemul_rep_stos_discard( + void *p_data, + enum x86_segment seg, + unsigned long offset, + unsigned int bytes_per_rep, + unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + return X86EMUL_OKAY; +} + static int hvmemul_rep_outs_discard( enum x86_segment src_seg, unsigned long src_offset, @@ -982,6 +993,114 @@ static int hvmemul_rep_movs( return X86EMUL_OKAY; } +static int hvmemul_rep_stos( + void *p_data, + enum x86_segment seg, + unsigned long offset, + unsigned int bytes_per_rep, + unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + struct hvm_emulate_ctxt *hvmemul_ctxt = + container_of(ctxt, struct hvm_emulate_ctxt, ctxt); + unsigned long addr; + paddr_t gpa; + p2m_type_t p2mt; + bool_t df = !!(ctxt->regs->eflags & X86_EFLAGS_DF); + int rc = hvmemul_virtual_to_linear(seg, offset, bytes_per_rep, reps, + hvm_access_write, hvmemul_ctxt, &addr); + + if ( rc == X86EMUL_OKAY ) + { + uint32_t pfec = PFEC_page_present | PFEC_write_access; + + if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 ) + pfec |= PFEC_user_mode; + + rc = hvmemul_linear_to_phys( + addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt); + } + if ( rc != X86EMUL_OKAY ) + return rc; + + /* Check for MMIO op */ + (void)get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt); + + switch ( p2mt ) + { + unsigned long bytes; + void *buf; + + default: + /* Allocate temporary buffer. */ + for ( ; ; ) + { + bytes = *reps * bytes_per_rep; + buf = xmalloc_bytes(bytes); + if ( buf || *reps <= 1 ) + break; + *reps >>= 1; + } + + if ( !buf ) + buf = p_data; + else + switch ( bytes_per_rep ) + { + unsigned long dummy; + +#define CASE(bits, suffix) \ + case (bits) / 8: \ + asm ( "rep stos" #suffix \ + : "=m" (*(char (*)[bytes])buf), \ + "=D" (dummy), "=c" (dummy) \ + : "a" (*(const uint##bits##_t *)p_data), \ + "1" (buf), "2" (*reps) \ + : "memory" ); \ + break + CASE(8, b); + CASE(16, w); + CASE(32, l); + CASE(64, q); +#undef CASE + + default: + xfree(buf); + ASSERT(!buf); + return X86EMUL_UNHANDLEABLE; + } + + /* Adjust address for reverse store. */ + if ( df ) + gpa -= bytes - bytes_per_rep; + + rc = hvm_copy_to_guest_phys(gpa, buf, bytes); + + if ( buf != p_data ) + xfree(buf); + + switch ( rc ) + { + case HVMCOPY_gfn_paged_out: + case HVMCOPY_gfn_shared: + return X86EMUL_RETRY; + case HVMCOPY_okay: + return X86EMUL_OKAY; + } + + gdprintk(XENLOG_WARNING, + "Failed REP STOS: gpa=%"PRIpaddr" reps=%lu bytes_per_rep=%u\n", + gpa, *reps, bytes_per_rep); + /* fall through */ + case p2m_mmio_direct: + return X86EMUL_UNHANDLEABLE; + + case p2m_mmio_dm: + return hvmemul_do_mmio(gpa, reps, bytes_per_rep, 0, IOREQ_WRITE, df, + p_data); + } +} + static int hvmemul_read_segment( enum x86_segment seg, struct segment_register *reg, @@ -1239,6 +1358,7 @@ static const struct x86_emulate_ops hvm_ .rep_ins = hvmemul_rep_ins, .rep_outs = hvmemul_rep_outs, .rep_movs = hvmemul_rep_movs, + .rep_stos = hvmemul_rep_stos, .read_segment = hvmemul_read_segment, .write_segment = hvmemul_write_segment, .read_io = hvmemul_read_io, @@ -1264,6 +1384,7 @@ static const struct x86_emulate_ops hvm_ .rep_ins = hvmemul_rep_ins_discard, .rep_outs = hvmemul_rep_outs_discard, .rep_movs = hvmemul_rep_movs_discard, + .rep_stos = hvmemul_rep_stos_discard, .read_segment = hvmemul_read_segment, .write_segment = hvmemul_write_segment, .read_io = hvmemul_read_io_discard, --- unstable.orig/xen/arch/x86/hvm/stdvga.c 2015-01-09 11:19:01.000000000 +0100 +++ unstable/xen/arch/x86/hvm/stdvga.c 2014-11-04 17:42:12.000000000 +0100 @@ -470,11 +470,11 @@ static int mmio_move(struct hvm_hw_stdvg uint64_t addr = p->addr; p2m_type_t p2mt; struct domain *d = current->domain; + int step = p->df ? -p->size : p->size; if ( p->data_is_ptr ) { uint64_t data = p->data, tmp; - int step = p->df ? -p->size : p->size; if ( p->dir == IOREQ_READ ) { @@ -529,13 +529,18 @@ static int mmio_move(struct hvm_hw_stdvg } } } + else if ( p->dir == IOREQ_WRITE ) + { + for ( i = 0; i < p->count; i++ ) + { + stdvga_mem_write(addr, p->data, p->size); + addr += step; + } + } else { ASSERT(p->count == 1); - if ( p->dir == IOREQ_READ ) - p->data = stdvga_mem_read(addr, p->size); - else - stdvga_mem_write(addr, p->data, p->size); + p->data = stdvga_mem_read(addr, p->size); } read_data = p->data; --- unstable.orig/xen/arch/x86/x86_emulate/x86_emulate.c 2015-01-09 11:19:01.000000000 +0100 +++ unstable/xen/arch/x86/x86_emulate/x86_emulate.c 2015-01-09 11:08:19.000000000 +0100 @@ -2568,15 +2568,25 @@ x86_emulate( } case 0xaa ... 0xab: /* stos */ { - /* unsigned long max_reps = */get_rep_prefix(); - dst.type = OP_MEM; + unsigned long nr_reps = get_rep_prefix(); dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.mem.seg = x86_seg_es; dst.mem.off = truncate_ea(_regs.edi); - dst.val = _regs.eax; + if ( (nr_reps == 1) || !ops->rep_stos || + ((rc = ops->rep_stos(&_regs.eax, + dst.mem.seg, dst.mem.off, dst.bytes, + &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) ) + { + dst.val = _regs.eax; + dst.type = OP_MEM; + nr_reps = 1; + } + else if ( rc != X86EMUL_OKAY ) + goto done; register_address_increment( - _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); - put_rep_prefix(1); + _regs.edi, + nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes)); + put_rep_prefix(nr_reps); break; } --- unstable.orig/xen/arch/x86/x86_emulate/x86_emulate.h 2015-01-09 11:19:01.000000000 +0100 +++ unstable/xen/arch/x86/x86_emulate/x86_emulate.h 2014-11-04 17:06:49.000000000 +0100 @@ -241,6 +241,20 @@ struct x86_emulate_ops struct x86_emulate_ctxt *ctxt); /* + * rep_stos: Emulate STOS: <*p_data> -> . + * @bytes_per_rep: [IN ] Bytes transferred per repetition. + * @reps: [IN ] Maximum repetitions to be emulated. + * [OUT] Number of repetitions actually emulated. + */ + int (*rep_stos)( + void *p_data, + enum x86_segment seg, + unsigned long offset, + unsigned int bytes_per_rep, + unsigned long *reps, + struct x86_emulate_ctxt *ctxt); + + /* * read_segment: Emulate a read of full context of a segment register. * @reg: [OUT] Contents of segment register (visible and hidden state). */