[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling



On 06/12/16 11:16, Jan Beulich wrote:
> There's a new emulator return code being added to allow bypassing
> certain operations (see the code comment).
>
> The other small tweak to the emulator is to single iteration handling
> of INS and OUTS: Since we don't want to handle any other memory access
> instructions, we want these to be handled by the rep_ins() / rep_outs()
> hooks here too.
>
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> ---
> v3: Re-base. Do away with the special case pointer checks on the ->read
>     and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
>     for 32-bit guests (to avoid the emulator's in_longmode() returning
>     a wrong result). Introduce and use ->validate() hook. Make
>     formatting more consistent.

Clearly EFER.LM* is a behavioural change for the guest.  Given that we
hide all other trace of 64bit-ness from 32bit PV guests, its probably a
fine change to make, but it should be called out as such.

> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
>      pv_inject_event(&event);
>  }
>  
> -static void instruction_done(
> -    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
> +static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
>  {
>      regs->eip = eip;
>      regs->eflags &= ~X86_EFLAGS_RF;
> -    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
> +    if ( regs->eflags & X86_EFLAGS_TF )
>      {
> -        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
> -        if ( regs->eflags & X86_EFLAGS_TF )
> -            current->arch.debugreg[6] |= DR_STEP;
> +        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;

Looking at this, I wonder how whether I should have had an explicit
DR_STEP adjustment for the singlestep work I did.

Thinking about it, I checked that the trap was raised properly, but not
that %dr6 was correct.

>          do_guest_trap(TRAP_debug, regs);
>      }
>  }
> @@ -2023,6 +2020,148 @@ static int read_gate_descriptor(unsigned
>      return 1;
>  }
>  
> +struct priv_op_ctxt {
> +    struct x86_emulate_ctxt ctxt;
> +    struct {
> +        unsigned long base, limit;
> +    } cs;
> +    char *io_emul_stub;
> +    unsigned int bpmatch;
> +    unsigned int tsc;
> +#define TSC_BASE 1
> +#define TSC_AUX 2
> +};
> +
> +static bool priv_op_to_linear(unsigned long base, unsigned long offset,
> +                              unsigned int bytes, unsigned long limit,
> +                              enum x86_segment seg,
> +                              struct x86_emulate_ctxt *ctxt,
> +                              unsigned long *addr)

This is entirely generic.  Could it be called pv_virtual_to_linear() in
case it can be reused elsewhere?

However, I'd recommend returning X86EMUL_* codes, to avoid having the
"return X86EMUL_EXCEPTION;" separate from the x86_emul_hw_exception() call.

> +{
> +    bool okay;
> +
> +    *addr = base + offset;
> +
> +    if ( ctxt->addr_size < 8 )
> +    {
> +        okay = limit >= bytes - 1 && offset <= limit - bytes + 1;
> +        *addr = (uint32_t)*addr;
> +    }
> +    else
> +        okay = __addr_ok(*addr);
> +
> +    if ( unlikely(!okay) )
> +        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
> +                                                : TRAP_stack_error,
> +                              0, ctxt);
> +
> +    return okay;
> +}
> +
> <snip>
> +static int priv_op_read_segment(enum x86_segment seg,
> +                                struct segment_register *reg,
> +                                struct x86_emulate_ctxt *ctxt)
> +{
> +    if ( ctxt->addr_size < 8 )
> +    {
> +        unsigned long limit;
> +        unsigned int sel, ar;
> +
> +        switch ( seg )
> +        {
> +        case x86_seg_cs: sel = ctxt->regs->cs; break;
> +        case x86_seg_ds: sel = read_sreg(ds);  break;
> +        case x86_seg_es: sel = read_sreg(es);  break;
> +        case x86_seg_fs: sel = read_sreg(fs);  break;
> +        case x86_seg_gs: sel = read_sreg(gs);  break;
> +        case x86_seg_ss: sel = ctxt->regs->ss; break;
> +        case x86_seg_tr:
> +            /* Check if this is an attempt to access to I/O bitmap. */
> +            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 
> 0x6c )
> +                return X86EMUL_DONE;

This this leftovers from before my system-segment work?

I/O bitmap accesses are now through the ->read() hook with x86_seg_tr.

> +            /* fall through */
> +        default: return X86EMUL_UNHANDLEABLE;
> +        }
> +
> +        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
> +            return X86EMUL_UNHANDLEABLE;
> +
> +        reg->limit = limit;
> +        reg->attr.bytes = ar >> 8;
> +    }
> +    else
> +    {
> +        switch ( seg )
> +        {
> +        default:
> +            reg->base = 0;
> +            break;

This is incorrect for system segments.  If we don't intend to service
them, we should ASSERT() their absence.

> +        case x86_seg_fs:
> +            reg->base = rdfsbase();
> +            break;
> +        case x86_seg_gs:
> +            reg->base = rdgsbase();
> +            break;
> +        }
> +
> +        reg->limit = ~0U;
> +
> +        reg->attr.bytes = 0;
> +        reg->attr.fields.type = _SEGMENT_WR >> 8;
> +        if ( seg == x86_seg_cs )
> +            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
> +        reg->attr.fields.s   = 1;
> +        reg->attr.fields.dpl = 3;
> +        reg->attr.fields.p   = 1;
> +        reg->attr.fields.l   = 1;
> +        reg->attr.fields.db  = 1;
> +        reg->attr.fields.g   = 1;

This can't all be correct.  Having %cs with both D and L set is
definitely wrong in 64bit mode.

> +    }
> +
> +    /*
> +     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
> +     * Also do this for consistency for non-conforming code segments.
> +     */
> +    if ( (seg == x86_seg_ss ||
> +          (seg == x86_seg_cs &&
> +           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
> +         guest_kernel_mode(current, ctxt->regs) )
> +        reg->attr.fields.dpl = 0;

Have you run the XTF pv-iopl tests?  I'm fairly sure this simplification
will regress the behaviour of guests not using my recent
VMASST_TYPE_architectural_iopl extension.

Having said that, the old behaviour was supremely unhelpful for all
parties involved, so there is an argument to be made for altering its
behaviour.

> <snip>
> +
> +static int priv_op_read_io(unsigned int port, unsigned int bytes,
> +                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
> +{
> +    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
> +    struct vcpu *curr = current;
> +    struct domain *currd = current->domain;
> +
> +    /* INS must not come here. */
> +    ASSERT((ctxt->opcode & ~9) == 0xe4);

In this case, I think it would be better to have:

if ( unlikely((ctxt->opcode & ~9) != 0xe4) )
{
    /* INS must not come here.  Only tolerate IN. */
    ASSERT_UNREACHABLE();
    return X86EMUL_UNHANDLEABLE;
}

So we still bail in release builds.

> +
> +    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
> +        return X86EMUL_UNHANDLEABLE;
> +
> +    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
> +
> +    if ( admin_io_okay(port, bytes, currd) )
> +    {
> +        io_emul_stub_t *io_emul =
> +            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
> +
> +        mark_regs_dirty(ctxt->regs);
> +        io_emul(ctxt->regs);
> +        return X86EMUL_DONE;
> +    }
> +
> +    *val = guest_io_read(port, bytes, currd);
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +static int priv_op_write_io(unsigned int port, unsigned int bytes,
> +                            unsigned long val, struct x86_emulate_ctxt *ctxt)
> +{
> +    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
> +    struct vcpu *curr = current;
> +    struct domain *currd = current->domain;
> +
> +    /* OUTS must not come here. */
> +    ASSERT((ctxt->opcode & ~9) == 0xe6);

Similarly here.

>  int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
>                    unsigned int *edx, struct x86_emulate_ctxt *ctxt)
>  {
>      struct cpu_user_regs regs = *ctxt->regs;
>  
> +    /*
> +     * x86_emulate uses this function to query CPU features for its own
> +     * internal use. Make sure we're actually emulating CPUID before checking
> +     * for emulated CPUID faulting.
> +     */
> +    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
> +    {
> +        const struct vcpu *curr = current;
> +
> +        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
> +        if ( curr->arch.cpuid_faulting &&
> +             !guest_kernel_mode(curr, ctxt->regs) )
> +            return X86EMUL_UNHANDLEABLE;

I don't think this is conceptually correct.  The emulator should
explicitly raise #GP[0] here, rather than relying on the calling
behaviour of bouncing the pending exception back to the caller.

Also, please double check with the XTF CPUID faulting tests if you
haven't done so already, but it does look like this maintains the
correct guest-observable behaviour.

> <snip>
> +    case 0x6c ... 0x6f: /* ins / outs */
> +    case 0xe4 ... 0xe7: /* in / out (immediate port) */
> +    case 0xec ... 0xef: /* in / out (port in %dx) */
> +    case X86EMUL_OPC(0x0f, 0x06): /* clts */
> +    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
> +    case X86EMUL_OPC(0x0f, 0x20) ...
> +         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
> +    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
> +    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
> +    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
> +    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
> +        return X86EMUL_OKAY;
>  
> -        case 0x6e: /* OUTSB */
> -            op_bytes = 1;
> -        case 0x6f: /* OUTSW/OUTSL */
> -            if ( (data_limit < (op_bytes - 1)) ||
> -                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
> -                  !guest_io_okay(port, op_bytes, v, regs) )
> -                goto fail;
> -            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
> -                                      op_bytes)) != 0 )
> -            {
> -                pv_inject_page_fault(0, data_base + rd_ad(esi)
> -                                     + op_bytes - rc);
> -                return EXCRET_fault_fixed;
> -            }
> -            guest_io_write(port, op_bytes, data, currd);
> -            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
> -                                         ? -op_bytes : op_bytes));
> +    case 0xfa: case 0xfb: /* cli / sti */
> +        if ( !iopl_ok(current, ctxt->regs) )

Given the use of DONE below, this part should also explicitly raise
#GP[0], rather than falling through to UNHANDLEABLE, as we are providing
an emulation of the instruction.

> <snip>
> +        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
> +             (modrm_rm & 7) != 1 )
>              break;
> -        case 0xd1: /* XSETBV */
> +        switch ( modrm_reg & 7 )
>          {
> -            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
> +        case 2: /* xgetbv */

You appear to have altered XSETBV for XGETBV here.  XGETBV wasn't
previously emulated, and doesn't have any condition (we care about
emulating) which suffers #GP.

> <snip>
> +    case X86EMUL_EXCEPTION:
> +        pv_inject_event(&ctxt.ctxt.event);
> +        return EXCRET_fault_fixed;
>      }
>  
> -#undef wr_ad
> -#undef rd_ad
> -
> - done:
> -    instruction_done(regs, eip, bpmatch);
> - skip:
> -    if ( io_emul_stub )
> -        unmap_domain_page(io_emul_stub);
> -    return EXCRET_fault_fixed;
> -
> - fail:
> -    if ( io_emul_stub )
> -        unmap_domain_page(io_emul_stub);
>      return 0;

This return can be dropped.  There are no paths out of the switch
statement, and dropping this return will cause the compiler to notice if
one gets introduced in the future.

>  }
>  
> @@ -3599,7 +3677,7 @@ static void emulate_gate_op(struct cpu_u
>          sel |= (regs->cs & 3);
>  
>      regs->cs = sel;
> -    instruction_done(regs, off, 0);
> +    instruction_done(regs, off);
>  }
>  
>  void do_general_protection(struct cpu_user_regs *regs)
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1182,7 +1182,7 @@ static int ioport_access_check(
>  
>      fail_if(ops->read_segment == NULL);
>      if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
> -        return rc;
> +        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;

Ah, so the real purpose of having read_segment(x86_seg_tr, ...) return
X86EMUL_DONE is to defer the port check into the hooks themselves.

This doesn't actually match your intended meaning of X86EMUL_DONE, or
the documentation you provide below.  I think it would be cleaner in
this case to have an input x86_emulate_ctxt boolean requesting to skip
%tr ioport checks, which causes ioport_access_check() to exit early with
X86EMUL_OKAY.

> <snip>
> +struct x86_emulate_state;
> +
>  /*
>   * These operations represent the instruction emulator's interface to memory,
>   * I/O ports, privileged state... pretty much everything other than GPRs.
> @@ -239,6 +249,13 @@ struct x86_emulate_ops
>          struct x86_emulate_ctxt *ctxt);
>  
>      /*
> +     * validate: Post-decode hook to allow caller controlled filtering.

"Post-decode, pre-emulate hook...", to specify exactly where it lives.

~Andrew

> +     */
> +    int (*validate)(
> +        const struct x86_emulate_state *state,
> +        struct x86_emulate_ctxt *ctxt);
> +
> +    /*
>       * rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
>       *  @bytes_per_rep: [IN ] Bytes transferred per repetition.
>       *  @reps:  [IN ] Maximum repetitions to be emulated.
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.