[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC XTF PATCH] Pagetable Emulation testing



On 13/03/17 15:45, Jan Beulich wrote:
>>>> On 06.03.17 at 17:42, <andrew.cooper3@xxxxxxxxxx> wrote:
>> +struct mapping_info
>> +{
>> +    unsigned int level, order;
>> +    void *va;
>> +    intpte_t *pte, *fe_pte;
> Having got quite a bit into the patch, I think I've finally understood that
> "fe" here is meant to match up with "fep" - a comment might be helpful.

Ah yes. It is a pointer to the PTE which maps the force emulation prefix.

In particular, its _PAGE_USER setting needs to match cpl so we don't
take unrelated instruction fetch faults while setting up an emulated
instruction fetch.

>
>> +bool ex_check_pf(struct cpu_regs *regs,
>> +                 const struct extable_entry *ex)
>> +{
>> +    if ( regs->entry_vector == X86_EXC_PF )
>> +    {
>> +        unsigned long cr2 = read_cr2();
>> +
>> +        if ( (cr2 != under_test.va) &&
>> +             (cr2 != under_test.va + (LDT_SEL & ~7)) )
>> +            xtf_failure("Bad %%cr2: expected %p, got %p\n",
>> +                        _p(under_test.va), _p(cr2));
>> +
>> +        regs->ax = EXINFO_SYM(PF, regs->error_code);
>> +
>> +        if ( ex->fixup )
>> +            regs->ip = ex->fixup;
>> +        else
>> +            regs->ip = *(unsigned long *)cpu_regs_sp(regs);
> What does this match up with? The single RET you place at the first
> byte of the page subject to testing? Surely this is rather fragile wrt
> #PF happening anywhere unexpected?

stub exec and user exec take #PF at the target of the call instruction,
rather than on the call instruction itself.

The extable entry is anchored at the target of the call (set up by the
asm() statements in prepare_mappings()), but the fixup needs to follow
the return address on the stack to recover.

>
>> +static void prepare_mappings(struct mapping_info *m, unsigned int level, 
>> bool super, paddr_t paddr)
>> +{
>> +    bool pse36 = CONFIG_PAGING_LEVELS == 2 && paddr != (uint32_t)paddr;
>> +
>> +    memset(m, 0, sizeof(*m));
>> +
>> +#define PAGE_COMMON PF_SYM(AD, U, RW, P)
>> +    /*
>> +     * For 4-level paging, we use l4[1/2].
>> +     */
>> +    if ( CONFIG_PAGING_LEVELS == 4 )
>> +    {
>> +
>> +        pae_l4_identmap[1] = (unsigned long)l3t | PAGE_COMMON;
>> +        pae_l4_identmap[2] = (unsigned long)l3t | PAGE_COMMON;
>> +
>> +        l3t[0]   = (unsigned long)l2t | PAGE_COMMON;
>> +        l3t[511] = (unsigned long)l2t | PAGE_COMMON;
>> +
>> +        l2t[0]   = (unsigned long)l1t | PAGE_COMMON;
>> +        l2t[511] = (unsigned long)l1t | PAGE_COMMON;
>> +
>> +        l1t[0]   = paddr | PAGE_COMMON;
>> +        l1t[511] = ((paddr - 1) & ~0xfff) | PAGE_COMMON;
>> +
>> +        m->va     = _p(2ULL << PAE_L4_PT_SHIFT);
>> +        m->l1e    = &l1t[0];
>> +        m->l2e    = &l2t[0];
>> +        m->l3e    = &l3t[0];
>> +        m->l4e    = _p(&pae_l4_identmap[2]);
>> +        m->fe_pte = &l1t[511];
>> +
>> +        asm(_ASM_EXTABLE_HANDLER(2 << PAE_L4_PT_SHIFT, 0, ex_check_pf));
>> +        under_test.va = (unsigned long)m->va;
>> +    }
>> +    else if ( CONFIG_PAGING_LEVELS == 3 )
>> +    {
>> +        pae32_l3_identmap[1] = (unsigned long)l2t | _PAGE_PRESENT;
>> +        pae32_l3_identmap[2] = (unsigned long)l2t | _PAGE_PRESENT;
>> +
>> +        l2t[0]   = (unsigned long)l1t | PAGE_COMMON;
>> +        l2t[511] = (unsigned long)l1t | PAGE_COMMON;
>> +
>> +        l1t[0]   = paddr | PAGE_COMMON;
>> +        l1t[511] = ((paddr - 1) & ~0xfff) | PAGE_COMMON;
>> +
>> +        m->va     = _p(2ULL << PAE_L3_PT_SHIFT);
>> +        m->l1e    = &l1t[0];
>> +        m->l2e    = &l2t[0];
>> +        m->l3e    = _p(&pae32_l3_identmap[2]);
>> +        m->l4e    = NULL;
>> +        m->fe_pte = &l1t[511];
>> +
>> +        asm(_ASM_EXTABLE_HANDLER(2 << PAE_L3_PT_SHIFT, 0, ex_check_pf));
>> +        under_test.va = (unsigned long)m->va;
>> +    }
>> +    else if ( CONFIG_PAGING_LEVELS == 2 )
>> +    {
>> +        if ( pse36 )
>> +        {
>> +            ASSERT(super);
>> +            ASSERT(IS_ALIGNED(paddr, MB(4)));
>> +
>> +            pse_l2_identmap[511] = fold_pse36((paddr - MB(4)) | PAGE_COMMON 
>> | _PAGE_PSE);
>> +            pse_l2_identmap[512] = fold_pse36(paddr | PAGE_COMMON | 
>> _PAGE_PSE);
>> +        }
>> +        else
>> +        {
>> +            pse_l2_identmap[511] = (unsigned long)l1t | PAGE_COMMON;
>> +            pse_l2_identmap[512] = (unsigned long)l1t | PAGE_COMMON;
>> +
>> +            l1t[0]    = paddr | PAGE_COMMON;
>> +            l1t[1023] = ((paddr - 1) & ~0xfff) | PAGE_COMMON;
>> +        }
>> +
>> +        m->va     = _p(2ULL << PAE_L3_PT_SHIFT);
>> +        m->l1e    = pse36 ? NULL : &l1t[0];
>> +        m->l2e    = _p(&pse_l2_identmap[512]);
>> +        m->l3e    = NULL;
>> +        m->l4e    = NULL;
>> +        m->fe_pte = pse36 ? _p(&pse_l2_identmap[511]) : &l1t[1023];
> The use of PAE constants here (including the literal 511 / 512) is
> mildly confusing here without any comments.

That looks like a copy/paste mistake.  It should be 512 <<
PSE_L2_PT_SHIFT, which should be the same value.

I will extend the l4 comments to the l3 and l2 cases.

>
>> +        asm(_ASM_EXTABLE_HANDLER(2 << PAE_L3_PT_SHIFT, 0, ex_check_pf));
>> +        under_test.va = (unsigned long)m->va;
>> +    }
>> +    else
>> +        panic("%s() PAGING_LEVELS %u not implemented yet\n",
>> +              __func__, CONFIG_PAGING_LEVELS);
>> +
>> +#undef PAGE_COMMON
>> +
>> +    /* Flush the TLB before trying to use the new mappings. */
>> +    flush_tlb(false);
>> +
>> +    /* Put FEP immediately before va, and a ret instruction at va. */
>> +    memcpy(m->va - 5, "\x0f\x0bxen\xc3", 6);
>> +    barrier();
>> +
>> +    /* Read them back, to confirm that RAM is properly in place. */
>> +    if ( memcmp(m->va - 5, "\x0f\x0bxen\xc3", 6) )
>> +        panic("Bad phys or virtual setup\n");
>> +
>> +    /* Construct the LDT at va. */
>> +    user_desc *ldt = m->va;
>> +
>> +    ldt[LDT_SEL >> 3] = (typeof(*ldt))INIT_GDTE_SYM(0, 0xfffff, COMMON, 
>> DATA, DPL3, B, W);
> This dual use of m->va clearly needs a comment, perhaps next to
> the definition of LDT_SEL (which can't be freely set to whatever one
> may like - it namely can't be using LDT slot 0).

va is just the virtual address under test.

It gets read/write testing normally, exec testing by calling at it, and
implicit access testing by layering an LDT over the top and loading a
selector.

The choice of LDT selector to use only matters if it fits within the
mapping, is otherwise valid, and is already accessed.  In particular,
the test logic doesn't cope with hardware setting the accessed bit over
a read-only mapping, because it can't distinguish the two different
memory accesses.

>
>> +    gdt[GDTE_AVAIL0]  = (typeof(*gdt))INIT_GDTE((unsigned long)m->va, 
>> PAGE_SIZE, 0x82);
>> +#if __x86_64__
> #ifdef ?

Yes, although this variant definitely works.

>
>> +void check(struct mapping_info *m, exinfo_t actual, exinfo_t expected, enum 
>> modifier mod)
>> +{
>> +    /* Check that the actual pagefault matched our expectation. */
>> +    if ( actual != expected )
>> +    {
>> +        const char *user_sup = under_test.user ? "User" : "Supervisor";
>> +        bool ac_fault = !!actual, ex_fault = !!expected;
> Stray !! ?

Yes.  This code predates me sorting out the bool_t/bool confusion.

>
>> +        char ac_ec[16], ex_ec[16];
>> +
>> +        if ( ac_fault )
>> +            x86_exc_decode_ec(ac_ec, ARRAY_SIZE(ac_ec),
>> +                              X86_EXC_PF, (uint16_t)actual);
>> +        if ( ex_fault )
>> +            x86_exc_decode_ec(ex_ec, ARRAY_SIZE(ex_ec),
>> +                              X86_EXC_PF, (uint16_t)expected);
> Why the casts on the last function arguments?

Rebasing mistake.  Up until I re-factored the exec_user() infrastructure
(very recently) to be SMEP/SMAP safe, I had a local copy here which took
void * rather than unsigned long.

>
>> +        if ( ac_fault && !ex_fault )
>> +            fail("    Fail: expected no fault, got #PF[%s] for %s %s\n",
>> +                 ac_ec, user_sup, under_test.desc);
>> +        else if ( !ac_fault && ex_fault )
>> +            fail("    Fail: expected #PF[%s], got no fault for %s %s\n",
>> +                 ex_ec, user_sup, under_test.desc);
>> +        else
>> +            fail("    Fail: expected #PF[%s], got #PF[%s] for %s %s\n",
>> +                 ex_ec, ac_ec,  user_sup, under_test.desc);
> From looking just at this function ex_ec[] and ac_ec[] may be
> used uninitialized here (when !ac_fault && !ex_fault). With the
> function not being static, I'd actually expect compilers to be
> able to spot and warn about this.

In that case, we fail the "if ( actual != expected )" precondition, and
don't end up here.

I am think it would probably be better to add %p printk() support and
have a formatter for exinfo_t, which would simply a whole lot of code
looking like this.

>
>> +    }
>> +
>> +    /* Check that A/D bits got updated as expected. */
>> +    unsigned int leaf_level =
>> +        m->order ? ((m->order - PAGE_SHIFT) / PT_ORDER) : 0;
>> +    unsigned int i; /* NB - Levels are 0-indexed. */
> Do you enable C99 mode somewhere? Otherwise I'd expect the
> compiler to warn about mixing declarations and code.

The std in use is gnu99.

>
>> +    if ( amd_fam10_erratum )
>> +    {
>> +        /*
>> +         * AMD Fam10 appears to defer the setting of access bits for 
>> implicit
>> +         * loads.  As a result, the implicit tests (which load %fs) don't
>> +         * necessarily observe the access bits being set on the pagewalk to
>> +         * the LDT.
>> +         *
>> +         * Experimentally, a serialising instruction fixes things, or a loop
>> +         * of 1000 nops, but so does forcing the use of the loaded segment.
>> +         *
>> +         * If this is an implicit load which didn't fault, read through %fs 
>> to
>> +         * force it to be loaded into the segment cache.
> The load into the segment cache certainly has at least started, so
> maybe the better (more precise) wording would be something like
> "... force the load into the segment cache to complete"?

The fact that #GP faults for bad entries work appear to work properly
would suggest that the access has happened.

I can't fathom what architectural quirks would lead to observable
behaviour like this.

>
>> +exinfo_t calc(struct mapping_info *m, uint64_t new, unsigned int walk, enum 
>> modifier mod)
> To other than you as the author of the code, it would help if the
> name of the function indicated what it wants to calculate.

Fair enough.  I need to do a general documentation sweep across this
patch anyway.

>
>> +{
>> +    bool nx_valid   = CONFIG_PAGING_LEVELS >= 3 && (host_nx_leaked || (mod 
>> & NX));
>> +    bool insn_valid = nx_valid || (mod & SMEP);
>> +    uint64_t rsvd = ((1ULL << 52) - 1) & ~((1ULL << maxphysaddr) - 1);
>> +
>> +    /* Accumulate additional bits which are reserved. */
>> +    if ( !nx_valid )
>> +        rsvd |= _PAGE_NX;
>> +
>> +    if ( m->level == 4 )
>> +        rsvd |= _PAGE_PSE | (vendor_is_amd ? _PAGE_GLOBAL : 0);
>> +    else if ( m->level == 3 && !cpu_has_page1gb )
>> +        rsvd |= _PAGE_PSE;
> Shouldn't this second conditional include CONFIG_PAGING_LEVELS > 3?
> Certainly that bit is reserved there too, but there are quite a few
> more.

I don't test 3-level L3 entries.  They don't behave like other bits of
pagetable infrastructure, as they take effect at the point of mov to
%cr3, and get re-read under a number of TLB flush conditions.

They are also broken and cause vmentry failures, as Xen is left to
manage the PDPTRs itself, and does a rather poor job.

>
>> +void test_pte(const struct stubs *stubs, struct mapping_info *m, uint64_t  
>> overlay)
>> +{
>> +    uint64_t new = m->paddr | overlay;
>> +    bool user = false;
>> +
>> +    under_test.pteval = *m->pte = new;
>> +    under_test.pte_printed = false;
>> +    clear_ad(m);
>> +
>> +    under_test.active = true;
>> +
>> +    for ( ; ; user = true )
>> +    {
>> +        unsigned int base = user ? PFEC(U) : 0;
>> +
>> +#define CALL(fn, va)                                                    \
>> +        (user ? exec_user_param(fn ## _user, (unsigned long)(va))       \
>> +              : fn((unsigned long)(va)))
>> +
>> +        under_test.user = user;
>> +
>> +        /* Map the exec FEP stub with suitable permissions. */
>> +        if ( stubs == &force_stubs )
>> +        {
>> +            *m->fe_pte &= ~_PAGE_USER;
>> +            if ( user )
>> +                *m->fe_pte |= _PAGE_USER;
>> +            invlpg(m->va - 5);
> Why do you need to do the for the FEP prefix, but not for the
> rest of the stub?

The FEP is always mapped by a different PTE to m->va.  This was quite
awkward to set up properly, to test instruction fetches.

FEP needs to start 5 before the linear address under test, as it doesn't
redirect control flow.  The linear address under test needs to be able
to have _PAGE_USER mismatched against cpl.  The insn fetch for FEP can't
be mismatched against _PAGE_USER, so the linear address needs to be
separately controllable.  Therefore, they need to be mapped by different
PTEs.

A side effect of needing different PTEs, but adjacent linear addresses
is that the linear addresses must be aligned on the largest superpage
expressible in the current paging mode.

>
>> --- /dev/null
>> +++ b/tests/pagetable-emulation/stubs.h
>> @@ -0,0 +1,25 @@
>> +#ifndef _LOWLEVEL_H_
>> +#define _LOWLEVEL_H_
>> +
>> +unsigned long stub_read(unsigned long va);
>> +unsigned long stub_implicit(unsigned long sel); /* unsigned long sel */
> What are these comments intended to tell the reader?

Stale.  These prototypes used to be void * va.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.