[Xen-devel] Re: [PATCH 13/13] Nested Virtualiztion: hap-on-hap

Hi, 

This is looking better - I think the TLB flushing is almost right.
A few more detailed comments below.

Content-Description: xen_nh13_haphap.diff
> # HG changeset patch
> # User cegger
> # Date 1283345895 -7200
> Implement Nested-on-Nested.
> This allows the guest to run nested guest with hap enabled.
> 
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/hvm/hvm.c
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1042,12 +1042,56 @@ void hvm_inject_exception(unsigned int t
>      hvm_funcs.inject_exception(trapnr, errcode, cr2);
>  }
>  
> -bool_t hvm_hap_nested_page_fault(unsigned long gfn)
> +bool_t hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs)
>  {
>      p2m_type_t p2mt;
>      mfn_t mfn;
>      struct vcpu *v = current;
>      struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
> +    unsigned long gfn = gpa >> PAGE_SHIFT;
> +    int rv;
> +
> +    /* On Nested Virtualization, walk the guest page table.
> +     * If this succeeds, all is fine.
> +     * If this fails, inject a nested page fault into the guest.
> +     */
> +    if ( nestedhvm_enabled(v->domain)
> +        && nestedhvm_vcpu_in_guestmode(v)
> +        && nestedhvm_paging_mode_hap(v) )
> +    {
> +        enum nestedhvm_vmexits nsret;
> +        struct nestedhvm *hvm = &vcpu_nestedhvm(v);
> +
> +        /* nested guest gpa == guest gva */

This comment is confusing to me.  The nested guest gpa isn't a virtual
address, and certainly not the same as an L1-guest VA.  Can you reword
it, maybe just to say what the call to nestedhvm_hap_nested_page_fault()
is going to do?

> +        rv = nestedhvm_hap_nested_page_fault(v, gpa);
> +        switch (rv) {
> +        case NESTEDHVM_PAGEFAULT_DONE:
> +            return 1;
> +        case NESTEDHVM_PAGEFAULT_ERROR:
> +            return 0;
> +        case NESTEDHVM_PAGEFAULT_INJECT:
> +            break;
> +        }
> +
> +        /* inject #VMEXIT(NPF) into guest. */
> +        hvm->nh_forcevmexit.exitcode = NESTEDHVM_INTERCEPT_NPF;
> +        hvm->nh_forcevmexit.exitinfo1 = regs->error_code;
> +        hvm->nh_forcevmexit.exitinfo2 = gpa;
> +        hvm->nh_hostflags.fields.forcevmexit = 1;
> +        nsret = nestedhvm_vcpu_vmexit(v, regs, NESTEDHVM_INTERCEPT_NPF);
> +        hvm->nh_hostflags.fields.forcevmexit = 0;
> +        switch (nsret) {
> +        case NESTEDHVM_VMEXIT_DONE:
> +        case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */
> +            return 1;
> +        case NESTEDHVM_VMEXIT_HOST:
> +        case NESTEDHVM_VMEXIT_CONTINUE:
> +        case NESTEDHVM_VMEXIT_FATALERROR:
> +        default:
> +            gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret);
> +            return 0;
> +        }
> +    }
>  
>      mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
>  

> @@ -1843,7 +1908,7 @@ static enum hvm_copy_result __hvm_copy(
>  
>          if ( flags & HVMCOPY_virt )
>          {
> -            gfn = paging_gva_to_gfn(curr, addr, &pfec);
> +            gfn = paging_p2m_ga_to_gfn(curr, p2m, mode, cr3, addr, &pfec);

Do other callers of paging_gva_to_gfn need to handle nested-npt mode as
well?  If so, would it be better to update paging_gva_to_gfn() itself? 

>              if ( gfn == INVALID_GFN )
>              {
>                  if ( pfec == PFEC_page_paged )
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/hvm/nestedhvm.c
> --- a/xen/arch/x86/hvm/nestedhvm.c
> +++ b/xen/arch/x86/hvm/nestedhvm.c
> @@ -20,6 +20,7 @@
>  #include <asm/msr.h>
>  #include <asm/hvm/support.h> /* for HVM_DELIVER_NO_ERROR_CODE */
>  #include <asm/hvm/hvm.h>
> +#include <asm/p2m.h>    /* for struct p2m_domain */
>  #include <asm/hvm/nestedhvm.h>
>  #include <asm/event.h>  /* for local_event_delivery_(en|dis)able */
>  #include <asm/paging.h> /* for paging_mode_hap() */
> @@ -477,6 +478,7 @@ nestedhvm_vcpu_vmexit(struct vcpu *v, st
>       enum hvm_copy_result hvm_rc;
>  
>       hvm->nh_hostflags.fields.vmentry = 1;
> +     paging_update_nestedmode(v);
>       if (nestedhvm_vcpu_in_guestmode(v)) {
>               ret = nestedhvm_vmexit(v, regs, exitcode);
>               switch (ret) {
> @@ -529,14 +531,30 @@ nestedhvm_vcpu_vmexit(struct vcpu *v, st
>  void
>  nestedhvm_vcpu_enter_guestmode(struct vcpu *v)
>  {
> +     struct p2m_domain *p2m;
>       vcpu_nestedhvm(v).nh_guestmode = 1;
> +
> +     p2m = vcpu_nestedhvm(v).nh_p2m;
> +     if (p2m == NULL)
> +             /* p2m has either been invalidated or not yet assigned. */
> +             return;
> +
> +     cpu_set(v->processor, p2m->p2m_dirty_cpumask);

Is this arch-specific code?  (and likewise the cpu_clear that follows)

Also, in the case where p2m is NULL, when the p2m is allocated later you
need to set a bit in p2m->p2m_dirty_cpumask there too.

>  }
>  
>  void
>  nestedhvm_vcpu_exit_guestmode(struct vcpu *v)
>  {
> +     struct p2m_domain *p2m;
>       vcpu_nestedhvm(v).nh_guestmode = 0;
> -} 
> +
> +     p2m = vcpu_nestedhvm(v).nh_p2m;
> +     if (p2m == NULL)
> +             /* p2m has either been invalidated or not yet assigned. */
> +             return;
> +
> +     cpu_clear(v->processor, p2m->p2m_dirty_cpumask);
> +}
>  



> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/hap/hap.c
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -40,6 +40,7 @@
>  #include <asm/p2m.h>
>  #include <asm/domain.h>
>  #include <xen/numa.h>
> +#include <asm/hvm/nestedhvm.h>
>  
>  #include "private.h"
>  
> @@ -340,6 +341,30 @@ static void hap_free_p2m_page(struct p2m
>      hap_unlock(d);
>  }
>  
> +#define nestedp2m_alloc_p2m_page hap_alloc_p2m_page
> +
> +/* We must use hap_free() or flushing the nested p2m tables fails
> + * with "freeing ptp fails due to insufficient pages.

That's quite sensible -- other code that frees p2m pages directly to Xen
is basically wrong.  It's a relic of the shadow allocator, where freeing
individual p2m pages to the shadow pool would have been a bit tricky and 
I knew at the time that p2m pages were only ever freed on destruction. 

I really ought to update the shadow p2m-free path now that the shadow
pool is simpler.  It just needs a little care around the order things
happen in teardown.

> + * XXX This triggers a bug in p2m that causes a crash in
> + * xen/common/page_alloc.c:1201 on L1 guest shutdown/destroy.
> + */
> +static void
> +nestedp2m_free_p2m_page(struct p2m_domain *p2m, struct page_info *pg)
> +{
> +    struct domain *d = p2m->domain;
> +    hap_lock(d);
> +    ASSERT(page_get_owner(pg) == d);
> +    /* Should have just the one ref we gave it in alloc_p2m_page() */
> +    BUG_ON((pg->count_info & PGC_count_mask) != 1);
> +    pg->count_info = 0;
> +    page_set_owner(pg, NULL);
> +    hap_free(d, page_to_mfn(pg));
> +    d->arch.paging.hap.total_pages++;
> +    d->arch.paging.hap.p2m_pages--;
> +    ASSERT(d->arch.paging.hap.p2m_pages >= 0);
> +    hap_unlock(d);
> +}
> +
>  /* Return the size of the pool, rounded up to the nearest MB */
>  static unsigned int
>  hap_get_allocation(struct domain *d)


> +static int 
> +nestedp2m_next_level(struct p2m_domain *p2m, struct page_info **table_pg, 
> +                     void **table, unsigned long *gfn_remainder, 
> +                     unsigned long gfn, uint32_t shift, uint32_t max, 
> +                     unsigned long type)
> +{
> +    l1_pgentry_t *l1_entry;
> +    l1_pgentry_t *p2m_entry;
> +    l1_pgentry_t new_entry;
> +    void *next;
> +    int i;
> +
> +    ASSERT(p2m);
> +    ASSERT(p2m->alloc_page);
> +
> +    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, shift, 
> max)) )
> +        return 0;
> +
> +    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
> +    {
> +        struct page_info *pg;
> +
> +        pg = p2m_alloc_ptp(p2m, type);
> +        if ( pg == NULL )
> +            return 0;
> +
> +        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> +                                 __PAGE_HYPERVISOR | _PAGE_USER);
> +
> +        switch ( type ) {
> +        case PGT_l3_page_table:
> +            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +            break;
> +        case PGT_l2_page_table:
> +#if CONFIG_PAGING_LEVELS == 3
> +            /* for PAE mode, PDPE only has PCD/PWT/P bits available */
> +            new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
> +#endif
> +            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +            break;
> +        case PGT_l1_page_table:
> +            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +            break;
> +        default:
> +            BUG();
> +            break;
> +        }
> +    }
> +
> +    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
> +
> +    /* split single large page into 4KB page in P2M table */
> +    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & 
> _PAGE_PSE) )
> +    {
> +        unsigned long flags, pfn;
> +        struct page_info *pg;
> +
> +        pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
> +        if ( pg == NULL )
> +            return 0;
> +
> +        /* New splintered mappings inherit the flags of the old superpage, 
> +         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
> +        flags = l1e_get_flags(*p2m_entry);
> +        pfn = l1e_get_pfn(*p2m_entry);
> +        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
> +            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
> +        else
> +            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
> +        
> +        l1_entry = __map_domain_page(pg);
> +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
> +        {
> +            new_entry = l1e_from_pfn(pfn + i, flags);
> +            nested_write_p2m_entry(p2m, l1_entry+i, new_entry);
> +        }
> +        unmap_domain_page(l1_entry);
> +        
> +        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> +                                 __PAGE_HYPERVISOR|_PAGE_USER);
> +        nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +    }
> +
> +    *table_pg = l1e_get_page(*p2m_entry);
> +    next = __map_domain_page(*table_pg);
> +    unmap_domain_page(*table);
> +    *table = next;
> +
> +    return 1;
> +}

This function looks like it duplicates a lot of logic from an old
version of the normal p2m next-level function.  Would it be easy to
merge them?

> +int 
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> +                    unsigned int page_order, p2m_type_t p2mt);
> +
> +int 
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> +                    unsigned int page_order, p2m_type_t p2mt)
> +{
> +    struct page_info *table_pg;
> +    void *table;
> +    unsigned long gfn_remainder = gfn;
> +    l1_pgentry_t *p2m_entry;
> +    l1_pgentry_t entry_content;
> +    l2_pgentry_t l2e_content;
> +    int rv = 0;
> +    
> +    ASSERT(p2m);
> +    ASSERT(p2m->alloc_page);
> +
> +    /* address of nested paging table */
> +    table_pg = pagetable_get_page(p2m_get_pagetable(p2m));
> +    table = __map_domain_page(table_pg);
> +
> +#if CONFIG_PAGING_LEVELS >= 4
> +    if ( !nestedp2m_next_level(p2m, &table_pg, &table, 
> +                               &gfn_remainder, gfn, 
> +                               L4_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                               L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
> +        goto out;
> +#endif
> +
> +    if ( !nestedp2m_next_level(p2m, &table_pg, &table, &gfn_remainder, 
> +                               gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                               ((CONFIG_PAGING_LEVELS == 3)
> +                                ? (paging_mode_hap(p2m->domain) ? 4 : 8)
> +                                : L3_PAGETABLE_ENTRIES),
> +                               PGT_l2_page_table) )
> +        goto out;
> +
> +    if ( page_order == 0 )
> +    {
> +        if ( !nestedp2m_next_level(p2m, &table_pg, &table, 
> +                                   &gfn_remainder, gfn,
> +                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                                   L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
> +            goto out;
> +
> +        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
> +                                   0, L1_PAGETABLE_ENTRIES);
> +        ASSERT(p2m_entry);
> +        
> +        if ( mfn_valid(mfn) ) {
> +            entry_content = l1e_from_pfn(mfn_x(mfn), 
> +                                         p2m_type_to_flags(p2mt));
> +        } else {
> +            entry_content = l1e_empty();
> +        }
> +        
> +        /* level 1 entry */
> +        nested_write_p2m_entry(p2m, p2m_entry, entry_content);
> +    }
> +    else 
> +    {
> +        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
> +                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                                   L2_PAGETABLE_ENTRIES);
> +        ASSERT(p2m_entry);
> +        
> +        /* FIXME: Deal with 4k replaced by 2MB pages */
> +        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
> +             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
> +        {
> +            domain_crash(p2m->domain);
> +            goto out;
> +        }
> +        
> +        if ( mfn_valid(mfn) )
> +            l2e_content = l2e_from_pfn(mfn_x(mfn),
> +                p2m_type_to_flags(p2mt) | _PAGE_PSE);
> +        else {
> +            l2e_content = l2e_empty();
> +     }
> +        
> +        entry_content.l1 = l2e_content.l2;
> +        nested_write_p2m_entry(p2m, p2m_entry, entry_content);
> +    }
> +
> +    /* Track the highest gfn for which we have ever had a valid mapping */
> +    if ( mfn_valid(mfn)
> +         && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
> +        p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
> +
> +    /* Success */
> +    rv = 1;
> +
> +out:
> +    unmap_domain_page(table);
> +    return rv;
> +}

Same for this: looks like it duplicates existing code. 


> +int
> +nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
> +{
> +    int rv;
> +    paddr_t L1_gpa, L0_gpa;
> +    struct domain *d = v->domain;
> +    struct p2m_domain *p2m, *nested_p2m;
> +
> +    p2m = p2m_get_hostp2m(d); /* L0 p2m */
> +    nested_p2m = p2m_get_nestedp2m(v, vcpu_nestedhvm(v).nh_vm_hostcr3);
> +
> +    /* walk the L1 P2M table, note we have to pass p2m
> +     * and not nested_p2m here or we fail the walk forever,
> +     * otherwise. */

Can you explain that more fully?  It looks like you're walking the L0
table to translate an L2 gfn into an L1 gfn, which surely can't be right. 

> +    rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
> +
> +    /* let caller to handle these two cases */
> +    switch (rv) {
> +    case NESTEDHVM_PAGEFAULT_INJECT:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_ERROR:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_DONE:
> +        break;
> +    default:
> +        BUG();
> +        break;
> +    }
> +
> +    /* ==> we have to walk L0 P2M */
> +    rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
> +
> +    /* let upper level caller to handle these two cases */
> +    switch (rv) {
> +    case NESTEDHVM_PAGEFAULT_INJECT:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_ERROR:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_DONE:
> +        break;
> +    default:
> +        BUG();
> +        break;
> +    }
> +
> +    /* fix p2m_get_pagetable(nested_p2m) */
> +    nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa);
> +
> +    return NESTEDHVM_PAGEFAULT_DONE;
> +}
> +
> +/********************************************/
> +/*     NESTED VIRT INITIALIZATION FUNCS     */
> +/********************************************/
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/hap/private.h
> --- a/xen/arch/x86/mm/hap/private.h
> +++ b/xen/arch/x86/mm/hap/private.h
> @@ -30,4 +30,14 @@ unsigned long hap_gva_to_gfn_3_levels(st
>  unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
>                                       uint32_t *pfec);
>  
> +unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
> +    struct p2m_domain *p2m, unsigned long cr3,
> +    paddr_t ga, uint32_t *pfec);
> +unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
> +    struct p2m_domain *p2m, unsigned long cr3,
> +    paddr_t ga, uint32_t *pfec);
> +unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
> +    struct p2m_domain *p2m, unsigned long cr3,
> +    paddr_t ga, uint32_t *pfec);
> +
>  #endif /* __HAP_PRIVATE_H__ */
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/p2m.c
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -34,6 +34,7 @@
>  #include <public/mem_event.h>
>  #include <asm/mem_sharing.h>
>  #include <xen/event.h>
> +#include <asm/hvm/nestedhvm.h>
>  
>  /* Debugging and auditing of the P2M code? */
>  #define P2M_AUDIT     0
> @@ -72,7 +73,7 @@ boolean_param("hap_1gb", opt_hap_1gb);
>  #define SUPERPAGE_PAGES (1UL << 9)
>  #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
>  
> -static unsigned long p2m_type_to_flags(p2m_type_t t) 
> +unsigned long p2m_type_to_flags(p2m_type_t t) 
>  {
>      unsigned long flags;
>  #ifdef __x86_64__
> @@ -116,9 +117,9 @@ static void audit_p2m(struct p2m_domain 
>  // Find the next level's P2M entry, checking for out-of-range gfn's...
>  // Returns NULL on error.
>  //
> -static l1_pgentry_t *
> +l1_pgentry_t *
>  p2m_find_entry(void *table, unsigned long *gfn_remainder,
> -                   unsigned long gfn, u32 shift, u32 max)
> +                   unsigned long gfn, uint32_t shift, uint32_t max)
>  {
>      u32 index;
>  
> @@ -1719,10 +1720,12 @@ static void p2m_initialise(struct domain
>      INIT_PAGE_LIST_HEAD(&p2m->pod.single);
>  
>      p2m->domain = d;
> +    p2m->cr3 = 0;
>      p2m->set_entry = p2m_set_entry;
>      p2m->get_entry = p2m_gfn_to_mfn;
>      p2m->get_entry_current = p2m_gfn_to_mfn_current;
>      p2m->change_entry_type_global = p2m_change_type_global;
> +    cpus_clear(p2m->p2m_dirty_cpumask);
>  
>      if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
>          ept_p2m_init(d);
> @@ -1730,6 +1733,28 @@ static void p2m_initialise(struct domain
>      return;
>  }
>  
> +extern int
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> +                    unsigned int page_order, p2m_type_t p2mt);

Please define this in a header file.

> +static int
> +p2m_init_nestedp2m(struct domain *d)
> +{
> +    uint8_t i;
> +    struct p2m_domain *p2m;
> +
> +    spin_lock_init(&d->arch.nested_p2m_lock);
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
> +        if (p2m == NULL)
> +            return -ENOMEM;
> +        p2m_initialise(d, p2m);
> +        p2m->set_entry = nestedp2m_set_entry;
> +    }
> +
> +    return 0;
> +}
> +
>  int p2m_init(struct domain *d)
>  {
>      struct p2m_domain *p2m;
> @@ -1739,7 +1764,11 @@ int p2m_init(struct domain *d)
>          return -ENOMEM;
>      p2m_initialise(d, p2m);
>  
> -    return 0;
> +    /* Must initialise nestedp2m unconditionally
> +     * since nestedhvm_enabled(d) returns false here.
> +     * (p2m_init runs too early for HVM_PARAM_* options)
> +     */
> +    return p2m_init_nestedp2m(d);
>  }
>  
>  void p2m_change_entry_type_global(struct p2m_domain *p2m,
> @@ -1836,6 +1865,9 @@ int p2m_alloc_table(struct p2m_domain *p
>                          p2m_invalid) )
>          goto error;
>  
> +    if (p2m_is_nestedp2m(p2m))
> +        goto nesteddone;
> +
>      /* Copy all existing mappings from the page list and m2p */
>      spin_lock(&p2m->domain->page_alloc_lock);
>      page_list_for_each(page, &p2m->domain->page_list)
> @@ -1857,6 +1889,7 @@ int p2m_alloc_table(struct p2m_domain *p
>      }
>      spin_unlock(&p2m->domain->page_alloc_lock);
>  
> + nesteddone:
>      P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
>      p2m_unlock(p2m);
>      return 0;
> @@ -1881,6 +1914,9 @@ void p2m_teardown(struct p2m_domain *p2m
>      mfn_t mfn;
>  #endif
>  
> +    if (p2m == NULL)
> +        return;
> +
>      p2m_lock(p2m);
>  
>  #ifdef __x86_64__
> @@ -1899,11 +1935,26 @@ void p2m_teardown(struct p2m_domain *p2m
>      p2m_unlock(p2m);
>  }
>  
> +static void p2m_teardown_nestedp2m(struct domain *d)
> +{
> +    uint8_t i;
> +
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        xfree(d->arch.nested_p2m[i]);
> +        d->arch.nested_p2m[i] = NULL;
> +    }
> +}
> +
>  void p2m_final_teardown(struct domain *d)
>  {
>      /* Iterate over all p2m tables per domain */
>      xfree(d->arch.p2m);
>      d->arch.p2m = NULL;
> +
> +    /* We must teardown unconditionally because
> +     * we initialise them unconditionally.
> +     */
> +    p2m_teardown_nestedp2m(d);
>  }
>  
>  #if P2M_AUDIT
> @@ -2823,6 +2874,173 @@ void p2m_mem_paging_resume(struct p2m_do
>  }
>  #endif /* __x86_64__ */
>  
> +static struct p2m_domain *
> +p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
> +{
> +    int i, lru_index = -1;
> +    struct p2m_domain *lrup2m, *tmp;
> +
> +    if (p2m == NULL) {
> +        lru_index = MAX_NESTEDP2M - 1;
> +        lrup2m = d->arch.nested_p2m[lru_index];
> +    } else {
> +        lrup2m = p2m;
> +        for (i = 0; i < MAX_NESTEDP2M; i++) {
> +            if (d->arch.nested_p2m[i] == p2m) {
> +                lru_index = i;
> +                break;
> +            }
> +        }
> +    }
> +
> +    ASSERT(lru_index >= 0);
> +    if (lru_index == 0) {
> +        return lrup2m;
> +    }
> +
> +    /* move the other's down the array "list" */
> +    for (i = lru_index - 1; i >= 0; i--) {
> +        tmp = d->arch.nested_p2m[i];
> +        d->arch.nested_p2m[i+1] = tmp;        
> +    }
> +
> +    /* make the entry the first one */
> +    d->arch.nested_p2m[0] = lrup2m;
> +
> +    return lrup2m;
> +}
> +
> +static int 
> +p2m_flush_locked(struct p2m_domain *p2m)
> +{
> +    struct page_info * (*alloc)(struct p2m_domain *);
> +    void (*free)(struct p2m_domain *, struct page_info *);
> +
> +    alloc = p2m->alloc_page;
> +    free = p2m->free_page;
> +
> +    if (p2m->cr3 == 0)
> +        /* Microoptimisation: p2m is already empty.
> +         * => about 0.3% speedup of overall system performance.
> +         */
> +        return 0;

What happens if a malicious guest uses 0 as its actual CR3 value?

> +
> +    p2m_teardown(p2m);
> +    p2m_initialise(p2m->domain, p2m);
> +    p2m->set_entry = nestedp2m_set_entry;
> +    BUG_ON(p2m_alloc_table(p2m, alloc, free) != 0);
> +
> +    ASSERT(p2m);
> +    ASSERT(p2m->alloc_page);
> +    return 0;
> +}
> +
> +void
> +p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
> +{
> +    struct domain *d = p2m->domain;
> +
> +    nestedhvm_vm_flushtlb(p2m);
> +    ASSERT(v->domain == d);
> +    vcpu_nestedhvm(v).nh_p2m = NULL;
> +    spin_lock(&d->arch.nested_p2m_lock);
> +    BUG_ON(p2m_flush_locked(p2m) != 0);
> +    spin_unlock(&d->arch.nested_p2m_lock);
> +    hvm_asid_flush_vcpu(v);
> +}
> +
> +void
> +p2m_flush_nestedp2m(struct domain *d)
> +{
> +    int i;
> +
> +    spin_lock(&d->arch.nested_p2m_lock);
> +    for (i = 0; i < MAX_NESTEDP2M; i++)
> +        BUG_ON(p2m_flush_locked(d->arch.nested_p2m[i]) != 0);
> +    spin_unlock(&d->arch.nested_p2m_lock);
> +    flush_tlb_mask(&d->domain_dirty_cpumask);
> +}
> +
> +struct p2m_domain *
> +p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
> +{
> +    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
> +    struct domain *d;
> +    struct p2m_domain *p2m;
> +    int i, rv;
> +
> +    if (cr3 == 0)
> +        cr3 = v->arch.hvm_vcpu.guest_cr[3];
> +
> +    if (hvm->nh_flushp2m && hvm->nh_p2m) {
> +        nestedhvm_vm_flushtlb(hvm->nh_p2m);
> +        hvm->nh_p2m = NULL;
> +    }
> +
> +    d = v->domain;
> +    spin_lock(&d->arch.nested_p2m_lock);
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        p2m = d->arch.nested_p2m[i];
> +        if (p2m->cr3 == cr3 && p2m == hvm->nh_p2m) {
> +            p2m_getlru_nestedp2m(d, p2m);
> +            if (hvm->nh_flushp2m) {
> +               BUG_ON(p2m_flush_locked(p2m) != 0);
> +               hvm->nh_flushp2m = 0;
> +               hvm_asid_flush_vcpu(v);
> +            }
> +            p2m->cr3 = cr3;
> +            spin_unlock(&d->arch.nested_p2m_lock);
> +            return p2m;
> +        }
> +        if (p2m->cr3 == 0) { /* found unused p2m table */
> +            hvm->nh_flushp2m = 0;
> +            p2m_getlru_nestedp2m(d, p2m);
> +            hvm->nh_p2m = p2m;
> +            p2m->cr3 = cr3;
> +            spin_unlock(&d->arch.nested_p2m_lock);
> +            hvm_asid_flush_vcpu(v);
> +            return p2m;
> +        }
> +    }
> +
> +    /* All p2m's are or were in use. We know the least recent used one.
> +     * Destroy and re-initialize it.
> +     */
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        p2m = p2m_getlru_nestedp2m(d, NULL);
> +        rv = p2m_flush_locked(p2m);

Is this enough?  If this p2m might be in host_vmcb->h_cr3 somewhere on
another vcpu, then you need to make sure that vcpu gets reset not to use
it any more. 

Cheers,

Tim.

-- 
Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] Re: [PATCH 13/13] Nested Virtualiztion: hap-on-hap