WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] Re: [PATCH 13/13] Nested Virtualiztion: hap-on-hap

To: Christoph Egger <Christoph.Egger@xxxxxxx>
Subject: [Xen-devel] Re: [PATCH 13/13] Nested Virtualiztion: hap-on-hap
From: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Date: Wed, 8 Sep 2010 15:04:51 +0100
Cc: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>, "Dong, Eddie" <eddie.dong@xxxxxxxxx>
Delivery-date: Wed, 08 Sep 2010 07:05:54 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <201009011718.44873.Christoph.Egger@xxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <201009011718.44873.Christoph.Egger@xxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.5.18 (2008-05-17)
Hi, 

This is looking better - I think the TLB flushing is almost right.
A few more detailed comments below.

Content-Description: xen_nh13_haphap.diff
> # HG changeset patch
> # User cegger
> # Date 1283345895 -7200
> Implement Nested-on-Nested.
> This allows the guest to run nested guest with hap enabled.
> 
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/hvm/hvm.c
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1042,12 +1042,56 @@ void hvm_inject_exception(unsigned int t
>      hvm_funcs.inject_exception(trapnr, errcode, cr2);
>  }
>  
> -bool_t hvm_hap_nested_page_fault(unsigned long gfn)
> +bool_t hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs)
>  {
>      p2m_type_t p2mt;
>      mfn_t mfn;
>      struct vcpu *v = current;
>      struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
> +    unsigned long gfn = gpa >> PAGE_SHIFT;
> +    int rv;
> +
> +    /* On Nested Virtualization, walk the guest page table.
> +     * If this succeeds, all is fine.
> +     * If this fails, inject a nested page fault into the guest.
> +     */
> +    if ( nestedhvm_enabled(v->domain)
> +        && nestedhvm_vcpu_in_guestmode(v)
> +        && nestedhvm_paging_mode_hap(v) )
> +    {
> +        enum nestedhvm_vmexits nsret;
> +        struct nestedhvm *hvm = &vcpu_nestedhvm(v);
> +
> +        /* nested guest gpa == guest gva */

This comment is confusing to me.  The nested guest gpa isn't a virtual
address, and certainly not the same as an L1-guest VA.  Can you reword
it, maybe just to say what the call to nestedhvm_hap_nested_page_fault()
is going to do?

> +        rv = nestedhvm_hap_nested_page_fault(v, gpa);
> +        switch (rv) {
> +        case NESTEDHVM_PAGEFAULT_DONE:
> +            return 1;
> +        case NESTEDHVM_PAGEFAULT_ERROR:
> +            return 0;
> +        case NESTEDHVM_PAGEFAULT_INJECT:
> +            break;
> +        }
> +
> +        /* inject #VMEXIT(NPF) into guest. */
> +        hvm->nh_forcevmexit.exitcode = NESTEDHVM_INTERCEPT_NPF;
> +        hvm->nh_forcevmexit.exitinfo1 = regs->error_code;
> +        hvm->nh_forcevmexit.exitinfo2 = gpa;
> +        hvm->nh_hostflags.fields.forcevmexit = 1;
> +        nsret = nestedhvm_vcpu_vmexit(v, regs, NESTEDHVM_INTERCEPT_NPF);
> +        hvm->nh_hostflags.fields.forcevmexit = 0;
> +        switch (nsret) {
> +        case NESTEDHVM_VMEXIT_DONE:
> +        case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */
> +            return 1;
> +        case NESTEDHVM_VMEXIT_HOST:
> +        case NESTEDHVM_VMEXIT_CONTINUE:
> +        case NESTEDHVM_VMEXIT_FATALERROR:
> +        default:
> +            gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret);
> +            return 0;
> +        }
> +    }
>  
>      mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
>  

> @@ -1843,7 +1908,7 @@ static enum hvm_copy_result __hvm_copy(
>  
>          if ( flags & HVMCOPY_virt )
>          {
> -            gfn = paging_gva_to_gfn(curr, addr, &pfec);
> +            gfn = paging_p2m_ga_to_gfn(curr, p2m, mode, cr3, addr, &pfec);

Do other callers of paging_gva_to_gfn need to handle nested-npt mode as
well?  If so, would it be better to update paging_gva_to_gfn() itself? 

>              if ( gfn == INVALID_GFN )
>              {
>                  if ( pfec == PFEC_page_paged )
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/hvm/nestedhvm.c
> --- a/xen/arch/x86/hvm/nestedhvm.c
> +++ b/xen/arch/x86/hvm/nestedhvm.c
> @@ -20,6 +20,7 @@
>  #include <asm/msr.h>
>  #include <asm/hvm/support.h> /* for HVM_DELIVER_NO_ERROR_CODE */
>  #include <asm/hvm/hvm.h>
> +#include <asm/p2m.h>    /* for struct p2m_domain */
>  #include <asm/hvm/nestedhvm.h>
>  #include <asm/event.h>  /* for local_event_delivery_(en|dis)able */
>  #include <asm/paging.h> /* for paging_mode_hap() */
> @@ -477,6 +478,7 @@ nestedhvm_vcpu_vmexit(struct vcpu *v, st
>       enum hvm_copy_result hvm_rc;
>  
>       hvm->nh_hostflags.fields.vmentry = 1;
> +     paging_update_nestedmode(v);
>       if (nestedhvm_vcpu_in_guestmode(v)) {
>               ret = nestedhvm_vmexit(v, regs, exitcode);
>               switch (ret) {
> @@ -529,14 +531,30 @@ nestedhvm_vcpu_vmexit(struct vcpu *v, st
>  void
>  nestedhvm_vcpu_enter_guestmode(struct vcpu *v)
>  {
> +     struct p2m_domain *p2m;
>       vcpu_nestedhvm(v).nh_guestmode = 1;
> +
> +     p2m = vcpu_nestedhvm(v).nh_p2m;
> +     if (p2m == NULL)
> +             /* p2m has either been invalidated or not yet assigned. */
> +             return;
> +
> +     cpu_set(v->processor, p2m->p2m_dirty_cpumask);

Is this arch-specific code?  (and likewise the cpu_clear that follows)

Also, in the case where p2m is NULL, when the p2m is allocated later you
need to set a bit in p2m->p2m_dirty_cpumask there too.

>  }
>  
>  void
>  nestedhvm_vcpu_exit_guestmode(struct vcpu *v)
>  {
> +     struct p2m_domain *p2m;
>       vcpu_nestedhvm(v).nh_guestmode = 0;
> -} 
> +
> +     p2m = vcpu_nestedhvm(v).nh_p2m;
> +     if (p2m == NULL)
> +             /* p2m has either been invalidated or not yet assigned. */
> +             return;
> +
> +     cpu_clear(v->processor, p2m->p2m_dirty_cpumask);
> +}
>  



> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/hap/hap.c
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -40,6 +40,7 @@
>  #include <asm/p2m.h>
>  #include <asm/domain.h>
>  #include <xen/numa.h>
> +#include <asm/hvm/nestedhvm.h>
>  
>  #include "private.h"
>  
> @@ -340,6 +341,30 @@ static void hap_free_p2m_page(struct p2m
>      hap_unlock(d);
>  }
>  
> +#define nestedp2m_alloc_p2m_page hap_alloc_p2m_page
> +
> +/* We must use hap_free() or flushing the nested p2m tables fails
> + * with "freeing ptp fails due to insufficient pages.

That's quite sensible -- other code that frees p2m pages directly to Xen
is basically wrong.  It's a relic of the shadow allocator, where freeing
individual p2m pages to the shadow pool would have been a bit tricky and 
I knew at the time that p2m pages were only ever freed on destruction. 

I really ought to update the shadow p2m-free path now that the shadow
pool is simpler.  It just needs a little care around the order things
happen in teardown.

> + * XXX This triggers a bug in p2m that causes a crash in
> + * xen/common/page_alloc.c:1201 on L1 guest shutdown/destroy.
> + */
> +static void
> +nestedp2m_free_p2m_page(struct p2m_domain *p2m, struct page_info *pg)
> +{
> +    struct domain *d = p2m->domain;
> +    hap_lock(d);
> +    ASSERT(page_get_owner(pg) == d);
> +    /* Should have just the one ref we gave it in alloc_p2m_page() */
> +    BUG_ON((pg->count_info & PGC_count_mask) != 1);
> +    pg->count_info = 0;
> +    page_set_owner(pg, NULL);
> +    hap_free(d, page_to_mfn(pg));
> +    d->arch.paging.hap.total_pages++;
> +    d->arch.paging.hap.p2m_pages--;
> +    ASSERT(d->arch.paging.hap.p2m_pages >= 0);
> +    hap_unlock(d);
> +}
> +
>  /* Return the size of the pool, rounded up to the nearest MB */
>  static unsigned int
>  hap_get_allocation(struct domain *d)


> +static int 
> +nestedp2m_next_level(struct p2m_domain *p2m, struct page_info **table_pg, 
> +                     void **table, unsigned long *gfn_remainder, 
> +                     unsigned long gfn, uint32_t shift, uint32_t max, 
> +                     unsigned long type)
> +{
> +    l1_pgentry_t *l1_entry;
> +    l1_pgentry_t *p2m_entry;
> +    l1_pgentry_t new_entry;
> +    void *next;
> +    int i;
> +
> +    ASSERT(p2m);
> +    ASSERT(p2m->alloc_page);
> +
> +    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, shift, 
> max)) )
> +        return 0;
> +
> +    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
> +    {
> +        struct page_info *pg;
> +
> +        pg = p2m_alloc_ptp(p2m, type);
> +        if ( pg == NULL )
> +            return 0;
> +
> +        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> +                                 __PAGE_HYPERVISOR | _PAGE_USER);
> +
> +        switch ( type ) {
> +        case PGT_l3_page_table:
> +            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +            break;
> +        case PGT_l2_page_table:
> +#if CONFIG_PAGING_LEVELS == 3
> +            /* for PAE mode, PDPE only has PCD/PWT/P bits available */
> +            new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
> +#endif
> +            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +            break;
> +        case PGT_l1_page_table:
> +            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +            break;
> +        default:
> +            BUG();
> +            break;
> +        }
> +    }
> +
> +    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
> +
> +    /* split single large page into 4KB page in P2M table */
> +    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & 
> _PAGE_PSE) )
> +    {
> +        unsigned long flags, pfn;
> +        struct page_info *pg;
> +
> +        pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
> +        if ( pg == NULL )
> +            return 0;
> +
> +        /* New splintered mappings inherit the flags of the old superpage, 
> +         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
> +        flags = l1e_get_flags(*p2m_entry);
> +        pfn = l1e_get_pfn(*p2m_entry);
> +        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
> +            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
> +        else
> +            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
> +        
> +        l1_entry = __map_domain_page(pg);
> +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
> +        {
> +            new_entry = l1e_from_pfn(pfn + i, flags);
> +            nested_write_p2m_entry(p2m, l1_entry+i, new_entry);
> +        }
> +        unmap_domain_page(l1_entry);
> +        
> +        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> +                                 __PAGE_HYPERVISOR|_PAGE_USER);
> +        nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> +    }
> +
> +    *table_pg = l1e_get_page(*p2m_entry);
> +    next = __map_domain_page(*table_pg);
> +    unmap_domain_page(*table);
> +    *table = next;
> +
> +    return 1;
> +}

This function looks like it duplicates a lot of logic from an old
version of the normal p2m next-level function.  Would it be easy to
merge them?

> +int 
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> +                    unsigned int page_order, p2m_type_t p2mt);
> +
> +int 
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> +                    unsigned int page_order, p2m_type_t p2mt)
> +{
> +    struct page_info *table_pg;
> +    void *table;
> +    unsigned long gfn_remainder = gfn;
> +    l1_pgentry_t *p2m_entry;
> +    l1_pgentry_t entry_content;
> +    l2_pgentry_t l2e_content;
> +    int rv = 0;
> +    
> +    ASSERT(p2m);
> +    ASSERT(p2m->alloc_page);
> +
> +    /* address of nested paging table */
> +    table_pg = pagetable_get_page(p2m_get_pagetable(p2m));
> +    table = __map_domain_page(table_pg);
> +
> +#if CONFIG_PAGING_LEVELS >= 4
> +    if ( !nestedp2m_next_level(p2m, &table_pg, &table, 
> +                               &gfn_remainder, gfn, 
> +                               L4_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                               L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
> +        goto out;
> +#endif
> +
> +    if ( !nestedp2m_next_level(p2m, &table_pg, &table, &gfn_remainder, 
> +                               gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                               ((CONFIG_PAGING_LEVELS == 3)
> +                                ? (paging_mode_hap(p2m->domain) ? 4 : 8)
> +                                : L3_PAGETABLE_ENTRIES),
> +                               PGT_l2_page_table) )
> +        goto out;
> +
> +    if ( page_order == 0 )
> +    {
> +        if ( !nestedp2m_next_level(p2m, &table_pg, &table, 
> +                                   &gfn_remainder, gfn,
> +                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                                   L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
> +            goto out;
> +
> +        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
> +                                   0, L1_PAGETABLE_ENTRIES);
> +        ASSERT(p2m_entry);
> +        
> +        if ( mfn_valid(mfn) ) {
> +            entry_content = l1e_from_pfn(mfn_x(mfn), 
> +                                         p2m_type_to_flags(p2mt));
> +        } else {
> +            entry_content = l1e_empty();
> +        }
> +        
> +        /* level 1 entry */
> +        nested_write_p2m_entry(p2m, p2m_entry, entry_content);
> +    }
> +    else 
> +    {
> +        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
> +                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
> +                                   L2_PAGETABLE_ENTRIES);
> +        ASSERT(p2m_entry);
> +        
> +        /* FIXME: Deal with 4k replaced by 2MB pages */
> +        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
> +             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
> +        {
> +            domain_crash(p2m->domain);
> +            goto out;
> +        }
> +        
> +        if ( mfn_valid(mfn) )
> +            l2e_content = l2e_from_pfn(mfn_x(mfn),
> +                p2m_type_to_flags(p2mt) | _PAGE_PSE);
> +        else {
> +            l2e_content = l2e_empty();
> +     }
> +        
> +        entry_content.l1 = l2e_content.l2;
> +        nested_write_p2m_entry(p2m, p2m_entry, entry_content);
> +    }
> +
> +    /* Track the highest gfn for which we have ever had a valid mapping */
> +    if ( mfn_valid(mfn)
> +         && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
> +        p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
> +
> +    /* Success */
> +    rv = 1;
> +
> +out:
> +    unmap_domain_page(table);
> +    return rv;
> +}

Same for this: looks like it duplicates existing code. 


> +int
> +nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
> +{
> +    int rv;
> +    paddr_t L1_gpa, L0_gpa;
> +    struct domain *d = v->domain;
> +    struct p2m_domain *p2m, *nested_p2m;
> +
> +    p2m = p2m_get_hostp2m(d); /* L0 p2m */
> +    nested_p2m = p2m_get_nestedp2m(v, vcpu_nestedhvm(v).nh_vm_hostcr3);
> +
> +    /* walk the L1 P2M table, note we have to pass p2m
> +     * and not nested_p2m here or we fail the walk forever,
> +     * otherwise. */

Can you explain that more fully?  It looks like you're walking the L0
table to translate an L2 gfn into an L1 gfn, which surely can't be right. 

> +    rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
> +
> +    /* let caller to handle these two cases */
> +    switch (rv) {
> +    case NESTEDHVM_PAGEFAULT_INJECT:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_ERROR:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_DONE:
> +        break;
> +    default:
> +        BUG();
> +        break;
> +    }
> +
> +    /* ==> we have to walk L0 P2M */
> +    rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
> +
> +    /* let upper level caller to handle these two cases */
> +    switch (rv) {
> +    case NESTEDHVM_PAGEFAULT_INJECT:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_ERROR:
> +        return rv;
> +    case NESTEDHVM_PAGEFAULT_DONE:
> +        break;
> +    default:
> +        BUG();
> +        break;
> +    }
> +
> +    /* fix p2m_get_pagetable(nested_p2m) */
> +    nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa);
> +
> +    return NESTEDHVM_PAGEFAULT_DONE;
> +}
> +
> +/********************************************/
> +/*     NESTED VIRT INITIALIZATION FUNCS     */
> +/********************************************/
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/hap/private.h
> --- a/xen/arch/x86/mm/hap/private.h
> +++ b/xen/arch/x86/mm/hap/private.h
> @@ -30,4 +30,14 @@ unsigned long hap_gva_to_gfn_3_levels(st
>  unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
>                                       uint32_t *pfec);
>  
> +unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
> +    struct p2m_domain *p2m, unsigned long cr3,
> +    paddr_t ga, uint32_t *pfec);
> +unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
> +    struct p2m_domain *p2m, unsigned long cr3,
> +    paddr_t ga, uint32_t *pfec);
> +unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
> +    struct p2m_domain *p2m, unsigned long cr3,
> +    paddr_t ga, uint32_t *pfec);
> +
>  #endif /* __HAP_PRIVATE_H__ */
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/p2m.c
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -34,6 +34,7 @@
>  #include <public/mem_event.h>
>  #include <asm/mem_sharing.h>
>  #include <xen/event.h>
> +#include <asm/hvm/nestedhvm.h>
>  
>  /* Debugging and auditing of the P2M code? */
>  #define P2M_AUDIT     0
> @@ -72,7 +73,7 @@ boolean_param("hap_1gb", opt_hap_1gb);
>  #define SUPERPAGE_PAGES (1UL << 9)
>  #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
>  
> -static unsigned long p2m_type_to_flags(p2m_type_t t) 
> +unsigned long p2m_type_to_flags(p2m_type_t t) 
>  {
>      unsigned long flags;
>  #ifdef __x86_64__
> @@ -116,9 +117,9 @@ static void audit_p2m(struct p2m_domain 
>  // Find the next level's P2M entry, checking for out-of-range gfn's...
>  // Returns NULL on error.
>  //
> -static l1_pgentry_t *
> +l1_pgentry_t *
>  p2m_find_entry(void *table, unsigned long *gfn_remainder,
> -                   unsigned long gfn, u32 shift, u32 max)
> +                   unsigned long gfn, uint32_t shift, uint32_t max)
>  {
>      u32 index;
>  
> @@ -1719,10 +1720,12 @@ static void p2m_initialise(struct domain
>      INIT_PAGE_LIST_HEAD(&p2m->pod.single);
>  
>      p2m->domain = d;
> +    p2m->cr3 = 0;
>      p2m->set_entry = p2m_set_entry;
>      p2m->get_entry = p2m_gfn_to_mfn;
>      p2m->get_entry_current = p2m_gfn_to_mfn_current;
>      p2m->change_entry_type_global = p2m_change_type_global;
> +    cpus_clear(p2m->p2m_dirty_cpumask);
>  
>      if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
>          ept_p2m_init(d);
> @@ -1730,6 +1733,28 @@ static void p2m_initialise(struct domain
>      return;
>  }
>  
> +extern int
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> +                    unsigned int page_order, p2m_type_t p2mt);

Please define this in a header file.

> +static int
> +p2m_init_nestedp2m(struct domain *d)
> +{
> +    uint8_t i;
> +    struct p2m_domain *p2m;
> +
> +    spin_lock_init(&d->arch.nested_p2m_lock);
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
> +        if (p2m == NULL)
> +            return -ENOMEM;
> +        p2m_initialise(d, p2m);
> +        p2m->set_entry = nestedp2m_set_entry;
> +    }
> +
> +    return 0;
> +}
> +
>  int p2m_init(struct domain *d)
>  {
>      struct p2m_domain *p2m;
> @@ -1739,7 +1764,11 @@ int p2m_init(struct domain *d)
>          return -ENOMEM;
>      p2m_initialise(d, p2m);
>  
> -    return 0;
> +    /* Must initialise nestedp2m unconditionally
> +     * since nestedhvm_enabled(d) returns false here.
> +     * (p2m_init runs too early for HVM_PARAM_* options)
> +     */
> +    return p2m_init_nestedp2m(d);
>  }
>  
>  void p2m_change_entry_type_global(struct p2m_domain *p2m,
> @@ -1836,6 +1865,9 @@ int p2m_alloc_table(struct p2m_domain *p
>                          p2m_invalid) )
>          goto error;
>  
> +    if (p2m_is_nestedp2m(p2m))
> +        goto nesteddone;
> +
>      /* Copy all existing mappings from the page list and m2p */
>      spin_lock(&p2m->domain->page_alloc_lock);
>      page_list_for_each(page, &p2m->domain->page_list)
> @@ -1857,6 +1889,7 @@ int p2m_alloc_table(struct p2m_domain *p
>      }
>      spin_unlock(&p2m->domain->page_alloc_lock);
>  
> + nesteddone:
>      P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
>      p2m_unlock(p2m);
>      return 0;
> @@ -1881,6 +1914,9 @@ void p2m_teardown(struct p2m_domain *p2m
>      mfn_t mfn;
>  #endif
>  
> +    if (p2m == NULL)
> +        return;
> +
>      p2m_lock(p2m);
>  
>  #ifdef __x86_64__
> @@ -1899,11 +1935,26 @@ void p2m_teardown(struct p2m_domain *p2m
>      p2m_unlock(p2m);
>  }
>  
> +static void p2m_teardown_nestedp2m(struct domain *d)
> +{
> +    uint8_t i;
> +
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        xfree(d->arch.nested_p2m[i]);
> +        d->arch.nested_p2m[i] = NULL;
> +    }
> +}
> +
>  void p2m_final_teardown(struct domain *d)
>  {
>      /* Iterate over all p2m tables per domain */
>      xfree(d->arch.p2m);
>      d->arch.p2m = NULL;
> +
> +    /* We must teardown unconditionally because
> +     * we initialise them unconditionally.
> +     */
> +    p2m_teardown_nestedp2m(d);
>  }
>  
>  #if P2M_AUDIT
> @@ -2823,6 +2874,173 @@ void p2m_mem_paging_resume(struct p2m_do
>  }
>  #endif /* __x86_64__ */
>  
> +static struct p2m_domain *
> +p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
> +{
> +    int i, lru_index = -1;
> +    struct p2m_domain *lrup2m, *tmp;
> +
> +    if (p2m == NULL) {
> +        lru_index = MAX_NESTEDP2M - 1;
> +        lrup2m = d->arch.nested_p2m[lru_index];
> +    } else {
> +        lrup2m = p2m;
> +        for (i = 0; i < MAX_NESTEDP2M; i++) {
> +            if (d->arch.nested_p2m[i] == p2m) {
> +                lru_index = i;
> +                break;
> +            }
> +        }
> +    }
> +
> +    ASSERT(lru_index >= 0);
> +    if (lru_index == 0) {
> +        return lrup2m;
> +    }
> +
> +    /* move the other's down the array "list" */
> +    for (i = lru_index - 1; i >= 0; i--) {
> +        tmp = d->arch.nested_p2m[i];
> +        d->arch.nested_p2m[i+1] = tmp;        
> +    }
> +
> +    /* make the entry the first one */
> +    d->arch.nested_p2m[0] = lrup2m;
> +
> +    return lrup2m;
> +}
> +
> +static int 
> +p2m_flush_locked(struct p2m_domain *p2m)
> +{
> +    struct page_info * (*alloc)(struct p2m_domain *);
> +    void (*free)(struct p2m_domain *, struct page_info *);
> +
> +    alloc = p2m->alloc_page;
> +    free = p2m->free_page;
> +
> +    if (p2m->cr3 == 0)
> +        /* Microoptimisation: p2m is already empty.
> +         * => about 0.3% speedup of overall system performance.
> +         */
> +        return 0;

What happens if a malicious guest uses 0 as its actual CR3 value?

> +
> +    p2m_teardown(p2m);
> +    p2m_initialise(p2m->domain, p2m);
> +    p2m->set_entry = nestedp2m_set_entry;
> +    BUG_ON(p2m_alloc_table(p2m, alloc, free) != 0);
> +
> +    ASSERT(p2m);
> +    ASSERT(p2m->alloc_page);
> +    return 0;
> +}
> +
> +void
> +p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
> +{
> +    struct domain *d = p2m->domain;
> +
> +    nestedhvm_vm_flushtlb(p2m);
> +    ASSERT(v->domain == d);
> +    vcpu_nestedhvm(v).nh_p2m = NULL;
> +    spin_lock(&d->arch.nested_p2m_lock);
> +    BUG_ON(p2m_flush_locked(p2m) != 0);
> +    spin_unlock(&d->arch.nested_p2m_lock);
> +    hvm_asid_flush_vcpu(v);
> +}
> +
> +void
> +p2m_flush_nestedp2m(struct domain *d)
> +{
> +    int i;
> +
> +    spin_lock(&d->arch.nested_p2m_lock);
> +    for (i = 0; i < MAX_NESTEDP2M; i++)
> +        BUG_ON(p2m_flush_locked(d->arch.nested_p2m[i]) != 0);
> +    spin_unlock(&d->arch.nested_p2m_lock);
> +    flush_tlb_mask(&d->domain_dirty_cpumask);
> +}
> +
> +struct p2m_domain *
> +p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
> +{
> +    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
> +    struct domain *d;
> +    struct p2m_domain *p2m;
> +    int i, rv;
> +
> +    if (cr3 == 0)
> +        cr3 = v->arch.hvm_vcpu.guest_cr[3];
> +
> +    if (hvm->nh_flushp2m && hvm->nh_p2m) {
> +        nestedhvm_vm_flushtlb(hvm->nh_p2m);
> +        hvm->nh_p2m = NULL;
> +    }
> +
> +    d = v->domain;
> +    spin_lock(&d->arch.nested_p2m_lock);
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        p2m = d->arch.nested_p2m[i];
> +        if (p2m->cr3 == cr3 && p2m == hvm->nh_p2m) {
> +            p2m_getlru_nestedp2m(d, p2m);
> +            if (hvm->nh_flushp2m) {
> +               BUG_ON(p2m_flush_locked(p2m) != 0);
> +               hvm->nh_flushp2m = 0;
> +               hvm_asid_flush_vcpu(v);
> +            }
> +            p2m->cr3 = cr3;
> +            spin_unlock(&d->arch.nested_p2m_lock);
> +            return p2m;
> +        }
> +        if (p2m->cr3 == 0) { /* found unused p2m table */
> +            hvm->nh_flushp2m = 0;
> +            p2m_getlru_nestedp2m(d, p2m);
> +            hvm->nh_p2m = p2m;
> +            p2m->cr3 = cr3;
> +            spin_unlock(&d->arch.nested_p2m_lock);
> +            hvm_asid_flush_vcpu(v);
> +            return p2m;
> +        }
> +    }
> +
> +    /* All p2m's are or were in use. We know the least recent used one.
> +     * Destroy and re-initialize it.
> +     */
> +    for (i = 0; i < MAX_NESTEDP2M; i++) {
> +        p2m = p2m_getlru_nestedp2m(d, NULL);
> +        rv = p2m_flush_locked(p2m);

Is this enough?  If this p2m might be in host_vmcb->h_cr3 somewhere on
another vcpu, then you need to make sure that vcpu gets reset not to use
it any more. 

Cheers,

Tim.

-- 
Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel