Hi,
This is looking better - I think the TLB flushing is almost right.
A few more detailed comments below.
Content-Description: xen_nh13_haphap.diff
> # HG changeset patch
> # User cegger
> # Date 1283345895 -7200
> Implement Nested-on-Nested.
> This allows the guest to run nested guest with hap enabled.
>
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/hvm/hvm.c
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1042,12 +1042,56 @@ void hvm_inject_exception(unsigned int t
> hvm_funcs.inject_exception(trapnr, errcode, cr2);
> }
>
> -bool_t hvm_hap_nested_page_fault(unsigned long gfn)
> +bool_t hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs)
> {
> p2m_type_t p2mt;
> mfn_t mfn;
> struct vcpu *v = current;
> struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
> + unsigned long gfn = gpa >> PAGE_SHIFT;
> + int rv;
> +
> + /* On Nested Virtualization, walk the guest page table.
> + * If this succeeds, all is fine.
> + * If this fails, inject a nested page fault into the guest.
> + */
> + if ( nestedhvm_enabled(v->domain)
> + && nestedhvm_vcpu_in_guestmode(v)
> + && nestedhvm_paging_mode_hap(v) )
> + {
> + enum nestedhvm_vmexits nsret;
> + struct nestedhvm *hvm = &vcpu_nestedhvm(v);
> +
> + /* nested guest gpa == guest gva */
This comment is confusing to me. The nested guest gpa isn't a virtual
address, and certainly not the same as an L1-guest VA. Can you reword
it, maybe just to say what the call to nestedhvm_hap_nested_page_fault()
is going to do?
> + rv = nestedhvm_hap_nested_page_fault(v, gpa);
> + switch (rv) {
> + case NESTEDHVM_PAGEFAULT_DONE:
> + return 1;
> + case NESTEDHVM_PAGEFAULT_ERROR:
> + return 0;
> + case NESTEDHVM_PAGEFAULT_INJECT:
> + break;
> + }
> +
> + /* inject #VMEXIT(NPF) into guest. */
> + hvm->nh_forcevmexit.exitcode = NESTEDHVM_INTERCEPT_NPF;
> + hvm->nh_forcevmexit.exitinfo1 = regs->error_code;
> + hvm->nh_forcevmexit.exitinfo2 = gpa;
> + hvm->nh_hostflags.fields.forcevmexit = 1;
> + nsret = nestedhvm_vcpu_vmexit(v, regs, NESTEDHVM_INTERCEPT_NPF);
> + hvm->nh_hostflags.fields.forcevmexit = 0;
> + switch (nsret) {
> + case NESTEDHVM_VMEXIT_DONE:
> + case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */
> + return 1;
> + case NESTEDHVM_VMEXIT_HOST:
> + case NESTEDHVM_VMEXIT_CONTINUE:
> + case NESTEDHVM_VMEXIT_FATALERROR:
> + default:
> + gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret);
> + return 0;
> + }
> + }
>
> mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
>
> @@ -1843,7 +1908,7 @@ static enum hvm_copy_result __hvm_copy(
>
> if ( flags & HVMCOPY_virt )
> {
> - gfn = paging_gva_to_gfn(curr, addr, &pfec);
> + gfn = paging_p2m_ga_to_gfn(curr, p2m, mode, cr3, addr, &pfec);
Do other callers of paging_gva_to_gfn need to handle nested-npt mode as
well? If so, would it be better to update paging_gva_to_gfn() itself?
> if ( gfn == INVALID_GFN )
> {
> if ( pfec == PFEC_page_paged )
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/hvm/nestedhvm.c
> --- a/xen/arch/x86/hvm/nestedhvm.c
> +++ b/xen/arch/x86/hvm/nestedhvm.c
> @@ -20,6 +20,7 @@
> #include <asm/msr.h>
> #include <asm/hvm/support.h> /* for HVM_DELIVER_NO_ERROR_CODE */
> #include <asm/hvm/hvm.h>
> +#include <asm/p2m.h> /* for struct p2m_domain */
> #include <asm/hvm/nestedhvm.h>
> #include <asm/event.h> /* for local_event_delivery_(en|dis)able */
> #include <asm/paging.h> /* for paging_mode_hap() */
> @@ -477,6 +478,7 @@ nestedhvm_vcpu_vmexit(struct vcpu *v, st
> enum hvm_copy_result hvm_rc;
>
> hvm->nh_hostflags.fields.vmentry = 1;
> + paging_update_nestedmode(v);
> if (nestedhvm_vcpu_in_guestmode(v)) {
> ret = nestedhvm_vmexit(v, regs, exitcode);
> switch (ret) {
> @@ -529,14 +531,30 @@ nestedhvm_vcpu_vmexit(struct vcpu *v, st
> void
> nestedhvm_vcpu_enter_guestmode(struct vcpu *v)
> {
> + struct p2m_domain *p2m;
> vcpu_nestedhvm(v).nh_guestmode = 1;
> +
> + p2m = vcpu_nestedhvm(v).nh_p2m;
> + if (p2m == NULL)
> + /* p2m has either been invalidated or not yet assigned. */
> + return;
> +
> + cpu_set(v->processor, p2m->p2m_dirty_cpumask);
Is this arch-specific code? (and likewise the cpu_clear that follows)
Also, in the case where p2m is NULL, when the p2m is allocated later you
need to set a bit in p2m->p2m_dirty_cpumask there too.
> }
>
> void
> nestedhvm_vcpu_exit_guestmode(struct vcpu *v)
> {
> + struct p2m_domain *p2m;
> vcpu_nestedhvm(v).nh_guestmode = 0;
> -}
> +
> + p2m = vcpu_nestedhvm(v).nh_p2m;
> + if (p2m == NULL)
> + /* p2m has either been invalidated or not yet assigned. */
> + return;
> +
> + cpu_clear(v->processor, p2m->p2m_dirty_cpumask);
> +}
>
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/hap/hap.c
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -40,6 +40,7 @@
> #include <asm/p2m.h>
> #include <asm/domain.h>
> #include <xen/numa.h>
> +#include <asm/hvm/nestedhvm.h>
>
> #include "private.h"
>
> @@ -340,6 +341,30 @@ static void hap_free_p2m_page(struct p2m
> hap_unlock(d);
> }
>
> +#define nestedp2m_alloc_p2m_page hap_alloc_p2m_page
> +
> +/* We must use hap_free() or flushing the nested p2m tables fails
> + * with "freeing ptp fails due to insufficient pages.
That's quite sensible -- other code that frees p2m pages directly to Xen
is basically wrong. It's a relic of the shadow allocator, where freeing
individual p2m pages to the shadow pool would have been a bit tricky and
I knew at the time that p2m pages were only ever freed on destruction.
I really ought to update the shadow p2m-free path now that the shadow
pool is simpler. It just needs a little care around the order things
happen in teardown.
> + * XXX This triggers a bug in p2m that causes a crash in
> + * xen/common/page_alloc.c:1201 on L1 guest shutdown/destroy.
> + */
> +static void
> +nestedp2m_free_p2m_page(struct p2m_domain *p2m, struct page_info *pg)
> +{
> + struct domain *d = p2m->domain;
> + hap_lock(d);
> + ASSERT(page_get_owner(pg) == d);
> + /* Should have just the one ref we gave it in alloc_p2m_page() */
> + BUG_ON((pg->count_info & PGC_count_mask) != 1);
> + pg->count_info = 0;
> + page_set_owner(pg, NULL);
> + hap_free(d, page_to_mfn(pg));
> + d->arch.paging.hap.total_pages++;
> + d->arch.paging.hap.p2m_pages--;
> + ASSERT(d->arch.paging.hap.p2m_pages >= 0);
> + hap_unlock(d);
> +}
> +
> /* Return the size of the pool, rounded up to the nearest MB */
> static unsigned int
> hap_get_allocation(struct domain *d)
> +static int
> +nestedp2m_next_level(struct p2m_domain *p2m, struct page_info **table_pg,
> + void **table, unsigned long *gfn_remainder,
> + unsigned long gfn, uint32_t shift, uint32_t max,
> + unsigned long type)
> +{
> + l1_pgentry_t *l1_entry;
> + l1_pgentry_t *p2m_entry;
> + l1_pgentry_t new_entry;
> + void *next;
> + int i;
> +
> + ASSERT(p2m);
> + ASSERT(p2m->alloc_page);
> +
> + if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, shift,
> max)) )
> + return 0;
> +
> + if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
> + {
> + struct page_info *pg;
> +
> + pg = p2m_alloc_ptp(p2m, type);
> + if ( pg == NULL )
> + return 0;
> +
> + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> + __PAGE_HYPERVISOR | _PAGE_USER);
> +
> + switch ( type ) {
> + case PGT_l3_page_table:
> + nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> + break;
> + case PGT_l2_page_table:
> +#if CONFIG_PAGING_LEVELS == 3
> + /* for PAE mode, PDPE only has PCD/PWT/P bits available */
> + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
> +#endif
> + nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> + break;
> + case PGT_l1_page_table:
> + nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> + break;
> + default:
> + BUG();
> + break;
> + }
> + }
> +
> + ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
> +
> + /* split single large page into 4KB page in P2M table */
> + if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) &
> _PAGE_PSE) )
> + {
> + unsigned long flags, pfn;
> + struct page_info *pg;
> +
> + pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
> + if ( pg == NULL )
> + return 0;
> +
> + /* New splintered mappings inherit the flags of the old superpage,
> + * with a little reorganisation for the _PAGE_PSE_PAT bit. */
> + flags = l1e_get_flags(*p2m_entry);
> + pfn = l1e_get_pfn(*p2m_entry);
> + if ( pfn & 1 ) /* ==> _PAGE_PSE_PAT was set */
> + pfn -= 1; /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
> + else
> + flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
> +
> + l1_entry = __map_domain_page(pg);
> + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
> + {
> + new_entry = l1e_from_pfn(pfn + i, flags);
> + nested_write_p2m_entry(p2m, l1_entry+i, new_entry);
> + }
> + unmap_domain_page(l1_entry);
> +
> + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> + __PAGE_HYPERVISOR|_PAGE_USER);
> + nested_write_p2m_entry(p2m, p2m_entry, new_entry);
> + }
> +
> + *table_pg = l1e_get_page(*p2m_entry);
> + next = __map_domain_page(*table_pg);
> + unmap_domain_page(*table);
> + *table = next;
> +
> + return 1;
> +}
This function looks like it duplicates a lot of logic from an old
version of the normal p2m next-level function. Would it be easy to
merge them?
> +int
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> + unsigned int page_order, p2m_type_t p2mt);
> +
> +int
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> + unsigned int page_order, p2m_type_t p2mt)
> +{
> + struct page_info *table_pg;
> + void *table;
> + unsigned long gfn_remainder = gfn;
> + l1_pgentry_t *p2m_entry;
> + l1_pgentry_t entry_content;
> + l2_pgentry_t l2e_content;
> + int rv = 0;
> +
> + ASSERT(p2m);
> + ASSERT(p2m->alloc_page);
> +
> + /* address of nested paging table */
> + table_pg = pagetable_get_page(p2m_get_pagetable(p2m));
> + table = __map_domain_page(table_pg);
> +
> +#if CONFIG_PAGING_LEVELS >= 4
> + if ( !nestedp2m_next_level(p2m, &table_pg, &table,
> + &gfn_remainder, gfn,
> + L4_PAGETABLE_SHIFT - PAGE_SHIFT,
> + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
> + goto out;
> +#endif
> +
> + if ( !nestedp2m_next_level(p2m, &table_pg, &table, &gfn_remainder,
> + gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT,
> + ((CONFIG_PAGING_LEVELS == 3)
> + ? (paging_mode_hap(p2m->domain) ? 4 : 8)
> + : L3_PAGETABLE_ENTRIES),
> + PGT_l2_page_table) )
> + goto out;
> +
> + if ( page_order == 0 )
> + {
> + if ( !nestedp2m_next_level(p2m, &table_pg, &table,
> + &gfn_remainder, gfn,
> + L2_PAGETABLE_SHIFT - PAGE_SHIFT,
> + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
> + goto out;
> +
> + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
> + 0, L1_PAGETABLE_ENTRIES);
> + ASSERT(p2m_entry);
> +
> + if ( mfn_valid(mfn) ) {
> + entry_content = l1e_from_pfn(mfn_x(mfn),
> + p2m_type_to_flags(p2mt));
> + } else {
> + entry_content = l1e_empty();
> + }
> +
> + /* level 1 entry */
> + nested_write_p2m_entry(p2m, p2m_entry, entry_content);
> + }
> + else
> + {
> + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
> + L2_PAGETABLE_SHIFT - PAGE_SHIFT,
> + L2_PAGETABLE_ENTRIES);
> + ASSERT(p2m_entry);
> +
> + /* FIXME: Deal with 4k replaced by 2MB pages */
> + if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
> + !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
> + {
> + domain_crash(p2m->domain);
> + goto out;
> + }
> +
> + if ( mfn_valid(mfn) )
> + l2e_content = l2e_from_pfn(mfn_x(mfn),
> + p2m_type_to_flags(p2mt) | _PAGE_PSE);
> + else {
> + l2e_content = l2e_empty();
> + }
> +
> + entry_content.l1 = l2e_content.l2;
> + nested_write_p2m_entry(p2m, p2m_entry, entry_content);
> + }
> +
> + /* Track the highest gfn for which we have ever had a valid mapping */
> + if ( mfn_valid(mfn)
> + && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
> + p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
> +
> + /* Success */
> + rv = 1;
> +
> +out:
> + unmap_domain_page(table);
> + return rv;
> +}
Same for this: looks like it duplicates existing code.
> +int
> +nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
> +{
> + int rv;
> + paddr_t L1_gpa, L0_gpa;
> + struct domain *d = v->domain;
> + struct p2m_domain *p2m, *nested_p2m;
> +
> + p2m = p2m_get_hostp2m(d); /* L0 p2m */
> + nested_p2m = p2m_get_nestedp2m(v, vcpu_nestedhvm(v).nh_vm_hostcr3);
> +
> + /* walk the L1 P2M table, note we have to pass p2m
> + * and not nested_p2m here or we fail the walk forever,
> + * otherwise. */
Can you explain that more fully? It looks like you're walking the L0
table to translate an L2 gfn into an L1 gfn, which surely can't be right.
> + rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
> +
> + /* let caller to handle these two cases */
> + switch (rv) {
> + case NESTEDHVM_PAGEFAULT_INJECT:
> + return rv;
> + case NESTEDHVM_PAGEFAULT_ERROR:
> + return rv;
> + case NESTEDHVM_PAGEFAULT_DONE:
> + break;
> + default:
> + BUG();
> + break;
> + }
> +
> + /* ==> we have to walk L0 P2M */
> + rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
> +
> + /* let upper level caller to handle these two cases */
> + switch (rv) {
> + case NESTEDHVM_PAGEFAULT_INJECT:
> + return rv;
> + case NESTEDHVM_PAGEFAULT_ERROR:
> + return rv;
> + case NESTEDHVM_PAGEFAULT_DONE:
> + break;
> + default:
> + BUG();
> + break;
> + }
> +
> + /* fix p2m_get_pagetable(nested_p2m) */
> + nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa);
> +
> + return NESTEDHVM_PAGEFAULT_DONE;
> +}
> +
> +/********************************************/
> +/* NESTED VIRT INITIALIZATION FUNCS */
> +/********************************************/
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/hap/private.h
> --- a/xen/arch/x86/mm/hap/private.h
> +++ b/xen/arch/x86/mm/hap/private.h
> @@ -30,4 +30,14 @@ unsigned long hap_gva_to_gfn_3_levels(st
> unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
> uint32_t *pfec);
>
> +unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
> + struct p2m_domain *p2m, unsigned long cr3,
> + paddr_t ga, uint32_t *pfec);
> +unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
> + struct p2m_domain *p2m, unsigned long cr3,
> + paddr_t ga, uint32_t *pfec);
> +unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
> + struct p2m_domain *p2m, unsigned long cr3,
> + paddr_t ga, uint32_t *pfec);
> +
> #endif /* __HAP_PRIVATE_H__ */
> diff -r 50b3e6c73d7c -r 24daea64bdff xen/arch/x86/mm/p2m.c
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -34,6 +34,7 @@
> #include <public/mem_event.h>
> #include <asm/mem_sharing.h>
> #include <xen/event.h>
> +#include <asm/hvm/nestedhvm.h>
>
> /* Debugging and auditing of the P2M code? */
> #define P2M_AUDIT 0
> @@ -72,7 +73,7 @@ boolean_param("hap_1gb", opt_hap_1gb);
> #define SUPERPAGE_PAGES (1UL << 9)
> #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
>
> -static unsigned long p2m_type_to_flags(p2m_type_t t)
> +unsigned long p2m_type_to_flags(p2m_type_t t)
> {
> unsigned long flags;
> #ifdef __x86_64__
> @@ -116,9 +117,9 @@ static void audit_p2m(struct p2m_domain
> // Find the next level's P2M entry, checking for out-of-range gfn's...
> // Returns NULL on error.
> //
> -static l1_pgentry_t *
> +l1_pgentry_t *
> p2m_find_entry(void *table, unsigned long *gfn_remainder,
> - unsigned long gfn, u32 shift, u32 max)
> + unsigned long gfn, uint32_t shift, uint32_t max)
> {
> u32 index;
>
> @@ -1719,10 +1720,12 @@ static void p2m_initialise(struct domain
> INIT_PAGE_LIST_HEAD(&p2m->pod.single);
>
> p2m->domain = d;
> + p2m->cr3 = 0;
> p2m->set_entry = p2m_set_entry;
> p2m->get_entry = p2m_gfn_to_mfn;
> p2m->get_entry_current = p2m_gfn_to_mfn_current;
> p2m->change_entry_type_global = p2m_change_type_global;
> + cpus_clear(p2m->p2m_dirty_cpumask);
>
> if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
> ept_p2m_init(d);
> @@ -1730,6 +1733,28 @@ static void p2m_initialise(struct domain
> return;
> }
>
> +extern int
> +nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
> + unsigned int page_order, p2m_type_t p2mt);
Please define this in a header file.
> +static int
> +p2m_init_nestedp2m(struct domain *d)
> +{
> + uint8_t i;
> + struct p2m_domain *p2m;
> +
> + spin_lock_init(&d->arch.nested_p2m_lock);
> + for (i = 0; i < MAX_NESTEDP2M; i++) {
> + d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
> + if (p2m == NULL)
> + return -ENOMEM;
> + p2m_initialise(d, p2m);
> + p2m->set_entry = nestedp2m_set_entry;
> + }
> +
> + return 0;
> +}
> +
> int p2m_init(struct domain *d)
> {
> struct p2m_domain *p2m;
> @@ -1739,7 +1764,11 @@ int p2m_init(struct domain *d)
> return -ENOMEM;
> p2m_initialise(d, p2m);
>
> - return 0;
> + /* Must initialise nestedp2m unconditionally
> + * since nestedhvm_enabled(d) returns false here.
> + * (p2m_init runs too early for HVM_PARAM_* options)
> + */
> + return p2m_init_nestedp2m(d);
> }
>
> void p2m_change_entry_type_global(struct p2m_domain *p2m,
> @@ -1836,6 +1865,9 @@ int p2m_alloc_table(struct p2m_domain *p
> p2m_invalid) )
> goto error;
>
> + if (p2m_is_nestedp2m(p2m))
> + goto nesteddone;
> +
> /* Copy all existing mappings from the page list and m2p */
> spin_lock(&p2m->domain->page_alloc_lock);
> page_list_for_each(page, &p2m->domain->page_list)
> @@ -1857,6 +1889,7 @@ int p2m_alloc_table(struct p2m_domain *p
> }
> spin_unlock(&p2m->domain->page_alloc_lock);
>
> + nesteddone:
> P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
> p2m_unlock(p2m);
> return 0;
> @@ -1881,6 +1914,9 @@ void p2m_teardown(struct p2m_domain *p2m
> mfn_t mfn;
> #endif
>
> + if (p2m == NULL)
> + return;
> +
> p2m_lock(p2m);
>
> #ifdef __x86_64__
> @@ -1899,11 +1935,26 @@ void p2m_teardown(struct p2m_domain *p2m
> p2m_unlock(p2m);
> }
>
> +static void p2m_teardown_nestedp2m(struct domain *d)
> +{
> + uint8_t i;
> +
> + for (i = 0; i < MAX_NESTEDP2M; i++) {
> + xfree(d->arch.nested_p2m[i]);
> + d->arch.nested_p2m[i] = NULL;
> + }
> +}
> +
> void p2m_final_teardown(struct domain *d)
> {
> /* Iterate over all p2m tables per domain */
> xfree(d->arch.p2m);
> d->arch.p2m = NULL;
> +
> + /* We must teardown unconditionally because
> + * we initialise them unconditionally.
> + */
> + p2m_teardown_nestedp2m(d);
> }
>
> #if P2M_AUDIT
> @@ -2823,6 +2874,173 @@ void p2m_mem_paging_resume(struct p2m_do
> }
> #endif /* __x86_64__ */
>
> +static struct p2m_domain *
> +p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
> +{
> + int i, lru_index = -1;
> + struct p2m_domain *lrup2m, *tmp;
> +
> + if (p2m == NULL) {
> + lru_index = MAX_NESTEDP2M - 1;
> + lrup2m = d->arch.nested_p2m[lru_index];
> + } else {
> + lrup2m = p2m;
> + for (i = 0; i < MAX_NESTEDP2M; i++) {
> + if (d->arch.nested_p2m[i] == p2m) {
> + lru_index = i;
> + break;
> + }
> + }
> + }
> +
> + ASSERT(lru_index >= 0);
> + if (lru_index == 0) {
> + return lrup2m;
> + }
> +
> + /* move the other's down the array "list" */
> + for (i = lru_index - 1; i >= 0; i--) {
> + tmp = d->arch.nested_p2m[i];
> + d->arch.nested_p2m[i+1] = tmp;
> + }
> +
> + /* make the entry the first one */
> + d->arch.nested_p2m[0] = lrup2m;
> +
> + return lrup2m;
> +}
> +
> +static int
> +p2m_flush_locked(struct p2m_domain *p2m)
> +{
> + struct page_info * (*alloc)(struct p2m_domain *);
> + void (*free)(struct p2m_domain *, struct page_info *);
> +
> + alloc = p2m->alloc_page;
> + free = p2m->free_page;
> +
> + if (p2m->cr3 == 0)
> + /* Microoptimisation: p2m is already empty.
> + * => about 0.3% speedup of overall system performance.
> + */
> + return 0;
What happens if a malicious guest uses 0 as its actual CR3 value?
> +
> + p2m_teardown(p2m);
> + p2m_initialise(p2m->domain, p2m);
> + p2m->set_entry = nestedp2m_set_entry;
> + BUG_ON(p2m_alloc_table(p2m, alloc, free) != 0);
> +
> + ASSERT(p2m);
> + ASSERT(p2m->alloc_page);
> + return 0;
> +}
> +
> +void
> +p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
> +{
> + struct domain *d = p2m->domain;
> +
> + nestedhvm_vm_flushtlb(p2m);
> + ASSERT(v->domain == d);
> + vcpu_nestedhvm(v).nh_p2m = NULL;
> + spin_lock(&d->arch.nested_p2m_lock);
> + BUG_ON(p2m_flush_locked(p2m) != 0);
> + spin_unlock(&d->arch.nested_p2m_lock);
> + hvm_asid_flush_vcpu(v);
> +}
> +
> +void
> +p2m_flush_nestedp2m(struct domain *d)
> +{
> + int i;
> +
> + spin_lock(&d->arch.nested_p2m_lock);
> + for (i = 0; i < MAX_NESTEDP2M; i++)
> + BUG_ON(p2m_flush_locked(d->arch.nested_p2m[i]) != 0);
> + spin_unlock(&d->arch.nested_p2m_lock);
> + flush_tlb_mask(&d->domain_dirty_cpumask);
> +}
> +
> +struct p2m_domain *
> +p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
> +{
> + struct nestedhvm *hvm = &vcpu_nestedhvm(v);
> + struct domain *d;
> + struct p2m_domain *p2m;
> + int i, rv;
> +
> + if (cr3 == 0)
> + cr3 = v->arch.hvm_vcpu.guest_cr[3];
> +
> + if (hvm->nh_flushp2m && hvm->nh_p2m) {
> + nestedhvm_vm_flushtlb(hvm->nh_p2m);
> + hvm->nh_p2m = NULL;
> + }
> +
> + d = v->domain;
> + spin_lock(&d->arch.nested_p2m_lock);
> + for (i = 0; i < MAX_NESTEDP2M; i++) {
> + p2m = d->arch.nested_p2m[i];
> + if (p2m->cr3 == cr3 && p2m == hvm->nh_p2m) {
> + p2m_getlru_nestedp2m(d, p2m);
> + if (hvm->nh_flushp2m) {
> + BUG_ON(p2m_flush_locked(p2m) != 0);
> + hvm->nh_flushp2m = 0;
> + hvm_asid_flush_vcpu(v);
> + }
> + p2m->cr3 = cr3;
> + spin_unlock(&d->arch.nested_p2m_lock);
> + return p2m;
> + }
> + if (p2m->cr3 == 0) { /* found unused p2m table */
> + hvm->nh_flushp2m = 0;
> + p2m_getlru_nestedp2m(d, p2m);
> + hvm->nh_p2m = p2m;
> + p2m->cr3 = cr3;
> + spin_unlock(&d->arch.nested_p2m_lock);
> + hvm_asid_flush_vcpu(v);
> + return p2m;
> + }
> + }
> +
> + /* All p2m's are or were in use. We know the least recent used one.
> + * Destroy and re-initialize it.
> + */
> + for (i = 0; i < MAX_NESTEDP2M; i++) {
> + p2m = p2m_getlru_nestedp2m(d, NULL);
> + rv = p2m_flush_locked(p2m);
Is this enough? If this p2m might be in host_vmcb->h_cr3 somewhere on
another vcpu, then you need to make sure that vcpu gets reset not to use
it any more.
Cheers,
Tim.
--
Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd. (Company #02937203, SL9 0BG)
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|