# HG changeset patch # User cegger # Date 1271330316 -7200 Implemented Nested-on-Nested. This allows the guest to run nested guest with hap. diff -r 3f48b73b0a30 -r d88026a2afdc xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -955,12 +955,51 @@ void hvm_triple_fault(void) domain_shutdown(v->domain, SHUTDOWN_reboot); } -bool_t hvm_hap_nested_page_fault(unsigned long gfn) +bool_t hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs) { p2m_type_t p2mt; mfn_t mfn; struct vcpu *v = current; struct p2m_domain *p2m = p2m_get_hostp2m(v->domain); + unsigned long gfn = gpa >> PAGE_SHIFT; + int rv; + + /* On Nested Virtualization, walk the guest page table. + * If this succeeds, all is fine. + * If this fails, inject a nested page fault into the guest. + */ + if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) + { + enum nestedhvm_vmexits nsret; + + /* nested guest gpa == guest gva */ + rv = nestedhvm_hap_nested_page_fault(v, gpa); + switch (rv) { + case NESTEDHVM_PAGEFAULT_DONE: + return 1; + case NESTEDHVM_PAGEFAULT_ERROR: + return 0; + case NESTEDHVM_PAGEFAULT_INJECT: + break; + } + + /* inject #VMEXIT(NPF) into guest. */ + VCPU_NESTEDHVM(v).nh_forcevmexit_exitcode = VMEXIT_NPF; + VCPU_NESTEDHVM(v).nh_hostflags.fields.forcevmexit = 1; + nsret = nestedhvm_vcpu_vmexit(v, regs, VMEXIT_NPF); + VCPU_NESTEDHVM(v).nh_hostflags.fields.forcevmexit = 0; + switch (nsret) { + case NESTEDHVM_VMEXIT_DONE: + case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */ + return 1; + case NESTEDHVM_VMEXIT_HOST: + case NESTEDHVM_VMEXIT_CONTINUE: + case NESTEDHVM_VMEXIT_FATALERROR: + default: + gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret); + return 0; + } + } mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt); diff -r 3f48b73b0a30 -r d88026a2afdc xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -892,6 +892,12 @@ static int nsvm_vcpu_features(struct vcp *edx = 0; + /* We require the host to use nested paging as + * nested-on-shadow is not supported. + */ + if ( cpu_has_svm_npt && paging_mode_hap(v->domain) ) + *edx |= (1U << SVM_FEATURE_NPT); + if ( cpu_has_svm_lbrv ) *edx |= (1U << SVM_FEATURE_LBRV); #if 0 /* not yet implemented */ @@ -1222,6 +1228,10 @@ static int nsvm_vmcb_prepare4vmrun(struc /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ + host_vmcb->np_enable = 1; + + host_vmcb->h_cr3 = + pagetable_get_paddr(p2m_get_pagetable(p2m_get_nestedp2m(v, ns_vmcb->h_cr3))); /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->cr3); @@ -1531,14 +1541,15 @@ void start_svm(struct cpuinfo_x86 *c) hvm_enable(&svm_function_table); } -static void svm_do_nested_pgfault(paddr_t gpa) +static void svm_do_nested_pgfault(struct vcpu *v, + struct cpu_user_regs *regs, paddr_t gpa) { unsigned long gfn = gpa >> PAGE_SHIFT; mfn_t mfn; p2m_type_t p2mt; struct p2m_domain *p2m; - p2m = p2m_get_hostp2m(current->domain); + p2m = p2m_get_p2m(v, VCPU_NESTEDHVM(v).nh_vmcb_hcr3); if ( tb_init_done ) { @@ -1556,14 +1567,14 @@ static void svm_do_nested_pgfault(paddr_ __trace_var(TRC_HVM_NPF, 0, sizeof(_d), (unsigned char *)&_d); } - if ( hvm_hap_nested_page_fault(gfn) ) + if ( hvm_hap_nested_page_fault(gpa, regs) ) return; /* Everything else is an error. */ mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt); gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n", gpa, mfn_x(mfn), p2mt); - domain_crash(current->domain); + domain_crash(v->domain); } static void svm_fpu_dirty_intercept(void) @@ -2354,7 +2365,7 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_NPF: perfc_incra(svmexits, VMEXIT_NPF_PERFC); regs->error_code = vmcb->exitinfo1; - svm_do_nested_pgfault(vmcb->exitinfo2); + svm_do_nested_pgfault(v, regs, vmcb->exitinfo2); break; case VMEXIT_IRET: diff -r 3f48b73b0a30 -r d88026a2afdc xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -2145,7 +2145,7 @@ static void ept_handle_violation(unsigne } if ( (qualification & EPT_GLA_VALID) && - hvm_hap_nested_page_fault(gfn) ) + hvm_hap_nested_page_fault(gpa, guest_cpu_user_regs()) ) return; /* Everything else is an error. */ diff -r 3f48b73b0a30 -r d88026a2afdc xen/arch/x86/mm/hap/Makefile --- a/xen/arch/x86/mm/hap/Makefile +++ b/xen/arch/x86/mm/hap/Makefile @@ -3,6 +3,7 @@ obj-y += guest_walk_2level.o obj-y += guest_walk_3level.o obj-y += guest_walk_4level.o obj-y += p2m-ept.o +obj-y += nested_hap.o guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1))))) guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) diff -r 3f48b73b0a30 -r d88026a2afdc xen/arch/x86/mm/hap/nested_hap.c --- /dev/null +++ b/xen/arch/x86/mm/hap/nested_hap.c @@ -0,0 +1,469 @@ +/****************************************************************************** + * arch/x86/mm/hap/nested_hap.c + * + * Code for Nested Virtualization + * Copyright (c) 2010 Advanced Micro Devices + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "private.h" + +/* AlGORITHM for NESTED PAGE FAULT + * + * NOTATION + * Levels: L0, L1, L2 + * Guests: L1 guest, L2 guest + * Hypervisor: L0 hypervisor + * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA + * + * On L0, when #NPF happens, the handler function should do: + * hap_page_fault(GPA) + * { + * 1. If #NPF is from L1 guest, then we crash the guest VM (same as old + * code) + * 2. If #NPF is from L2 guest, then we continue from (3) + * 3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space. + * 4. Walk the h_cr3 page table + * 5. - if not present, then we inject #NPF back to L1 guest and + * re-launch L1 guest (L1 guest will either treat this #NPF as MMIO, + * or fix its p2m table for L2 guest) + * 6. - if present, then we will get the a new translated value L1-GPA + * (points to L1 machine memory) + * 7. * Use L1-GPA to walk L0 P2M table + * 8. - if not present, then crash the guest (should not happen) + * 9. - if present, then we get a new translated value MPA + * (points to real machine memory) + * 10. * Finally, use GPA and MPA to walk nested_p2m + * and fix the bits. + * } + * + */ + + +/********************************************/ +/* NESTED VIRT P2M FUNCTIONS */ +/********************************************/ +/* Override macros from asm/page.h to make them work with mfn_t */ +#undef mfn_valid +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) +#undef page_to_mfn +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) + +#define NESTED_P2M_BASE_FLAGS \ + (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED) + +static unsigned long +nested_p2m_type_to_flags(p2m_type_t t) +{ + unsigned long flags; +#ifdef __x86_64__ + flags = (unsigned long)(t & 0x3fff) << 9; +#else + flags = (t & 0x7UL) << 9; +#endif +#ifndef HAVE_GRANT_MAP_P2M + BUG_ON(p2m_is_grant(t)); +#endif + switch(t) + { + case p2m_invalid: + default: + return flags; + case p2m_ram_rw: + case p2m_grant_map_rw: + return flags | NESTED_P2M_BASE_FLAGS | _PAGE_RW; + case p2m_ram_logdirty: + return flags | NESTED_P2M_BASE_FLAGS; + case p2m_ram_ro: + case p2m_grant_map_ro: + return flags | NESTED_P2M_BASE_FLAGS; + case p2m_ram_shared: + return flags | NESTED_P2M_BASE_FLAGS; + case p2m_mmio_dm: + return flags; + case p2m_mmio_direct: + return flags | NESTED_P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD; + case p2m_populate_on_demand: + return flags; + } +} + +static void +nested_write_p2m_entry(struct p2m_domain *p2m, + l1_pgentry_t *p, l1_pgentry_t new) +{ + struct domain *d = p2m->domain; + uint32_t old_flags; + + hap_lock(d); + + old_flags = l1e_get_flags(*p); + safe_write_pte(p, new); + + hap_unlock(d); +} + +static l1_pgentry_t * +nested_p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, uint32_t shift, uint32_t max) +{ + uint32_t index; + + index = *gfn_remainder >> shift; + if ( index >= max ) + { + gdprintk(XENLOG_ERR, "gfn=0x%lx out of range " + "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", + gfn, *gfn_remainder, shift, index, max); + return NULL; + } + + *gfn_remainder &= (1 << shift) - 1; + return (l1_pgentry_t *)table + index; +} + +static int +nested_p2m_next_level(struct p2m_domain *p2m, struct page_info **table_pg, + void **table, unsigned long *gfn_remainder, + unsigned long gfn, uint32_t shift, uint32_t max, + unsigned long type) +{ + l1_pgentry_t *l1_entry; + l1_pgentry_t *p2m_entry; + l1_pgentry_t new_entry; + void *next; + int i; + + ASSERT(p2m); + ASSERT(p2m->alloc_page); + + if ( !(p2m_entry = nested_p2m_find_entry(*table, gfn_remainder, gfn, + shift, max)) ) + return 0; + + if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) + { + struct page_info *pg; + + pg = p2m_alloc_ptp(p2m, type); + if ( pg == NULL ) + return 0; + + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR | _PAGE_USER); + + switch ( type ) { + case PGT_l3_page_table: + nested_write_p2m_entry(p2m, p2m_entry, new_entry); + break; + case PGT_l2_page_table: +#if CONFIG_PAGING_LEVELS == 3 + /* for PAE mode, PDPE only has PCD/PWT/P bits available */ + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT); +#endif + nested_write_p2m_entry(p2m, p2m_entry, new_entry); + break; + case PGT_l1_page_table: + nested_write_p2m_entry(p2m, p2m_entry, new_entry); + break; + default: + BUG(); + break; + } + } + + ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE)); + + /* split single large page into 4KB page in P2M table */ + if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + unsigned long flags, pfn; + struct page_info *pg; + + pg = p2m_alloc_ptp(p2m, PGT_l1_page_table); + if ( pg == NULL ) + return 0; + + /* New splintered mappings inherit the flags of the old superpage, + * with a little reorganisation for the _PAGE_PSE_PAT bit. */ + flags = l1e_get_flags(*p2m_entry); + pfn = l1e_get_pfn(*p2m_entry); + if ( pfn & 1 ) /* ==> _PAGE_PSE_PAT was set */ + pfn -= 1; /* Clear it; _PAGE_PSE becomes _PAGE_PAT */ + else + flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */ + + l1_entry = __map_domain_page(pg); + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + new_entry = l1e_from_pfn(pfn + i, flags); + nested_write_p2m_entry(p2m, l1_entry+i, new_entry); + } + unmap_domain_page(l1_entry); + + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR|_PAGE_USER); + nested_write_p2m_entry(p2m, p2m_entry, new_entry); + } + + *table_pg = l1e_get_page(*p2m_entry); + next = __map_domain_page(*table_pg); + unmap_domain_page(*table); + *table = next; + + return 1; +} + +static int +nested_p2m_set_entry(struct p2m_domain *p2m, + unsigned long gfn, mfn_t mfn, unsigned int page_order, + p2m_type_t p2mt) +{ + struct page_info *table_pg; + void *table; + unsigned long gfn_remainder = gfn; + l1_pgentry_t *p2m_entry; + l1_pgentry_t entry_content; + l2_pgentry_t l2e_content; + int rv = 0; + + ASSERT(p2m); + + /* address of nested paging table */ + table_pg = pagetable_get_page(p2m_get_pagetable(p2m)); + table = __map_domain_page(table_pg); + +#if CONFIG_PAGING_LEVELS >= 4 + if ( !nested_p2m_next_level(p2m, &table_pg, &table, + &gfn_remainder, gfn, + L4_PAGETABLE_SHIFT - PAGE_SHIFT, + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) + goto out; +#endif + + if ( !nested_p2m_next_level(p2m, &table_pg, &table, &gfn_remainder, + gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT, + ((CONFIG_PAGING_LEVELS == 3) + ? (paging_mode_hap(p2m->domain) ? 4 : 8) + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) + goto out; + + if ( page_order == 0 ) + { + if ( !nested_p2m_next_level(p2m, &table_pg, &table, + &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + goto out; + + p2m_entry = nested_p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + if ( mfn_valid(mfn) ) { + entry_content = l1e_from_pfn(mfn_x(mfn), + nested_p2m_type_to_flags(p2mt)); + } else { + entry_content = l1e_empty(); + } + + /* level 1 entry */ + nested_write_p2m_entry(p2m, p2m_entry, entry_content); + + ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT); + } + else + { + printk("%s: fix l2, page_order %i\n", __func__, page_order); + p2m_entry = nested_p2m_find_entry(table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + /* FIXME: Deal with 4k replaced by 2MB pages */ + if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && + !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + printk("configure P2M table 4KB L2 entry with large page\n"); + domain_crash(p2m->domain); + goto out; + } + + printk("%s: fix l2, page_order %i, mfn %s\n", + __func__, page_order, mfn_valid(mfn) ? "valid" : "invalid"); + if ( mfn_valid(mfn) ) + l2e_content = l2e_from_pfn(mfn_x(mfn), + nested_p2m_type_to_flags(p2mt) | _PAGE_PSE); + else { + l2e_content = l2e_empty(); + } + + printk("%s: write l2 entry, page_order %i\n", __func__, page_order); + entry_content.l1 = l2e_content.l2; + nested_write_p2m_entry(p2m, p2m_entry, entry_content); + } + + /* Success */ + rv = 1; + +out: + unmap_domain_page(table); + return rv; +} + +/********************************************/ +/* NESTED VIRT FUNCTIONS */ +/********************************************/ +static void +nested_hap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa) +{ + nested_p2m_set_entry(p2m, L2_gpa >> PAGE_SHIFT, + page_to_mfn(maddr_to_page(L0_gpa)), + 0 /*4K*/, p2m_ram_rw); +} + +/* This function uses l1_gpa to walk the P2M table in L0 hypervisor. If the + * walk is successful, the traslated value is return as L0_gpa. The return + * value tells the upper level what to do. + */ +static int +nested_hap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa) +{ + mfn_t mfn; + p2m_type_t p2mt; + + /* we use gfn_to_mfn_query() function to walk L0 P2M table */ + mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt); + + if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) ) + return NESTEDHVM_PAGEFAULT_ERROR; + + if ( !mfn_valid(mfn) ) + return NESTEDHVM_PAGEFAULT_ERROR; + + *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK); + return NESTEDHVM_PAGEFAULT_DONE; +} + +/* This function uses L2_gpa to walk the P2M page table in L1. If the + * walk is successful, the translated value is returned as + * L1_gpa. The result value tells what to do next. + */ +static int +nested_hap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m, + paddr_t L2_gpa, paddr_t *L1_gpa) +{ + uint32_t pfec; + unsigned long nested_cr3, gfn; + + nested_cr3 = VCPU_NESTEDHVM(v).nh_vmcb_hcr3; + + /* walk the guest table */ + gfn = paging_p2m_gva_to_gfn(v, p2m, nested_cr3, L2_gpa, &pfec); + + if ( gfn == INVALID_GFN ) + return NESTEDHVM_PAGEFAULT_INJECT; + + *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK); + return NESTEDHVM_PAGEFAULT_DONE; +} + +/* + * The following function, nested_hap_page_fault(), is for steps (3)--(10). + * + * Returns: + */ +int +nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa) +{ + int rv; + paddr_t L1_gpa, L0_gpa; + struct domain *d = v->domain; + struct p2m_domain *p2m, *nested_p2m; + + p2m = p2m_get_hostp2m(d); /* L0 p2m */ + nested_p2m = p2m_get_nestedp2m(v, VCPU_NESTEDHVM(v).nh_vmcb_hcr3); + + /* walk the L1 P2M table, note we have to pass p2m + * and not nested_p2m here or we fail the walk forever, + * otherwise. */ + rv = nested_hap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa); + + /* let caller to handle these two cases */ + switch (rv) { + case NESTEDHVM_PAGEFAULT_INJECT: + return rv; + case NESTEDHVM_PAGEFAULT_ERROR: + printk("%s: l1 p2m walk result: error\n", __func__); + return rv; + case NESTEDHVM_PAGEFAULT_DONE: + break; + default: + BUG(); + break; + } + + /* ==> we have to walk L0 P2M */ + rv = nested_hap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa); + + /* let upper level caller to handle these two cases */ + switch (rv) { + case NESTEDHVM_PAGEFAULT_INJECT: + printk("%s: l0 p2m walk result: inject\n", __func__); + return rv; + case NESTEDHVM_PAGEFAULT_ERROR: + printk("%s: l0 p2m walk result: error\n", __func__); + return rv; + case NESTEDHVM_PAGEFAULT_DONE: + break; + default: + BUG(); + break; + } + + /* fix p2m_get_pagetable(nested_p2m) */ + nested_hap_fix_p2m(nested_p2m, L2_gpa, L0_gpa); + + return NESTEDHVM_PAGEFAULT_DONE; +} + +/********************************************/ +/* NESTED VIRT INITIALIZATION FUNCS */ +/********************************************/ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 3f48b73b0a30 -r d88026a2afdc xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -34,6 +34,7 @@ #include #include #include +#include /* Debugging and auditing of the P2M code? */ #define P2M_AUDIT 0 @@ -1561,6 +1562,7 @@ static void p2m_initialise(struct domain INIT_PAGE_LIST_HEAD(&p2m->pod.single); p2m->domain = d; + p2m->cr3 = 0; p2m->set_entry = p2m_set_entry; p2m->get_entry = p2m_gfn_to_mfn; p2m->change_entry_type_global = p2m_change_type_global; @@ -1576,6 +1578,25 @@ static void p2m_initialise(struct domain return; } +static int +p2m_init_nestedp2m(struct domain *d) +{ + uint8_t i; + int rv; + + ASSERT(nestedhvm_enabled(d)); + + spin_lock_init(&d->arch.nested_p2m_lock); + for (i = 0; i < MAX_NESTEDP2M; i++) { + rv = p2m_allocp2m(&d->arch.nested_p2m[i]); + if (rv) + return rv; + p2m_initialise(d, d->arch.nested_p2m[i], 0); + } + + return 0; +} + int p2m_init(struct domain *d) { int rv; @@ -1585,6 +1606,9 @@ int p2m_init(struct domain *d) return rv; p2m_initialise(d, p2m_get_hostp2m(d), 0); + if ( nestedhvm_enabled(d) ) + return p2m_init_nestedp2m(d); + return 0; } @@ -1743,11 +1767,26 @@ void p2m_teardown(struct p2m_domain *p2m p2m_unlock(p2m); } +static void p2m_teardown_nestedp2m(struct domain *d) +{ + uint8_t i; + + for (i = 0; i < MAX_NESTEDP2M; i++) { + xfree(d->arch.nested_p2m[i]); + d->arch.nested_p2m[i] = NULL; + } +} + void p2m_final_teardown(struct domain *d) { /* Iterate over all p2m tables per domain */ xfree(d->arch.p2m); d->arch.p2m = NULL; + + if ( !nestedhvm_enabled(d) ) + return; + + p2m_teardown_nestedp2m(d); } #if P2M_AUDIT @@ -2658,6 +2697,118 @@ void p2m_mem_paging_resume(struct p2m_do mem_event_unpause_vcpus(d); } +static struct p2m_domain * +p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m) +{ + int i, lru_index = -1; + struct p2m_domain *lrup2m, *tmp; + + if (p2m == NULL) { + lru_index = MAX_NESTEDP2M - 1; + lrup2m = d->arch.nested_p2m[lru_index]; + } else { + lrup2m = p2m; + for (i = 0; i < MAX_NESTEDP2M; i++) { + if (d->arch.nested_p2m[i] == p2m) { + lru_index = i; + break; + } + } + } + + ASSERT(lru_index >= 0); + if (lru_index == 0) { + return lrup2m; + } + + /* move the other's down the array "list" */ + for (i = lru_index - 1; i >= 0; i--) { + tmp = d->arch.nested_p2m[i]; + d->arch.nested_p2m[i+1] = tmp; + } + + /* make the entry the first one */ + d->arch.nested_p2m[0] = lrup2m; + + return lrup2m; +} + +static void +p2m_flush(struct p2m_domain *p2m) +{ + p2m_teardown(p2m); + p2m_initialise(p2m->domain, p2m, 1); +} + +struct p2m_domain * +p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3) +{ + struct domain *d; + struct p2m_domain *p2m; + uint8_t i; + bool_t flush; + uint32_t asid, old_asid; + + if (cr3 == 0) + cr3 = v->arch.hvm_vcpu.guest_cr[3]; + + flush = VCPU_NESTEDHVM(v).nh_tlb_control ? 1 : 0; + asid = VCPU_NESTEDHVM(v).nh_guest_asid; + old_asid = VCPU_NESTEDHVM(v).nh_old_guest_asid; + + if (asid != old_asid) { + VCPU_NESTEDHVM(v).nh_p2m = NULL; + flush = 1; + } + + d = v->domain; + spin_lock(&d->arch.nested_p2m_lock); + for (i = 0; i < MAX_NESTEDP2M; i++) { + p2m = d->arch.nested_p2m[i]; + if (p2m->cr3 == cr3 && p2m == VCPU_NESTEDHVM(v).nh_p2m) { + p2m_getlru_nestedp2m(d, p2m); + if (flush) + p2m_flush(p2m); + p2m->cr3 = cr3; + spin_unlock(&d->arch.nested_p2m_lock); + return p2m; + } + if (p2m->cr3 == 0) { /* found unused p2m table */ + p2m_getlru_nestedp2m(d, p2m); + VCPU_NESTEDHVM(v).nh_p2m = p2m; + p2m->cr3 = cr3; + spin_unlock(&d->arch.nested_p2m_lock); + return p2m; + } + } + + /* All p2m's are or were in use. We know the least recent used one. + * Destroy and re-initialize it. + */ + p2m = p2m_getlru_nestedp2m(d, NULL); + p2m_flush(p2m); + VCPU_NESTEDHVM(v).nh_p2m = p2m; + p2m->cr3 = cr3; + spin_unlock(&d->arch.nested_p2m_lock); + + return p2m; +} + +struct p2m_domain * +p2m_get_p2m(struct vcpu *v, uint64_t cr3) +{ + struct domain *d = v->domain; + + if (!nestedhvm_enabled(d)) + return p2m_get_hostp2m(d); + + if (nestedhvm_vcpu_in_guestmode(v)) + return p2m_get_nestedp2m(v, cr3); + + return p2m_get_hostp2m(d); +} + + /* * Local variables: * mode: C diff -r 3f48b73b0a30 -r d88026a2afdc xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -213,6 +213,7 @@ struct paging_vcpu { #define MAX_CPUID_INPUT 40 typedef xen_domctl_cpuid_t cpuid_input_t; +#define MAX_NESTEDP2M 10 struct p2m_domain; /* Define for GUEST MCA handling */ @@ -272,6 +273,10 @@ struct arch_domain struct paging_domain paging; struct p2m_domain *p2m; + /* NestedSVM: phys_table + guest phys_table */ + struct p2m_domain *nested_p2m[MAX_NESTEDP2M]; + spinlock_t nested_p2m_lock; + /* NB. protected by d->event_lock and by irq_desc[irq].lock */ int *irq_pirq; int *pirq_irq; diff -r 3f48b73b0a30 -r d88026a2afdc xen/include/asm-x86/hvm/hvm.h --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -358,7 +358,7 @@ static inline void hvm_set_info_guest(st int hvm_debug_op(struct vcpu *v, int32_t op); -bool_t hvm_hap_nested_page_fault(unsigned long gfn); +bool_t hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs); #define hvm_msr_tsc_aux(v) ({ \ struct domain *__d = (v)->domain; \ diff -r 3f48b73b0a30 -r d88026a2afdc xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -171,6 +171,7 @@ struct p2m_domain { pagetable_t phys_table; struct domain *domain; /* back pointer to domain */ + uint64_t cr3; /* to identify this p2m for re-use */ /* Pages used to construct the p2m */ struct page_list_head pages; @@ -223,6 +224,17 @@ struct p2m_domain { /* get host p2m table */ #define p2m_get_hostp2m(d) ((d)->arch.p2m) +/* Get p2m table (re)usable for specified cr3. + * Automatically destroys and re-initializes a p2m if none found. + * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used. + */ +struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3); + +/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m(). + * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m(). + */ +struct p2m_domain *p2m_get_p2m(struct vcpu *v, uint64_t cr3); + #define p2m_get_pagetable(p2m) ((p2m)->phys_table) /* diff -r 3f48b73b0a30 -r d88026a2afdc xen/include/asm-x86/paging.h --- a/xen/include/asm-x86/paging.h +++ b/xen/include/asm-x86/paging.h @@ -109,6 +109,10 @@ struct paging_mode { int (*invlpg )(struct vcpu *v, unsigned long va); unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va, uint32_t *pfec); + unsigned long (*p2m_gva_to_gfn )(struct vcpu *v, + struct p2m_domain *p2m, + unsigned long cr3, + paddr_t gva, uint32_t *pfec); void (*update_cr3 )(struct vcpu *v, int do_locking); void (*update_paging_modes )(struct vcpu *v); void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn, @@ -257,6 +261,19 @@ static inline unsigned long paging_gva_t return v->arch.paging.mode->gva_to_gfn(v, va, pfec); } +/* With nested virtualization gva == nested gpa, hence we use paddr_t + * to not overflow. */ +static inline unsigned long paging_p2m_gva_to_gfn(struct vcpu *v, + struct p2m_domain *p2m, + unsigned long cr3, + paddr_t gva, + uint32_t *pfec) +{ + if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) ) + return v->arch.paging.mode->p2m_gva_to_gfn(v, p2m, cr3, gva, pfec); + return INVALID_GFN; +} + /* Update all the things that are derived from the guest's CR3. * Called when the guest changes CR3; the caller can then use v->arch.cr3 * as the value to load into the host CR3 to schedule this vcpu */