x86/NPT: don't walk entire page tables when changing types on a range This builds on the fact that in order for no NPF VM exit to occur, _PAGE_USER must always be set. I.e. by clearing the flag we can force a VM exit allowing us to do similar lazy type changes as on EPT. That way, the generic entry-wise code can go away, and we could remove the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27. Signed-off-by: Jan Beulich --- v3: Rename p2m_npt_fault() to p2m_pt_handle_deferred_changes() and don't call it on not-present faults (we don't ever mark non-present entries for re-calculation). --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -2557,7 +2557,17 @@ void svm_vmexit_handler(struct cpu_user_ perfc_incra(svmexits, VMEXIT_NPF_PERFC); if ( cpu_has_svm_decode ) v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf; - svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2); + rc = vmcb->exitinfo1 & PFEC_page_present + ? p2m_pt_handle_deferred_changes(vmcb->exitinfo2) : 0; + if ( rc >= 0 ) + svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2); + else + { + printk(XENLOG_G_ERR + "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n", + v, rc, vmcb->exitinfo2, vmcb->exitinfo1); + domain_crash(v->domain); + } v->arch.hvm_svm.cached_insn_len = 0; break; --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -728,10 +728,7 @@ void p2m_change_type_range(struct domain unsigned long start, unsigned long end, p2m_type_t ot, p2m_type_t nt) { - p2m_access_t a; - p2m_type_t pt; unsigned long gfn = start; - mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); int rc = 0; @@ -750,47 +747,8 @@ void p2m_change_type_range(struct domain } end = p2m->max_mapped_pfn + 1; } - - if ( gfn < end && p2m->change_entry_type_range ) - { + if ( gfn < end ) rc = p2m->change_entry_type_range(p2m, ot, nt, gfn, end - 1); - gfn = end; - } - while ( !rc && gfn < end ) - { - unsigned int order; - - mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order); - while ( order > PAGE_ORDER_4K ) - { - unsigned long mask = ~0UL << order; - - /* - * Log-dirty ranges starting/ending in the middle of a super page - * (with a page split still pending) can't have a consistent type - * reported for the full range and hence need the split to be - * enforced here. - */ - if ( !p2m_is_changeable(pt) || - p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 ) - { - if ( pt != ot ) - break; - if ( !(gfn & ~mask) && end > (gfn | ~mask) ) - break; - } - if ( order == PAGE_ORDER_1G ) - order = PAGE_ORDER_2M; - else - order = PAGE_ORDER_4K; - } - if ( pt == ot ) - rc = p2m_set_entry(p2m, gfn, mfn, order, nt, a); - gfn += 1UL << order; - gfn &= -1UL << order; - if ( !gfn ) - break; - } if ( rc ) { printk(XENLOG_G_ERR "Error %d changing Dom%d GFNs [%lx,%lx] from %d to %d\n", --- a/xen/arch/x86/mm/p2m-pt.c +++ b/xen/arch/x86/mm/p2m-pt.c @@ -60,6 +60,19 @@ #define P2M_BASE_FLAGS \ (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED) +#define RECALC_FLAGS (_PAGE_USER|_PAGE_ACCESSED) +#define set_recalc(level, ent) level##e_remove_flags(ent, RECALC_FLAGS) +#define clear_recalc(level, ent) level##e_add_flags(ent, RECALC_FLAGS) +#define _needs_recalc(flags) (!((flags) & _PAGE_USER)) +#define needs_recalc(level, ent) _needs_recalc(level##e_get_flags(ent)) +#define valid_recalc(level, ent) (!(level##e_get_flags(ent) & _PAGE_ACCESSED)) + +static const unsigned long pgt[] = { + PGT_l1_page_table, + PGT_l2_page_table, + PGT_l3_page_table +}; + static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) { unsigned long flags; @@ -272,6 +285,196 @@ p2m_next_level(struct p2m_domain *p2m, v return 0; } +/* + * Mark (via clearing the U flag) as needing P2M type re-calculation all valid + * present entries at the targeted level for the passed in GFN range, which is + * guaranteed to not cross a page (table) boundary at that level. + */ +static int p2m_pt_set_recalc_range(struct p2m_domain *p2m, + unsigned int level, + unsigned long first_gfn, + unsigned long last_gfn) +{ + void *table; + unsigned long gfn_remainder = first_gfn, remainder; + unsigned int i; + l1_pgentry_t *pent, *plast; + int err = 0; + + table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); + for ( i = 4; i-- > level; ) + { + remainder = gfn_remainder; + pent = p2m_find_entry(table, &remainder, first_gfn, + i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER); + if ( !pent ) + { + err = -EINVAL; + goto out; + } + + if ( !(l1e_get_flags(*pent) & _PAGE_PRESENT) ) + goto out; + + err = p2m_next_level(p2m, &table, &gfn_remainder, first_gfn, + i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER, + pgt[i - 1]); + if ( err ) + goto out; + } + + remainder = gfn_remainder + (last_gfn - first_gfn); + pent = p2m_find_entry(table, &gfn_remainder, first_gfn, + i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER); + plast = p2m_find_entry(table, &remainder, last_gfn, + i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER); + if ( pent && plast ) + for ( ; pent <= plast; ++pent ) + { + l1_pgentry_t e = *pent; + + if ( (l1e_get_flags(e) & _PAGE_PRESENT) && !needs_recalc(l1, e) ) + { + set_recalc(l1, e); + p2m->write_p2m_entry(p2m, first_gfn, pent, e, level); + } + first_gfn += 1UL << (i * PAGETABLE_ORDER); + } + else + err = -EIO; + + out: + unmap_domain_page(table); + + return err; +} + +/* + * Handle possibly necessary P2M type re-calculation (U flag clear for a + * present entry) for the entries in the page table hierarchy for the given + * GFN. Propagate the re-calculation flag down to the next page table level + * for entries not involved in the translation of the given GFN. + */ +static int do_recalc(struct p2m_domain *p2m, unsigned long gfn) +{ + void *table; + unsigned long gfn_remainder = gfn; + unsigned int level = 4; + l1_pgentry_t *pent; + int err = 0; + + table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); + while ( --level ) + { + unsigned long remainder = gfn_remainder; + + pent = p2m_find_entry(table, &remainder, gfn, + level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER); + if ( !pent || !(l1e_get_flags(*pent) & _PAGE_PRESENT) ) + goto out; + + if ( l1e_get_flags(*pent) & _PAGE_PSE ) + { + unsigned long mask = ~0UL << (level * PAGETABLE_ORDER); + + if ( !needs_recalc(l1, *pent) || + !p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(*pent))) || + p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 ) + break; + } + + err = p2m_next_level(p2m, &table, &gfn_remainder, gfn, + level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER, + pgt[level - 1]); + if ( err ) + goto out; + + if ( needs_recalc(l1, *pent) ) + { + l1_pgentry_t e = *pent, *ptab = table; + unsigned int i; + + if ( !valid_recalc(l1, e) ) + P2M_DEBUG("bogus recalc state at d%d:%lx:%u\n", + p2m->domain->domain_id, gfn, level); + remainder = gfn_remainder; + for ( i = 0; i < (1 << PAGETABLE_ORDER); ++i ) + { + l1_pgentry_t ent = ptab[i]; + + if ( (l1e_get_flags(ent) & _PAGE_PRESENT) && + !needs_recalc(l1, ent) ) + { + set_recalc(l1, ent); + p2m->write_p2m_entry(p2m, gfn - remainder, &ptab[i], + ent, level); + } + remainder -= 1UL << ((level - 1) * PAGETABLE_ORDER); + } + smp_wmb(); + clear_recalc(l1, e); + p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1); + } + } + + pent = p2m_find_entry(table, &gfn_remainder, gfn, + level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER); + if ( pent && (l1e_get_flags(*pent) & _PAGE_PRESENT) && + needs_recalc(l1, *pent) ) + { + l1_pgentry_t e = *pent; + + if ( !valid_recalc(l1, e) ) + P2M_DEBUG("bogus recalc leaf at d%d:%lx:%u\n", + p2m->domain->domain_id, gfn, level); + if ( p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(e))) ) + { + unsigned long mask = ~0UL << (level * PAGETABLE_ORDER); + p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) + ? p2m_ram_logdirty : p2m_ram_rw; + unsigned long mfn = l1e_get_pfn(e); + unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn)); + + if ( level ) + { + if ( flags & _PAGE_PAT ) + { + BUILD_BUG_ON(_PAGE_PAT != _PAGE_PSE); + mfn |= _PAGE_PSE_PAT >> PAGE_SHIFT; + } + else + mfn &= ~(_PAGE_PSE_PAT >> PAGE_SHIFT); + flags |= _PAGE_PSE; + } + e = l1e_from_pfn(mfn, flags); + p2m_add_iommu_flags(&e, level, + (p2mt == p2m_ram_rw) + ? IOMMUF_readable|IOMMUF_writable : 0); + ASSERT(!needs_recalc(l1, e)); + } + else + clear_recalc(l1, e); + p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1); + } + + out: + unmap_domain_page(table); + + return err; +} + +int p2m_pt_handle_deferred_changes(uint64_t gpa) +{ + struct p2m_domain *p2m = p2m_get_hostp2m(current->domain); + int rc; + + p2m_lock(p2m); + rc = do_recalc(p2m, PFN_DOWN(gpa)); + p2m_unlock(p2m); + + return rc; +} + /* Returns: 0 for success, -errno for failure */ static int p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, @@ -307,6 +510,11 @@ p2m_pt_set_entry(struct p2m_domain *p2m, __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t); } + /* Carry out any eventually pending earlier changes first. */ + rc = do_recalc(p2m, gfn); + if ( rc < 0 ) + return rc; + table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn, L4_PAGETABLE_SHIFT - PAGE_SHIFT, @@ -459,6 +667,15 @@ p2m_pt_set_entry(struct p2m_domain *p2m, return rc; } +static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t, + struct p2m_domain *p2m, unsigned long gfn) +{ + if ( !recalc || !p2m_is_changeable(t) ) + return t; + return p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty + : p2m_ram_rw; +} + static mfn_t p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, @@ -468,8 +685,9 @@ p2m_pt_get_entry(struct p2m_domain *p2m, paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; l2_pgentry_t *l2e; l1_pgentry_t *l1e; - unsigned long l1e_flags; + unsigned int flags; p2m_type_t l1t; + bool_t recalc; ASSERT(paging_mode_translate(p2m->domain)); @@ -496,15 +714,17 @@ p2m_pt_get_entry(struct p2m_domain *p2m, return _mfn(INVALID_MFN); } mfn = _mfn(l4e_get_pfn(*l4e)); + recalc = needs_recalc(l4, *l4e); unmap_domain_page(l4e); } { l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); l3e += l3_table_offset(addr); pod_retry_l3: - if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + flags = l3e_get_flags(*l3e); + if ( !(flags & _PAGE_PRESENT) ) { - if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand ) + if ( p2m_flags_to_type(flags) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { @@ -518,12 +738,13 @@ pod_retry_l3: unmap_domain_page(l3e); return _mfn(INVALID_MFN); } - else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) ) + if ( flags & _PAGE_PSE ) { mfn = _mfn(l3e_get_pfn(*l3e) + l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + l1_table_offset(addr)); - *t = p2m_flags_to_type(l3e_get_flags(*l3e)); + *t = recalc_type(recalc || _needs_recalc(flags), + p2m_flags_to_type(flags), p2m, gfn); unmap_domain_page(l3e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); @@ -533,6 +754,8 @@ pod_retry_l3: } mfn = _mfn(l3e_get_pfn(*l3e)); + if ( _needs_recalc(flags) ) + recalc = 1; unmap_domain_page(l3e); } @@ -540,10 +763,11 @@ pod_retry_l3: l2e += l2_table_offset(addr); pod_retry_l2: - if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + flags = l2e_get_flags(*l2e); + if ( !(flags & _PAGE_PRESENT) ) { /* PoD: Try to populate a 2-meg chunk */ - if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) + if ( p2m_flags_to_type(flags) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) ) @@ -555,10 +779,11 @@ pod_retry_l2: unmap_domain_page(l2e); return _mfn(INVALID_MFN); } - else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) + if ( flags & _PAGE_PSE ) { mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); - *t = p2m_flags_to_type(l2e_get_flags(*l2e)); + *t = recalc_type(recalc || _needs_recalc(flags), + p2m_flags_to_type(flags), p2m, gfn); unmap_domain_page(l2e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); @@ -568,14 +793,16 @@ pod_retry_l2: } mfn = _mfn(l2e_get_pfn(*l2e)); + if ( needs_recalc(l2, *l2e) ) + recalc = 1; unmap_domain_page(l2e); l1e = map_domain_page(mfn_x(mfn)); l1e += l1_table_offset(addr); pod_retry_l1: - l1e_flags = l1e_get_flags(*l1e); - l1t = p2m_flags_to_type(l1e_flags); - if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) ) + flags = l1e_get_flags(*l1e); + l1t = p2m_flags_to_type(flags); + if ( !(flags & _PAGE_PRESENT) && !p2m_is_paging(l1t) ) { /* PoD: Try to populate */ if ( l1t == p2m_populate_on_demand ) @@ -591,7 +818,7 @@ pod_retry_l1: return _mfn(INVALID_MFN); } mfn = _mfn(l1e_get_pfn(*l1e)); - *t = l1t; + *t = recalc_type(recalc || _needs_recalc(flags), l1t, p2m, gfn); unmap_domain_page(l1e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t)); @@ -714,6 +941,47 @@ static void p2m_pt_change_entry_type_glo unmap_domain_page(l4e); } +static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m, + p2m_type_t ot, p2m_type_t nt, + unsigned long first_gfn, + unsigned long last_gfn) +{ + unsigned long mask = (1 << PAGETABLE_ORDER) - 1; + unsigned int i; + int err = 0; + + ASSERT(hap_enabled(p2m->domain)); + + for ( i = 1; i <= 4; ) + { + if ( first_gfn & mask ) + { + unsigned long end_gfn = min(first_gfn | mask, last_gfn); + + err = p2m_pt_set_recalc_range(p2m, i, first_gfn, end_gfn); + if ( err || end_gfn >= last_gfn ) + break; + first_gfn = end_gfn + 1; + } + else if ( (last_gfn & mask) != mask ) + { + unsigned long start_gfn = max(first_gfn, last_gfn & ~mask); + + err = p2m_pt_set_recalc_range(p2m, i, start_gfn, last_gfn); + if ( err || start_gfn <= first_gfn ) + break; + last_gfn = start_gfn - 1; + } + else + { + ++i; + mask |= mask << PAGETABLE_ORDER; + } + } + + return err; +} + #if P2M_AUDIT long p2m_pt_audit_p2m(struct p2m_domain *p2m) { @@ -872,6 +1140,7 @@ void p2m_pt_init(struct p2m_domain *p2m) p2m->set_entry = p2m_pt_set_entry; p2m->get_entry = p2m_pt_get_entry; p2m->change_entry_type_global = p2m_pt_change_entry_type_global; + p2m->change_entry_type_range = p2m_pt_change_entry_type_range; p2m->write_p2m_entry = paging_write_p2m_entry; #if P2M_AUDIT p2m->audit_p2m = p2m_pt_audit_p2m; --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -668,6 +668,8 @@ static inline p2m_type_t p2m_flags_to_ty return (flags >> 12) & 0x7f; } +int p2m_pt_handle_deferred_changes(uint64_t gpa); + /* * Nested p2m: shadow p2m tables used for nested HVM virtualization */