[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 3/4] x86/NPT: don't walk entire page tables when changing types on a range



This builds on the fact that in order for no NPF VM exit to occur,
_PAGE_USER must always be set. I.e. by clearing the flag we can force a
VM exit allowing us to do similar lazy type changes as on EPT.

That way, the generic entry-wise code can go away, and we could remove
the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v3: Rename p2m_npt_fault() to p2m_pt_handle_deferred_changes() and
    don't call it on not-present faults (we don't ever mark non-present
    entries for re-calculation).

--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2557,7 +2557,17 @@ void svm_vmexit_handler(struct cpu_user_
         perfc_incra(svmexits, VMEXIT_NPF_PERFC);
         if ( cpu_has_svm_decode )
             v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
-        svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+        rc = vmcb->exitinfo1 & PFEC_page_present
+             ? p2m_pt_handle_deferred_changes(vmcb->exitinfo2) : 0;
+        if ( rc >= 0 )
+            svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+        else
+        {
+            printk(XENLOG_G_ERR
+                   "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n",
+                   v, rc, vmcb->exitinfo2, vmcb->exitinfo1);
+            domain_crash(v->domain);
+        }
         v->arch.hvm_svm.cached_insn_len = 0;
         break;
 
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -728,10 +728,7 @@ void p2m_change_type_range(struct domain
                            unsigned long start, unsigned long end,
                            p2m_type_t ot, p2m_type_t nt)
 {
-    p2m_access_t a;
-    p2m_type_t pt;
     unsigned long gfn = start;
-    mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     int rc = 0;
 
@@ -750,47 +747,8 @@ void p2m_change_type_range(struct domain
         }
         end = p2m->max_mapped_pfn + 1;
     }
-
-    if ( gfn < end && p2m->change_entry_type_range )
-    {
+    if ( gfn < end )
         rc = p2m->change_entry_type_range(p2m, ot, nt, gfn, end - 1);
-        gfn = end;
-    }
-    while ( !rc && gfn < end )
-    {
-        unsigned int order;
-
-        mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order);
-        while ( order > PAGE_ORDER_4K )
-        {
-            unsigned long mask = ~0UL << order;
-
-            /*
-             * Log-dirty ranges starting/ending in the middle of a super page
-             * (with a page split still pending) can't have a consistent type
-             * reported for the full range and hence need the split to be
-             * enforced here.
-             */
-            if ( !p2m_is_changeable(pt) ||
-                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
-            {
-                if ( pt != ot )
-                    break;
-                if ( !(gfn & ~mask) && end > (gfn | ~mask) )
-                    break;
-            }
-            if ( order == PAGE_ORDER_1G )
-                order = PAGE_ORDER_2M;
-            else
-                order = PAGE_ORDER_4K;
-        }
-        if ( pt == ot )
-            rc = p2m_set_entry(p2m, gfn, mfn, order, nt, a);
-        gfn += 1UL << order;
-        gfn &= -1UL << order;
-        if ( !gfn )
-            break;
-    }
     if ( rc )
     {
         printk(XENLOG_G_ERR "Error %d changing Dom%d GFNs [%lx,%lx] from %d to 
%d\n",
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -60,6 +60,19 @@
 #define P2M_BASE_FLAGS \
         (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
 
+#define RECALC_FLAGS (_PAGE_USER|_PAGE_ACCESSED)
+#define set_recalc(level, ent) level##e_remove_flags(ent, RECALC_FLAGS)
+#define clear_recalc(level, ent) level##e_add_flags(ent, RECALC_FLAGS)
+#define _needs_recalc(flags) (!((flags) & _PAGE_USER))
+#define needs_recalc(level, ent) _needs_recalc(level##e_get_flags(ent))
+#define valid_recalc(level, ent) (!(level##e_get_flags(ent) & _PAGE_ACCESSED))
+
+static const unsigned long pgt[] = {
+    PGT_l1_page_table,
+    PGT_l2_page_table,
+    PGT_l3_page_table
+};
+
 static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
 {
     unsigned long flags;
@@ -272,6 +285,196 @@ p2m_next_level(struct p2m_domain *p2m, v
     return 0;
 }
 
+/*
+ * Mark (via clearing the U flag) as needing P2M type re-calculation all valid
+ * present entries at the targeted level for the passed in GFN range, which is
+ * guaranteed to not cross a page (table) boundary at that level.
+ */
+static int p2m_pt_set_recalc_range(struct p2m_domain *p2m,
+                                   unsigned int level,
+                                   unsigned long first_gfn,
+                                   unsigned long last_gfn)
+{
+    void *table;
+    unsigned long gfn_remainder = first_gfn, remainder;
+    unsigned int i;
+    l1_pgentry_t *pent, *plast;
+    int err = 0;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    for ( i = 4; i-- > level; )
+    {
+        remainder = gfn_remainder;
+        pent = p2m_find_entry(table, &remainder, first_gfn,
+                              i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+        if ( !pent )
+        {
+            err = -EINVAL;
+            goto out;
+        }
+
+        if ( !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+            goto out;
+
+        err = p2m_next_level(p2m, &table, &gfn_remainder, first_gfn,
+                             i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+                             pgt[i - 1]);
+        if ( err )
+            goto out;
+    }
+
+    remainder = gfn_remainder + (last_gfn - first_gfn);
+    pent = p2m_find_entry(table, &gfn_remainder, first_gfn,
+                          i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    plast = p2m_find_entry(table, &remainder, last_gfn,
+                           i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    if ( pent && plast )
+        for ( ; pent <= plast; ++pent )
+        {
+            l1_pgentry_t e = *pent;
+
+            if ( (l1e_get_flags(e) & _PAGE_PRESENT) && !needs_recalc(l1, e) )
+            {
+                set_recalc(l1, e);
+                p2m->write_p2m_entry(p2m, first_gfn, pent, e, level);
+            }
+            first_gfn += 1UL << (i * PAGETABLE_ORDER);
+        }
+    else
+        err = -EIO;
+
+ out:
+    unmap_domain_page(table);
+
+    return err;
+}
+
+/*
+ * Handle possibly necessary P2M type re-calculation (U flag clear for a
+ * present entry) for the entries in the page table hierarchy for the given
+ * GFN. Propagate the re-calculation flag down to the next page table level
+ * for entries not involved in the translation of the given GFN.
+ */
+static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
+{
+    void *table;
+    unsigned long gfn_remainder = gfn;
+    unsigned int level = 4;
+    l1_pgentry_t *pent;
+    int err = 0;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    while ( --level )
+    {
+        unsigned long remainder = gfn_remainder;
+
+        pent = p2m_find_entry(table, &remainder, gfn,
+                              level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+        if ( !pent || !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+            goto out;
+
+        if ( l1e_get_flags(*pent) & _PAGE_PSE )
+        {
+            unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+
+            if ( !needs_recalc(l1, *pent) ||
+                 !p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(*pent))) ||
+                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
+                break;
+        }
+
+        err = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
+                             level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+                             pgt[level - 1]);
+        if ( err )
+            goto out;
+
+        if ( needs_recalc(l1, *pent) )
+        {
+            l1_pgentry_t e = *pent, *ptab = table;
+            unsigned int i;
+
+            if ( !valid_recalc(l1, e) )
+                P2M_DEBUG("bogus recalc state at d%d:%lx:%u\n",
+                          p2m->domain->domain_id, gfn, level);
+            remainder = gfn_remainder;
+            for ( i = 0; i < (1 << PAGETABLE_ORDER); ++i )
+            {
+                l1_pgentry_t ent = ptab[i];
+
+                if ( (l1e_get_flags(ent) & _PAGE_PRESENT) &&
+                     !needs_recalc(l1, ent) )
+                {
+                    set_recalc(l1, ent);
+                    p2m->write_p2m_entry(p2m, gfn - remainder, &ptab[i],
+                                         ent, level);
+                }
+                remainder -= 1UL << ((level - 1) * PAGETABLE_ORDER);
+            }
+            smp_wmb();
+            clear_recalc(l1, e);
+            p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+        }
+    }
+
+    pent = p2m_find_entry(table, &gfn_remainder, gfn,
+                          level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    if ( pent && (l1e_get_flags(*pent) & _PAGE_PRESENT) &&
+         needs_recalc(l1, *pent) )
+    {
+        l1_pgentry_t e = *pent;
+
+        if ( !valid_recalc(l1, e) )
+            P2M_DEBUG("bogus recalc leaf at d%d:%lx:%u\n",
+                      p2m->domain->domain_id, gfn, level);
+        if ( p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(e))) )
+        {
+            unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+            p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | 
~mask)
+                              ? p2m_ram_logdirty : p2m_ram_rw;
+            unsigned long mfn = l1e_get_pfn(e);
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+
+            if ( level )
+            {
+                if ( flags & _PAGE_PAT )
+                {
+                     BUILD_BUG_ON(_PAGE_PAT != _PAGE_PSE);
+                     mfn |= _PAGE_PSE_PAT >> PAGE_SHIFT;
+                }
+                else
+                     mfn &= ~(_PAGE_PSE_PAT >> PAGE_SHIFT);
+                flags |= _PAGE_PSE;
+            }
+            e = l1e_from_pfn(mfn, flags);
+            p2m_add_iommu_flags(&e, level,
+                                (p2mt == p2m_ram_rw)
+                                ? IOMMUF_readable|IOMMUF_writable : 0);
+            ASSERT(!needs_recalc(l1, e));
+        }
+        else
+            clear_recalc(l1, e);
+        p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+    }
+
+ out:
+    unmap_domain_page(table);
+
+    return err;
+}
+
+int p2m_pt_handle_deferred_changes(uint64_t gpa)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(current->domain);
+    int rc;
+
+    p2m_lock(p2m);
+    rc = do_recalc(p2m, PFN_DOWN(gpa));
+    p2m_unlock(p2m);
+
+    return rc;
+}
+
 /* Returns: 0 for success, -errno for failure */
 static int
 p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
@@ -307,6 +510,11 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
     }
 
+    /* Carry out any eventually pending earlier changes first. */
+    rc = do_recalc(p2m, gfn);
+    if ( rc < 0 )
+        return rc;
+
     table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
     rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
@@ -459,6 +667,15 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
     return rc;
 }
 
+static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t,
+                                     struct p2m_domain *p2m, unsigned long gfn)
+{
+    if ( !recalc || !p2m_is_changeable(t) )
+        return t;
+    return p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
+                                                : p2m_ram_rw;
+}
+
 static mfn_t
 p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
                  p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
@@ -468,8 +685,9 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     l2_pgentry_t *l2e;
     l1_pgentry_t *l1e;
-    unsigned long l1e_flags;
+    unsigned int flags;
     p2m_type_t l1t;
+    bool_t recalc;
 
     ASSERT(paging_mode_translate(p2m->domain));
 
@@ -496,15 +714,17 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
             return _mfn(INVALID_MFN);
         }
         mfn = _mfn(l4e_get_pfn(*l4e));
+        recalc = needs_recalc(l4, *l4e);
         unmap_domain_page(l4e);
     }
     {
         l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
         l3e += l3_table_offset(addr);
 pod_retry_l3:
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        flags = l3e_get_flags(*l3e);
+        if ( !(flags & _PAGE_PRESENT) )
         {
-            if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == 
p2m_populate_on_demand )
+            if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
             {
                 if ( q & P2M_ALLOC )
                 {
@@ -518,12 +738,13 @@ pod_retry_l3:
             unmap_domain_page(l3e);
             return _mfn(INVALID_MFN);
         }
-        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+        if ( flags & _PAGE_PSE )
         {
             mfn = _mfn(l3e_get_pfn(*l3e) +
                        l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
                        l1_table_offset(addr));
-            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+            *t = recalc_type(recalc || _needs_recalc(flags),
+                             p2m_flags_to_type(flags), p2m, gfn);
             unmap_domain_page(l3e);
 
             ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -533,6 +754,8 @@ pod_retry_l3:
         }
 
         mfn = _mfn(l3e_get_pfn(*l3e));
+        if ( _needs_recalc(flags) )
+            recalc = 1;
         unmap_domain_page(l3e);
     }
 
@@ -540,10 +763,11 @@ pod_retry_l3:
     l2e += l2_table_offset(addr);
 
 pod_retry_l2:
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    flags = l2e_get_flags(*l2e);
+    if ( !(flags & _PAGE_PRESENT) )
     {
         /* PoD: Try to populate a 2-meg chunk */
-        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
         {
             if ( q & P2M_ALLOC ) {
                 if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) )
@@ -555,10 +779,11 @@ pod_retry_l2:
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
-    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    if ( flags & _PAGE_PSE )
     {
         mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
-        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        *t = recalc_type(recalc || _needs_recalc(flags),
+                         p2m_flags_to_type(flags), p2m, gfn);
         unmap_domain_page(l2e);
         
         ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -568,14 +793,16 @@ pod_retry_l2:
     }
 
     mfn = _mfn(l2e_get_pfn(*l2e));
+    if ( needs_recalc(l2, *l2e) )
+        recalc = 1;
     unmap_domain_page(l2e);
 
     l1e = map_domain_page(mfn_x(mfn));
     l1e += l1_table_offset(addr);
 pod_retry_l1:
-    l1e_flags = l1e_get_flags(*l1e);
-    l1t = p2m_flags_to_type(l1e_flags);
-    if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) )
+    flags = l1e_get_flags(*l1e);
+    l1t = p2m_flags_to_type(flags);
+    if ( !(flags & _PAGE_PRESENT) && !p2m_is_paging(l1t) )
     {
         /* PoD: Try to populate */
         if ( l1t == p2m_populate_on_demand )
@@ -591,7 +818,7 @@ pod_retry_l1:
         return _mfn(INVALID_MFN);
     }
     mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = l1t;
+    *t = recalc_type(recalc || _needs_recalc(flags), l1t, p2m, gfn);
     unmap_domain_page(l1e);
 
     ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t));
@@ -714,6 +941,47 @@ static void p2m_pt_change_entry_type_glo
     unmap_domain_page(l4e);
 }
 
+static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,
+                                          p2m_type_t ot, p2m_type_t nt,
+                                          unsigned long first_gfn,
+                                          unsigned long last_gfn)
+{
+    unsigned long mask = (1 << PAGETABLE_ORDER) - 1;
+    unsigned int i;
+    int err = 0;
+
+    ASSERT(hap_enabled(p2m->domain));
+
+    for ( i = 1; i <= 4; )
+    {
+        if ( first_gfn & mask )
+        {
+            unsigned long end_gfn = min(first_gfn | mask, last_gfn);
+
+            err = p2m_pt_set_recalc_range(p2m, i, first_gfn, end_gfn);
+            if ( err || end_gfn >= last_gfn )
+                break;
+            first_gfn = end_gfn + 1;
+        }
+        else if ( (last_gfn & mask) != mask )
+        {
+            unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
+
+            err = p2m_pt_set_recalc_range(p2m, i, start_gfn, last_gfn);
+            if ( err || start_gfn <= first_gfn )
+                break;
+            last_gfn = start_gfn - 1;
+        }
+        else
+        {
+            ++i;
+            mask |= mask << PAGETABLE_ORDER;
+        }
+    }
+
+    return err;
+}
+
 #if P2M_AUDIT
 long p2m_pt_audit_p2m(struct p2m_domain *p2m)
 {
@@ -872,6 +1140,7 @@ void p2m_pt_init(struct p2m_domain *p2m)
     p2m->set_entry = p2m_pt_set_entry;
     p2m->get_entry = p2m_pt_get_entry;
     p2m->change_entry_type_global = p2m_pt_change_entry_type_global;
+    p2m->change_entry_type_range = p2m_pt_change_entry_type_range;
     p2m->write_p2m_entry = paging_write_p2m_entry;
 #if P2M_AUDIT
     p2m->audit_p2m = p2m_pt_audit_p2m;
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -668,6 +668,8 @@ static inline p2m_type_t p2m_flags_to_ty
     return (flags >> 12) & 0x7f;
 }
 
+int p2m_pt_handle_deferred_changes(uint64_t gpa);
+
 /*
  * Nested p2m: shadow p2m tables used for nested HVM virtualization 
  */


Attachment: NPT-implement-cetr.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.