WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] hvm: make dirty logging stop requiring ph

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] hvm: make dirty logging stop requiring physical pages of order > 0
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 22 Nov 2007 12:00:29 -0800
Delivery-date: Thu, 22 Nov 2007 12:01:54 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1195238004 0
# Node ID 68c911f7733a0158056d10e9e7997a6acfe47eb1
# Parent  2052364cb456170a70ad5c8bfb876c95f7a9fe4a
hvm: make dirty logging stop requiring physical pages of order > 0

This patch re-implements the (x86) hypervisor dirty page log with a
simple four-level radix tree whose nodes are all single pages, thus
making migration require only order-0 pages (where before it required
at least an order-5 page).

Unlike the p2m radix tree implementation, the interior nodes of this
tree are NOT page table nodes.  I chose a lazy-allocation and -mapping
approach because most pages are not marked dirty while dirty-logging is
enabled.  There are doubtless situations (the 'stream' benchmark, for
example) where a more complex p2m-like approach is faster, but I'm not
sure they're worth the effort.

Signed-off-by: Dave Lively <dlively@xxxxxxxxxxxxxxx>
---
 xen/arch/x86/mm/paging.c         |  251 +++++++++++++++++++++++++++------------
 xen/arch/x86/mm/shadow/private.h |   43 +++++-
 xen/include/asm-x86/domain.h     |    7 -
 xen/include/asm-x86/paging.h     |   22 +++
 4 files changed, 241 insertions(+), 82 deletions(-)

diff -r 2052364cb456 -r 68c911f7733a xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/arch/x86/mm/paging.c  Fri Nov 16 18:33:24 2007 +0000
@@ -96,36 +96,97 @@
         spin_unlock(&(_d)->arch.paging.log_dirty.lock);                   \
     } while (0)
 
+static mfn_t paging_new_log_dirty_page(struct domain *d, void **mapping_p)
+{
+    mfn_t mfn;
+    struct page_info *page = alloc_domheap_page(NULL);
+
+    if ( unlikely(page == NULL) ) {
+        d->arch.paging.log_dirty.failed_allocs++;
+        return _mfn(INVALID_MFN);
+    }
+    d->arch.paging.log_dirty.allocs++;
+    mfn = page_to_mfn(page);
+    *mapping_p = map_domain_page(mfn_x(mfn));
+    return mfn;
+}
+
+
+static mfn_t paging_new_log_dirty_leaf(struct domain *d, uint8_t **leaf_p)
+{
+    mfn_t mfn = paging_new_log_dirty_page(d, (void **)leaf_p);
+    clear_page(*leaf_p);
+    return mfn;
+}
+        
+
+static mfn_t paging_new_log_dirty_node(struct domain *d, mfn_t **node_p)
+{
+    int i;
+    mfn_t mfn = paging_new_log_dirty_page(d, (void **)node_p);
+    for (i = 0; i < LOGDIRTY_NODE_ENTRIES; i++)
+        (*node_p)[i] = _mfn(INVALID_MFN);
+    return mfn;
+}
+        
+
 /* allocate bitmap resources for log dirty */
 int paging_alloc_log_dirty_bitmap(struct domain *d)
 {
-    if ( d->arch.paging.log_dirty.bitmap != NULL )
+    mfn_t *mapping;
+
+    if ( mfn_valid(d->arch.paging.log_dirty.top) )
         return 0;
 
-    d->arch.paging.log_dirty.bitmap_size =
-        (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
-    d->arch.paging.log_dirty.bitmap =
-        xmalloc_array(unsigned long,
-                      d->arch.paging.log_dirty.bitmap_size / BITS_PER_LONG);
-    if ( d->arch.paging.log_dirty.bitmap == NULL )
-    {
-        d->arch.paging.log_dirty.bitmap_size = 0;
+    d->arch.paging.log_dirty.top = paging_new_log_dirty_node(d, &mapping);
+    if ( unlikely(!mfn_valid(d->arch.paging.log_dirty.top)) ) {
+        /* Clear error indicator since we're reporting this one */
+        d->arch.paging.log_dirty.failed_allocs = 0;
         return -ENOMEM;
     }
-    memset(d->arch.paging.log_dirty.bitmap, 0,
-           d->arch.paging.log_dirty.bitmap_size/8);
+    unmap_domain_page(mapping);
 
     return 0;
 }
+
+
+static void paging_free_log_dirty_page(struct domain *d, mfn_t mfn)
+{
+    d->arch.paging.log_dirty.allocs--;
+    free_domheap_page(mfn_to_page(mfn));
+}    
 
 /* free bitmap resources */
 void paging_free_log_dirty_bitmap(struct domain *d)
 {
-    d->arch.paging.log_dirty.bitmap_size = 0;
-    if ( d->arch.paging.log_dirty.bitmap )
-    {
-        xfree(d->arch.paging.log_dirty.bitmap);
-        d->arch.paging.log_dirty.bitmap = NULL;
+    int i4, i3, i2;
+
+    if (mfn_valid(d->arch.paging.log_dirty.top)) {
+        mfn_t *l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+        printk("%s: used %d pages for domain %d dirty logging\n",
+               __FUNCTION__, d->arch.paging.log_dirty.allocs, d->domain_id);
+        for (i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++) {
+            if (mfn_valid(l4[i4])) {
+                mfn_t *l3 = map_domain_page(mfn_x(l4[i4]));
+                for (i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++) {
+                    if (mfn_valid(l3[i3])) {
+                        mfn_t *l2 = map_domain_page(mfn_x(l3[i3]));
+                        for (i2 = 0; i2 < LOGDIRTY_NODE_ENTRIES; i2++)
+                            if (mfn_valid(l2[i2]))
+                                paging_free_log_dirty_page(d, l2[i2]);
+                        unmap_domain_page(l2);
+                        paging_free_log_dirty_page(d, l3[i3]);
+                    }
+                }
+                unmap_domain_page(l3);
+                paging_free_log_dirty_page(d, l4[i4]);
+            }
+        }
+        unmap_domain_page(l4);
+        paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
+        d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+        ASSERT(d->arch.paging.log_dirty.allocs == 0);
+        d->arch.paging.log_dirty.failed_allocs = 0;
     }
 }
 
@@ -187,15 +248,19 @@ void paging_mark_dirty(struct domain *d,
 {
     unsigned long pfn;
     mfn_t gmfn;
+    int changed;
+    mfn_t mfn, *l4, *l3, *l2;
+    uint8_t *l1;
+    int i1, i2, i3, i4;
 
     gmfn = _mfn(guest_mfn);
+
+    ASSERT(mfn_valid(d->arch.paging.log_dirty.top));
 
     if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) )
         return;
 
     log_dirty_lock(d);
-
-    ASSERT(d->arch.paging.log_dirty.bitmap != NULL);
 
     /* We /really/ mean PFN here, even for non-translated guests. */
     pfn = get_gpfn_from_mfn(mfn_x(gmfn));
@@ -206,37 +271,52 @@ void paging_mark_dirty(struct domain *d,
      * Nothing to do here...
      */
     if ( unlikely(!VALID_M2P(pfn)) )
-    {
-        log_dirty_unlock(d);
-        return;
-    }
-
-    if ( likely(pfn < d->arch.paging.log_dirty.bitmap_size) )
-    {
-        if ( !__test_and_set_bit(pfn, d->arch.paging.log_dirty.bitmap) )
-        {
-            PAGING_DEBUG(LOGDIRTY,
-                         "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n",
-                         mfn_x(gmfn), pfn, d->domain_id);
-            d->arch.paging.log_dirty.dirty_count++;
-        }
-    }
-    else
-    {
-        PAGING_PRINTK("mark_dirty OOR! "
-                      "mfn=%" PRI_mfn " pfn=%lx max=%x (dom %d)\n"
-                      "owner=%d c=%08x t=%" PRtype_info "\n",
-                      mfn_x(gmfn),
-                      pfn,
-                      d->arch.paging.log_dirty.bitmap_size,
-                      d->domain_id,
-                      (page_get_owner(mfn_to_page(gmfn))
-                       ? page_get_owner(mfn_to_page(gmfn))->domain_id
-                       : -1),
-                      mfn_to_page(gmfn)->count_info,
-                      mfn_to_page(gmfn)->u.inuse.type_info);
-    }
-
+        goto out;
+
+    i1 = L1_LOGDIRTY_IDX(pfn);
+    i2 = L2_LOGDIRTY_IDX(pfn);
+    i3 = L3_LOGDIRTY_IDX(pfn);
+    i4 = L4_LOGDIRTY_IDX(pfn);
+
+    l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+    mfn = l4[i4];
+    if ( !mfn_valid(mfn) )
+        mfn = l4[i4] = paging_new_log_dirty_node(d, &l3);
+    else
+        l3 = map_domain_page(mfn_x(mfn));
+    unmap_domain_page(l4);
+    if ( unlikely(!mfn_valid(mfn)) )
+        goto out;
+
+    mfn = l3[i3];
+    if ( !mfn_valid(mfn) )
+        mfn = l3[i3] = paging_new_log_dirty_node(d, &l2);
+    else
+        l2 = map_domain_page(mfn_x(mfn));
+    unmap_domain_page(l3);
+    if ( unlikely(!mfn_valid(mfn)) )
+        goto out;
+
+    mfn = l2[i2];
+    if ( !mfn_valid(mfn) )
+        mfn = l2[i2] = paging_new_log_dirty_leaf(d, &l1);
+    else
+        l1 = map_domain_page(mfn_x(mfn));
+    unmap_domain_page(l2);
+    if ( unlikely(!mfn_valid(mfn)) )
+        goto out;
+
+    changed = !__test_and_set_bit(i1, l1);
+    unmap_domain_page(l1);
+    if ( changed )
+    {
+        PAGING_DEBUG(LOGDIRTY, 
+                     "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n",
+                     mfn_x(gmfn), pfn, d->domain_id);
+        d->arch.paging.log_dirty.dirty_count++;
+    }
+
+ out:
     log_dirty_unlock(d);
 }
 
@@ -244,7 +324,11 @@ void paging_mark_dirty(struct domain *d,
  * clear the bitmap and stats as well. */
 int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc)
 {
-    int i, rv = 0, clean = 0, peek = 1;
+    int rv = 0, clean = 0, peek = 1;
+    unsigned long pages = 0;
+    mfn_t *l4, *l3, *l2;
+    uint8_t *l1;
+    int i4, i3, i2;
 
     domain_pause(d);
     log_dirty_lock(d);
@@ -270,37 +354,55 @@ int paging_log_dirty_op(struct domain *d
         /* caller may have wanted just to clean the state or access stats. */
         peek = 0;
 
-    if ( (peek || clean) && (d->arch.paging.log_dirty.bitmap == NULL) )
+    if ( (peek || clean) && !mfn_valid(d->arch.paging.log_dirty.top) )
     {
         rv = -EINVAL; /* perhaps should be ENOMEM? */
         goto out;
     }
 
-    if ( sc->pages > d->arch.paging.log_dirty.bitmap_size )
-        sc->pages = d->arch.paging.log_dirty.bitmap_size;
-
-#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
-    for ( i = 0; i < sc->pages; i += CHUNK )
-    {
-        int bytes = ((((sc->pages - i) > CHUNK)
-                      ? CHUNK
-                      : (sc->pages - i)) + 7) / 8;
-
-        if ( likely(peek) )
-        {
-            if ( copy_to_guest_offset(
-                sc->dirty_bitmap, i/8,
-                (uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), bytes) )
-            {
-                rv = -EFAULT;
-                goto out;
+    if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) {
+        printk("%s: %d failed page allocs while logging dirty pages\n",
+               __FUNCTION__, d->arch.paging.log_dirty.failed_allocs);
+        rv = -ENOMEM;
+        goto out;
+    }
+
+    pages = 0;
+    l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+    for ( i4 = 0; pages < sc->pages && i4 < LOGDIRTY_NODE_ENTRIES; i4++ ) {
+        l3 = mfn_valid(l4[i4]) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+        for ( i3 = 0; pages < sc->pages && i3 < LOGDIRTY_NODE_ENTRIES; i3++ ) {
+            l2 = l3 && mfn_valid(l3[i3]) ? map_domain_page(mfn_x(l3[i3])) : 
NULL;
+            for ( i2 = 0; pages < sc->pages && i2 < LOGDIRTY_NODE_ENTRIES; 
i2++ ) {
+                static uint8_t zeroes[PAGE_SIZE];
+                unsigned int bytes = PAGE_SIZE;
+                l1 = l2 && mfn_valid(l2[i2]) ? map_domain_page(mfn_x(l2[i2])) 
: zeroes;
+                if ( unlikely(((sc->pages - pages + 7) >> 3) < bytes) )
+                    bytes = (unsigned int)((sc->pages - pages + 7) >> 3);
+                if ( likely(peek) ) {
+                    if ( copy_to_guest_offset(sc->dirty_bitmap, pages >> 3, 
l1, bytes) != 0) {
+                        rv = -EFAULT;
+                        goto out;
+                    }
+                }
+
+                if ( clean && l1 != zeroes )
+                    clear_page(l1);
+
+                pages += bytes << 3;
+                if (l1 != zeroes)
+                    unmap_domain_page(l1);
             }
+            if (l2)
+                unmap_domain_page(l2);
         }
-
-        if ( clean )
-            memset((uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), 0, 
bytes);
-    }
-#undef CHUNK
+        if (l3)
+            unmap_domain_page(l3);
+    }
+    unmap_domain_page(l4);
+
+    if (pages < sc->pages)
+        sc->pages = pages;
 
     log_dirty_unlock(d);
 
@@ -338,6 +440,7 @@ void paging_log_dirty_init(struct domain
     d->arch.paging.log_dirty.enable_log_dirty = enable_log_dirty;
     d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty;
     d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
+    d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
 }
 
 /* This function fress log dirty bitmap resources. */
diff -r 2052364cb456 -r 68c911f7733a xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/arch/x86/mm/shadow/private.h  Fri Nov 16 18:33:24 2007 +0000
@@ -491,17 +491,50 @@ sh_mfn_is_dirty(struct domain *d, mfn_t 
 /* Is this guest page dirty?  Call only in log-dirty mode. */
 {
     unsigned long pfn;
+    mfn_t mfn, *l4, *l3, *l2;
+    uint8_t *l1;
+    int rv;
+
     ASSERT(shadow_mode_log_dirty(d));
-    ASSERT(d->arch.paging.log_dirty.bitmap != NULL);
+    ASSERT(mfn_valid(d->arch.paging.log_dirty.top));
 
     /* We /really/ mean PFN here, even for non-translated guests. */
     pfn = get_gpfn_from_mfn(mfn_x(gmfn));
-    if ( likely(VALID_M2P(pfn))
-         && likely(pfn < d->arch.paging.log_dirty.bitmap_size) 
-         && test_bit(pfn, d->arch.paging.log_dirty.bitmap) )
+    if ( unlikely(!VALID_M2P(pfn)) )
+        return 0;
+    
+    if (d->arch.paging.log_dirty.failed_allocs > 0)
+        /* If we have any failed allocations our dirty log is bogus.
+         * Since we can't signal an error here, be conservative and
+         * report "dirty" in this case.  (The only current caller,
+         * _sh_propagate, leaves known-dirty pages writable, preventing
+         * subsequent dirty-logging faults from them.)
+         */
         return 1;
 
-    return 0;
+    l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+    mfn = l4[L4_LOGDIRTY_IDX(pfn)];
+    unmap_domain_page(l4);
+    if (!mfn_valid(mfn))
+        return 0;
+
+    l3 = map_domain_page(mfn_x(mfn));
+    mfn = l3[L3_LOGDIRTY_IDX(pfn)];
+    unmap_domain_page(l3);
+    if (!mfn_valid(mfn))
+        return 0;
+
+    l2 = map_domain_page(mfn_x(mfn));
+    mfn = l2[L2_LOGDIRTY_IDX(pfn)];
+    unmap_domain_page(l2);
+    if (!mfn_valid(mfn))
+        return 0;
+
+    l1 = map_domain_page(mfn_x(mfn));
+    rv = test_bit(L1_LOGDIRTY_IDX(pfn), l1);
+    unmap_domain_page(l1);
+
+    return rv;
 }
 
 
diff -r 2052364cb456 -r 68c911f7733a xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/include/asm-x86/domain.h      Fri Nov 16 18:33:24 2007 +0000
@@ -158,9 +158,10 @@ struct log_dirty_domain {
     int            locker; /* processor that holds the lock */
     const char    *locker_function; /* func that took it */
 
-    /* log-dirty bitmap to record dirty pages */
-    unsigned long *bitmap;
-    unsigned int   bitmap_size;  /* in pages, bit per page */
+    /* log-dirty radix tree to record dirty pages */
+    mfn_t          top;
+    unsigned int   allocs;
+    unsigned int   failed_allocs;
 
     /* log-dirty mode stats */
     unsigned int   fault_count;
diff -r 2052364cb456 -r 68c911f7733a xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h      Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/include/asm-x86/paging.h      Fri Nov 16 18:33:24 2007 +0000
@@ -152,6 +152,28 @@ void paging_log_dirty_init(struct domain
 /* mark a page as dirty */
 void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
 
+/*
+ * Log-dirty radix tree indexing:
+ *   All tree nodes are PAGE_SIZE bytes, mapped on-demand.
+ *   Leaf nodes are simple bitmaps; 1 bit per guest pfn.
+ *   Interior nodes are arrays of LOGDIRTY_NODE_ENTRIES mfns.
+ * TODO: Dynamic radix tree height. Most guests will only need 2 levels.
+ *       The fourth level is basically unusable on 32-bit Xen.
+ * TODO2: Abstract out the radix-tree mechanics?
+ */
+#define LOGDIRTY_NODE_ENTRIES (1 << PAGETABLE_ORDER)
+#define L1_LOGDIRTY_IDX(pfn) ((pfn) & ((1 << (PAGE_SHIFT+3)) - 1))
+#define L2_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3)) & \
+                              (LOGDIRTY_NODE_ENTRIES-1))
+#define L3_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER)) & \
+                              (LOGDIRTY_NODE_ENTRIES-1))
+#if BITS_PER_LONG == 64
+#define L4_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER*2)) & \
+                              (LOGDIRTY_NODE_ENTRIES-1))
+#else
+#define L4_LOGDIRTY_IDX(pfn) 0
+#endif
+
 /*****************************************************************************
  * Entry points into the paging-assistance code */
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] hvm: make dirty logging stop requiring physical pages of order > 0, Xen patchbot-unstable <=