# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1195238004 0
# Node ID 68c911f7733a0158056d10e9e7997a6acfe47eb1
# Parent 2052364cb456170a70ad5c8bfb876c95f7a9fe4a
hvm: make dirty logging stop requiring physical pages of order > 0
This patch re-implements the (x86) hypervisor dirty page log with a
simple four-level radix tree whose nodes are all single pages, thus
making migration require only order-0 pages (where before it required
at least an order-5 page).
Unlike the p2m radix tree implementation, the interior nodes of this
tree are NOT page table nodes. I chose a lazy-allocation and -mapping
approach because most pages are not marked dirty while dirty-logging is
enabled. There are doubtless situations (the 'stream' benchmark, for
example) where a more complex p2m-like approach is faster, but I'm not
sure they're worth the effort.
Signed-off-by: Dave Lively <dlively@xxxxxxxxxxxxxxx>
---
xen/arch/x86/mm/paging.c | 251 +++++++++++++++++++++++++++------------
xen/arch/x86/mm/shadow/private.h | 43 +++++-
xen/include/asm-x86/domain.h | 7 -
xen/include/asm-x86/paging.h | 22 +++
4 files changed, 241 insertions(+), 82 deletions(-)
diff -r 2052364cb456 -r 68c911f7733a xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/arch/x86/mm/paging.c Fri Nov 16 18:33:24 2007 +0000
@@ -96,36 +96,97 @@
spin_unlock(&(_d)->arch.paging.log_dirty.lock); \
} while (0)
+static mfn_t paging_new_log_dirty_page(struct domain *d, void **mapping_p)
+{
+ mfn_t mfn;
+ struct page_info *page = alloc_domheap_page(NULL);
+
+ if ( unlikely(page == NULL) ) {
+ d->arch.paging.log_dirty.failed_allocs++;
+ return _mfn(INVALID_MFN);
+ }
+ d->arch.paging.log_dirty.allocs++;
+ mfn = page_to_mfn(page);
+ *mapping_p = map_domain_page(mfn_x(mfn));
+ return mfn;
+}
+
+
+static mfn_t paging_new_log_dirty_leaf(struct domain *d, uint8_t **leaf_p)
+{
+ mfn_t mfn = paging_new_log_dirty_page(d, (void **)leaf_p);
+ clear_page(*leaf_p);
+ return mfn;
+}
+
+
+static mfn_t paging_new_log_dirty_node(struct domain *d, mfn_t **node_p)
+{
+ int i;
+ mfn_t mfn = paging_new_log_dirty_page(d, (void **)node_p);
+ for (i = 0; i < LOGDIRTY_NODE_ENTRIES; i++)
+ (*node_p)[i] = _mfn(INVALID_MFN);
+ return mfn;
+}
+
+
/* allocate bitmap resources for log dirty */
int paging_alloc_log_dirty_bitmap(struct domain *d)
{
- if ( d->arch.paging.log_dirty.bitmap != NULL )
+ mfn_t *mapping;
+
+ if ( mfn_valid(d->arch.paging.log_dirty.top) )
return 0;
- d->arch.paging.log_dirty.bitmap_size =
- (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
- d->arch.paging.log_dirty.bitmap =
- xmalloc_array(unsigned long,
- d->arch.paging.log_dirty.bitmap_size / BITS_PER_LONG);
- if ( d->arch.paging.log_dirty.bitmap == NULL )
- {
- d->arch.paging.log_dirty.bitmap_size = 0;
+ d->arch.paging.log_dirty.top = paging_new_log_dirty_node(d, &mapping);
+ if ( unlikely(!mfn_valid(d->arch.paging.log_dirty.top)) ) {
+ /* Clear error indicator since we're reporting this one */
+ d->arch.paging.log_dirty.failed_allocs = 0;
return -ENOMEM;
}
- memset(d->arch.paging.log_dirty.bitmap, 0,
- d->arch.paging.log_dirty.bitmap_size/8);
+ unmap_domain_page(mapping);
return 0;
}
+
+
+static void paging_free_log_dirty_page(struct domain *d, mfn_t mfn)
+{
+ d->arch.paging.log_dirty.allocs--;
+ free_domheap_page(mfn_to_page(mfn));
+}
/* free bitmap resources */
void paging_free_log_dirty_bitmap(struct domain *d)
{
- d->arch.paging.log_dirty.bitmap_size = 0;
- if ( d->arch.paging.log_dirty.bitmap )
- {
- xfree(d->arch.paging.log_dirty.bitmap);
- d->arch.paging.log_dirty.bitmap = NULL;
+ int i4, i3, i2;
+
+ if (mfn_valid(d->arch.paging.log_dirty.top)) {
+ mfn_t *l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+ printk("%s: used %d pages for domain %d dirty logging\n",
+ __FUNCTION__, d->arch.paging.log_dirty.allocs, d->domain_id);
+ for (i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++) {
+ if (mfn_valid(l4[i4])) {
+ mfn_t *l3 = map_domain_page(mfn_x(l4[i4]));
+ for (i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++) {
+ if (mfn_valid(l3[i3])) {
+ mfn_t *l2 = map_domain_page(mfn_x(l3[i3]));
+ for (i2 = 0; i2 < LOGDIRTY_NODE_ENTRIES; i2++)
+ if (mfn_valid(l2[i2]))
+ paging_free_log_dirty_page(d, l2[i2]);
+ unmap_domain_page(l2);
+ paging_free_log_dirty_page(d, l3[i3]);
+ }
+ }
+ unmap_domain_page(l3);
+ paging_free_log_dirty_page(d, l4[i4]);
+ }
+ }
+ unmap_domain_page(l4);
+ paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
+ d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+ ASSERT(d->arch.paging.log_dirty.allocs == 0);
+ d->arch.paging.log_dirty.failed_allocs = 0;
}
}
@@ -187,15 +248,19 @@ void paging_mark_dirty(struct domain *d,
{
unsigned long pfn;
mfn_t gmfn;
+ int changed;
+ mfn_t mfn, *l4, *l3, *l2;
+ uint8_t *l1;
+ int i1, i2, i3, i4;
gmfn = _mfn(guest_mfn);
+
+ ASSERT(mfn_valid(d->arch.paging.log_dirty.top));
if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) )
return;
log_dirty_lock(d);
-
- ASSERT(d->arch.paging.log_dirty.bitmap != NULL);
/* We /really/ mean PFN here, even for non-translated guests. */
pfn = get_gpfn_from_mfn(mfn_x(gmfn));
@@ -206,37 +271,52 @@ void paging_mark_dirty(struct domain *d,
* Nothing to do here...
*/
if ( unlikely(!VALID_M2P(pfn)) )
- {
- log_dirty_unlock(d);
- return;
- }
-
- if ( likely(pfn < d->arch.paging.log_dirty.bitmap_size) )
- {
- if ( !__test_and_set_bit(pfn, d->arch.paging.log_dirty.bitmap) )
- {
- PAGING_DEBUG(LOGDIRTY,
- "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n",
- mfn_x(gmfn), pfn, d->domain_id);
- d->arch.paging.log_dirty.dirty_count++;
- }
- }
- else
- {
- PAGING_PRINTK("mark_dirty OOR! "
- "mfn=%" PRI_mfn " pfn=%lx max=%x (dom %d)\n"
- "owner=%d c=%08x t=%" PRtype_info "\n",
- mfn_x(gmfn),
- pfn,
- d->arch.paging.log_dirty.bitmap_size,
- d->domain_id,
- (page_get_owner(mfn_to_page(gmfn))
- ? page_get_owner(mfn_to_page(gmfn))->domain_id
- : -1),
- mfn_to_page(gmfn)->count_info,
- mfn_to_page(gmfn)->u.inuse.type_info);
- }
-
+ goto out;
+
+ i1 = L1_LOGDIRTY_IDX(pfn);
+ i2 = L2_LOGDIRTY_IDX(pfn);
+ i3 = L3_LOGDIRTY_IDX(pfn);
+ i4 = L4_LOGDIRTY_IDX(pfn);
+
+ l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+ mfn = l4[i4];
+ if ( !mfn_valid(mfn) )
+ mfn = l4[i4] = paging_new_log_dirty_node(d, &l3);
+ else
+ l3 = map_domain_page(mfn_x(mfn));
+ unmap_domain_page(l4);
+ if ( unlikely(!mfn_valid(mfn)) )
+ goto out;
+
+ mfn = l3[i3];
+ if ( !mfn_valid(mfn) )
+ mfn = l3[i3] = paging_new_log_dirty_node(d, &l2);
+ else
+ l2 = map_domain_page(mfn_x(mfn));
+ unmap_domain_page(l3);
+ if ( unlikely(!mfn_valid(mfn)) )
+ goto out;
+
+ mfn = l2[i2];
+ if ( !mfn_valid(mfn) )
+ mfn = l2[i2] = paging_new_log_dirty_leaf(d, &l1);
+ else
+ l1 = map_domain_page(mfn_x(mfn));
+ unmap_domain_page(l2);
+ if ( unlikely(!mfn_valid(mfn)) )
+ goto out;
+
+ changed = !__test_and_set_bit(i1, l1);
+ unmap_domain_page(l1);
+ if ( changed )
+ {
+ PAGING_DEBUG(LOGDIRTY,
+ "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n",
+ mfn_x(gmfn), pfn, d->domain_id);
+ d->arch.paging.log_dirty.dirty_count++;
+ }
+
+ out:
log_dirty_unlock(d);
}
@@ -244,7 +324,11 @@ void paging_mark_dirty(struct domain *d,
* clear the bitmap and stats as well. */
int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc)
{
- int i, rv = 0, clean = 0, peek = 1;
+ int rv = 0, clean = 0, peek = 1;
+ unsigned long pages = 0;
+ mfn_t *l4, *l3, *l2;
+ uint8_t *l1;
+ int i4, i3, i2;
domain_pause(d);
log_dirty_lock(d);
@@ -270,37 +354,55 @@ int paging_log_dirty_op(struct domain *d
/* caller may have wanted just to clean the state or access stats. */
peek = 0;
- if ( (peek || clean) && (d->arch.paging.log_dirty.bitmap == NULL) )
+ if ( (peek || clean) && !mfn_valid(d->arch.paging.log_dirty.top) )
{
rv = -EINVAL; /* perhaps should be ENOMEM? */
goto out;
}
- if ( sc->pages > d->arch.paging.log_dirty.bitmap_size )
- sc->pages = d->arch.paging.log_dirty.bitmap_size;
-
-#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
- for ( i = 0; i < sc->pages; i += CHUNK )
- {
- int bytes = ((((sc->pages - i) > CHUNK)
- ? CHUNK
- : (sc->pages - i)) + 7) / 8;
-
- if ( likely(peek) )
- {
- if ( copy_to_guest_offset(
- sc->dirty_bitmap, i/8,
- (uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), bytes) )
- {
- rv = -EFAULT;
- goto out;
+ if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) {
+ printk("%s: %d failed page allocs while logging dirty pages\n",
+ __FUNCTION__, d->arch.paging.log_dirty.failed_allocs);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ pages = 0;
+ l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+ for ( i4 = 0; pages < sc->pages && i4 < LOGDIRTY_NODE_ENTRIES; i4++ ) {
+ l3 = mfn_valid(l4[i4]) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+ for ( i3 = 0; pages < sc->pages && i3 < LOGDIRTY_NODE_ENTRIES; i3++ ) {
+ l2 = l3 && mfn_valid(l3[i3]) ? map_domain_page(mfn_x(l3[i3])) :
NULL;
+ for ( i2 = 0; pages < sc->pages && i2 < LOGDIRTY_NODE_ENTRIES;
i2++ ) {
+ static uint8_t zeroes[PAGE_SIZE];
+ unsigned int bytes = PAGE_SIZE;
+ l1 = l2 && mfn_valid(l2[i2]) ? map_domain_page(mfn_x(l2[i2]))
: zeroes;
+ if ( unlikely(((sc->pages - pages + 7) >> 3) < bytes) )
+ bytes = (unsigned int)((sc->pages - pages + 7) >> 3);
+ if ( likely(peek) ) {
+ if ( copy_to_guest_offset(sc->dirty_bitmap, pages >> 3,
l1, bytes) != 0) {
+ rv = -EFAULT;
+ goto out;
+ }
+ }
+
+ if ( clean && l1 != zeroes )
+ clear_page(l1);
+
+ pages += bytes << 3;
+ if (l1 != zeroes)
+ unmap_domain_page(l1);
}
+ if (l2)
+ unmap_domain_page(l2);
}
-
- if ( clean )
- memset((uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), 0,
bytes);
- }
-#undef CHUNK
+ if (l3)
+ unmap_domain_page(l3);
+ }
+ unmap_domain_page(l4);
+
+ if (pages < sc->pages)
+ sc->pages = pages;
log_dirty_unlock(d);
@@ -338,6 +440,7 @@ void paging_log_dirty_init(struct domain
d->arch.paging.log_dirty.enable_log_dirty = enable_log_dirty;
d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty;
d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
+ d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
}
/* This function fress log dirty bitmap resources. */
diff -r 2052364cb456 -r 68c911f7733a xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/arch/x86/mm/shadow/private.h Fri Nov 16 18:33:24 2007 +0000
@@ -491,17 +491,50 @@ sh_mfn_is_dirty(struct domain *d, mfn_t
/* Is this guest page dirty? Call only in log-dirty mode. */
{
unsigned long pfn;
+ mfn_t mfn, *l4, *l3, *l2;
+ uint8_t *l1;
+ int rv;
+
ASSERT(shadow_mode_log_dirty(d));
- ASSERT(d->arch.paging.log_dirty.bitmap != NULL);
+ ASSERT(mfn_valid(d->arch.paging.log_dirty.top));
/* We /really/ mean PFN here, even for non-translated guests. */
pfn = get_gpfn_from_mfn(mfn_x(gmfn));
- if ( likely(VALID_M2P(pfn))
- && likely(pfn < d->arch.paging.log_dirty.bitmap_size)
- && test_bit(pfn, d->arch.paging.log_dirty.bitmap) )
+ if ( unlikely(!VALID_M2P(pfn)) )
+ return 0;
+
+ if (d->arch.paging.log_dirty.failed_allocs > 0)
+ /* If we have any failed allocations our dirty log is bogus.
+ * Since we can't signal an error here, be conservative and
+ * report "dirty" in this case. (The only current caller,
+ * _sh_propagate, leaves known-dirty pages writable, preventing
+ * subsequent dirty-logging faults from them.)
+ */
return 1;
- return 0;
+ l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+ mfn = l4[L4_LOGDIRTY_IDX(pfn)];
+ unmap_domain_page(l4);
+ if (!mfn_valid(mfn))
+ return 0;
+
+ l3 = map_domain_page(mfn_x(mfn));
+ mfn = l3[L3_LOGDIRTY_IDX(pfn)];
+ unmap_domain_page(l3);
+ if (!mfn_valid(mfn))
+ return 0;
+
+ l2 = map_domain_page(mfn_x(mfn));
+ mfn = l2[L2_LOGDIRTY_IDX(pfn)];
+ unmap_domain_page(l2);
+ if (!mfn_valid(mfn))
+ return 0;
+
+ l1 = map_domain_page(mfn_x(mfn));
+ rv = test_bit(L1_LOGDIRTY_IDX(pfn), l1);
+ unmap_domain_page(l1);
+
+ return rv;
}
diff -r 2052364cb456 -r 68c911f7733a xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/include/asm-x86/domain.h Fri Nov 16 18:33:24 2007 +0000
@@ -158,9 +158,10 @@ struct log_dirty_domain {
int locker; /* processor that holds the lock */
const char *locker_function; /* func that took it */
- /* log-dirty bitmap to record dirty pages */
- unsigned long *bitmap;
- unsigned int bitmap_size; /* in pages, bit per page */
+ /* log-dirty radix tree to record dirty pages */
+ mfn_t top;
+ unsigned int allocs;
+ unsigned int failed_allocs;
/* log-dirty mode stats */
unsigned int fault_count;
diff -r 2052364cb456 -r 68c911f7733a xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h Fri Nov 16 17:59:34 2007 +0000
+++ b/xen/include/asm-x86/paging.h Fri Nov 16 18:33:24 2007 +0000
@@ -152,6 +152,28 @@ void paging_log_dirty_init(struct domain
/* mark a page as dirty */
void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
+/*
+ * Log-dirty radix tree indexing:
+ * All tree nodes are PAGE_SIZE bytes, mapped on-demand.
+ * Leaf nodes are simple bitmaps; 1 bit per guest pfn.
+ * Interior nodes are arrays of LOGDIRTY_NODE_ENTRIES mfns.
+ * TODO: Dynamic radix tree height. Most guests will only need 2 levels.
+ * The fourth level is basically unusable on 32-bit Xen.
+ * TODO2: Abstract out the radix-tree mechanics?
+ */
+#define LOGDIRTY_NODE_ENTRIES (1 << PAGETABLE_ORDER)
+#define L1_LOGDIRTY_IDX(pfn) ((pfn) & ((1 << (PAGE_SHIFT+3)) - 1))
+#define L2_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3)) & \
+ (LOGDIRTY_NODE_ENTRIES-1))
+#define L3_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER)) & \
+ (LOGDIRTY_NODE_ENTRIES-1))
+#if BITS_PER_LONG == 64
+#define L4_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER*2)) & \
+ (LOGDIRTY_NODE_ENTRIES-1))
+#else
+#define L4_LOGDIRTY_IDX(pfn) 0
+#endif
+
/*****************************************************************************
* Entry points into the paging-assistance code */
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|