# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1236367119 0
# Node ID dd489125a2e7718efa0e7afe89e7875d7909353f
# Parent f1080b20cd15e06d5fc72062c35b627b2f947339
Page offline support in Xen side
This patch add support to offline a page. The basical idea is, when a
page is assigned, it will be marked offline pending and be moved out of
buddy when freed, when a page is free, it will be moved out of buddy directly.
One notice after this change is, now the page->count_info is not
always 0, especially for shadow page, since the PGC_offlining bit may be set.
Signed-off-by: Wang, Shane <shane.wang@xxxxxxxxx>
Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
---
xen/common/page_alloc.c | 341 +++++++++++++++++++++++++++++++++++++++++++-
xen/common/sysctl.c | 55 +++++++
xen/include/asm-x86/mm.h | 23 ++
xen/include/public/sysctl.h | 49 ++++++
xen/include/public/xen.h | 3
xen/include/xen/mm.h | 3
6 files changed, 470 insertions(+), 4 deletions(-)
diff -r f1080b20cd15 -r dd489125a2e7 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/common/page_alloc.c Fri Mar 06 19:18:39 2009 +0000
@@ -35,6 +35,7 @@
#include <xen/perfc.h>
#include <xen/numa.h>
#include <xen/nodemask.h>
+#include <public/sysctl.h>
#include <asm/page.h>
#include <asm/numa.h>
#include <asm/flushtlb.h>
@@ -74,6 +75,11 @@ PAGE_LIST_HEAD(page_scrub_list);
PAGE_LIST_HEAD(page_scrub_list);
static unsigned long scrub_pages;
+/* Offlined page list, protected by heap_lock */
+PAGE_LIST_HEAD(page_offlined_list);
+
+/* Broken page list, protected by heap_lock */
+PAGE_LIST_HEAD(page_broken_list);
/*********************
* ALLOCATION BITMAP
* One bit per page of memory. Bit set => page is allocated.
@@ -421,12 +427,92 @@ static struct page_info *alloc_heap_page
return pg;
}
+/*
+ * Remove any offlined page in the buddy poined by head
+ */
+static int reserve_offlined_page(struct page_info *head)
+{
+ unsigned int node = phys_to_nid(page_to_maddr(head));
+ int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
+ struct page_info *cur_head;
+ int cur_order;
+
+ ASSERT(spin_is_locked(&heap_lock));
+
+ cur_head = head;
+
+ page_list_del(head, &heap(node, zone, head_order));
+
+ while ( cur_head < (head + (1 << head_order)) )
+ {
+ struct page_info *pg;
+ int next_order;
+
+ if (test_bit(_PGC_offlined, &cur_head->count_info))
+ {
+ cur_head++;
+ continue;
+ }
+
+ next_order = cur_order = 0;
+
+ while (cur_order < head_order)
+ {
+ next_order = cur_order + 1;
+
+ if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)))
+ goto merge;
+
+ for (i = (1 << cur_order), pg = cur_head + (1 << cur_order);
+ i < (1 << next_order);
+ i++, pg ++)
+ if (test_bit(_PGC_offlined, &pg->count_info))
+ break;
+ if (i == ( 1 << next_order))
+ {
+ cur_order = next_order;
+ continue;
+ }
+ else
+ {
+ /*
+ * We don't need considering merge outside the head_order
+ */
+merge:
+ page_list_add_tail(cur_head, &heap(node, zone, cur_order));
+ PFN_ORDER(cur_head) = cur_order;
+ cur_head += (1 << cur_order);
+ break;
+ }
+ }
+ }
+
+ for (cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++)
+ {
+ if (!test_bit(_PGC_offlined, &cur_head->count_info))
+ continue;
+
+ avail[node][zone] --;
+
+ map_alloc(page_to_mfn(cur_head), 1);
+
+ if (test_bit(_PGC_broken, &cur_head->count_info))
+ page_list_add_tail(cur_head, &page_broken_list);
+ else
+ page_list_add_tail(cur_head, &page_offlined_list);
+
+ count ++;
+ }
+
+ return count;
+}
+
/* Free 2^@order set of pages. */
static void free_heap_pages(
struct page_info *pg, unsigned int order)
{
unsigned long mask;
- unsigned int i, node = phys_to_nid(page_to_maddr(pg));
+ unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
unsigned int zone = page_to_zone(pg);
ASSERT(order <= MAX_ORDER);
@@ -446,7 +532,14 @@ static void free_heap_pages(
* in its pseudophysical address space).
* In all the above cases there can be no guest mappings of this page.
*/
- pg[i].count_info = 0;
+ ASSERT(!(pg[i].count_info & PGC_offlined));
+ pg[i].count_info &= PGC_offlining | PGC_broken;
+ if (pg[i].count_info & PGC_offlining)
+ {
+ pg[i].count_info &= ~PGC_offlining;
+ pg[i].count_info |= PGC_offlined;
+ tainted = 1;
+ }
/* If a page has no owner it will need no safety TLB flush. */
pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
@@ -481,7 +574,7 @@ static void free_heap_pages(
break;
page_list_del(pg + mask, &heap(node, zone, order));
}
-
+
order++;
/* After merging, pg should remain in the same node. */
@@ -491,7 +584,249 @@ static void free_heap_pages(
PFN_ORDER(pg) = order;
page_list_add_tail(pg, &heap(node, zone, order));
+ if (tainted)
+ reserve_offlined_page(pg);
+
spin_unlock(&heap_lock);
+}
+
+
+/*
+ * Following possible status for a page:
+ * free and Online; free and offlined; free and offlined and broken;
+ * assigned and online; assigned and offlining; assigned and offling and broken
+ *
+ * Following rules applied for page offline:
+ * Once a page is broken, it can't be assigned anymore
+ * A page will be offlined only if it is free
+ * return original count_info
+ *
+ */
+static unsigned long mark_page_offline(struct page_info *pg, int broken)
+{
+ unsigned long nx, x, y = pg->count_info;
+
+ ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
+ /*
+ * Caller gurantee the page will not be reassigned during this process
+ */
+ ASSERT(spin_is_locked(&heap_lock));
+
+ do {
+ nx = x = y;
+
+ if ( ((x & PGC_offlined_broken) == PGC_offlined_broken) )
+ return y;
+ /* PGC_offlined means it is free pages */
+ if (x & PGC_offlined)
+ {
+ if (broken && !(nx & PGC_broken))
+ nx |= PGC_broken;
+ else
+ return y;
+ }
+ /* It is not offlined, not reserved page */
+ else if ( allocated_in_map(page_to_mfn(pg)) )
+ nx |= PGC_offlining;
+ else
+ nx |= PGC_offlined;
+
+ if (broken)
+ nx |= PGC_broken;
+ } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
+
+ return y;
+}
+
+static int reserve_heap_page(struct page_info *pg)
+{
+ struct page_info *head = NULL;
+ unsigned int i, node = phys_to_nid(page_to_maddr(pg));
+ unsigned int zone = page_to_zone(pg);
+
+ /* get the header */
+ for ( i = 0; i <= MAX_ORDER; i++ )
+ {
+ struct page_info *tmp;
+
+ if ( page_list_empty(&heap(node, zone, i)) )
+ continue;
+
+ page_list_for_each_safe(head, tmp, &heap(node, zone, i))
+ {
+ if ( (head <= pg) &&
+ (head + (1UL << i) > pg) )
+ return reserve_offlined_page(head);
+ }
+ }
+
+ return -EINVAL;
+
+}
+
+/*
+ * offline one page
+ */
+int offline_page(unsigned long mfn, int broken, uint32_t *status)
+{
+ unsigned long old_info = 0;
+ struct domain *owner;
+ int ret = 0;
+ struct page_info *pg;
+
+ if (mfn > max_page)
+ {
+ dprintk(XENLOG_WARNING,
+ "try to offline page out of range %lx\n", mfn);
+ return -EINVAL;
+ }
+
+ *status = 0;
+ pg = mfn_to_page(mfn);
+
+
+#if defined(__x86_64__)
+ /* Xen's txt mfn in x86_64 is reserved in e820 */
+ if ( is_xen_fixed_mfn(mfn) )
+#elif defined(__i386__)
+ if ( is_xen_heap_mfn(mfn) )
+#endif
+ {
+ *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
+ (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+ return -EPERM;
+ }
+
+ /*
+ * N.B. xen's txt in x86_64 is marked reserved and handled already
+ * Also kexec range is reserved
+ */
+ if (!page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL))
+ {
+ *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
+ return -EINVAL;
+ }
+
+ spin_lock(&heap_lock);
+
+ old_info = mark_page_offline(pg, broken);
+
+ if ( !allocated_in_map(mfn) )
+ {
+ /* Free pages are reserve directly */
+ reserve_heap_page(pg);
+ *status = PG_OFFLINE_OFFLINED;
+ }
+ else if (test_bit(_PGC_offlined, &pg->count_info))
+ {
+ *status = PG_OFFLINE_OFFLINED;
+ }
+ else if ((owner = page_get_owner_and_reference(pg)))
+ {
+ *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
+ (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
+ /* Release the reference since it will not be allocated anymore */
+ put_page(pg);
+ }
+ else if ( old_info & PGC_xen_heap)
+ {
+ *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
+ (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+ }
+ else
+ {
+ /*
+ * assign_pages does not hold heap_lock, so small window that the owner
+ * may be set later, but please notice owner will only change from
+ * NULL to be set, not verse, since page is offlining now.
+ * No windows If called from #MC handler, since all CPU are in softirq
+ * If called from user space like CE handling, tools can wait some time
+ * before call again.
+ */
+ *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
+ (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
+ }
+
+ if (broken)
+ *status |= PG_OFFLINE_BROKEN;
+
+ spin_unlock(&heap_lock);
+
+ return ret;
+}
+
+/*
+ * Online the memory.
+ * The caller should make sure end_pfn <= max_page,
+ * if not, expand_pages() should be called prior to online_page().
+ */
+unsigned int online_page(unsigned long mfn, uint32_t *status)
+{
+ struct page_info *pg;
+ int ret = 0, free = 0;
+
+ if ( mfn > max_page )
+ {
+ dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+ return -EINVAL;
+ }
+
+ pg = mfn_to_page(mfn);
+
+ *status = 0;
+
+ spin_lock(&heap_lock);
+
+ if ( unlikely(is_page_broken(pg)) )
+ {
+ ret = -EINVAL;
+ *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
+ }
+ else if (pg->count_info & PGC_offlined)
+ {
+ clear_bit(_PGC_offlined, &pg->count_info);
+ page_list_del(pg, &page_offlined_list);
+ *status = PG_ONLINE_ONLINED;
+ free = 1;
+ }
+ else if (pg->count_info & PGC_offlining)
+ {
+ clear_bit(_PGC_offlining, &pg->count_info);
+ *status = PG_ONLINE_ONLINED;
+ }
+ spin_unlock(&heap_lock);
+
+ if (free)
+ free_heap_pages(pg, 0);
+
+ return ret;
+}
+
+int query_page_offline(unsigned long mfn, uint32_t *status)
+{
+ struct page_info *pg;
+
+ if ( (mfn > max_page) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
+ {
+ dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+ return -EINVAL;
+ }
+
+ *status = 0;
+ spin_lock(&heap_lock);
+
+ pg = mfn_to_page(mfn);
+
+ if (pg->count_info & PGC_offlining)
+ *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
+ if (pg->count_info & PGC_broken)
+ *status |= PG_OFFLINE_STATUS_BROKEN;
+ if (pg->count_info & PGC_offlined)
+ *status |= PG_OFFLINE_STATUS_OFFLINED;
+
+ spin_unlock(&heap_lock);
+
+ return 0;
}
/*
diff -r f1080b20cd15 -r dd489125a2e7 xen/common/sysctl.c
--- a/xen/common/sysctl.c Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/common/sysctl.c Fri Mar 06 19:18:39 2009 +0000
@@ -233,6 +233,61 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
}
break;
+ case XEN_SYSCTL_page_offline_op:
+ {
+ uint32_t *status, *ptr;
+ unsigned long pfn;
+
+ ptr = status = xmalloc_bytes( sizeof(uint32_t) *
+ (op->u.page_offline.end -
+ op->u.page_offline.start + 1));
+ if (!status)
+ {
+ dprintk(XENLOG_WARNING, "Out of memory for page offline op\n");
+ ret = -ENOMEM;
+ break;
+ }
+
+ memset(status, PG_OFFLINE_INVALID, sizeof(uint32_t) *
+ (op->u.page_offline.end - op->u.page_offline.start + 1));
+
+ for ( pfn = op->u.page_offline.start;
+ pfn <= op->u.page_offline.end;
+ pfn ++ )
+ {
+ switch (op->u.page_offline.cmd)
+ {
+ /* Shall revert her if failed, or leave caller do it? */
+ case sysctl_page_offline:
+ ret = offline_page(pfn, 0, ptr++);
+ break;
+ case sysctl_page_online:
+ ret = online_page(pfn, ptr++);
+ break;
+ case sysctl_query_page_offline:
+ ret = query_page_offline(pfn, ptr++);
+ break;
+ default:
+ gdprintk(XENLOG_WARNING, "invalid page offline op %x\n",
+ op->u.page_offline.cmd);
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ break;
+ }
+
+ if (copy_to_guest(op->u.page_offline.status, status,
+ op->u.page_offline.end - op->u.page_offline.start +
1))
+ {
+ ret = -EFAULT;
+ break;
+ }
+ xfree(status);
+ }
+ break;
+
default:
ret = arch_do_sysctl(op, u_sysctl);
break;
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/asm-x86/mm.h Fri Mar 06 19:18:39 2009 +0000
@@ -198,8 +198,25 @@ struct page_info
/* 3-bit PAT/PCD/PWT cache-attribute hint. */
#define PGC_cacheattr_base PG_shift(6)
#define PGC_cacheattr_mask PG_mask(7, 6)
+
+ /* Page is broken? */
+ #define _PGC_broken PG_shift(7)
+ #define PGC_broken PG_mask(1, 7)
+ /* Page is offline pending ? */
+ #define _PGC_offlining PG_shift(8)
+ #define PGC_offlining PG_mask(1, 8)
+ /* Page is offlined */
+ #define _PGC_offlined PG_shift(9)
+ #define PGC_offlined PG_mask(1, 9)
+ #define PGC_offlined_broken (PGC_offlined | PGC_broken)
+
+ #define is_page_offlining(page) ((page)->count_info & PGC_offlining)
+ #define is_page_offlined(page) ((page)->count_info & PGC_offlined)
+ #define is_page_broken(page) ((page)->count_info & PGC_broken)
+ #define is_page_online(page) (!is_page_offlined(page))
+
/* Count of references to this frame. */
-#define PGC_count_width PG_shift(6)
+#define PGC_count_width PG_shift(9)
#define PGC_count_mask ((1UL<<PGC_count_width)-1)
#if defined(__i386__)
@@ -209,9 +226,13 @@ struct page_info
(_mfn < paddr_to_pfn(xenheap_phys_end)); \
})
#else
+extern unsigned long allocator_bitmap_end;
#define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap)
#define is_xen_heap_mfn(mfn) \
(__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn)))
+#define is_xen_fixed_mfn(mfn) \
+ ( (mfn << PAGE_SHIFT) >= __pa(&_start) && \
+ (mfn << PAGE_SHIFT) <= allocator_bitmap_end )
#endif
#if defined(__i386__)
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/sysctl.h
--- a/xen/include/public/sysctl.h Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/public/sysctl.h Fri Mar 06 19:18:39 2009 +0000
@@ -359,6 +359,54 @@ struct xen_sysctl_pm_op {
};
};
+#define XEN_SYSCTL_page_offline_op 14
+struct xen_sysctl_page_offline_op {
+ /* IN: range of page to be offlined */
+#define sysctl_page_offline 1
+#define sysctl_page_online 2
+#define sysctl_query_page_offline 3
+ uint32_t cmd;
+ uint32_t start;
+ uint32_t end;
+ /* OUT: result of page offline request */
+ /*
+ * bit 0~15: result flags
+ * bit 16~31: owner
+ */
+ XEN_GUEST_HANDLE(uint32) status;
+};
+
+#define PG_OFFLINE_STATUS_MASK (0xFFUL)
+
+/* The result is invalid, i.e. HV does not handle it */
+#define PG_OFFLINE_INVALID (0x1UL << 0)
+
+#define PG_OFFLINE_OFFLINED (0x1UL << 1)
+#define PG_OFFLINE_PENDING (0x1UL << 2)
+#define PG_OFFLINE_FAILED (0x1UL << 3)
+
+#define PG_ONLINE_FAILED PG_OFFLINE_FAILED
+#define PG_ONLINE_ONLINED PG_OFFLINE_OFFLINED
+
+#define PG_OFFLINE_STATUS_OFFLINED (0x1UL << 1)
+#define PG_OFFLINE_STATUS_ONLINE (0x1UL << 2)
+#define PG_OFFLINE_STATUS_OFFLINE_PENDING (0x1UL << 3)
+#define PG_OFFLINE_STATUS_BROKEN (0x1UL << 4)
+
+#define PG_OFFLINE_MISC_MASK (0xFFUL << 4)
+
+/* only valid when PG_OFFLINE_FAILED */
+#define PG_OFFLINE_XENPAGE (0x1UL << 8)
+#define PG_OFFLINE_DOM0PAGE (0x1UL << 9)
+#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
+#define PG_OFFLINE_NOT_CONV_RAM (0x1UL << 11)
+#define PG_OFFLINE_OWNED (0x1UL << 12)
+
+#define PG_OFFLINE_BROKEN (0x1UL << 13)
+#define PG_ONLINE_BROKEN PG_OFFLINE_BROKEN
+
+#define PG_OFFLINE_OWNER_SHIFT 16
+
struct xen_sysctl {
uint32_t cmd;
uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
@@ -375,6 +423,7 @@ struct xen_sysctl {
struct xen_sysctl_get_pmstat get_pmstat;
struct xen_sysctl_cpu_hotplug cpu_hotplug;
struct xen_sysctl_pm_op pm_op;
+ struct xen_sysctl_page_offline_op page_offline;
uint8_t pad[128];
} u;
};
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/xen.h
--- a/xen/include/public/xen.h Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/public/xen.h Fri Mar 06 19:18:39 2009 +0000
@@ -354,6 +354,9 @@ typedef uint16_t domid_t;
*/
#define DOMID_XEN (0x7FF2U)
+/* DOMID_INVALID is used to identity invalid domid */
+#define DOMID_INVALID (0x7FFFU)
+
/*
* Send an array of these to HYPERVISOR_mmu_update().
* NB. The fields are natural pointer/address size for this architecture.
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/xen/mm.h Fri Mar 06 19:18:39 2009 +0000
@@ -60,6 +60,9 @@ unsigned long avail_domheap_pages(void);
unsigned long avail_domheap_pages(void);
#define alloc_domheap_page(d,f) (alloc_domheap_pages(d,0,f))
#define free_domheap_page(p) (free_domheap_pages(p,0))
+unsigned int online_page(unsigned long mfn, uint32_t *status);
+int offline_page(unsigned long mfn, int broken, uint32_t *status);
+int query_page_offline(unsigned long mfn, uint32_t *status);
void scrub_heap_pages(void);
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|