WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Page offline support in Xen side

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Page offline support in Xen side
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Sat, 07 Mar 2009 06:35:37 -0800
Delivery-date: Sat, 07 Mar 2009 06:36:48 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1236367119 0
# Node ID dd489125a2e7718efa0e7afe89e7875d7909353f
# Parent  f1080b20cd15e06d5fc72062c35b627b2f947339
Page offline support in Xen side

This patch add support to offline a page. The basical idea is, when a
page is assigned, it will be marked offline pending and be moved out of
buddy when freed, when a page is free, it will be moved out of buddy directly.

One notice after this change is, now the page->count_info is not
always 0, especially for shadow page, since the PGC_offlining bit may be set.

Signed-off-by: Wang, Shane <shane.wang@xxxxxxxxx>
Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
---
 xen/common/page_alloc.c     |  341 +++++++++++++++++++++++++++++++++++++++++++-
 xen/common/sysctl.c         |   55 +++++++
 xen/include/asm-x86/mm.h    |   23 ++
 xen/include/public/sysctl.h |   49 ++++++
 xen/include/public/xen.h    |    3 
 xen/include/xen/mm.h        |    3 
 6 files changed, 470 insertions(+), 4 deletions(-)

diff -r f1080b20cd15 -r dd489125a2e7 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/common/page_alloc.c   Fri Mar 06 19:18:39 2009 +0000
@@ -35,6 +35,7 @@
 #include <xen/perfc.h>
 #include <xen/numa.h>
 #include <xen/nodemask.h>
+#include <public/sysctl.h>
 #include <asm/page.h>
 #include <asm/numa.h>
 #include <asm/flushtlb.h>
@@ -74,6 +75,11 @@ PAGE_LIST_HEAD(page_scrub_list);
 PAGE_LIST_HEAD(page_scrub_list);
 static unsigned long scrub_pages;
 
+/* Offlined page list, protected by heap_lock */
+PAGE_LIST_HEAD(page_offlined_list);
+
+/* Broken page list, protected by heap_lock */
+PAGE_LIST_HEAD(page_broken_list);
 /*********************
  * ALLOCATION BITMAP
  *  One bit per page of memory. Bit set => page is allocated.
@@ -421,12 +427,92 @@ static struct page_info *alloc_heap_page
     return pg;
 }
 
+/*
+ * Remove any offlined page in the buddy poined by head
+ */
+static int reserve_offlined_page(struct page_info *head)
+{
+    unsigned int node = phys_to_nid(page_to_maddr(head));
+    int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
+    struct page_info *cur_head;
+    int cur_order;
+
+    ASSERT(spin_is_locked(&heap_lock));
+
+    cur_head = head;
+
+    page_list_del(head, &heap(node, zone, head_order));
+
+    while ( cur_head < (head + (1 << head_order)) )
+    {
+        struct page_info *pg;
+        int next_order;
+
+        if (test_bit(_PGC_offlined, &cur_head->count_info))
+        {
+            cur_head++;
+            continue;
+        }
+
+        next_order = cur_order = 0;
+
+        while (cur_order < head_order)
+        {
+            next_order = cur_order + 1;
+
+            if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)))
+                goto merge;
+
+            for (i = (1 << cur_order), pg = cur_head + (1 << cur_order);
+              i < (1 << next_order);
+              i++, pg ++)
+                if (test_bit(_PGC_offlined, &pg->count_info))
+                    break;
+            if (i == ( 1 << next_order))
+            {
+                cur_order = next_order;
+                continue;
+            }
+            else
+            {
+                /*
+                 * We don't need considering merge outside the head_order
+                 */
+merge:
+                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
+                PFN_ORDER(cur_head) = cur_order;
+                cur_head += (1 << cur_order);
+                break;
+            }
+        }
+    }
+
+    for (cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++)
+    {
+        if (!test_bit(_PGC_offlined, &cur_head->count_info))
+            continue;
+
+        avail[node][zone] --;
+
+        map_alloc(page_to_mfn(cur_head), 1);
+
+        if (test_bit(_PGC_broken, &cur_head->count_info))
+            page_list_add_tail(cur_head, &page_broken_list);
+        else
+            page_list_add_tail(cur_head, &page_offlined_list);
+
+        count ++;
+    }
+
+    return count;
+}
+
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
     struct page_info *pg, unsigned int order)
 {
     unsigned long mask;
-    unsigned int i, node = phys_to_nid(page_to_maddr(pg));
+    unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
     unsigned int zone = page_to_zone(pg);
 
     ASSERT(order <= MAX_ORDER);
@@ -446,7 +532,14 @@ static void free_heap_pages(
          *     in its pseudophysical address space).
          * In all the above cases there can be no guest mappings of this page.
          */
-        pg[i].count_info = 0;
+        ASSERT(!(pg[i].count_info & PGC_offlined));
+        pg[i].count_info &= PGC_offlining | PGC_broken;
+        if (pg[i].count_info & PGC_offlining)
+        {
+            pg[i].count_info &= ~PGC_offlining;
+            pg[i].count_info |= PGC_offlined;
+            tainted = 1;
+        }
 
         /* If a page has no owner it will need no safety TLB flush. */
         pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
@@ -481,7 +574,7 @@ static void free_heap_pages(
                 break;
             page_list_del(pg + mask, &heap(node, zone, order));
         }
-        
+
         order++;
 
         /* After merging, pg should remain in the same node. */
@@ -491,7 +584,249 @@ static void free_heap_pages(
     PFN_ORDER(pg) = order;
     page_list_add_tail(pg, &heap(node, zone, order));
 
+    if (tainted)
+        reserve_offlined_page(pg);
+
     spin_unlock(&heap_lock);
+}
+
+
+/*
+ * Following possible status for a page:
+ * free and Online; free and offlined; free and offlined and broken;
+ * assigned and online; assigned and offlining; assigned and offling and broken
+ *
+ * Following rules applied for page offline:
+ * Once a page is broken, it can't be assigned anymore
+ * A page will be offlined only if it is free
+ * return original count_info
+ *
+ */
+static unsigned long mark_page_offline(struct page_info *pg, int broken)
+{
+    unsigned long nx, x, y = pg->count_info;
+
+    ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
+    /*
+     * Caller gurantee the page will not be reassigned during this process
+     */
+    ASSERT(spin_is_locked(&heap_lock));
+
+    do {
+        nx = x = y;
+
+        if ( ((x & PGC_offlined_broken) == PGC_offlined_broken) )
+            return y;
+        /* PGC_offlined means it is free pages */
+        if (x & PGC_offlined)
+        {
+            if (broken && !(nx & PGC_broken))
+                nx |= PGC_broken;
+            else
+                return y;
+        }
+        /* It is not offlined, not reserved page */
+        else if ( allocated_in_map(page_to_mfn(pg)) )
+            nx |= PGC_offlining;
+        else
+            nx |= PGC_offlined;
+
+        if (broken)
+            nx |= PGC_broken;
+    } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
+
+    return y;
+}
+
+static int reserve_heap_page(struct page_info *pg)
+{
+    struct page_info *head = NULL;
+    unsigned int i, node = phys_to_nid(page_to_maddr(pg));
+    unsigned int zone = page_to_zone(pg);
+
+    /* get the header */
+    for ( i = 0; i <= MAX_ORDER; i++ )
+    {
+        struct page_info *tmp;
+
+        if ( page_list_empty(&heap(node, zone, i)) )
+            continue;
+
+        page_list_for_each_safe(head, tmp, &heap(node, zone, i))
+        {
+            if ( (head <= pg) &&
+                 (head + (1UL << i) > pg) )
+                return reserve_offlined_page(head);
+        }
+    }
+
+    return -EINVAL;
+
+}
+
+/*
+ * offline one page
+ */
+int offline_page(unsigned long mfn, int broken, uint32_t *status)
+{
+    unsigned long old_info = 0;
+    struct domain *owner;
+    int ret = 0;
+    struct page_info *pg;
+
+    if (mfn > max_page)
+    {
+        dprintk(XENLOG_WARNING,
+                "try to offline page out of range %lx\n", mfn);
+        return -EINVAL;
+    }
+
+    *status = 0;
+    pg = mfn_to_page(mfn);
+
+
+#if defined(__x86_64__)
+     /* Xen's txt mfn in x86_64 is reserved in e820 */
+    if ( is_xen_fixed_mfn(mfn) )
+#elif defined(__i386__)
+    if ( is_xen_heap_mfn(mfn) )
+#endif
+    {
+        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
+          (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+        return -EPERM;
+    }
+
+    /*
+     * N.B. xen's txt in x86_64 is marked reserved and handled already
+     *  Also kexec range is reserved
+     */
+     if (!page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL))
+     {
+        *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
+        return -EINVAL;
+     }
+
+    spin_lock(&heap_lock);
+
+    old_info = mark_page_offline(pg, broken);
+
+    if ( !allocated_in_map(mfn) )
+    {
+        /* Free pages are reserve directly */
+        reserve_heap_page(pg);
+        *status = PG_OFFLINE_OFFLINED;
+    }
+    else if (test_bit(_PGC_offlined, &pg->count_info))
+    {
+        *status = PG_OFFLINE_OFFLINED;
+    }
+    else if ((owner = page_get_owner_and_reference(pg)))
+    {
+            *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
+              (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
+            /* Release the reference since it will not be allocated anymore */
+            put_page(pg);
+    }
+    else if ( old_info & PGC_xen_heap)
+    {
+        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
+          (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+    }
+    else
+    {
+        /*
+         * assign_pages does not hold heap_lock, so small window that the owner
+         * may be set later, but please notice owner will only change from
+         * NULL to be set, not verse, since page is offlining now.
+         * No windows If called from #MC handler, since all CPU are in softirq
+         * If called from user space like CE handling, tools can wait some time
+         * before call again.
+         */
+        *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
+                  (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
+    }
+
+    if (broken)
+        *status |= PG_OFFLINE_BROKEN;
+
+    spin_unlock(&heap_lock);
+
+    return ret;
+}
+
+/*
+ * Online the memory.
+ *   The caller should make sure end_pfn <= max_page,
+ *   if not, expand_pages() should be called prior to online_page().
+ */
+unsigned int online_page(unsigned long mfn, uint32_t *status)
+{
+    struct page_info *pg;
+    int ret = 0, free = 0;
+
+    if ( mfn > max_page )
+    {
+        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+        return -EINVAL;
+    }
+
+    pg = mfn_to_page(mfn);
+
+    *status = 0;
+
+    spin_lock(&heap_lock);
+
+    if ( unlikely(is_page_broken(pg)) )
+    {
+        ret = -EINVAL;
+        *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
+    }
+    else if (pg->count_info & PGC_offlined)
+    {
+        clear_bit(_PGC_offlined, &pg->count_info);
+        page_list_del(pg, &page_offlined_list);
+        *status = PG_ONLINE_ONLINED;
+        free = 1;
+    }
+    else if (pg->count_info & PGC_offlining)
+    {
+        clear_bit(_PGC_offlining, &pg->count_info);
+        *status = PG_ONLINE_ONLINED;
+    }
+    spin_unlock(&heap_lock);
+
+    if (free)
+        free_heap_pages(pg, 0);
+
+    return ret;
+}
+
+int query_page_offline(unsigned long mfn, uint32_t *status)
+{
+    struct page_info *pg;
+
+    if ( (mfn > max_page) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
+    {
+        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+        return -EINVAL;
+    }
+
+    *status = 0;
+    spin_lock(&heap_lock);
+
+    pg = mfn_to_page(mfn);
+
+    if (pg->count_info & PGC_offlining)
+        *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
+    if (pg->count_info & PGC_broken)
+        *status |= PG_OFFLINE_STATUS_BROKEN;
+    if (pg->count_info & PGC_offlined)
+        *status |= PG_OFFLINE_STATUS_OFFLINED;
+
+    spin_unlock(&heap_lock);
+
+    return 0;
 }
 
 /*
diff -r f1080b20cd15 -r dd489125a2e7 xen/common/sysctl.c
--- a/xen/common/sysctl.c       Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/common/sysctl.c       Fri Mar 06 19:18:39 2009 +0000
@@ -233,6 +233,61 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
     }
     break;
 
+    case XEN_SYSCTL_page_offline_op:
+    {
+        uint32_t *status, *ptr;
+        unsigned long pfn;
+
+        ptr = status = xmalloc_bytes( sizeof(uint32_t) *
+                                (op->u.page_offline.end -
+                                  op->u.page_offline.start + 1));
+        if (!status)
+        {
+            dprintk(XENLOG_WARNING, "Out of memory for page offline op\n");
+            ret = -ENOMEM;
+            break;
+        }
+
+        memset(status, PG_OFFLINE_INVALID, sizeof(uint32_t) *
+                      (op->u.page_offline.end - op->u.page_offline.start + 1));
+
+        for ( pfn = op->u.page_offline.start;
+              pfn <= op->u.page_offline.end;
+              pfn ++ )
+        {
+            switch (op->u.page_offline.cmd)
+            {
+                /* Shall revert her if failed, or leave caller do it? */
+                case sysctl_page_offline:
+                    ret = offline_page(pfn, 0, ptr++);
+                    break;
+                case sysctl_page_online:
+                    ret = online_page(pfn, ptr++);
+                    break;
+                case sysctl_query_page_offline:
+                    ret = query_page_offline(pfn, ptr++);
+                    break;
+                default:
+                    gdprintk(XENLOG_WARNING, "invalid page offline op %x\n",
+                            op->u.page_offline.cmd);
+                    ret = -EINVAL;
+                    break;
+            }
+
+            if (ret)
+                break;
+        }
+
+        if (copy_to_guest(op->u.page_offline.status, status,
+                          op->u.page_offline.end - op->u.page_offline.start + 
1))
+        {
+            ret = -EFAULT;
+            break;
+        }
+        xfree(status);
+    }
+    break;
+
     default:
         ret = arch_do_sysctl(op, u_sysctl);
         break;
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/asm-x86/mm.h  Fri Mar 06 19:18:39 2009 +0000
@@ -198,8 +198,25 @@ struct page_info
  /* 3-bit PAT/PCD/PWT cache-attribute hint. */
 #define PGC_cacheattr_base PG_shift(6)
 #define PGC_cacheattr_mask PG_mask(7, 6)
+
+  /* Page is broken? */
+ #define _PGC_broken         PG_shift(7)
+ #define PGC_broken          PG_mask(1, 7)
+  /* Page is offline pending ? */
+ #define _PGC_offlining      PG_shift(8)
+ #define PGC_offlining       PG_mask(1, 8)
+  /* Page is offlined */
+ #define _PGC_offlined       PG_shift(9)
+ #define PGC_offlined        PG_mask(1, 9)
+ #define PGC_offlined_broken (PGC_offlined | PGC_broken)
+
+ #define is_page_offlining(page)          ((page)->count_info & PGC_offlining)
+ #define is_page_offlined(page)          ((page)->count_info & PGC_offlined)
+ #define is_page_broken(page)           ((page)->count_info & PGC_broken)
+ #define is_page_online(page)           (!is_page_offlined(page))
+
  /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(6)
+#define PGC_count_width   PG_shift(9)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 #if defined(__i386__)
@@ -209,9 +226,13 @@ struct page_info
     (_mfn < paddr_to_pfn(xenheap_phys_end));            \
 })
 #else
+extern unsigned long allocator_bitmap_end;
 #define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap)
 #define is_xen_heap_mfn(mfn) \
     (__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn)))
+#define is_xen_fixed_mfn(mfn) \
+    ( (mfn << PAGE_SHIFT) >= __pa(&_start) &&    \
+          (mfn << PAGE_SHIFT) <= allocator_bitmap_end )
 #endif
 
 #if defined(__i386__)
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/sysctl.h
--- a/xen/include/public/sysctl.h       Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/public/sysctl.h       Fri Mar 06 19:18:39 2009 +0000
@@ -359,6 +359,54 @@ struct xen_sysctl_pm_op {
     };
 };
 
+#define XEN_SYSCTL_page_offline_op        14
+struct xen_sysctl_page_offline_op {
+    /* IN: range of page to be offlined */
+#define sysctl_page_offline     1
+#define sysctl_page_online      2
+#define sysctl_query_page_offline  3
+    uint32_t cmd;
+    uint32_t start;
+    uint32_t end;
+    /* OUT: result of page offline request */
+    /*
+     * bit 0~15: result flags
+     * bit 16~31: owner
+     */
+    XEN_GUEST_HANDLE(uint32) status;
+};
+
+#define PG_OFFLINE_STATUS_MASK    (0xFFUL)
+
+/* The result is invalid, i.e. HV does not handle it */
+#define PG_OFFLINE_INVALID   (0x1UL << 0)
+
+#define PG_OFFLINE_OFFLINED  (0x1UL << 1)
+#define PG_OFFLINE_PENDING   (0x1UL << 2)
+#define PG_OFFLINE_FAILED    (0x1UL << 3)
+
+#define PG_ONLINE_FAILED     PG_OFFLINE_FAILED
+#define PG_ONLINE_ONLINED    PG_OFFLINE_OFFLINED
+
+#define PG_OFFLINE_STATUS_OFFLINED              (0x1UL << 1)
+#define PG_OFFLINE_STATUS_ONLINE                (0x1UL << 2)
+#define PG_OFFLINE_STATUS_OFFLINE_PENDING       (0x1UL << 3)
+#define PG_OFFLINE_STATUS_BROKEN                (0x1UL << 4)
+
+#define PG_OFFLINE_MISC_MASK    (0xFFUL << 4)
+
+/* only valid when PG_OFFLINE_FAILED */
+#define PG_OFFLINE_XENPAGE   (0x1UL << 8)
+#define PG_OFFLINE_DOM0PAGE  (0x1UL << 9)
+#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
+#define PG_OFFLINE_NOT_CONV_RAM   (0x1UL << 11)
+#define PG_OFFLINE_OWNED     (0x1UL << 12)
+
+#define PG_OFFLINE_BROKEN    (0x1UL << 13)
+#define PG_ONLINE_BROKEN     PG_OFFLINE_BROKEN
+
+#define PG_OFFLINE_OWNER_SHIFT 16
+
 struct xen_sysctl {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
@@ -375,6 +423,7 @@ struct xen_sysctl {
         struct xen_sysctl_get_pmstat        get_pmstat;
         struct xen_sysctl_cpu_hotplug       cpu_hotplug;
         struct xen_sysctl_pm_op             pm_op;
+        struct xen_sysctl_page_offline_op   page_offline;
         uint8_t                             pad[128];
     } u;
 };
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/xen.h
--- a/xen/include/public/xen.h  Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/public/xen.h  Fri Mar 06 19:18:39 2009 +0000
@@ -354,6 +354,9 @@ typedef uint16_t domid_t;
  */
 #define DOMID_XEN  (0x7FF2U)
 
+/* DOMID_INVALID is used to identity invalid domid */
+#define DOMID_INVALID (0x7FFFU)
+
 /*
  * Send an array of these to HYPERVISOR_mmu_update().
  * NB. The fields are natural pointer/address size for this architecture.
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/xen/mm.h      Fri Mar 06 19:18:39 2009 +0000
@@ -60,6 +60,9 @@ unsigned long avail_domheap_pages(void);
 unsigned long avail_domheap_pages(void);
 #define alloc_domheap_page(d,f) (alloc_domheap_pages(d,0,f))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
+unsigned int online_page(unsigned long mfn, uint32_t *status);
+int offline_page(unsigned long mfn, int broken, uint32_t *status);
+int query_page_offline(unsigned long mfn, uint32_t *status);
 
 void scrub_heap_pages(void);
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Page offline support in Xen side, Xen patchbot-unstable <=