WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Implement clean return from save/restore

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Implement clean return from save/restore failure (so that original
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 22 Jan 2007 11:00:12 -0800
Delivery-date: Mon, 22 Jan 2007 11:00:14 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1169478932 0
# Node ID 207523704fb15ae92b1852bb7e1f0e739ed01fb3
# Parent  baa9b76ea3e1de27dbe46ba9b3fb117e09637518
Implement clean return from save/restore failure (so that original
domain can continue execution).
Signed-off-by: Andrei Petrov <andrei.petrov@xxxxxxxxxxxxx>
---
 tools/libxc/xc_resume.c                 |  156 +++++++++++++++++++++++++++++---
 tools/libxc/xg_save_restore.h           |    9 -
 tools/python/xen/lowlevel/xc/xc.c       |    4 
 tools/python/xen/xend/XendCheckpoint.py |   24 ++++
 tools/python/xen/xend/XendDomain.py     |    1 
 tools/python/xen/xend/XendDomainInfo.py |   29 +++++
 6 files changed, 200 insertions(+), 23 deletions(-)

diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c   Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/libxc/xc_resume.c   Mon Jan 22 15:15:32 2007 +0000
@@ -1,5 +1,6 @@
 #include "xc_private.h"
-
+#include "xg_private.h"
+#include "xg_save_restore.h"
 
 #if defined(__i386__) || defined(__x86_64__)
 static int modify_returncode(int xc_handle, uint32_t domid)
@@ -22,19 +23,7 @@ static int modify_returncode(int xc_hand
 }
 #endif
 
-
-/*
- * Resume execution of a domain after suspend shutdown.
- * This can happen in one of two ways:
- *  1. Resume with special return code.
- *  2. Reset guest environment so it believes it is resumed in a new
- *     domain context.
- * (2) should be used only for guests which cannot handle the special
- * new return code. (1) is always safe (but slower).
- * 
- * XXX Only (2) is implemented below. We need to use (1) by default!
- */
-int xc_domain_resume(int xc_handle, uint32_t domid)
+static int xc_domain_resume_cooperative(int xc_handle, uint32_t domid)
 {
     DECLARE_DOMCTL;
     int rc;
@@ -50,3 +39,142 @@ int xc_domain_resume(int xc_handle, uint
     domctl.domain = domid;
     return do_domctl(xc_handle, &domctl);
 }
+
+static int xc_domain_resume_any(int xc_handle, uint32_t domid)
+{
+    DECLARE_DOMCTL;
+    int i, rc = -1;
+
+    /*
+     * (x86 only) Rewrite store_mfn and console_mfn back to MFN (from PFN).
+     */
+#if defined(__i386__) || defined(__x86_64__)
+    xc_dominfo_t info;
+    unsigned long mfn, max_pfn = 0;
+    vcpu_guest_context_t ctxt;
+    start_info_t *start_info;
+    shared_info_t *shinfo = NULL;
+    xen_pfn_t *p2m_frame_list_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
+    xen_pfn_t *p2m = NULL;
+
+    if ( xc_domain_getinfo(xc_handle, domid, 1, &info) != 1 )
+    {
+        PERROR("Could not get domain info");
+        goto out;
+    }
+
+    /* Map the shared info frame */
+    shinfo = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
+                                  PROT_READ, info.shared_info_frame);
+    if ( shinfo == NULL )
+    {
+        ERROR("Couldn't map shared info");
+        goto out;
+    }
+
+    max_pfn = shinfo->arch.max_pfn;
+
+    p2m_frame_list_list =
+        xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ,
+                             shinfo->arch.pfn_to_mfn_frame_list_list);
+    if ( p2m_frame_list_list == NULL )
+    {
+        ERROR("Couldn't map p2m_frame_list_list");
+        goto out;
+    }
+
+    p2m_frame_list = xc_map_foreign_batch(xc_handle, domid, PROT_READ,
+                                          p2m_frame_list_list,
+                                          P2M_FLL_ENTRIES);
+    if ( p2m_frame_list == NULL )
+    {
+        ERROR("Couldn't map p2m_frame_list");
+        goto out;
+    }
+
+    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
+       the guest must not change which frames are used for this purpose.
+       (its not clear why it would want to change them, and we'll be OK
+       from a safety POV anyhow. */
+    p2m = xc_map_foreign_batch(xc_handle, domid, PROT_READ,
+                               p2m_frame_list,
+                               P2M_FL_ENTRIES);
+    if ( p2m == NULL )
+    {
+        ERROR("Couldn't map p2m table");
+        goto out;
+    }
+
+    if ( lock_pages(&ctxt, sizeof(ctxt)) )
+    {
+        ERROR("Unable to lock ctxt");
+        goto out;
+    }
+
+    if ( xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt) )
+    {
+        ERROR("Could not get vcpu context");
+        goto out;
+    }
+
+    mfn = ctxt.user_regs.edx;
+
+    start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
+                                      PROT_READ | PROT_WRITE, mfn);
+    if ( start_info == NULL )
+    {
+        ERROR("Couldn't map start_info");
+        goto out;
+    }
+
+    start_info->store_mfn        = p2m[start_info->store_mfn];
+    start_info->console.domU.mfn = p2m[start_info->console.domU.mfn];
+
+    munmap(start_info, PAGE_SIZE);
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+    /* Reset all secondary CPU states. */
+    for ( i = 1; i <= info.max_vcpu_id; i++ )
+        xc_vcpu_setcontext(xc_handle, domid, i, NULL);
+
+    /* Ready to resume domain execution now. */
+    domctl.cmd = XEN_DOMCTL_resumedomain;
+    domctl.domain = domid;
+    rc = do_domctl(xc_handle, &domctl);
+
+#if defined(__i386__) || defined(__x86_64__)
+ out:
+    unlock_pages((void *)&ctxt, sizeof ctxt);
+    if (p2m)
+        munmap(p2m, P2M_FL_ENTRIES*PAGE_SIZE);
+    if (p2m_frame_list)
+        munmap(p2m_frame_list, P2M_FLL_ENTRIES*PAGE_SIZE);
+    if (p2m_frame_list_list)
+        munmap(p2m_frame_list_list, PAGE_SIZE);
+    if (shinfo)
+        munmap(shinfo, PAGE_SIZE);
+#endif
+
+    return rc;
+}
+
+/*
+ * Resume execution of a domain after suspend shutdown.
+ * This can happen in one of two ways:
+ *  1. Resume with special return code.
+ *  2. Reset guest environment so it believes it is resumed in a new
+ *     domain context.
+ * (2) should be used only for guests which cannot handle the special
+ * new return code. (1) is always safe (but slower).
+ */
+int xc_domain_resume(int xc_handle, uint32_t domid)
+{
+    /*
+     * XXX: Implement a way to select between options (1) and (2).
+     * Or expose the options as two different methods to Python.
+     */
+    return (0
+            ? xc_domain_resume_cooperative(xc_handle, domid)
+            : xc_domain_resume_any(xc_handle, domid));
+}
diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/libxc/xg_save_restore.h     Mon Jan 22 15:15:32 2007 +0000
@@ -34,11 +34,10 @@
 **
 ** Returns 1 on success, 0 on failure.
 */
-static int get_platform_info(int xc_handle, uint32_t dom,
-                             /* OUT */ unsigned long *max_mfn,
-                             /* OUT */ unsigned long *hvirt_start,
-                             /* OUT */ unsigned int *pt_levels)
-
+static inline int get_platform_info(int xc_handle, uint32_t dom,
+                                    /* OUT */ unsigned long *max_mfn,
+                                    /* OUT */ unsigned long *hvirt_start,
+                                    /* OUT */ unsigned int *pt_levels)
 {
     xen_capabilities_info_t xen_caps = "";
     xen_platform_parameters_t xen_params;
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 15:15:32 2007 +0000
@@ -1064,9 +1064,9 @@ static PyMethodDef pyxc_methods[] = {
       "Destroy a domain.\n"
       " dom [int]:    Identifier of domain to be destroyed.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
-    
+
     { "domain_resume", 
-      (PyCFunction)pyxc_domain_resume, 
+      (PyCFunction)pyxc_domain_resume,
       METH_VARARGS, "\n"
       "Resume execution of a suspended domain.\n"
       " dom [int]: Identifier of domain to be resumed.\n\n"
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendCheckpoint.py   Mon Jan 22 15:15:32 2007 +0000
@@ -122,6 +122,8 @@ def save(fd, dominfo, network, live, dst
             os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid())
 
         dominfo.destroyDomain()
+        dominfo.testDeviceComplete()
+
         try:
             dominfo.setName(domain_name)
         except VmError:
@@ -134,11 +136,31 @@ def save(fd, dominfo, network, live, dst
     except Exception, exn:
         log.exception("Save failed on domain %s (%s).", domain_name,
                       dominfo.getDomid())
+
+        dominfo._releaseDevices()
+        dominfo.testDeviceComplete()
+        dominfo.testvifsComplete()
+        log.debug("XendCheckpoint.save: devices released")
+
+        dominfo._resetChannels()
+
+        dominfo._removeDom('control/shutdown')
+        dominfo._removeDom('device-misc/vif/nextDeviceID')
+
+        dominfo._createChannels()
+        dominfo._introduceDomain()
+        dominfo._storeDomDetails()
+
+        dominfo._createDevices()
+        log.debug("XendCheckpoint.save: devices created")
+
+        dominfo.resumeDomain()
+        log.debug("XendCheckpoint.save: resumeDomain")
+
         try:
             dominfo.setName(domain_name)
         except:
             log.exception("Failed to reset the migrating domain's name")
-        raise Exception, exn
 
 
 def restore(xd, fd, dominfo = None, paused = False):
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendDomain.py       Mon Jan 22 15:15:32 2007 +0000
@@ -1166,7 +1166,6 @@ class XendDomain:
         sock.send("receive\n")
         sock.recv(80)
         XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst)
-        dominfo.testDeviceComplete()
         sock.close()
 
     def domain_save(self, domid, dst):
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendDomainInfo.py   Mon Jan 22 15:15:32 2007 +0000
@@ -1580,6 +1580,16 @@ class XendDomainInfo:
             log.exception("Exception in alloc_unbound(%d)", self.domid)
             raise
 
+    def _resetChannels(self):
+        """Reset all event channels in the domain.
+        """
+        try:
+            return xc.evtchn_reset(dom=self.domid)
+        except:
+            log.exception("Exception in evtcnh_reset(%d)", self.domid)
+            raise
+
+
     #
     # Bootloader configuration
     #
@@ -1727,6 +1737,25 @@ class XendDomainInfo:
             test = 0
             diff = time.time() - start
             for i in self.getDeviceController('vbd').deviceIDs():
+                test = 1
+                log.info("Dev %s still active, looping...", i)
+                time.sleep(0.1)
+                
+            if test == 0:
+                break
+            if diff >= MIGRATE_TIMEOUT:
+                log.info("Dev still active but hit max loop timeout")
+                break
+
+    def testvifsComplete(self):
+        """ In case vifs are released and then created for the same
+        domain, we need to wait the device shut down.
+        """
+        start = time.time()
+        while True:
+            test = 0
+            diff = time.time() - start
+            for i in self.getDeviceController('vif').deviceIDs():
                 test = 1
                 log.info("Dev %s still active, looping...", i)
                 time.sleep(0.1)

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Implement clean return from save/restore failure (so that original, Xen patchbot-unstable <=