[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] - fix/improve error handling for failed suspend/migrate



This has been broken since cset 16964:5d84464dc1fc
Also deal better with very early errors (close sender side socket)

Signed-off-by: Steven Hand <steven.hand@xxxxxxxxxxxx>


diff -r b0d7780794eb tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Thu May 08 13:40:40 2008 +0100
+++ b/tools/python/xen/xend/XendCheckpoint.py   Thu May 08 14:08:39 2008 +0100
@@ -81,8 +81,6 @@ def save(fd, dominfo, network, live, dst
     # thing is useful for debugging.
     dominfo.setName('migrating-' + domain_name)
 
-    done_suspend = 0
-
     try:
         dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP1, domain_name)
 
@@ -110,7 +108,6 @@ def save(fd, dominfo, network, live, dst
                 log.debug("Suspending %d ...", dominfo.getDomid())
                 dominfo.shutdown('suspend')
                 dominfo.waitForShutdown()
-                done_suspend = 1
                 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP2,
                                        domain_name)
                 log.info("Domain %d suspended.", dominfo.getDomid())
@@ -154,16 +151,9 @@ def save(fd, dominfo, network, live, dst
             pass
 
     except Exception, exn:
-        log.exception("Save failed on domain %s (%s).", domain_name,
+        log.exception("Save failed on domain %s (%s) - resuming.", domain_name,
                       dominfo.getDomid())
-        
-        # If we didn't get as far as suspending the domain (for
-        # example, we couldn't balloon enough memory for the new
-        # domain), then we don't want to re-plumb the devices, as the
-        # domU will not be expecting it.
-        if done_suspend:
-            log.debug("XendCheckpoint.save: resumeDomain")
-            dominfo.resumeDomain()
+        dominfo.resumeDomain()
  
         try:
             dominfo.setName(domain_name)
diff -r b0d7780794eb tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Thu May 08 13:40:40 2008 +0100
+++ b/tools/python/xen/xend/XendDomain.py       Thu May 08 14:05:56 2008 +0100
@@ -1308,8 +1308,10 @@ class XendDomain:
 
         sock.send("receive\n")
         sock.recv(80)
-        XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node)
-        sock.close()
+        try:
+            XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, 
node=node)
+        finally:
+            sock.close()
 
     def domain_save(self, domid, dst, checkpoint=False):
         """Start saving a domain to file.
diff -r b0d7780794eb tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Thu May 08 13:40:40 2008 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py   Thu May 08 14:07:20 2008 +0100
@@ -2378,8 +2378,19 @@ class XendDomainInfo:
     def resumeDomain(self):
         log.debug("XendDomainInfo.resumeDomain(%s)", str(self.domid))
 
-        if self.domid is None:
+        # resume a suspended domain (e.g. after live checkpoint, or after
+        # a later error during save or migate); checks that the domain
+        # is currently suspended first so safe to call from anywhere
+
+        xeninfo = dom_get(self.domid)
+        if xeninfo is None: 
             return
+        if not xeninfo['shutdown']:
+            return
+        reason = shutdown_reason(xeninfo['shutdown_reason'])
+        if reason != 'suspend':
+            return
+
         try:
             # could also fetch a parsed note from xenstore
             fast = self.info.get_notes().get('SUSPEND_CANCEL') and 1 or 0

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.