[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC PATCH 6/7] XendCheckpoint: implement colo



In colo mode, XendCheckpoit.py will communicate with both master and
xc_restore. This patch implements this communication. In colo mode,
the signature is "GuestColoRestore".

Signed-off-by: Ye Wei <wei.ye1987@xxxxxxxxx>
Signed-off-by: Jiang Yunhong <yunhong.jiang@xxxxxxxxx>
Signed-off-by: Wen Congyang <wency@xxxxxxxxxxxxxx>

---
 tools/python/xen/xend/XendCheckpoint.py | 138 +++++++++++++++++++++++---------
 1 file changed, 101 insertions(+), 37 deletions(-)

diff --git a/tools/python/xen/xend/XendCheckpoint.py 
b/tools/python/xen/xend/XendCheckpoint.py
index fa09757..261d9d1 100644
--- a/tools/python/xen/xend/XendCheckpoint.py
+++ b/tools/python/xen/xend/XendCheckpoint.py
@@ -25,6 +25,7 @@ from xen.xend.XendConstants import *
 from xen.xend import XendNode
 
 SIGNATURE = "LinuxGuestRecord"
+COLO_SIGNATURE = "GuestColoRestore"
 QEMU_SIGNATURE = "QemuDeviceModelRecord"
 dm_batch = 512
 XC_SAVE = "xc_save"
@@ -203,10 +204,15 @@ def restore(xd, fd, dominfo = None, paused = False, 
relocating = False):
 
     signature = read_exact(fd, len(SIGNATURE),
         "not a valid guest state file: signature read")
-    if signature != SIGNATURE:
+    if signature != SIGNATURE and signature != COLO_SIGNATURE:
         raise XendError("not a valid guest state file: found '%s'" %
                         signature)
 
+    if signature == COLO_SIGNATURE:
+        colo = True
+    else
+        colo = False
+
     l = read_exact(fd, sizeof_int,
                    "not a valid guest state file: config size read")
     vmconfig_size = unpack("!i", l)[0]
@@ -305,6 +311,7 @@ def restore(xd, fd, dominfo = None, paused = False, 
relocating = False):
         log.debug("[xc_restore]: %s", string.join(cmd))
 
         handler = RestoreInputHandler()
+        restore_handler = RestoreHandler(fd, colo, dominfo, inputHandler)
 
         forkHelper(cmd, fd, handler.handler, True)
 
@@ -321,35 +328,9 @@ def restore(xd, fd, dominfo = None, paused = False, 
relocating = False):
             raise XendError('Could not read store MFN')
 
         if not is_hvm and handler.console_mfn is None:
-            raise XendError('Could not read console MFN')        
-
-        restore_image.setCpuid()
-
-        # xc_restore will wait for source to close connection
-        
-        dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
-
-        #
-        # We shouldn't hold the domains_lock over a waitForDevices
-        # As this function sometime gets called holding this lock,
-        # we must release it and re-acquire it appropriately
-        #
-        from xen.xend import XendDomain
+            raise XendError('Could not read console MFN')
 
-        lock = True;
-        try:
-            XendDomain.instance().domains_lock.release()
-        except:
-            lock = False;
-
-        try:
-            dominfo.waitForDevices() # Wait for backends to set up
-        finally:
-            if lock:
-                XendDomain.instance().domains_lock.acquire()
-
-        if not paused:
-            dominfo.unpause()
+        restorehandler.resume(True, paused, None)
 
         return dominfo
     except Exception, exn:
@@ -358,23 +339,106 @@ def restore(xd, fd, dominfo = None, paused = False, 
relocating = False):
         raise exn
 
 
+class RestoreHandler:
+    def __init__(self, fd, colo, dominfo, inputHandler):
+        self.fd = fd
+        self.colo = colo
+        self.firsttime = True
+        self.inputHandler = inputHandler
+        self.dominfo = dominfo
+
+    def resume(self, finish, paused, child):
+        fd = self.fd
+        dominfo = self.dominfo
+        handler = self.inputHandler
+        restore_image.setCpuid()
+        dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
+
+        if self.colo and not finish:
+            # notify master that checkpoint finishes
+            write_exact(fd, "finish", "failed to write finish done")
+            buf = read_exact(fd, 6, "failed to read resume flag")
+            if buf != "resume":
+                return False
+
+        from xen.xend import XendDomain
+
+        if self.firsttime:
+            lock = True;
+            try:
+                XendDomain.instance().domains_lock.release()
+            except:
+                lock = False;
+
+            try:
+                dominfo.waitForDevices() # Wait for backends to set up
+            finally:
+                if lock:
+                    XendDomain.instance().domains_lock.acquire()
+            if not paused:
+                dominfo.unpause()
+        else:
+            # colo
+            xc.domain_resume(dominfo.domid, 0)
+            ResumeDomain(dominfo.domid)
+
+        if self.colo and not finish:
+            child.tochild.write("resume\n")
+            child.tochild.flush()
+            buf = child.fromchild.readline()
+            if buf != "resume\n":
+                return False
+            if self.firsttime:
+                util.runcmd("/etc/xen/scripts/HA_fw_runtime.sh slaver")
+            # notify master side VM resumed
+            write_exact(fd, "resume", "failed to write resume done");
+
+            # wait new checkpoint
+            buf = read_exact(fd, 8, "failed to read continue flag")
+            if buf != "continue":
+                return False
+
+            child.tochild.write("suspend\n")
+            buf = child.fromchild.readline()
+            if buf != "suspend\n":
+                return False
+
+            # notify master side suspend done.
+            write_exact(fd, "suspend", "failed to write suspend done")
+            buf = read_exact(fd, 5, "failed to read start flag")
+            if buf != "start":
+                return False
+
+            child.tochild.write("start\n")
+            child.tochild.flush()
+
+            self.firsttime = False
+
 class RestoreInputHandler:
-    def __init__(self):
+    def __init__(self, colo):
         self.store_mfn = None
         self.console_mfn = None
 
 
-    def handler(self, line, _):
+    def handler(self, line, child, restorehandler):
+        if line == "finish\n":
+            # colo
+            return restorehandler.resume(False, False, child)
+
         m = re.match(r"^(store-mfn) (\d+)$", line)
         if m:
             self.store_mfn = int(m.group(2))
-        else:
-            m = re.match(r"^(console-mfn) (\d+)$", line)
-            if m:
-                self.console_mfn = int(m.group(2))
+            return True
+
+        m = re.match(r"^(console-mfn) (\d+)$", line)
+        if m:
+            self.console_mfn = int(m.group(2))
+            return True
+
+        return False
 
 
-def forkHelper(cmd, fd, inputHandler, closeToChild):
+def forkHelper(cmd, fd, inputHandler, closeToChild, restorehandler):
     child = xPopen3(cmd, True, -1, [fd])
 
     if closeToChild:
@@ -392,7 +456,7 @@ def forkHelper(cmd, fd, inputHandler, closeToChild):
                 else:
                     line = line.rstrip()
                     log.debug('%s', line)
-                    inputHandler(line, child.tochild)
+                    inputHandler(line, child, restorehandler)
 
         except IOError, exn:
             raise XendError('Error reading from child process for %s: %s' %
-- 
1.8.0


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.