[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] Fix xend xenstore handling



On Fri, Dec 21, 2007 at 03:08:03PM +0000, Daniel P. Berrange wrote:

> > This is the cause of the "reboot loop" xend failures I reported earlier.
> > Note that I've only tested this patch against 3.1, however the code,
> > and the fix, is the same in unstable. I realise it's late in the cycle
> > but this bug is bad enough to need fixing IMHO.
> 
> I think the entire code segment from the point at which xstransact() is
> created needs to be in a try...finally   block to be safe against the
> code throwing exceptions, otherwise you could very ocassionally get the
> final xs.abort() being missed in error conditions.

OK, how about this one? (I'm starting testing it now)

regards
john

# HG changeset patch
# User john.levon@xxxxxxx
# Date 1198250774 28800
# Node ID c847f62cbad09ccbf0ba63ee30204228c16d60fb
# Parent  5092403708afc964a908cf1a193f3fc68fb4b950
Fix xend xenstore handling.

xend can get into a situation where two processes are attempting to
interact with the xenstore socket, with disastrous results. Fix the two
bad users of xstransact, add a big warning, and fix the destructor so
future mistakes will be detected earlier.

Signed-off-by: John Levon <john.levon@xxxxxxx>

diff --git a/tools/python/xen/xend/XendDomainInfo.py 
b/tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -1529,18 +1529,19 @@ class XendDomainInfo:
 
         log.debug("Releasing devices")
         t = xstransact("%s/device" % self.dompath)
-        for devclass in XendDevices.valid_devices():
-            for dev in t.list(devclass):
-                try:
-                    log.debug("Removing %s", dev);
-                    self.destroyDevice(devclass, dev, False);
-                except:
-                    # Log and swallow any exceptions in removal --
-                    # there's nothing more we can do.
+        try:
+            for devclass in XendDevices.valid_devices():
+                for dev in t.list(devclass):
+                    try:
+                        log.debug("Removing %s", dev);
+                        self.destroyDevice(devclass, dev, False);
+                    except:
+                        # Log and swallow any exceptions in removal --
+                        # there's nothing more we can do.
                         log.exception("Device release failed: %s; %s; %s",
                                       self.info['name_label'], devclass, dev)
-
-            
+        finally:
+            t.abort()
 
     def getDeviceController(self, name):
         """Get the device controller for this domain, and if it
@@ -1852,16 +1853,18 @@ class XendDomainInfo:
         # build list of phantom devices to be removed after normal devices
         plist = []
         if self.domid is not None:
-            from xen.xend.xenstore.xstransact import xstransact
             t = xstransact("%s/device/vbd" % GetDomainPath(self.domid))
-            for dev in t.list():
-                backend_phantom_vbd = 
xstransact.Read("%s/device/vbd/%s/phantom_vbd" \
-                                      % (self.dompath, dev))
-                if backend_phantom_vbd is not None:
-                    frontend_phantom_vbd =  xstransact.Read("%s/frontend" \
-                                      % backend_phantom_vbd)
-                    plist.append(backend_phantom_vbd)
-                    plist.append(frontend_phantom_vbd)
+            try:
+                for dev in t.list():
+                    backend_phantom_vbd = 
xstransact.Read("%s/device/vbd/%s/phantom_vbd" \
+                                          % (self.dompath, dev))
+                    if backend_phantom_vbd is not None:
+                        frontend_phantom_vbd =  xstransact.Read("%s/frontend" \
+                                          % backend_phantom_vbd)
+                        plist.append(backend_phantom_vbd)
+                        plist.append(frontend_phantom_vbd)
+            finally:
+                t.abort()
         return plist
 
     def _cleanup_phantom_devs(self, plist):
diff --git a/tools/python/xen/xend/server/pciif.py 
b/tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py
+++ b/tools/python/xen/xend/server/pciif.py
@@ -22,8 +22,6 @@ from xen.xend import sxp
 from xen.xend import sxp
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
-
-from xen.xend.xenstore.xstransact import xstransact
 
 from xen.xend.server.DevController import DevController
 
diff --git a/tools/python/xen/xend/xenstore/xstransact.py 
b/tools/python/xen/xend/xenstore/xstransact.py
--- a/tools/python/xen/xend/xenstore/xstransact.py
+++ b/tools/python/xen/xend/xenstore/xstransact.py
@@ -7,8 +7,16 @@
 
 from xen.xend.xenstore.xsutil import xshandle
 
+class xstransact:
+    """WARNING: Be very careful if you're instantiating an xstransact object
+       yourself (i.e. not using the capitalized static helpers like .Read().
+       It is essential that you clean up the object in place via
+       t.commit/abort(): GC can happen at any time, including contexts where
+       it's not safe to to use the shared xenstore socket fd. In particular,
+       if xend forks, and GC occurs, we can have two processes trying to
+       use the same xenstore fd, and all hell breaks loose.
+       """
 
-class xstransact:
 
     def __init__(self, path = ""):
         
@@ -22,8 +30,9 @@ class xstransact:
         self.in_transaction = True
 
     def __del__(self):
+        # see above.
         if self.in_transaction:
-            xshandle().transaction_end(self.transaction, True)
+            raise RuntimeError("ERROR: GC of live transaction")
 
     def commit(self):
         if not self.in_transaction:

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.