On Fri, Dec 21, 2007 at 03:08:03PM +0000, Daniel P. Berrange wrote:
> > This is the cause of the "reboot loop" xend failures I reported earlier.
> > Note that I've only tested this patch against 3.1, however the code,
> > and the fix, is the same in unstable. I realise it's late in the cycle
> > but this bug is bad enough to need fixing IMHO.
>
> I think the entire code segment from the point at which xstransact() is
> created needs to be in a try...finally block to be safe against the
> code throwing exceptions, otherwise you could very ocassionally get the
> final xs.abort() being missed in error conditions.
OK, how about this one? (I'm starting testing it now)
regards
john
# HG changeset patch
# User john.levon@xxxxxxx
# Date 1198250774 28800
# Node ID c847f62cbad09ccbf0ba63ee30204228c16d60fb
# Parent 5092403708afc964a908cf1a193f3fc68fb4b950
Fix xend xenstore handling.
xend can get into a situation where two processes are attempting to
interact with the xenstore socket, with disastrous results. Fix the two
bad users of xstransact, add a big warning, and fix the destructor so
future mistakes will be detected earlier.
Signed-off-by: John Levon <john.levon@xxxxxxx>
diff --git a/tools/python/xen/xend/XendDomainInfo.py
b/tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -1529,18 +1529,19 @@ class XendDomainInfo:
log.debug("Releasing devices")
t = xstransact("%s/device" % self.dompath)
- for devclass in XendDevices.valid_devices():
- for dev in t.list(devclass):
- try:
- log.debug("Removing %s", dev);
- self.destroyDevice(devclass, dev, False);
- except:
- # Log and swallow any exceptions in removal --
- # there's nothing more we can do.
+ try:
+ for devclass in XendDevices.valid_devices():
+ for dev in t.list(devclass):
+ try:
+ log.debug("Removing %s", dev);
+ self.destroyDevice(devclass, dev, False);
+ except:
+ # Log and swallow any exceptions in removal --
+ # there's nothing more we can do.
log.exception("Device release failed: %s; %s; %s",
self.info['name_label'], devclass, dev)
-
-
+ finally:
+ t.abort()
def getDeviceController(self, name):
"""Get the device controller for this domain, and if it
@@ -1852,16 +1853,18 @@ class XendDomainInfo:
# build list of phantom devices to be removed after normal devices
plist = []
if self.domid is not None:
- from xen.xend.xenstore.xstransact import xstransact
t = xstransact("%s/device/vbd" % GetDomainPath(self.domid))
- for dev in t.list():
- backend_phantom_vbd =
xstransact.Read("%s/device/vbd/%s/phantom_vbd" \
- % (self.dompath, dev))
- if backend_phantom_vbd is not None:
- frontend_phantom_vbd = xstransact.Read("%s/frontend" \
- % backend_phantom_vbd)
- plist.append(backend_phantom_vbd)
- plist.append(frontend_phantom_vbd)
+ try:
+ for dev in t.list():
+ backend_phantom_vbd =
xstransact.Read("%s/device/vbd/%s/phantom_vbd" \
+ % (self.dompath, dev))
+ if backend_phantom_vbd is not None:
+ frontend_phantom_vbd = xstransact.Read("%s/frontend" \
+ % backend_phantom_vbd)
+ plist.append(backend_phantom_vbd)
+ plist.append(frontend_phantom_vbd)
+ finally:
+ t.abort()
return plist
def _cleanup_phantom_devs(self, plist):
diff --git a/tools/python/xen/xend/server/pciif.py
b/tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py
+++ b/tools/python/xen/xend/server/pciif.py
@@ -22,8 +22,6 @@ from xen.xend import sxp
from xen.xend import sxp
from xen.xend.XendError import VmError
from xen.xend.XendLogging import log
-
-from xen.xend.xenstore.xstransact import xstransact
from xen.xend.server.DevController import DevController
diff --git a/tools/python/xen/xend/xenstore/xstransact.py
b/tools/python/xen/xend/xenstore/xstransact.py
--- a/tools/python/xen/xend/xenstore/xstransact.py
+++ b/tools/python/xen/xend/xenstore/xstransact.py
@@ -7,8 +7,16 @@
from xen.xend.xenstore.xsutil import xshandle
+class xstransact:
+ """WARNING: Be very careful if you're instantiating an xstransact object
+ yourself (i.e. not using the capitalized static helpers like .Read().
+ It is essential that you clean up the object in place via
+ t.commit/abort(): GC can happen at any time, including contexts where
+ it's not safe to to use the shared xenstore socket fd. In particular,
+ if xend forks, and GC occurs, we can have two processes trying to
+ use the same xenstore fd, and all hell breaks loose.
+ """
-class xstransact:
def __init__(self, path = ""):
@@ -22,8 +30,9 @@ class xstransact:
self.in_transaction = True
def __del__(self):
+ # see above.
if self.in_transaction:
- xshandle().transaction_end(self.transaction, True)
+ raise RuntimeError("ERROR: GC of live transaction")
def commit(self):
if not self.in_transaction:
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|