WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] xl: pci completion error

On Wed, 2010-10-06 at 14:54 +0100, Stefano Stabellini wrote:
> On Wed, 6 Oct 2010, Sergey Tovpeko wrote:
> > The guest is Windows XP sp2 32 bit.
> > 
> > When I do pci hot-unplug on the fly
> > xl pci-detach 11:0b.0
> > the guest detaches the pci card correctly with 'pci-removed' signal, but 
> > on the shutdown, it doesn't.
> > 
> > In the code the flow is the following:
> > qemu-dm receives the 'pci-rem' command and sends the SCI interrupt to 
> > the guest. I think guest should catch this interrupt and write to the 
> > magic ACPI port that causes qemu-dm send 'pci-removed' signal.
> > 
> > On guest shutdown,  libxl sends 'pci-rem' command to qemu-dm and waits 
> > for the answer.
> > I suppose, by the time 'pci-rem' signal my guest have finished its work, 
> > and doesn't react on SCI interrupt. As a result libxl doesn't receive 
> > pci-removed signal.
> > 
> > It's my observation.
> > 
> 
> We could probably benefit from a boolean "force" parameter to do_pci_remove
> and libxl_device_pci_remove, that would cause do_pci_remove not to
> return if libxl__wait_for_device_model returns error.
> Of course libxl_device_pci_remove would be called with force=1 by
> libxl_device_pci_shutdown.
> Gianni, what do you think?

Sergeys analysis sounds very plausible to me actually. The co-ordination
required between qemu and libxl for PCI passthrough is very complicated
and one ought to have fairly low confidence in it's correctness :P

Below is a less hacky version of the patch I just sent. Stefano, please
consider this for inclusion.

---
xl: Implement PCI passthrough force removal

This fixes two errors with removing PCI devices from HVM domains. The
first error is that the handling of "pci-rem" device-model command is
erroneously implemented in qemu and difficult (impossible?) to get
right.

For example, during domain shutdown there can be a race where the guest
OS unloads it's drivers and perhaps even shuts down PCI subsystem before
the pci-rem command has been received by qemu. This means that no OS is
present to write to the port which causes the dm command to be
acknowledged.

We fix this by implementing a 'force removal' option to
libxl_device_pci_remove which is always set to 1 during guest shutdown.
It can be optionally enabled on the xl command line for other occasions.

The second error is that if a guest OS doesn't respond to the SCI
interrupt and therefore the pci-rem dm command, which can happen if the
guest OS has no ACPI PCI hotplug support, then device removal bails with
an error but only AFTER removing the device from xenstore. This means
that xenstore gets in to an inconsistent state where an assigned device
also appears to be assignable.

This is fixed by moving xenstore device removal to occur only after the
device has really been removed.

Signed-off-by: Gianni Tedesco <gianni.tedesco@xxxxxxxxxx>

diff -r 02e199c96ece tools/libxl/libxl.h
--- a/tools/libxl/libxl.h       Wed Oct 06 11:00:19 2010 +0100
+++ b/tools/libxl/libxl.h       Wed Oct 06 15:29:12 2010 +0100
@@ -406,7 +406,7 @@ int libxl_device_vfb_clean_shutdown(libx
 int libxl_device_vfb_hard_shutdown(libxl_ctx *ctx, uint32_t domid);
 
 int libxl_device_pci_add(libxl_ctx *ctx, uint32_t domid, libxl_device_pci 
*pcidev);
-int libxl_device_pci_remove(libxl_ctx *ctx, uint32_t domid, libxl_device_pci 
*pcidev);
+int libxl_device_pci_remove(libxl_ctx *ctx, uint32_t domid, libxl_device_pci 
*pcidev, int force);
 int libxl_device_pci_shutdown(libxl_ctx *ctx, uint32_t domid);
 int libxl_device_pci_list_assigned(libxl_ctx *ctx, libxl_device_pci **list, 
uint32_t domid, int *num);
 int libxl_device_pci_list_assignable(libxl_ctx *ctx, libxl_device_pci **list, 
int *num);
diff -r 02e199c96ece tools/libxl/libxl_pci.c
--- a/tools/libxl/libxl_pci.c   Wed Oct 06 11:00:19 2010 +0100
+++ b/tools/libxl/libxl_pci.c   Wed Oct 06 15:29:12 2010 +0100
@@ -841,7 +841,8 @@ out:
     return rc;
 }
 
-static int do_pci_remove(libxl__gc *gc, uint32_t domid, libxl_device_pci 
*pcidev)
+static int do_pci_remove(libxl__gc *gc, uint32_t domid,
+                         libxl_device_pci *pcidev, int force)
 {
     libxl_ctx *ctx = libxl__gc_owner(gc);
     libxl_device_pci *assigned;
@@ -858,8 +859,6 @@ static int do_pci_remove(libxl__gc *gc, 
         }
     }
 
-    libxl_device_pci_remove_xenstore(gc, domid, pcidev);
-
     hvm = libxl__domain_is_hvm(ctx, domid);
     if (hvm) {
         if (libxl__wait_for_device_model(ctx, domid, "running", NULL, NULL) < 
0) {
@@ -878,7 +877,12 @@ static int do_pci_remove(libxl__gc *gc, 
             xs_write(ctx->xsh, XBT_NULL, path, "pci-rem", strlen("pci-rem"));
             if (libxl__wait_for_device_model(ctx, domid, "pci-removed", NULL, 
NULL) < 0) {
                 LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "Device Model didn't respond 
in time");
-                return ERROR_FAIL;
+                /* This depends on guest operating system acknowledging the
+                 * SCI, if it doesn't respond in time then we may wish to 
+                 * force the removal.
+                 */
+                if ( !force )
+                    return ERROR_FAIL;
             }
         }
         path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/state", 
domid);
@@ -924,7 +928,7 @@ skip1:
         if ((fscanf(f, "%u", &irq) == 1) && irq) {
             rc = xc_physdev_unmap_pirq(ctx->xch, domid, irq);
             if (rc < 0) {
-                LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc, 
"xc_physdev_map_pirq irq=%d", irq);
+                LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc, 
"xc_physdev_unmap_pirq irq=%d", irq);
             }
             rc = xc_domain_irq_permission(ctx->xch, domid, irq, 0);
             if (rc < 0) {
@@ -948,13 +952,16 @@ out:
     stubdomid = libxl_get_stubdom_id(ctx, domid);
     if (stubdomid != 0) {
         libxl_device_pci pcidev_s = *pcidev;
-        libxl_device_pci_remove(ctx, stubdomid, &pcidev_s);
+        libxl_device_pci_remove(ctx, stubdomid, &pcidev_s, force);
     }
 
+    libxl_device_pci_remove_xenstore(gc, domid, pcidev);
+
     return 0;
 }
 
-int libxl_device_pci_remove(libxl_ctx *ctx, uint32_t domid, libxl_device_pci 
*pcidev)
+int libxl_device_pci_remove(libxl_ctx *ctx, uint32_t domid,
+                            libxl_device_pci *pcidev, int force)
 {
     libxl__gc gc = LIBXL_INIT_GC(ctx);
     unsigned int orig_vdev, pfunc_mask;
@@ -980,7 +987,7 @@ int libxl_device_pci_remove(libxl_ctx *c
             }else{
                 pcidev->vdevfn = orig_vdev;
             }
-            if ( do_pci_remove(&gc, domid, pcidev) )
+            if ( do_pci_remove(&gc, domid, pcidev, force) )
                 rc = ERROR_FAIL;
         }
     }
@@ -1049,7 +1056,11 @@ int libxl_device_pci_shutdown(libxl_ctx 
     if ( rc )
         return rc;
     for (i = 0; i < num; i++) {
-        if (libxl_device_pci_remove(ctx, domid, pcidevs + i) < 0)
+        /* Force remove on shutdown since, on HVM, qemu will not always
+         * respond to SCI interrupt because the guest kernel has shut down the
+         * devices by the time we even get here!
+         */
+        if (libxl_device_pci_remove(ctx, domid, pcidevs + i, 1) < 0)
             return ERROR_FAIL;
     }
     free(pcidevs);
diff -r 02e199c96ece tools/libxl/xl_cmdimpl.c
--- a/tools/libxl/xl_cmdimpl.c  Wed Oct 06 11:00:19 2010 +0100
+++ b/tools/libxl/xl_cmdimpl.c  Wed Oct 06 15:29:12 2010 +0100
@@ -2262,7 +2262,7 @@ int main_pcilist(int argc, char **argv)
     return 0;
 }
 
-static void pcidetach(char *dom, char *bdf)
+static void pcidetach(char *dom, char *bdf, int force)
 {
     libxl_device_pci pcidev;
 
@@ -2273,20 +2273,24 @@ static void pcidetach(char *dom, char *b
         fprintf(stderr, "pci-detach: malformed BDF specification \"%s\"\n", 
bdf);
         exit(2);
     }
-    libxl_device_pci_remove(&ctx, domid, &pcidev);
+    libxl_device_pci_remove(&ctx, domid, &pcidev, force);
     libxl_device_pci_destroy(&pcidev);
 }
 
 int main_pcidetach(int argc, char **argv)
 {
     int opt;
+    int force = 0;
     char *domname = NULL, *bdf = NULL;
 
-    while ((opt = getopt(argc, argv, "h")) != -1) {
+    while ((opt = getopt(argc, argv, "hf")) != -1) {
         switch (opt) {
         case 'h':
             help("pci-detach");
             return 0;
+        case 'f':
+            force = 1;
+            break;
         default:
             fprintf(stderr, "option not supported\n");
             break;
@@ -2300,7 +2304,7 @@ int main_pcidetach(int argc, char **argv
     domname = argv[optind];
     bdf = argv[optind + 1];
 
-    pcidetach(domname, bdf);
+    pcidetach(domname, bdf, force);
     return 0;
 }
 static void pciattach(char *dom, char *bdf, char *vs)



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel