[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] Xen-unstable: xen panic RIP: dpci_softirq



On Tue, Nov 18, 2014 at 05:20:44PM +0000, Jan Beulich wrote:
> >>> On 18.11.14 at 18:03, <linux@xxxxxxxxxxxxxx> wrote:
> > Tuesday, November 18, 2014, 5:16:50 PM, you wrote:
> >>  1) test_and_[set|clear]_bit sometimes return unexpected values.
> >>     [But this might be invalid as the addition of the ffff8303faaf25a8
> >>      might be correct - as the second dpci the softirq is processing
> >>      could be the MSI one]
> > 
> > Would there be an easy way to stress test this function separately in some 
> > debugging function to see if it indeed is returning unexpected values ?
> 
> I don't think there's a reasonable chance of these functions to return
> "unexpected" values - they're being used elsewhere, and you'd have
> had other problems in the past if they didn't behave as expected.

Interestingly most of the 'test_and_[set|clear]_bit operate on 'unsigned long'
while we do 'unsigned int' here. But the 'test_and'' are all btXl so double-word
safe.

The fact that Sander is able to get 'test_and_clear_bit(STATE_SCHED)' to return
zero - when in fact it should return a positive value - implies that some actor
is messing with the 'state' outside our raise/softirq dialogue.

pt_irq_guest_eoi does pirq_dpci->state = 0 unconditionally!

Lets see if this debug + potential patch helps. This should be applied
on top of the other patch you had. Just in case I am attaching all four
to this email.

From 8093140e374fceb9121ccd07726fb3898b212bfb Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Date: Tue, 18 Nov 2014 15:08:15 -0500
Subject: [PATCH 4/5] DEBUG4: Add an 'stamp' and potential fix.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
---
 xen/drivers/passthrough/io.c | 57 +++++++++++++++++++++++++++++++-------------
 xen/include/xen/hvm/irq.h    |  1 +
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index b786bd2..8a8fc62 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -26,6 +26,8 @@
 #include <asm/hvm/iommu.h>
 #include <asm/hvm/support.h>
 #include <xen/hvm/irq.h>
+#include <xen/console.h>
+
 
 static DEFINE_PER_CPU(struct list_head, dpci_list);
 
@@ -61,21 +63,29 @@ struct _debug_f {
     struct list_head list;
     unsigned int state;
     struct hvm_pirq_dpci *dpci;
+    unsigned long stamp;
 };
 
 struct __debug {
     struct _debug_f ok;
     struct _debug_f poison;
     struct _debug_f raise;
+    struct _debug_f busy_raise;
     struct _debug_f reset;
     struct _debug_f zombie_softirq;
     struct _debug_f zombie_raise;
+    struct _debug_f timeout;
+    struct _debug_f clear;
 };
 
 static DEFINE_PER_CPU(struct __debug, _d);
 
 void _record(struct _debug_f *d, struct hvm_pirq_dpci *pirq_dpci)
 {
+
+    if (pirq_dpci->pirq)
+        return;
+
     if (pirq_dpci->dom)
         d->domid = pirq_dpci->dom->domain_id;
     else
@@ -86,6 +96,7 @@ void _record(struct _debug_f *d, struct hvm_pirq_dpci 
*pirq_dpci)
     d->list.prev = pirq_dpci->softirq_list.prev;
     d->state = pirq_dpci->state;
     d->dpci = pirq_dpci;
+    d->stamp = pirq_dpci->stamp++;
 }
 
 enum {
@@ -95,6 +106,9 @@ enum {
     OK_SOFTIRQ,
     OK_RAISE,
     OK_RESET,
+    OK_TIMEOUT,
+    OK_BUSY,
+    OK_CLEAR,
 };
 
 static void dump_record(struct _debug_f *d, unsigned int type)
@@ -106,7 +120,11 @@ static void dump_record(struct _debug_f *d, unsigned int 
type)
         [OK_SOFTIRQ] = "OK-softirq",
         [OK_RAISE]   = "OK-raise  ",
         [OK_RESET]   = "OK-reset  ",
+        [OK_TIMEOUT] = "OK-timeout",
+        [OK_BUSY]    = "OK-busy   ",
+        [OK_CLEAR]   = "OK-clear  ",
     };
+#if 0
 #define LONG(x) [_HVM_IRQ_DPCI_##x] = #x
     static const char *const names_flag[] = {
         LONG(MACH_PCI_SHIFT),
@@ -117,6 +135,7 @@ static void dump_record(struct _debug_f *d, unsigned int 
type)
     };
 #undef LONG
     unsigned int i;
+#endif
     s_time_t now;
 
     if ( d->domid == 0 )
@@ -126,20 +145,21 @@ static void dump_record(struct _debug_f *d, unsigned int 
type)
         BUG();
 
     now = NOW();
-    printk("d%d %s %lumsec ago, state:%x, %ld count, [prev:%p, next:%p] %p",
+    printk("d%d %s %lumsec ago, state:%x, %ld count, [prev:%p, next:%p] %p 
%lx",
            d->domid, names[type],
            (unsigned long)((now - d->last) / MILLISECS(1)),
-            d->state, d->count, d->list.prev, d->list.next, d->dpci);
+            d->state, d->count, d->list.prev, d->list.next, d->dpci, d->stamp);
 
     if ( d->dpci )
     {
         struct hvm_pirq_dpci *pirq_dpci = d->dpci;
-
+#if 0
         for ( i = 0; i <= _HVM_IRQ_DPCI_GUEST_MSI_SHIFT; i++ )
             if ( pirq_dpci->flags & (1 << i) )
                 printk("%s ", names_flag[i]);
 
         printk(" PIRQ:%d", pirq_dpci->pirq);
+#endif
         if (pirq_dpci->line)
             printk(" LINE: %d", pirq_dpci->line);
     }
@@ -159,7 +179,10 @@ static void dump_debug(unsigned char key)
         printk("CPU%02d: \n" ,cpu);
         dump_record(&d->ok, OK_SOFTIRQ);
         dump_record(&d->raise, OK_RAISE);
+        dump_record(&d->busy_raise, OK_RAISE);
         dump_record(&d->reset, OK_RESET);
+        dump_record(&d->timeout, OK_TIMEOUT);
+        dump_record(&d->clear, OK_TIMEOUT);
         dump_record(&d->poison, ERR_POISON);
         dump_record(&d->zombie_softirq, Z_SOFTIRQ);
         dump_record(&d->zombie_raise, Z_RAISE);
@@ -184,8 +207,10 @@ static void raise_softirq_for(struct hvm_pirq_dpci 
*pirq_dpci)
         return;
     }
     if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
+    {
+        _record(&d->busy_raise, pirq_dpci);
         return;
-
+    }
     _record(&d->raise, pirq_dpci);
 
     get_knownalive_domain(pirq_dpci->dom);
@@ -264,10 +289,14 @@ bool_t pt_irq_need_timer(uint32_t flags)
 static int pt_irq_guest_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
                             void *arg)
 {
+    struct __debug *debug = &__get_cpu_var(_d);
+
     if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT,
                               &pirq_dpci->flags) )
     {
-        pirq_dpci->state = 0;
+        _record(&debug->clear, pirq_dpci);
+        pt_pirq_softirq_reset(pirq_dpci);
+        /* pirq_dpci->state = 0; <= OUCH! */
         pirq_dpci->pending = 0;
         pirq_guest_eoi(dpci_pirq(pirq_dpci));
     }
@@ -280,11 +309,13 @@ static void pt_irq_time_out(void *data)
     struct hvm_pirq_dpci *irq_map = data;
     const struct hvm_irq_dpci *dpci;
     const struct dev_intx_gsi_link *digl;
-
+    struct __debug *d = &__get_cpu_var(_d);
     spin_lock(&irq_map->dom->event_lock);
 
     dpci = domain_get_irq_dpci(irq_map->dom);
     ASSERT(dpci);
+
+    _record(&d->timeout, irq_map);
     list_for_each_entry ( digl, &irq_map->digl_list, list )
     {
         unsigned int guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx);
@@ -302,6 +333,9 @@ static void pt_irq_time_out(void *data)
     pt_pirq_iterate(irq_map->dom, pt_irq_guest_eoi, NULL);
 
     spin_unlock(&irq_map->dom->event_lock);
+    console_start_sync();
+    dump_debug((char)0);
+    console_end_sync();
 }
 
 struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *d)
@@ -901,8 +935,6 @@ unlock:
     spin_unlock(&d->event_lock);
 }
 
-#include <xen/console.h>
-
 /*
  * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
  * doing it. If that is the case we let 'pt_pirq_softirq_reset' do 
ref-counting.
@@ -919,26 +951,17 @@ static void dpci_softirq(void)
             struct hvm_pirq_dpci *dpci;
     } l[4];
     unsigned int i = 0;
-#if DIFF_LIST
-    struct hvm_pirq_dpci *n;
-#endif
     debug = &per_cpu(_d, cpu);
 
     local_irq_disable();
     list_splice_init(&per_cpu(dpci_list, cpu), &our_list);
     local_irq_enable();
-#if DIFF_LIST
-    list_for_each_entry_safe(pirq_dpci, n, &our_list, softirq_list)
-#else
     while ( !list_empty(&our_list) )
-#endif
     {
         struct domain *d;
         struct list_head *entry;
 
-#ifndef DIFF_LIST
         pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, 
softirq_list);
-#endif
         entry = &pirq_dpci->softirq_list;
         if ( i <= 3 )
         {
diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
index 1fb1292..f5b6061 100644
--- a/xen/include/xen/hvm/irq.h
+++ b/xen/include/xen/hvm/irq.h
@@ -102,6 +102,7 @@ struct hvm_pirq_dpci {
     struct list_head softirq_list;
     unsigned int pirq;
     unsigned int line;
+    unsigned long stamp;
 };
 
 void pt_pirq_init(struct domain *, struct hvm_pirq_dpci *);
-- 
1.9.3

Attachment: 0001-dpci-Add-ZOMBIE-state.patch
Description: Text document

Attachment: 0002-debug.patch
Description: Text document

Attachment: 0003-DEBUG-2.patch
Description: Text document

Attachment: 0004-DEBUG4-Add-an-stamp-and-potential-fix.patch
Description: Text document

Attachment: 0005-debug-Remove-the-MARK-code.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.