[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] netback Oops then xenwatch stuck in D state



>>> On 14.02.13 at 13:45, "Jan Beulich" <JBeulich@xxxxxxxx> wrote:
>>>> On 14.02.13 at 13:33, "Jan Beulich" <JBeulich@xxxxxxxx> wrote:
>>>>> On 14.02.13 at 13:13, Wei Liu <wei.liu2@xxxxxxxxxx> wrote:
>>> On Thu, 2013-02-14 at 11:48 +0000, Jan Beulich wrote:
>>>> I don't think this patch will fix his problems, which - as described
>>>> yesterday - I'm relatively certain result from the harsh action
>>>> netbk_fatal_tx_err() does.
>>> 
>>> Yes, having this one only is not enough. It will need to either disable
>>> the vif timer or check vif->netbk != NULL inside timer callback.
>> 
>> I continue to be unclear what timer you refer to.
>> 
>> If we keep the carrier-off in fatal_tx_err, then _all_
>> asynchronously callable interfaces need to check whether the
>> vif -> netbk association went away. But, especially in the
>> context of using tasklets instead of kthreads, I meanwhile
>> think that simply setting a flag (along with turning off the IRQ)
>> would be better (and keep the turning off of the carrier the way
>> it had been done before. The flag would basically need checking
>> anywhere we look at the shared ring (which ought to be very
>> few places), and perhaps should also cause xenvif_schedulable()
>> to return false.
> 
> Or a "light" version of xenvif_down(), i.e. simply
> 
>       disable_irq(vif->irq);
>       xen_netbk_deschedule_xenvif(vif);
> 
> If I'm not mistaken, doing this instead of xenvif_carrier_off()
> could be all that's needed.

Like the below/attached (compile tested only so far).

Jan

netback: fix shutting down the ring if it contains garbage

Using rtnl_lock() in tasklet context is not permitted.

This undoes the part of 1219:5108c6901b30 that split off
xenvif_carrier_off() from netif_disconnect().

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -78,6 +78,8 @@ typedef struct netif_st {
        u8 can_queue:1; /* can queue packets for receiver? */
        u8 copying_receiver:1;  /* copy packets to receiver?       */
 
+       u8 busted:1;
+
        /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
        RING_IDX rx_req_cons_peek;
 
@@ -195,7 +197,8 @@ int netif_map(netif_t *netif, unsigned l
 void netif_xenbus_init(void);
 
 #define netif_schedulable(netif)                               \
-       (netif_running((netif)->dev) && netback_carrier_ok(netif))
+       (likely(!(netif)->busted) &&                            \
+        netif_running((netif)->dev) && netback_carrier_ok(netif))
 
 void netif_schedule_work(netif_t *netif);
 void netif_deschedule_work(netif_t *netif);
@@ -204,9 +207,6 @@ int netif_be_start_xmit(struct sk_buff *
 struct net_device_stats *netif_be_get_stats(struct net_device *dev);
 irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
 
-/* Prevent the device from generating any further traffic. */
-void xenvif_carrier_off(netif_t *netif);
-
 static inline int netbk_can_queue(struct net_device *dev)
 {
        netif_t *netif = netdev_priv(dev);
--- a/drivers/xen/netback/interface.c
+++ b/drivers/xen/netback/interface.c
@@ -56,6 +56,7 @@ module_param_named(queue_length, netbk_q
 
 static void __netif_up(netif_t *netif)
 {
+       netif->busted = 0;
        enable_irq(netif->irq);
        netif_schedule_work(netif);
 }
@@ -347,23 +348,19 @@ err_rx:
        return err;
 }
 
-void xenvif_carrier_off(netif_t *netif)
-{
-       rtnl_lock();
-       netback_carrier_off(netif);
-       netif_carrier_off(netif->dev); /* discard queued packets */
-       if (netif_running(netif->dev))
-               __netif_down(netif);
-       rtnl_unlock();
-       netif_put(netif);
-}
-
 void netif_disconnect(struct backend_info *be)
 {
        netif_t *netif = be->netif;
 
-       if (netback_carrier_ok(netif))
-               xenvif_carrier_off(netif);
+       if (netback_carrier_ok(netif)) {
+               rtnl_lock();
+               netback_carrier_off(netif);
+               netif_carrier_off(netif->dev); /* discard queued packets */
+               if (netif_running(netif->dev))
+                       __netif_down(netif);
+               rtnl_unlock();
+               netif_put(netif);
+       }
 
        atomic_dec(&netif->refcnt);
        wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -845,7 +845,7 @@ void netif_schedule_work(netif_t *netif)
        RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
 #endif
 
-       if (more_to_do) {
+       if (more_to_do && likely(!netif->busted)) {
                add_to_net_schedule_list_tail(netif);
                maybe_schedule_tx_action();
        }
@@ -1024,7 +1024,9 @@ static void netbk_fatal_tx_err(netif_t *
 {
        printk(KERN_ERR "%s: fatal error; disabling device\n",
               netif->dev->name);
-       xenvif_carrier_off(netif);
+       netif->busted = 1;
+       disable_irq(netif->irq);
+       netif_deschedule_work(netif);
        netif_put(netif);
 }
 
@@ -1292,6 +1294,11 @@ static void net_tx_action(unsigned long 
                if (!netif)
                        continue;
 
+               if (unlikely(netif->busted)) {
+                       netif_put(netif);
+                       continue;
+               }
+
                if (netif->tx.sring->req_prod - netif->tx.req_cons >
                    NET_TX_RING_SIZE) {
                        printk(KERN_ERR "%s: Impossible number of requests. "


Attachment: xen-netback-garbage-ring-fix.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.