[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [8/11] [NET] front: Add SG support



Hi:

[NET] front: Add SG support

This patch adds scatter-and-gather support to the frontend.  It also
advertises this fact through xenbus so that the backend can detect
this and send through SG requests only if it is supported.

SG support is required to support skb's larger than one page.  This
in turn is needed for either jumbo MTU or TSO.  One of these is
required to bring local networking performance up to a level that
is acceptable.

Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@xxxxxxxxxxxxxxxxxxx>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff -r c754083146bc -r c82398c08669 
linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Fri Jul 07 
23:36:58 2006 +1000
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Fri Jul 07 
23:37:45 2006 +1000
@@ -46,11 +46,11 @@
 #include <linux/ethtool.h>
 #include <linux/in.h>
 #include <linux/if_ether.h>
+#include <linux/io.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/arp.h>
 #include <net/route.h>
-#include <asm/io.h>
 #include <asm/uaccess.h>
 #include <xen/evtchn.h>
 #include <xen/xenbus.h>
@@ -62,17 +62,12 @@
 #include <xen/interface/grant_table.h>
 #include <xen/gnttab.h>
 
+#define RX_COPY_THRESHOLD 256
+
 #define GRANT_INVALID_REF      0
 
 #define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
 #define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
-
-static inline void init_skb_shinfo(struct sk_buff *skb)
-{
-       atomic_set(&(skb_shinfo(skb)->dataref), 1);
-       skb_shinfo(skb)->nr_frags = 0;
-       skb_shinfo(skb)->frag_list = NULL;
-}
 
 struct netfront_info {
        struct list_head list;
@@ -329,6 +324,12 @@ again:
        err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
        if (err) {
                message = "writing feature-rx-notify";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+       if (err) {
+               message = "writing feature-sg";
                goto abort_transaction;
        }
 
@@ -575,10 +576,13 @@ static void network_alloc_rx_buffers(str
        unsigned short id;
        struct netfront_info *np = netdev_priv(dev);
        struct sk_buff *skb;
+       struct page *page;
        int i, batch_target;
        RING_IDX req_prod = np->rx.req_prod_pvt;
        struct xen_memory_reservation reservation;
        grant_ref_t ref;
+       unsigned long pfn;
+       void *vaddr;
 
        if (unlikely(!netif_carrier_ok(dev)))
                return;
@@ -591,15 +595,16 @@ static void network_alloc_rx_buffers(str
         */
        batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
        for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
-               /*
-                * Subtract dev_alloc_skb headroom (16 bytes) and shared info
-                * tailroom then round down to SKB_DATA_ALIGN boundary.
-                */
-               skb = __dev_alloc_skb(
-                       ((PAGE_SIZE - sizeof(struct skb_shared_info)) &
-                        (-SKB_DATA_ALIGN(1))) - 16,
-                       GFP_ATOMIC|__GFP_NOWARN);
-               if (skb == NULL) {
+               /* Allocate an skb and a page. */
+               skb = __dev_alloc_skb(RX_COPY_THRESHOLD,
+                                     GFP_ATOMIC | __GFP_NOWARN);
+               if (unlikely(!skb))
+                       goto no_skb;
+
+               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+               if (!page) {
+                       kfree_skb(skb);
+no_skb:
                        /* Any skbuffs queued for refill? Force them out. */
                        if (i != 0)
                                goto refill;
@@ -608,6 +613,9 @@ static void network_alloc_rx_buffers(str
                                  jiffies + (HZ/10));
                        break;
                }
+
+               skb_shinfo(skb)->frags[0].page = page;
+               skb_shinfo(skb)->nr_frags = 1;
                __skb_queue_tail(&np->rx_batch, skb);
        }
 
@@ -639,18 +647,20 @@ static void network_alloc_rx_buffers(str
                ref = gnttab_claim_grant_reference(&np->gref_rx_head);
                BUG_ON((signed short)ref < 0);
                np->grant_rx_ref[id] = ref;
+
+               pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
+               vaddr = page_address(skb_shinfo(skb)->frags[0].page);
+
                gnttab_grant_foreign_transfer_ref(ref,
-                                                 np->xbdev->otherend_id,
-                                                 __pa(skb->head)>>PAGE_SHIFT);
+                                                 np->xbdev->otherend_id, pfn);
                RING_GET_REQUEST(&np->rx, req_prod + i)->gref = ref;
-               np->rx_pfn_array[i] = virt_to_mfn(skb->head);
+               np->rx_pfn_array[i] = pfn_to_mfn(pfn);
 
                if (!xen_feature(XENFEAT_auto_translated_physmap)) {
                        /* Remove this page before passing back to Xen. */
-                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
-                                           INVALID_P2M_ENTRY);
+                       set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
                        MULTI_update_va_mapping(np->rx_mcl+i,
-                                               (unsigned long)skb->head,
+                                               (unsigned long)vaddr,
                                                __pte(0), 0);
                }
        }
@@ -888,41 +898,29 @@ static void xennet_move_rx_slot(struct n
        np->rx.req_prod_pvt++;
 }
 
-static int netif_poll(struct net_device *dev, int *pbudget)
-{
-       struct netfront_info *np = netdev_priv(dev);
-       struct sk_buff *skb, *nskb;
-       struct netif_rx_response *rx;
-       RING_IDX i, rp;
-       struct mmu_update *mmu = np->rx_mmu;
-       struct multicall_entry *mcl = np->rx_mcl;
-       int work_done, budget, more_to_do = 1;
-       struct sk_buff_head rxq;
-       unsigned long flags;
-       unsigned long mfn;
-       grant_ref_t ref;
-
-       spin_lock(&np->rx_lock);
-
-       if (unlikely(!netif_carrier_ok(dev))) {
-               spin_unlock(&np->rx_lock);
-               return 0;
-       }
-
-       skb_queue_head_init(&rxq);
-
-       if ((budget = *pbudget) > dev->quota)
-               budget = dev->quota;
-       rp = np->rx.sring->rsp_prod;
-       rmb(); /* Ensure we see queued responses up to 'rp'. */
-
-       for (i = np->rx.rsp_cons, work_done = 0;
-            (i != rp) && (work_done < budget);
-            i++, work_done++) {
-               rx = RING_GET_RESPONSE(&np->rx, i);
-
-               skb = xennet_get_rx_skb(np, i);
-               ref = xennet_get_rx_ref(np, i);
+static int xennet_get_responses(struct netfront_info *np,
+                               struct netif_rx_response *rx, RING_IDX rp,
+                               struct sk_buff_head *list, int count)
+{
+       struct mmu_update *mmu = np->rx_mmu + count;
+       struct multicall_entry *mcl = np->rx_mcl + count;
+       RING_IDX cons = np->rx.rsp_cons;
+       struct sk_buff *skb = xennet_get_rx_skb(np, cons);
+       grant_ref_t ref = xennet_get_rx_ref(np, cons);
+       int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+       int frags = 1;
+       int err = 0;
+
+       for (;;) {
+               unsigned long mfn;
+
+               if (unlikely(rx->status < 0 ||
+                            rx->offset + rx->status > PAGE_SIZE)) {
+                       if (net_ratelimit())
+                               WPRINTK("rx->offset: %x, size: %u\n",
+                                       rx->offset, rx->status);
+                       err = -EINVAL;
+               }
 
                /*
                 * This definitely indicates a bug, either in this driver or in
@@ -931,8 +929,8 @@ static int netif_poll(struct net_device 
                 */
                if (ref == GRANT_INVALID_REF) {
                        WPRINTK("Bad rx response id %d.\n", rx->id);
-                       work_done--;
-                       continue;
+                       err = -EINVAL;
+                       goto next;
                }
 
                /* Memory pressure, insufficient buffer headroom, ... */
@@ -941,16 +939,161 @@ static int netif_poll(struct net_device 
                                WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
                                        rx->id, rx->status);
                        xennet_move_rx_slot(np, skb, ref);
+                       err = -ENOMEM;
+                       goto next;
+               }
+
+               gnttab_release_grant_reference(&np->gref_rx_head, ref);
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* Remap the page. */
+                       struct page *page = skb_shinfo(skb)->frags[0].page;
+                       unsigned long pfn = page_to_pfn(page);
+                       void *vaddr = page_address(page);
+
+                       MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
+                                               pfn_pte_ma(mfn, PAGE_KERNEL),
+                                               0);
+                       mcl++;
+                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+                               | MMU_MACHPHYS_UPDATE;
+                       mmu->val = pfn;
+                       mmu++;
+
+                       set_phys_to_machine(pfn, mfn);
+               }
+
+               __skb_queue_tail(list, skb);
+
+next:
+               if (!(rx->flags & NETRXF_more_data))
+                       break;
+
+               if (cons + frags == rp) {
+                       if (net_ratelimit())
+                               WPRINTK("Need more frags\n");
+                       err = -ENOENT;
+                       break;
+               }
+
+               rx = RING_GET_RESPONSE(&np->rx, cons + frags);
+               skb = xennet_get_rx_skb(np, cons + frags);
+               ref = xennet_get_rx_ref(np, cons + frags);
+               frags++;
+       }
+
+       if (unlikely(frags > max)) {
+               if (net_ratelimit())
+                       WPRINTK("Too many frags\n");
+               err = -E2BIG;
+       }
+
+       return err;
+}
+
+static RING_IDX xennet_fill_frags(struct netfront_info *np,
+                                 struct sk_buff *skb,
+                                 struct sk_buff_head *list)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       RING_IDX cons = np->rx.rsp_cons;
+       skb_frag_t *frag = shinfo->frags + nr_frags;
+       struct sk_buff *nskb;
+
+       while ((nskb = __skb_dequeue(list))) {
+               struct netif_rx_response *rx =
+                       RING_GET_RESPONSE(&np->rx, ++cons);
+
+               frag->page = skb_shinfo(nskb)->frags[0].page;
+               frag->page_offset = rx->offset;
+               frag->size = rx->status;
+
+               skb->data_len += rx->status;
+
+               skb_shinfo(nskb)->nr_frags = 0;
+               kfree_skb(nskb);
+
+               frag++;
+               nr_frags++;
+       }
+
+       shinfo->nr_frags = nr_frags;
+       return cons;
+}
+
+static int netif_poll(struct net_device *dev, int *pbudget)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       struct sk_buff *skb;
+       struct netif_rx_response *rx;
+       RING_IDX i, rp;
+       struct multicall_entry *mcl;
+       int work_done, budget, more_to_do = 1;
+       struct sk_buff_head rxq;
+       struct sk_buff_head errq;
+       struct sk_buff_head tmpq;
+       unsigned long flags;
+       unsigned int len;
+       int pages_done;
+       int err;
+
+       spin_lock(&np->rx_lock);
+
+       if (unlikely(!netif_carrier_ok(dev))) {
+               spin_unlock(&np->rx_lock);
+               return 0;
+       }
+
+       skb_queue_head_init(&rxq);
+       skb_queue_head_init(&errq);
+       skb_queue_head_init(&tmpq);
+
+       if ((budget = *pbudget) > dev->quota)
+               budget = dev->quota;
+       rp = np->rx.sring->rsp_prod;
+       rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+       for (i = np->rx.rsp_cons, work_done = 0, pages_done = 0;
+            (i != rp) && (work_done < budget);
+            np->rx.rsp_cons = ++i, work_done++) {
+               rx = RING_GET_RESPONSE(&np->rx, i);
+
+               err = xennet_get_responses(np, rx, rp, &tmpq, pages_done);
+               pages_done += skb_queue_len(&tmpq);
+
+               if (unlikely(err)) {
+                       i = np->rx.rsp_cons + skb_queue_len(&tmpq) - 1;
                        work_done--;
+                       while ((skb = __skb_dequeue(&tmpq)))
+                               __skb_queue_tail(&errq, skb);
+                       np->stats.rx_errors++;
                        continue;
                }
 
-               gnttab_release_grant_reference(&np->gref_rx_head, ref);
-
-               /* NB. We handle skb overflow later. */
-               skb->data = skb->head + rx->offset;
-               skb->len  = rx->status;
-               skb->tail = skb->data + skb->len;
+               skb = __skb_dequeue(&tmpq);
+
+               skb->nh.raw = (void *)skb_shinfo(skb)->frags[0].page;
+               skb->h.raw = skb->nh.raw + rx->offset;
+
+               len = rx->status;
+               if (len > RX_COPY_THRESHOLD)
+                       len = RX_COPY_THRESHOLD;
+               skb_put(skb, len);
+
+               if (rx->status > len) {
+                       skb_shinfo(skb)->frags[0].page_offset =
+                               rx->offset + len;
+                       skb_shinfo(skb)->frags[0].size = rx->status - len;
+                       skb->data_len = rx->status - len;
+               } else {
+                       skb_shinfo(skb)->frags[0].page = NULL;
+                       skb_shinfo(skb)->nr_frags = 0;
+               }
+
+               i = xennet_fill_frags(np, skb, &tmpq);
+               skb->truesize += skb->data_len;
+               skb->len += skb->data_len;
 
                /*
                 * Old backends do not assert data_validated but we
@@ -966,96 +1109,38 @@ static int netif_poll(struct net_device 
                skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
 
                np->stats.rx_packets++;
-               np->stats.rx_bytes += rx->status;
-
-               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-                       /* Remap the page. */
-                       MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
-                                               pfn_pte_ma(mfn, PAGE_KERNEL),
-                                               0);
-                       mcl++;
-                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
-                               | MMU_MACHPHYS_UPDATE;
-                       mmu->val = __pa(skb->head) >> PAGE_SHIFT;
-                       mmu++;
-
-                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
-                                           mfn);
-               }
+               np->stats.rx_bytes += skb->len;
 
                __skb_queue_tail(&rxq, skb);
        }
 
        /* Some pages are no longer absent... */
-       balloon_update_driver_allowance(-work_done);
+       balloon_update_driver_allowance(-pages_done);
 
        /* Do all the remapping work, and M2P updates, in one big hypercall. */
-       if (likely((mcl - np->rx_mcl) != 0)) {
+       if (likely(pages_done)) {
+               mcl = np->rx_mcl + pages_done;
                mcl->op = __HYPERVISOR_mmu_update;
                mcl->args[0] = (unsigned long)np->rx_mmu;
-               mcl->args[1] = mmu - np->rx_mmu;
+               mcl->args[1] = pages_done;
                mcl->args[2] = 0;
                mcl->args[3] = DOMID_SELF;
-               mcl++;
-               (void)HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
-       }
+               (void)HYPERVISOR_multicall(np->rx_mcl, pages_done + 1);
+       }
+
+       while ((skb = __skb_dequeue(&errq)))
+               kfree_skb(skb);
 
        while ((skb = __skb_dequeue(&rxq)) != NULL) {
-               if (skb->len > (dev->mtu + ETH_HLEN + 4)) {
-                       if (net_ratelimit())
-                               printk(KERN_INFO "Received packet too big for "
-                                      "MTU (%d > %d)\n",
-                                      skb->len - ETH_HLEN - 4, dev->mtu);
-                       skb->len  = 0;
-                       skb->tail = skb->data;
-                       init_skb_shinfo(skb);
-                       dev_kfree_skb(skb);
-                       continue;
-               }
-
-               /*
-                * Enough room in skbuff for the data we were passed? Also,
-                * Linux expects at least 16 bytes headroom in each rx buffer.
-                */
-               if (unlikely(skb->tail > skb->end) ||
-                   unlikely((skb->data - skb->head) < 16)) {
-                       if (net_ratelimit()) {
-                               if (skb->tail > skb->end)
-                                       printk(KERN_INFO "Received packet "
-                                              "is %zd bytes beyond tail.\n",
-                                              skb->tail - skb->end);
-                               else
-                                       printk(KERN_INFO "Received packet "
-                                              "is %zd bytes before head.\n",
-                                              16 - (skb->data - skb->head));
-                       }
-
-                       nskb = __dev_alloc_skb(skb->len + 2,
-                                              GFP_ATOMIC|__GFP_NOWARN);
-                       if (nskb != NULL) {
-                               skb_reserve(nskb, 2);
-                               skb_put(nskb, skb->len);
-                               memcpy(nskb->data, skb->data, skb->len);
-                               /* Copy any other fields we already set up. */
-                               nskb->dev = skb->dev;
-                               nskb->ip_summed = skb->ip_summed;
-                               nskb->proto_data_valid = skb->proto_data_valid;
-                               nskb->proto_csum_blank = skb->proto_csum_blank;
-                       }
-
-                       /* Reinitialise and then destroy the old skbuff. */
-                       skb->len  = 0;
-                       skb->tail = skb->data;
-                       init_skb_shinfo(skb);
-                       dev_kfree_skb(skb);
-
-                       /* Switch old for new, if we copied the buffer. */
-                       if ((skb = nskb) == NULL)
-                               continue;
-               }
-
-               /* Set the shinfo area, which is hidden behind the data. */
-               init_skb_shinfo(skb);
+               struct page *page = (struct page *)skb->nh.raw;
+               void *vaddr = page_address(page);
+
+               memcpy(skb->data, vaddr + (skb->h.raw - skb->nh.raw),
+                      skb_headlen(skb));
+
+               if (page != skb_shinfo(skb)->frags[0].page)
+                       __free_page(page);
+
                /* Ethernet work: Delayed to here as it peeks the header. */
                skb->protocol = eth_type_trans(skb, dev);
 
@@ -1063,8 +1148,6 @@ static int netif_poll(struct net_device 
                netif_receive_skb(skb);
                dev->last_rx = jiffies;
        }
-
-       np->rx.rsp_cons = i;
 
        /* If we get a callback with very few responses, reduce fill target. */
        /* NB. Note exponential increase, linear decrease. */
@@ -1205,7 +1288,7 @@ static void network_connect(struct net_d
                        break;
                gnttab_grant_foreign_transfer_ref(
                        np->grant_rx_ref[i], np->xbdev->otherend_id,
-                       __pa(np->rx_skbs[i]->data) >> PAGE_SHIFT);
+                       page_to_pfn(skb_shinfo(np->rx_skbs[i])->frags->page));
                RING_GET_REQUEST(&np->rx, i)->gref = np->grant_rx_ref[i];
                RING_GET_REQUEST(&np->rx, i)->id = i;
        }
@@ -1216,7 +1299,7 @@ static void network_connect(struct net_d
                ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
                gnttab_grant_foreign_transfer_ref(
                        ref, np->xbdev->otherend_id,
-                       __pa(skb->data) >> PAGE_SHIFT);
+                       page_to_pfn(skb_shinfo(skb)->frags->page));
                RING_GET_REQUEST(&np->rx, requeue_idx)->gref = ref;
                RING_GET_REQUEST(&np->rx, requeue_idx)->id = requeue_idx;
                requeue_idx++;
diff -r c754083146bc -r c82398c08669 xen/include/public/io/netif.h
--- a/xen/include/public/io/netif.h     Fri Jul 07 23:36:58 2006 +1000
+++ b/xen/include/public/io/netif.h     Fri Jul 07 23:37:45 2006 +1000
@@ -123,6 +123,10 @@ typedef struct netif_rx_request netif_rx
 #define _NETRXF_csum_blank     (1)
 #define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
 
+/* Packet continues in the next request descriptor. */
+#define _NETRXF_more_data      (2)
+#define  NETRXF_more_data      (1U<<_NETRXF_more_data)
+
 struct netif_rx_response {
     uint16_t id;
     uint16_t offset;       /* Offset in page of start of received packet  */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.