WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] Turn blktap tapfds into a link list

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH] Turn blktap tapfds into a link list
From: Steven Rostedt <srostedt@xxxxxxxxxx>
Date: Fri, 29 Sep 2006 00:00:39 -0400
Cc: andrew.warfield@xxxxxxxxxxxx
Delivery-date: Thu, 28 Sep 2006 21:00:42 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 1.5.0.4 (X11/20060614)
The current implementation of blktap has the tapfds descriptors as a static array. Which means that what isn't used is taking up memory. It also means that if we ever want to dynamically increase the number of descriptors that we allow, we can't do so.

So this patch converts the tapfds from an array into a link list.

I use the list_add_rcu and list_for_each_entry_rcu so that I don't need to protect the list while walking or adding items to it. I never delete a device, so once it is added to the list, it stays. But it wouldn't be hard with the rcu list operations to add a deletion in the future.

This patch also fixed a few bugs that were lying in there. Some where the tapfds[idx] was offset outside the checking of the idx size (although they would probably be very hard to hit).

Anyway, I booted this up and it runs. I started a guest with a block device on the blktap, twice, and there was no problems.

But... I have to admit, I wrote this when I was tired, so it may have introduced a few bugs as well. I went over it twice, but I could have missed something twice. Please review! But there shouldn't be anything major wrong with it.

-- Steve

Signed-off-by: Steven Rostedt <srostedt@xxxxxxxxxx>
diff -r 0de1d48970aa linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c  Thu Sep 28 23:36:01 
2006 -0400
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c  Thu Sep 28 23:45:27 
2006 -0400
@@ -9,6 +9,9 @@
  * Based on the blkback driver code.
  * 
  * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Clean ups and fix ups:
+ *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License version 2
@@ -105,8 +108,15 @@ static int mmap_pages = MMAP_PAGES;
                      * memory rings.
                      */
 
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+       unsigned short domid;
+       unsigned short busid;
+} domid_translate_t ;
+
 /*Data struct associated with each of the tapdisk devices*/
 typedef struct tap_blkif {
+       struct list_head list;        /*Link list of all the tapfds          */
        struct vm_area_struct *vma;   /*Shared memory area                   */
        unsigned long rings_vstart;   /*Kernel memory mapping                */
        unsigned long user_vstart;    /*User memory mapping                  */
@@ -123,17 +133,13 @@ typedef struct tap_blkif {
        unsigned long *idx_map;       /*Record the user ring id to kern 
                                        [req id, idx] tuple                  */
        blkif_t *blkif;               /*Associate blkif with tapdev          */
-       int sysfs_set;                /*Set if it has a class device.        */
+       struct domid_translate trans; /*Translation from domid to bus.       */
 } tap_blkif_t;
 
-/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
-typedef struct domid_translate {
-       unsigned short domid;
-       unsigned short busid;
-} domid_translate_t ;
-
-static domid_translate_t  translate_domid[MAX_TAP_DEV];
-static tap_blkif_t *tapfds[MAX_TAP_DEV];
+static LIST_HEAD(tapfds);
+static int blktap_next_minor;
+
+#define INFO_NOT_FOUND(info) (unlikely((&(info)->list == &tapfds)))
 
 static int __init set_blkif_reqs(char *str)
 {
@@ -320,7 +326,7 @@ struct vm_operations_struct blktap_vm_op
  */
  
 /*Function Declarations*/
-static int get_next_free_dev(void);
+static tap_blkif_t *get_next_free_dev(void);
 static int blktap_open(struct inode *inode, struct file *filp);
 static int blktap_release(struct inode *inode, struct file *filp);
 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
@@ -338,51 +344,83 @@ static struct file_operations blktap_fop
 };
 
 
-static int get_next_free_dev(void)
+static tap_blkif_t *get_next_free_dev(void)
 {
        tap_blkif_t *info;
-       int i = 0, ret = -1;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pending_free_lock, flags);
-       
-       while (i < MAX_TAP_DEV) {
-               info = tapfds[i];
-               if ( (tapfds[i] != NULL) && (info->dev_inuse == 0)
-                       && (info->dev_pending == 0) ) {
+       int minor = -1;
+
+       /*
+        * This is called only from the ioctl, which
+        * means we should always have interrupts enabled.
+        */
+       BUG_ON(irqs_disabled());
+
+       spin_lock_irq(&pending_free_lock);
+       
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if ((info->dev_inuse == 0) &&
+                   (info->dev_pending == 0) ) {
                        info->dev_pending = 1;
-                       ret = i;
-                       goto done;
-               }
-               i++;
-       }
-       
-done:
-       spin_unlock_irqrestore(&pending_free_lock, flags);
+                       goto found;
+               }
+       }
+       info = NULL;
 
        /*
-        * We are protected by having the dev_pending set.
+        * We didn't find a free device. If we can still allocate
+        * more, then we grab the next device minor that is
+        * available.  This is done while we are still under
+        * the protection of the pending_free_lock.
         */
-       if (!tapfds[i]->sysfs_set && xen_class) {
+       if (blktap_next_minor < MAX_TAP_DEV)
+               minor = blktap_next_minor++;
+found:
+       spin_unlock_irq(&pending_free_lock);
+
+       /* minor should never be zero with info NULL */
+       BUG_ON(!info && !minor);
+
+       if (!info && minor > 0) {
+               info = kzalloc(sizeof(*info), GFP_KERNEL);
+               if (unlikely(!info)) {
+                       /*
+                        * If we failed here, try to put back
+                        * the next minor number. But if one
+                        * was just taken, then we just lose this
+                        * minor.
+                        */
+                       spin_lock_irq(&pending_free_lock);
+                       if (blktap_next_minor == minor+1)
+                               blktap_next_minor--;
+                       spin_unlock_irq(&pending_free_lock);
+                       goto out;
+               }
+
+               info->minor = minor;
+               list_add_rcu(&info->list, &tapfds);
                class_device_create(xen_class, NULL,
-                                   MKDEV(blktap_major, ret), NULL,
-                                   "blktap%d", ret);
-               tapfds[i]->sysfs_set = 1;
-       }
-       return ret;
+                                   MKDEV(blktap_major, minor), NULL,
+                                   "blktap%d", minor);
+               devfs_mk_cdev(MKDEV(blktap_major, minor),
+                       S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", minor);
+       }
+
+out:
+       return info;
 }
 
 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
 {
-       int i;
-               
-       for (i = 0; i < MAX_TAP_DEV; i++)
-               if ( (translate_domid[i].domid == domid)
-                   && (translate_domid[i].busid == xenbus_id) ) {
-                       tapfds[i]->blkif = blkif;
-                       tapfds[i]->status = RUNNING;
-                       return i;
-               }
+       tap_blkif_t *info;
+
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if ( (info->trans.domid == domid)
+                   && (info->trans.busid == xenbus_id) ) {
+                       info->blkif = blkif;
+                       info->status = RUNNING;
+                       return info->minor;
+               }
+       }
        return -1;
 }
 
@@ -391,13 +429,17 @@ void signal_tapdisk(int idx)
        tap_blkif_t *info;
        struct task_struct *ptask;
 
-       info = tapfds[idx];
-       if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) {
-               ptask = find_task_by_pid(info->pid);
-               if (ptask)
-                       info->status = CLEANSHUTDOWN;
-       }
-       info->blkif = NULL;
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if (info->minor == idx) {
+                       if (info->pid > 0) {
+                               ptask = find_task_by_pid(info->pid);
+                               if (ptask)
+                                       info->status = CLEANSHUTDOWN;
+                       }
+                       info->blkif = NULL;
+                       break;
+               }
+       }
        return;
 }
 
@@ -408,14 +450,16 @@ static int blktap_open(struct inode *ino
        tap_blkif_t *info;
        int i;
        
-       if (tapfds[idx] == NULL) {
-               WPRINTK("Unable to open device /dev/xen/blktap%d\n",
-                      idx);
-               return -ENOMEM;
-       }
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if (info->minor == idx)
+                       goto found;
+       }
+       WPRINTK("Unable to open device /dev/xen/blktap%d\n",
+               idx);
+       return -ENODEV;
+
+found:
        DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
-       
-       info = tapfds[idx];
        
        /*Only one process can access device at a time*/
        if (test_and_set_bit(0, &info->dev_inuse))
@@ -617,19 +661,18 @@ static int blktap_ioctl(struct inode *in
        {               
                uint64_t val = (uint64_t)arg;
                domid_translate_t *tr = (domid_translate_t *)&val;
-               int newdev;
 
                DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
                       tr->domid, tr->busid);
-               newdev = get_next_free_dev();
-               if (newdev < 1) {
+               info = get_next_free_dev();
+               if (!info) {
                        WPRINTK("Error initialising /dev/xen/blktap - "
                                "No more devices\n");
                        return -1;
                }
-               translate_domid[newdev].domid = tr->domid;
-               translate_domid[newdev].busid = tr->busid;
-               return newdev;
+               info->trans.domid = tr->domid;
+               info->trans.busid = tr->busid;
+               return info->minor;
        }
        case BLKTAP_IOCTL_FREEINTF:
        {
@@ -637,13 +680,16 @@ static int blktap_ioctl(struct inode *in
                unsigned long flags;
 
                /* Looking at another device */
-               info = NULL;
-
-               if ( (dev > 0) && (dev < MAX_TAP_DEV) )
-                       info = tapfds[dev];
+               list_for_each_entry_rcu(info, &tapfds, list) {
+                       if (dev == info->minor)
+                               break;
+               }
+
+               if (INFO_NOT_FOUND(info))
+                       return 0; /* should this be an error? */
 
                spin_lock_irqsave(&pending_free_lock, flags);
-               if ( (info != NULL) && (info->dev_pending) )
+               if (info->dev_pending)
                        info->dev_pending = 0;
                spin_unlock_irqrestore(&pending_free_lock, flags);
 
@@ -654,15 +700,15 @@ static int blktap_ioctl(struct inode *in
                unsigned long dev = arg;
 
                /* Looking at another device */
-               info = NULL;
-               
-               if ( (dev > 0) && (dev < MAX_TAP_DEV) )
-                       info = tapfds[dev];
-               
-               if (info != NULL)
-                       return info->minor;
-               else
-                       return -1;
+               list_for_each_entry_rcu(info, &tapfds, list) {
+                       if (dev == info->minor)
+                               break;
+               }
+
+               if (INFO_NOT_FOUND(info))
+                       return -EINVAL;
+
+               return info->minor;
        }
        case BLKTAP_IOCTL_MAJOR:
                return blktap_major;
@@ -705,11 +751,13 @@ void blktap_kick_user(int idx)
 
        if (idx == 0)
                return;
-       
-       info = tapfds[idx];
-       
-       if (info != NULL)
-               wake_up_interruptible(&info->wait);
+
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if (info->minor == idx) {
+                       wake_up_interruptible(&info->wait);
+                       break;
+               }
+       }
 
        return;
 }
@@ -913,13 +961,18 @@ static void fast_flush_area(pending_req_
        uint64_t ptep;
        int ret, mmap_idx;
        unsigned long kvaddr, uvaddr;
-
-       tap_blkif_t *info = tapfds[tapidx];
-       
-       if (info == NULL) {
-               WPRINTK("fast_flush: Couldn't get info!\n");
-               return;
-       }
+       tap_blkif_t *info;
+       
+
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if (info->minor == tapidx)
+                       goto found;
+       }
+
+       WPRINTK("fast_flush: Couldn't get info!\n");
+       return;
+
+found:
        mmap_idx = req->mem_idx;
 
        for (i = 0; i < req->nr_pages; i++) {
@@ -1134,8 +1187,12 @@ static int do_block_io_op(blkif_t *blkif
                return 0;
        }
 
-       info = tapfds[blkif->dev_num];
-       if (info == NULL || !info->dev_inuse) {
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if (info->minor == blkif->dev_num)
+                       break;
+       }
+
+       if (INFO_NOT_FOUND(info) || !info->dev_inuse) {
                if (print_dbug) {
                        WPRINTK("Can't get UE info!\n");
                        print_dbug = 0;
@@ -1203,16 +1260,26 @@ static void dispatch_rw_block_io(blkif_t
        struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
        unsigned int nseg;
        int ret, i;
-       tap_blkif_t *info = tapfds[blkif->dev_num];
+       tap_blkif_t *info;
        uint64_t sector;
        
        blkif_request_t *target;
        int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
-       int usr_idx = GET_NEXT_REQ(info->idx_map);
+       int usr_idx;
        uint16_t mmap_idx = pending_req->mem_idx;
 
+       list_for_each_entry_rcu(info, &tapfds, list) {
+               if (info->minor == blkif->dev_num)
+                       break;
+       }
+       if (INFO_NOT_FOUND(info))
+               goto fail_response;
+
+       usr_idx = GET_NEXT_REQ(info->idx_map);
+
        /*Check we have space on user ring - should never fail*/
-       if(usr_idx == INVALID_REQ) goto fail_flush;
+       if (usr_idx == INVALID_REQ)
+               goto fail_flush;
        
        /* Check that number of segments is sane. */
        nseg = req->nr_segments;
@@ -1426,9 +1493,6 @@ static int __init blkif_init(void)
 
        tap_blkif_xenbus_init();
 
-       /*Create the blktap devices, but do not map memory or waitqueue*/
-       for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF;
-
        /* Dynamically allocate a major for this device */
        ret = register_chrdev(0, "blktap", &blktap_fops);
        blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
@@ -1440,24 +1504,22 @@ static int __init blkif_init(void)
        
        blktap_major = ret;
 
-       for(i = 0; i < MAX_TAP_DEV; i++ ) {
-               info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
-               if(tapfds[i] == NULL)
-                       return -ENOMEM;
-               info->minor = i;
-               info->pid = 0;
-               info->blkif = NULL;
-
-               ret = devfs_mk_cdev(MKDEV(blktap_major, i),
-                       S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
-
-               if(ret != 0)
-                       return -ENOMEM;
-               info->dev_pending = info->dev_inuse = 0;
-
-               DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
-       }
-       
+       info = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+
+       blktap_next_minor++;
+
+       ret = devfs_mk_cdev(MKDEV(blktap_major, i),
+                           S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
+
+       if(ret != 0)
+               return -ENOMEM;
+
+       DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
+
+       list_add_rcu(&info->list, &tapfds);
+
        /* Make sure the xen class exists */
        if (!setup_xen_class()) {
                /*
@@ -1470,7 +1532,6 @@ static int __init blkif_init(void)
                class_device_create(xen_class, NULL,
                                    MKDEV(blktap_major, 0), NULL,
                                    "blktap0");
-               tapfds[0]->sysfs_set = 1;
        } else {
                /* this is bad, but not fatal */
                WPRINTK("blktap: sysfs xen_class not created\n");
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>