[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3] xen/blkfront: convert to blk-mq APIs



Hi Jens & Christoph,

Rafal reported an issue about this patch, that's after this patch no more 
merges happen and the performance dropped if "modprobe null_blk irqmode=2 
completion_nsec=1000000", 
but works fine if "modprobe null_blk".

I'm not sure whether it's as expect or not.
Do you have any suggestions? Thank you!

Here is the test result:

fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
    --time_based=1 --runtime=30 --bs=4KB --filename=/dev/xvdb \
    --direct=1 --group_reporting=1 --iodepth_batch=16

========================================================================
modprobe null_blk
========================================================================
------------------------------------------------------------------------
*no patch* (avgrq-sz = 8.00 avgqu-sz=5.00)
------------------------------------------------------------------------
READ: io=10655MB, aggrb=363694KB/s, minb=363694KB/s, maxb=363694KB/s, 
mint=30001msec, maxt=30001msec

Disk stats (read/write):
  xvdb: ios=2715852/0, merge=1089/0, ticks=126572/0, in_queue=127456, 
util=100.00%

------------------------------------------------------------------------
*with patch* (avgrq-sz = 8.00 avgqu-sz=8.00)
------------------------------------------------------------------------
READ: io=20655MB, aggrb=705010KB/s, minb=705010KB/s, maxb=705010KB/s, 
mint=30001msec, maxt=30001msec

Disk stats (read/write):
  xvdb: ios=5274633/0, merge=22/0, ticks=243208/0, in_queue=242908, util=99.98%

========================================================================
modprobe null_blk irqmode=2 completion_nsec=1000000
========================================================================
------------------------------------------------------------------------
*no patch* (avgrq-sz = 34.00 avgqu-sz=38.00)
------------------------------------------------------------------------
READ: io=10372MB, aggrb=354008KB/s, minb=354008KB/s, maxb=354008KB/s, 
mint=30003msec, maxt=30003msec

Disk stats (read/write):
  xvdb: ios=621760/0, *merge=1988170/0*, ticks=1136700/0, in_queue=1146020, 
util=99.76%

------------------------------------------------------------------------
*with patch* (avgrq-sz = 8.00 avgqu-sz=28.00)
------------------------------------------------------------------------
READ: io=2876.8MB, aggrb=98187KB/s, minb=98187KB/s, maxb=98187KB/s, 
mint=30002msec, maxt=30002msec

Disk stats (read/write):
  xvdb: ios=734048/0, merge=0/0, ticks=843584/0, in_queue=843080, util=99.72%

Regards,
-Bob

On 07/13/2015 05:55 PM, Bob Liu wrote:
> Note: This patch is based on original work of Arianna's internship for
> GNOME's Outreach Program for Women.
> 
> Only one hardware queue is used now, so there is no performance change.
> 
> The legacy non-mq code is deleted completely which is the same as other
> drivers like virtio, mtip, and nvme.
> 
> Also dropped one unnecessary holding of info->io_lock when calling
> blk_mq_stop_hw_queues().
> 
> Changes in v2:
>  - Reorganized blk_mq_queue_rq()
>  - Restored most io_locks in place
> 
> Change in v3:
>  - Rename blk_mq_queue_rq to blkif_queue_rq
> 
> Signed-off-by: Arianna Avanzini <avanzini.arianna@xxxxxxxxx>
> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
> Reviewed-by: Christoph Hellwig <hch@xxxxxx>
> Acked-by: Jens Axboe <axboe@xxxxxx>
> ---
>  drivers/block/xen-blkfront.c |  146 
> +++++++++++++++++-------------------------
>  1 file changed, 60 insertions(+), 86 deletions(-)
> 
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 6d89ed3..5b45ee5 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -37,6 +37,7 @@
>  
>  #include <linux/interrupt.h>
>  #include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
>  #include <linux/hdreg.h>
>  #include <linux/cdrom.h>
>  #include <linux/module.h>
> @@ -148,6 +149,7 @@ struct blkfront_info
>       unsigned int feature_persistent:1;
>       unsigned int max_indirect_segments;
>       int is_ready;
> +     struct blk_mq_tag_set tag_set;
>  };
>  
>  static unsigned int nr_minors;
> @@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct 
> request *req,
>                !(info->feature_flush & REQ_FUA)));
>  }
>  
> -/*
> - * do_blkif_request
> - *  read a block; request is in a request queue
> - */
> -static void do_blkif_request(struct request_queue *rq)
> +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
> +                        const struct blk_mq_queue_data *qd)
>  {
> -     struct blkfront_info *info = NULL;
> -     struct request *req;
> -     int queued;
> -
> -     pr_debug("Entered do_blkif_request\n");
> -
> -     queued = 0;
> +     struct blkfront_info *info = qd->rq->rq_disk->private_data;
>  
> -     while ((req = blk_peek_request(rq)) != NULL) {
> -             info = req->rq_disk->private_data;
> -
> -             if (RING_FULL(&info->ring))
> -                     goto wait;
> +     blk_mq_start_request(qd->rq);
> +     spin_lock_irq(&info->io_lock);
> +     if (RING_FULL(&info->ring))
> +             goto out_busy;
>  
> -             blk_start_request(req);
> +     if (blkif_request_flush_invalid(qd->rq, info))
> +             goto out_err;
>  
> -             if (blkif_request_flush_invalid(req, info)) {
> -                     __blk_end_request_all(req, -EOPNOTSUPP);
> -                     continue;
> -             }
> +     if (blkif_queue_request(qd->rq))
> +             goto out_busy;
>  
> -             pr_debug("do_blk_req %p: cmd %p, sec %lx, "
> -                      "(%u/%u) [%s]\n",
> -                      req, req->cmd, (unsigned long)blk_rq_pos(req),
> -                      blk_rq_cur_sectors(req), blk_rq_sectors(req),
> -                      rq_data_dir(req) ? "write" : "read");
> -
> -             if (blkif_queue_request(req)) {
> -                     blk_requeue_request(rq, req);
> -wait:
> -                     /* Avoid pointless unplugs. */
> -                     blk_stop_queue(rq);
> -                     break;
> -             }
> +     flush_requests(info);
> +     spin_unlock_irq(&info->io_lock);
> +     return BLK_MQ_RQ_QUEUE_OK;
>  
> -             queued++;
> -     }
> +out_err:
> +     spin_unlock_irq(&info->io_lock);
> +     return BLK_MQ_RQ_QUEUE_ERROR;
>  
> -     if (queued != 0)
> -             flush_requests(info);
> +out_busy:
> +     spin_unlock_irq(&info->io_lock);
> +     blk_mq_stop_hw_queue(hctx);
> +     return BLK_MQ_RQ_QUEUE_BUSY;
>  }
>  
> +static struct blk_mq_ops blkfront_mq_ops = {
> +     .queue_rq = blkif_queue_rq,
> +     .map_queue = blk_mq_map_queue,
> +};
> +
>  static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
>                               unsigned int physical_sector_size,
>                               unsigned int segments)
> @@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
> sector_size,
>       struct request_queue *rq;
>       struct blkfront_info *info = gd->private_data;
>  
> -     rq = blk_init_queue(do_blkif_request, &info->io_lock);
> -     if (rq == NULL)
> +     memset(&info->tag_set, 0, sizeof(info->tag_set));
> +     info->tag_set.ops = &blkfront_mq_ops;
> +     info->tag_set.nr_hw_queues = 1;
> +     info->tag_set.queue_depth =  BLK_RING_SIZE(info);
> +     info->tag_set.numa_node = NUMA_NO_NODE;
> +     info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
> +     info->tag_set.cmd_size = 0;
> +     info->tag_set.driver_data = info;
> +
> +     if (blk_mq_alloc_tag_set(&info->tag_set))
>               return -1;
> +     rq = blk_mq_init_queue(&info->tag_set);
> +     if (IS_ERR(rq)) {
> +             blk_mq_free_tag_set(&info->tag_set);
> +             return -1;
> +     }
>  
>       queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
>  
> @@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
>  static void xlvbd_release_gendisk(struct blkfront_info *info)
>  {
>       unsigned int minor, nr_minors;
> -     unsigned long flags;
>  
>       if (info->rq == NULL)
>               return;
>  
> -     spin_lock_irqsave(&info->io_lock, flags);
> -
>       /* No more blkif_request(). */
> -     blk_stop_queue(info->rq);
> +     blk_mq_stop_hw_queues(info->rq);
>  
>       /* No more gnttab callback work. */
>       gnttab_cancel_free_callback(&info->callback);
> -     spin_unlock_irqrestore(&info->io_lock, flags);
>  
>       /* Flush gnttab callback work. Must be done with no locks held. */
>       flush_work(&info->work);
> @@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info 
> *info)
>       xlbd_release_minors(minor, nr_minors);
>  
>       blk_cleanup_queue(info->rq);
> +     blk_mq_free_tag_set(&info->tag_set);
>       info->rq = NULL;
>  
>       put_disk(info->gd);
>       info->gd = NULL;
>  }
>  
> +/* Must be called with io_lock holded */
>  static void kick_pending_request_queues(struct blkfront_info *info)
>  {
> -     if (!RING_FULL(&info->ring)) {
> -             /* Re-enable calldowns. */
> -             blk_start_queue(info->rq);
> -             /* Kick things off immediately. */
> -             do_blkif_request(info->rq);
> -     }
> +     if (!RING_FULL(&info->ring))
> +             blk_mq_start_stopped_hw_queues(info->rq, true);
>  }
>  
>  static void blkif_restart_queue(struct work_struct *work)
> @@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int 
> suspend)
>               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
>       /* No more blkif_request(). */
>       if (info->rq)
> -             blk_stop_queue(info->rq);
> +             blk_mq_stop_hw_queues(info->rq);
>  
>       /* Remove all persistent grants */
>       if (!list_empty(&info->grants)) {
> @@ -1144,7 +1140,6 @@ static irqreturn_t blkif_interrupt(int irq, void 
> *dev_id)
>       RING_IDX i, rp;
>       unsigned long flags;
>       struct blkfront_info *info = (struct blkfront_info *)dev_id;
> -     int error;
>  
>       spin_lock_irqsave(&info->io_lock, flags);
>  
> @@ -1185,37 +1180,37 @@ static irqreturn_t blkif_interrupt(int irq, void 
> *dev_id)
>                       continue;
>               }
>  
> -             error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
> +             req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
>               switch (bret->operation) {
>               case BLKIF_OP_DISCARD:
>                       if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>                               struct request_queue *rq = info->rq;
>                               printk(KERN_WARNING "blkfront: %s: %s op 
> failed\n",
>                                          info->gd->disk_name, 
> op_name(bret->operation));
> -                             error = -EOPNOTSUPP;
> +                             req->errors = -EOPNOTSUPP;
>                               info->feature_discard = 0;
>                               info->feature_secdiscard = 0;
>                               queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
>                               queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
>                       }
> -                     __blk_end_request_all(req, error);
> +                     blk_mq_complete_request(req);
>                       break;
>               case BLKIF_OP_FLUSH_DISKCACHE:
>               case BLKIF_OP_WRITE_BARRIER:
>                       if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>                               printk(KERN_WARNING "blkfront: %s: %s op 
> failed\n",
>                                      info->gd->disk_name, 
> op_name(bret->operation));
> -                             error = -EOPNOTSUPP;
> +                             req->errors = -EOPNOTSUPP;
>                       }
>                       if (unlikely(bret->status == BLKIF_RSP_ERROR &&
>                                    info->shadow[id].req.u.rw.nr_segments == 
> 0)) {
>                               printk(KERN_WARNING "blkfront: %s: empty %s op 
> failed\n",
>                                      info->gd->disk_name, 
> op_name(bret->operation));
> -                             error = -EOPNOTSUPP;
> +                             req->errors = -EOPNOTSUPP;
>                       }
> -                     if (unlikely(error)) {
> -                             if (error == -EOPNOTSUPP)
> -                                     error = 0;
> +                     if (unlikely(req->errors)) {
> +                             if (req->errors == -EOPNOTSUPP)
> +                                     req->errors = 0;
>                               info->feature_flush = 0;
>                               xlvbd_flush(info);
>                       }
> @@ -1226,7 +1221,7 @@ static irqreturn_t blkif_interrupt(int irq, void 
> *dev_id)
>                               dev_dbg(&info->xbdev->dev, "Bad return from 
> blkdev data "
>                                       "request: %x\n", bret->status);
>  
> -                     __blk_end_request_all(req, error);
> +                     blk_mq_complete_request(req);
>                       break;
>               default:
>                       BUG();
> @@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
>  
>       kfree(copy);
>  
> -     /*
> -      * Empty the queue, this is important because we might have
> -      * requests in the queue with more segments than what we
> -      * can handle now.
> -      */
> -     spin_lock_irq(&info->io_lock);
> -     while ((req = blk_fetch_request(info->rq)) != NULL) {
> -             if (req->cmd_flags &
> -                 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
> -                     list_add(&req->queuelist, &requests);
> -                     continue;
> -             }
> -             merge_bio.head = req->bio;
> -             merge_bio.tail = req->biotail;
> -             bio_list_merge(&bio_list, &merge_bio);
> -             req->bio = NULL;
> -             if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
> -                     pr_alert("diskcache flush request found!\n");
> -             __blk_end_request_all(req, 0);
> -     }
> -     spin_unlock_irq(&info->io_lock);
> -
>       xenbus_switch_state(info->xbdev, XenbusStateConnected);
>  
>       spin_lock_irq(&info->io_lock);
> @@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
>               /* Requeue pending requests (flush or discard) */
>               list_del_init(&req->queuelist);
>               BUG_ON(req->nr_phys_segments > segs);
> -             blk_requeue_request(info->rq, req);
> +             blk_mq_requeue_request(req);
>       }
>       spin_unlock_irq(&info->io_lock);
> +     blk_mq_kick_requeue_list(info->rq);
>  
>       while ((bio = bio_list_pop(&bio_list)) != NULL) {
>               /* Traverse the list of pending bios and re-queue them */
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.