Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch

Hi,

relying on DM is not advisable since this is yet another layer that
breaks write barriers, plus it kills portability to non-linux OS.

btw:
It would be cool if a block shaping patch finally made it in the Xen
mainline, since the last one that was submitted in 2009 was apparently
forgotten...

Regards,
Florian

2011/6/21 Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>:
> On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
>> Hi all,
>>
>> I add a blkback QoS patch.
>
> What tree is this against? There is a xen-blkback in 3.0-rc4, can you rebase
> it against that please.
>
> What is the patch solving? Why can't it be done with dm-ioband?
>
>> You can config(dynamic/static) different I/O speed for different VM disk
>> by this patch.
>>
>> ----------------------------------------------------------------------------
>>
>> diff -urNp blkback/blkback.c blkback-qos/blkback.c
>> --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800
>> +++ blkback-qos/blkback.c     2011-06-22 07:53:18.000000000 +0800
>> @@ -44,6 +44,11 @@
>>  #include <asm/hypervisor.h>
>>  #include "common.h"
>>
>> +#undef DPRINTK
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/blkback (%s:%d) " fmt ".\n",    \
>> +              __FUNCTION__, __LINE__, ##args)
>> +
>>  /*
>>   * These are rather arbitrary. They are fairly large because adjacent 
>> requests
>>   * pulled from a communication ring are quite likely to end up being part of
>> @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
>>  static int do_block_io_op(blkif_t *blkif);
>>  static int dispatch_rw_block_io(blkif_t *blkif,
>>                                blkif_request_t *req,
>> -                              pending_req_t *pending_req);
>> +                              pending_req_t *pending_req,
>> +                              int *done_nr_sects);
>>  static void make_response(blkif_t *blkif, u64 id,
>>                         unsigned short op, int st);
>>
>> @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
>>       blkif->st_pk_req = 0;
>>  }
>>
>> +static void refill_reqcount(blkif_t *blkif)
>> +{
>> +     blkif->reqtime = jiffies + msecs_to_jiffies(1000);
>> +     blkif->reqcount = blkif->reqrate;
>> +     if (blkif->reqcount < blkif->reqmin)
>> +             blkif->reqcount = blkif->reqmin;
>> +}
>> +
>>  int blkif_schedule(void *arg)
>>  {
>>       blkif_t *blkif = arg;
>>       struct vbd *vbd = &blkif->vbd;
>> +     int     ret = 0;
>> +     struct timeval cur_time;
>>
>>       blkif_get(blkif);
>>
>> @@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
>>               blkif->waiting_reqs = 0;
>>               smp_mb(); /* clear flag *before* checking for work */
>>
>> -             if (do_block_io_op(blkif))
>> +             ret = do_block_io_op(blkif);
>> +             if (ret)
>>                       blkif->waiting_reqs = 1;
>>               unplug_queue(blkif);
>>
>> +             if(blkif->reqmin){
>> +                     if(2 == ret && (blkif->reqtime > jiffies)){
>> +                             jiffies_to_timeval(jiffies, &cur_time);
>> +                             if(log_stats && (cur_time.tv_sec % 10 ==1 ))
>> +                                     printk(KERN_DEBUG "%s: going to sleep 
>> %d millsecs(rate=%d)\n",
>> +                                                     current->comm,
>> +                                                     
>> jiffies_to_msecs(blkif->reqtime - jiffies),
>> +                                                     blkif->reqrate);
>> +
>> +                             set_current_state(TASK_INTERRUPTIBLE);
>> +                             schedule_timeout(blkif->reqtime - jiffies);
>> +
>> +                             if(log_stats && (cur_time.tv_sec % 10 ==1 ))
>> +                                     printk(KERN_DEBUG "%s: sleep 
>> end(rate=%d)\n",
>> +                                                     
>> current->comm,blkif->reqrate);
>> +                     }
>> +                     if (time_after(jiffies, blkif->reqtime))
>> +                             refill_reqcount(blkif);
>> +             }
>> +
>>               if (log_stats && time_after(jiffies, blkif->st_print))
>>                       print_stats(blkif);
>> +
>>       }
>>
>>       if (log_stats)
>> @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
>>  /******************************************************************
>>   * DOWNWARD CALLS -- These interface with the block-device layer proper.
>>   */
>> -
>>  static int do_block_io_op(blkif_t *blkif)
>>  {
>>       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
>> @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
>>       pending_req_t *pending_req;
>>       RING_IDX rc, rp;
>>       int more_to_do = 0, ret;
>> +     static int last_done_nr_sects = 0;
>>
>>       rc = blk_rings->common.req_cons;
>>       rp = blk_rings->common.sring->req_prod;
>>       rmb(); /* Ensure we see queued requests up to 'rp'. */
>> +
>> +     if (blkif->reqmin && blkif->reqcount <= 0)
>> +             return (rc != rp) ? 2 : 0;
>>
>>       while ((rc != rp) || (blkif->is_suspended_req)) {
>>
>>               if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
>>                       break;
>> +
>> +             if(blkif->reqmin){
>> +                     blkif->reqcount -= last_done_nr_sects;
>> +                     if (blkif->reqcount <= 0) {
>> +                             more_to_do = 2;
>> +                             break;
>> +                     }
>> +             }
>>
>>               if (kthread_should_stop()) {
>>                       more_to_do = 1;
>> @@ -367,14 +416,14 @@ handle_request:
>>               switch (req.operation) {
>>               case BLKIF_OP_READ:
>>                       blkif->st_rd_req++;
>> -                     ret = dispatch_rw_block_io(blkif, &req, pending_req);
>> +                     ret = dispatch_rw_block_io(blkif, &req, 
>> pending_req,&last_done_nr_sects);
>>                       break;
>>               case BLKIF_OP_WRITE_BARRIER:
>>                       blkif->st_br_req++;
>>                       /* fall through */
>>               case BLKIF_OP_WRITE:
>>                       blkif->st_wr_req++;
>> -                     ret = dispatch_rw_block_io(blkif, &req, pending_req);
>> +                     ret = dispatch_rw_block_io(blkif, &req, 
>> pending_req,&last_done_nr_sects);
>>                       break;
>>               case BLKIF_OP_PACKET:
>>                       DPRINTK("error: block operation BLKIF_OP_PACKET not 
>> implemented\n");
>> @@ -412,9 +461,29 @@ handle_request:
>>       return more_to_do;
>>  }
>>
>> +static char* operation2str(int operation)
>> +{
>> +     char* ret_str = NULL;
>> +     switch (operation) {
>> +     case BLKIF_OP_READ:
>> +             ret_str = "READ";
>> +             break;
>> +     case BLKIF_OP_WRITE:
>> +             ret_str = "WRITE";
>> +             break;
>> +     case BLKIF_OP_WRITE_BARRIER:
>> +             ret_str = "WRITE_BARRIER";
>> +             break;
>> +     default:
>> +             ret_str = "0";
>> +     }
>> +     return ret_str;
>> +}
>> +
>>  static int dispatch_rw_block_io(blkif_t *blkif,
>>                                blkif_request_t *req,
>> -                              pending_req_t *pending_req)
>> +                              pending_req_t *pending_req,
>> +                              int *done_nr_sects)
>>  {
>>       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
>>       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>> @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
>>       struct bio *bio = NULL;
>>       int ret, i;
>>       int operation;
>> +     struct timeval cur_time;
>> +
>> +     *done_nr_sects = 0;
>>
>>       switch (req->operation) {
>>       case BLKIF_OP_READ:
>> @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
>>       else if (operation == WRITE || operation == WRITE_BARRIER)
>>               blkif->st_wr_sect += preq.nr_sects;
>>
>> +     *done_nr_sects = preq.nr_sects;
>> +     jiffies_to_timeval(jiffies, &cur_time);
>> +     if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
>> +             printk(KERN_DEBUG "  operation=%s sects=%d\n",
>> +                     operation2str(req->operation),preq.nr_sects);
>> +
>>       return 0;
>>
>>   fail_flush:
>> @@ -695,6 +773,8 @@ static int __init blkif_init(void)
>>
>>       blkif_xenbus_init();
>>
>> +     DPRINTK("blkif_inited\n");
>> +
>>       return 0;
>>
>>   out_of_memory:
>> diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
>> --- blkback/cdrom.c   2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/cdrom.c       2011-06-22 07:34:50.000000000 +0800
>> @@ -35,9 +35,9 @@
>>  #include "common.h"
>>
>>  #undef DPRINTK
>> -#define DPRINTK(_f, _a...)                   \
>> -     printk("(%s() file=%s, line=%d) " _f "\n",      \
>> -              __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/cdrom (%s:%d) " fmt ".\n",      \
>> +              __FUNCTION__, __LINE__, ##args)
>>
>>
>>  #define MEDIA_PRESENT "media-present"
>> diff -urNp blkback/common.h blkback-qos/common.h
>> --- blkback/common.h  2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/common.h      2011-06-22 07:34:50.000000000 +0800
>> @@ -100,8 +100,17 @@ typedef struct blkif_st {
>>
>>       grant_handle_t shmem_handle;
>>       grant_ref_t    shmem_ref;
>> +
>> +     /* qos information */
>> +     unsigned long   reqtime;
>> +     int    reqcount;
>> +     int    reqmin;
>> +     int    reqrate;
>> +
>>  } blkif_t;
>>
>> +#define VBD_QOS_MIN_RATE_LIMIT                       2*1024          /*     
>>  1MBs    */
>> +
>>  struct backend_info
>>  {
>>       struct xenbus_device *dev;
>> @@ -111,6 +120,8 @@ struct backend_info
>>       unsigned major;
>>       unsigned minor;
>>       char *mode;
>> +     struct xenbus_watch rate_watch;
>> +     int have_rate_watch;
>>  };
>>
>>  blkif_t *blkif_alloc(domid_t domid);
>> diff -urNp blkback/vbd.c blkback-qos/vbd.c
>> --- blkback/vbd.c     2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800
>> @@ -35,6 +35,11 @@
>>  #define vbd_sz(_v)   ((_v)->bdev->bd_part ?                          \
>>       (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
>>
>> +#undef DPRINTK
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/vbd (%s:%d) " fmt ".\n",        \
>> +              __FUNCTION__, __LINE__, ##args)
>> +
>>  unsigned long long vbd_size(struct vbd *vbd)
>>  {
>>       return vbd_sz(vbd);
>> @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
>>       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
>>               vbd->type |= VDISK_REMOVABLE;
>>
>> -     DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
>> +     DPRINTK("Successful creation of handle=%04x (dom=%u)",
>>               handle, blkif->domid);
>>       return 0;
>>  }
>> diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
>> --- blkback/xenbus.c  2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/xenbus.c      2011-06-22 07:34:50.000000000 +0800
>> @@ -25,13 +25,14 @@
>>
>>  #undef DPRINTK
>>  #define DPRINTK(fmt, args...)                                \
>> -     pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
>> +     printk("blkback/xenbus (%s:%d) " fmt ".\n",     \
>>                __FUNCTION__, __LINE__, ##args)
>>
>>  static void connect(struct backend_info *);
>>  static int connect_ring(struct backend_info *);
>>  static void backend_changed(struct xenbus_watch *, const char **,
>>                           unsigned int);
>> +static void unregister_rate_watch(struct backend_info *be);
>>
>>  static int blkback_name(blkif_t *blkif, char *buf)
>>  {
>> @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
>>       char name[TASK_COMM_LEN];
>>
>>       /* Not ready to connect? */
>> -     if (!blkif->irq || !blkif->vbd.bdev)
>> +     if (!blkif->irq || !blkif->vbd.bdev){
>> +             DPRINTK("Not ready to connect");
>>               return;
>> +     }
>>
>>       /* Already connected? */
>>       if (blkif->be->dev->state == XenbusStateConnected)
>> @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
>>               be->cdrom_watch.node = NULL;
>>       }
>>
>> +     unregister_rate_watch(be);
>> +
>>       if (be->blkif) {
>>               blkif_disconnect(be->blkif);
>>               vbd_free(&be->blkif->vbd);
>> @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
>>
>>       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
>>                                &be->backend_watch, backend_changed);
>> +
>> +     DPRINTK("blkback_probe called");
>> +     DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
>> +
>>       if (err)
>>               goto fail;
>>
>> @@ -266,7 +275,6 @@ fail:
>>       return err;
>>  }
>>
>> -
>>  /**
>>   * Callback received when the hotplug scripts have placed the 
>> physical-device
>>   * node.  Read it and the mode node, and create a vbd.  If the frontend is
>> @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
>>       struct xenbus_device *dev = be->dev;
>>       int cdrom = 0;
>>       char *device_type;
>> +     char name[TASK_COMM_LEN];
>>
>> -     DPRINTK("");
>> +     DPRINTK("backend_changed called");
>>
>>       err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
>>                          &major, &minor);
>> @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
>>               kfree(device_type);
>>       }
>>
>> +     /* gather information about QoS policy for this device. */
>> +     err = blkback_name(be->blkif, name);
>> +     if (err) {
>> +             xenbus_dev_error(be->dev, err, "get blkback dev name");
>> +             return;
>> +     }
>> +
>> +     err = xenbus_gather(XBT_NIL, dev->otherend,
>> +                             "tokens-rate", "%d", &be->blkif->reqrate,
>> +                             NULL);
>> +     if(err){
>> +             DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
>> +     }else{
>> +             if(be->blkif->reqrate <= 0){
>> +                     be->blkif->reqmin = 0 ;
>> +                     DPRINTK("%s tokens-rate == 0,no limit",name);
>> +             }else{
>> +                     DPRINTK("%s 
>> xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     be->blkif->reqrate *= 2;
>> +                     be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
>> +                     if(be->blkif->reqmin > be->blkif->reqrate){
>> +                             be->blkif->reqrate = be->blkif->reqmin;
>> +                             DPRINTK("%s reset default 
>> value(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     }
>> +             }
>> +     }
>> +     be->blkif->reqtime = jiffies;
>> +
>>       if (be->major == 0 && be->minor == 0) {
>>               /* Front end dir is a number, which is used as the handle. */
>>
>> @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
>>
>>  /* ** Connection ** */
>>
>> +static void unregister_rate_watch(struct backend_info *be)
>> +{
>> +     if (be->have_rate_watch) {
>> +             unregister_xenbus_watch(&be->rate_watch);
>> +             kfree(be->rate_watch.node);
>> +     }
>> +     be->have_rate_watch = 0;
>> +}
>> +
>> +static void rate_changed(struct xenbus_watch *watch,
>> +                       const char **vec, unsigned int len)
>> +{
>> +
>> +     struct backend_info *be=container_of(watch,struct backend_info, 
>> rate_watch);
>> +     int err;
>> +     char name[TASK_COMM_LEN];
>> +
>> +     err = blkback_name(be->blkif, name);
>> +     if (err) {
>> +             xenbus_dev_error(be->dev, err, "get blkback dev name");
>> +             return;
>> +     }
>> +
>> +     err = xenbus_gather(XBT_NIL,be->dev->otherend,
>> +                                     "tokens-rate",  "%d",
>> +                                     &be->blkif->reqrate,NULL);
>> +     if(err){
>> +             DPRINTK("%s xenbus_gather(tokens-rate) error",name);
>> +     }else{
>> +             if(be->blkif->reqrate <= 0){
>> +                     be->blkif->reqmin = 0;
>> +                     DPRINTK("%s tokens-rate == 0,no limit",name);
>> +             }else{
>> +                     DPRINTK("%s 
>> xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     be->blkif->reqrate *= 2;
>> +                     be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
>> +                     if(be->blkif->reqmin > be->blkif->reqrate){
>> +                             be->blkif->reqrate = be->blkif->reqmin;
>> +                             DPRINTK("%s reset default 
>> value(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     }
>> +             }
>> +     }
>> +}
>>
>>  /**
>>   * Write the physical details regarding the block device to the store, and
>> @@ -439,6 +519,14 @@ again:
>>       if (err)
>>               goto abort;
>>
>> +     /*add by andrew for centos pv*/
>> +     err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
>> +     if (err){
>> +             xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
>> +                     dev->nodename);
>> +             goto abort;
>> +     }
>> +
>>       err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
>>                           vbd_size(&be->blkif->vbd));
>>       if (err) {
>> @@ -469,11 +557,22 @@ again:
>>       if (err)
>>               xenbus_dev_fatal(dev, err, "ending transaction");
>>
>> +     DPRINTK("xenbus_switch_to XenbusStateConnected");
>> +
>>       err = xenbus_switch_state(dev, XenbusStateConnected);
>>       if (err)
>>               xenbus_dev_fatal(dev, err, "switching to Connected state",
>>                                dev->nodename);
>>
>> +     unregister_rate_watch(be);
>> +     err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
>> +                                                             
>> &be->rate_watch,rate_changed);
>> +     if (!err)
>> +             be->have_rate_watch = 1;
>> +     else
>> +             xenbus_dev_fatal(dev, err, "watching tokens-rate",
>> +                              dev->nodename);
>> +
>>       return;
>>   abort:
>>       xenbus_transaction_end(xbt, 1);
>
>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@xxxxxxxxxxxxxxxxxxx
>> http://lists.xensource.com/xen-devel
>
>
> _______________________________________________
> Xen-users mailing list
> Xen-users@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-users
>



-- 
the purpose of libvirt is to provide an abstraction layer hiding all
xen features added since 2006 until they were finally understood and
copied by the kvm devs.

_______________________________________________
Xen-users mailing list
Xen-users@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-users
WARNING - OLD ARCHIVES

xen-devel

Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch