WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-users

Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch

To: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Subject: Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
From: Florian Heigl <florian.heigl@xxxxxxxxx>
Date: Wed, 22 Jun 2011 14:16:09 +0200
Cc: Andrew Xu <xu.an@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxx, xen-users@xxxxxxxxxxxxxxxxxxx
Delivery-date: Wed, 22 Jun 2011 05:17:11 -0700
Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:mime-version:in-reply-to:references:date :message-id:subject:from:to:cc:content-type :content-transfer-encoding; bh=EERNiuYX429zlhTEk9r6LLvcdY1Cse1rgM4xzgBJcSw=; b=ixq8Bzea+uWNb6AkRbe7idIgdFRuZsj9xc75KNaCOU9QaolO8Slrk9oAfsUqs0UIzN Fb90XDpuox5N9Cw3hr6u0dP7ncyZSSKm79lOUGvlsJVGHG7nl5WHoIhmASq/uh4bjnuk CaxnsYpXZCzbRkVuO9uN2lRhHPRSedZ2ebSo8=
Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:in-reply-to:references:date:message-id:subject:from:to :cc:content-type:content-transfer-encoding; b=tru9dGs9chmHePPGeggbI96Eo7b+rFy0O1VsVAwznVtu60+x2s0oB2cV92QhuFS95F ZrZWUrwSObZ8NAoEC6qEOBhnzjFzpOGUYripAiW1Qb4FLP72bKTKG8f8CC2U1iToTSeu LCflnaUvxKtVQEdkx54F8e8RYtvtdAlxPgu30=
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <20110621133337.GD5650@xxxxxxxxxxxx>
List-help: <mailto:xen-users-request@lists.xensource.com?subject=help>
List-id: Xen user discussion <xen-users.lists.xensource.com>
List-post: <mailto:xen-users@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-users>, <mailto:xen-users-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-users>, <mailto:xen-users-request@lists.xensource.com?subject=unsubscribe>
References: <20110621162935.F4A1.3A8D29D5@xxxxxxxxxx> <20110621133337.GD5650@xxxxxxxxxxxx>
Sender: xen-users-bounces@xxxxxxxxxxxxxxxxxxx
Hi,

relying on DM is not advisable since this is yet another layer that
breaks write barriers, plus it kills portability to non-linux OS.

btw:
It would be cool if a block shaping patch finally made it in the Xen
mainline, since the last one that was submitted in 2009 was apparently
forgotten...

Regards,
Florian

2011/6/21 Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>:
> On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
>> Hi all,
>>
>> I add a blkback QoS patch.
>
> What tree is this against? There is a xen-blkback in 3.0-rc4, can you rebase
> it against that please.
>
> What is the patch solving? Why can't it be done with dm-ioband?
>
>> You can config(dynamic/static) different I/O speed for different VM disk
>> by this patch.
>>
>> ----------------------------------------------------------------------------
>>
>> diff -urNp blkback/blkback.c blkback-qos/blkback.c
>> --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800
>> +++ blkback-qos/blkback.c     2011-06-22 07:53:18.000000000 +0800
>> @@ -44,6 +44,11 @@
>>  #include <asm/hypervisor.h>
>>  #include "common.h"
>>
>> +#undef DPRINTK
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/blkback (%s:%d) " fmt ".\n",    \
>> +              __FUNCTION__, __LINE__, ##args)
>> +
>>  /*
>>   * These are rather arbitrary. They are fairly large because adjacent 
>> requests
>>   * pulled from a communication ring are quite likely to end up being part of
>> @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
>>  static int do_block_io_op(blkif_t *blkif);
>>  static int dispatch_rw_block_io(blkif_t *blkif,
>>                                blkif_request_t *req,
>> -                              pending_req_t *pending_req);
>> +                              pending_req_t *pending_req,
>> +                              int *done_nr_sects);
>>  static void make_response(blkif_t *blkif, u64 id,
>>                         unsigned short op, int st);
>>
>> @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
>>       blkif->st_pk_req = 0;
>>  }
>>
>> +static void refill_reqcount(blkif_t *blkif)
>> +{
>> +     blkif->reqtime = jiffies + msecs_to_jiffies(1000);
>> +     blkif->reqcount = blkif->reqrate;
>> +     if (blkif->reqcount < blkif->reqmin)
>> +             blkif->reqcount = blkif->reqmin;
>> +}
>> +
>>  int blkif_schedule(void *arg)
>>  {
>>       blkif_t *blkif = arg;
>>       struct vbd *vbd = &blkif->vbd;
>> +     int     ret = 0;
>> +     struct timeval cur_time;
>>
>>       blkif_get(blkif);
>>
>> @@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
>>               blkif->waiting_reqs = 0;
>>               smp_mb(); /* clear flag *before* checking for work */
>>
>> -             if (do_block_io_op(blkif))
>> +             ret = do_block_io_op(blkif);
>> +             if (ret)
>>                       blkif->waiting_reqs = 1;
>>               unplug_queue(blkif);
>>
>> +             if(blkif->reqmin){
>> +                     if(2 == ret && (blkif->reqtime > jiffies)){
>> +                             jiffies_to_timeval(jiffies, &cur_time);
>> +                             if(log_stats && (cur_time.tv_sec % 10 ==1 ))
>> +                                     printk(KERN_DEBUG "%s: going to sleep 
>> %d millsecs(rate=%d)\n",
>> +                                                     current->comm,
>> +                                                     
>> jiffies_to_msecs(blkif->reqtime - jiffies),
>> +                                                     blkif->reqrate);
>> +
>> +                             set_current_state(TASK_INTERRUPTIBLE);
>> +                             schedule_timeout(blkif->reqtime - jiffies);
>> +
>> +                             if(log_stats && (cur_time.tv_sec % 10 ==1 ))
>> +                                     printk(KERN_DEBUG "%s: sleep 
>> end(rate=%d)\n",
>> +                                                     
>> current->comm,blkif->reqrate);
>> +                     }
>> +                     if (time_after(jiffies, blkif->reqtime))
>> +                             refill_reqcount(blkif);
>> +             }
>> +
>>               if (log_stats && time_after(jiffies, blkif->st_print))
>>                       print_stats(blkif);
>> +
>>       }
>>
>>       if (log_stats)
>> @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
>>  /******************************************************************
>>   * DOWNWARD CALLS -- These interface with the block-device layer proper.
>>   */
>> -
>>  static int do_block_io_op(blkif_t *blkif)
>>  {
>>       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
>> @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
>>       pending_req_t *pending_req;
>>       RING_IDX rc, rp;
>>       int more_to_do = 0, ret;
>> +     static int last_done_nr_sects = 0;
>>
>>       rc = blk_rings->common.req_cons;
>>       rp = blk_rings->common.sring->req_prod;
>>       rmb(); /* Ensure we see queued requests up to 'rp'. */
>> +
>> +     if (blkif->reqmin && blkif->reqcount <= 0)
>> +             return (rc != rp) ? 2 : 0;
>>
>>       while ((rc != rp) || (blkif->is_suspended_req)) {
>>
>>               if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
>>                       break;
>> +
>> +             if(blkif->reqmin){
>> +                     blkif->reqcount -= last_done_nr_sects;
>> +                     if (blkif->reqcount <= 0) {
>> +                             more_to_do = 2;
>> +                             break;
>> +                     }
>> +             }
>>
>>               if (kthread_should_stop()) {
>>                       more_to_do = 1;
>> @@ -367,14 +416,14 @@ handle_request:
>>               switch (req.operation) {
>>               case BLKIF_OP_READ:
>>                       blkif->st_rd_req++;
>> -                     ret = dispatch_rw_block_io(blkif, &req, pending_req);
>> +                     ret = dispatch_rw_block_io(blkif, &req, 
>> pending_req,&last_done_nr_sects);
>>                       break;
>>               case BLKIF_OP_WRITE_BARRIER:
>>                       blkif->st_br_req++;
>>                       /* fall through */
>>               case BLKIF_OP_WRITE:
>>                       blkif->st_wr_req++;
>> -                     ret = dispatch_rw_block_io(blkif, &req, pending_req);
>> +                     ret = dispatch_rw_block_io(blkif, &req, 
>> pending_req,&last_done_nr_sects);
>>                       break;
>>               case BLKIF_OP_PACKET:
>>                       DPRINTK("error: block operation BLKIF_OP_PACKET not 
>> implemented\n");
>> @@ -412,9 +461,29 @@ handle_request:
>>       return more_to_do;
>>  }
>>
>> +static char* operation2str(int operation)
>> +{
>> +     char* ret_str = NULL;
>> +     switch (operation) {
>> +     case BLKIF_OP_READ:
>> +             ret_str = "READ";
>> +             break;
>> +     case BLKIF_OP_WRITE:
>> +             ret_str = "WRITE";
>> +             break;
>> +     case BLKIF_OP_WRITE_BARRIER:
>> +             ret_str = "WRITE_BARRIER";
>> +             break;
>> +     default:
>> +             ret_str = "0";
>> +     }
>> +     return ret_str;
>> +}
>> +
>>  static int dispatch_rw_block_io(blkif_t *blkif,
>>                                blkif_request_t *req,
>> -                              pending_req_t *pending_req)
>> +                              pending_req_t *pending_req,
>> +                              int *done_nr_sects)
>>  {
>>       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
>>       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>> @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
>>       struct bio *bio = NULL;
>>       int ret, i;
>>       int operation;
>> +     struct timeval cur_time;
>> +
>> +     *done_nr_sects = 0;
>>
>>       switch (req->operation) {
>>       case BLKIF_OP_READ:
>> @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
>>       else if (operation == WRITE || operation == WRITE_BARRIER)
>>               blkif->st_wr_sect += preq.nr_sects;
>>
>> +     *done_nr_sects = preq.nr_sects;
>> +     jiffies_to_timeval(jiffies, &cur_time);
>> +     if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
>> +             printk(KERN_DEBUG "  operation=%s sects=%d\n",
>> +                     operation2str(req->operation),preq.nr_sects);
>> +
>>       return 0;
>>
>>   fail_flush:
>> @@ -695,6 +773,8 @@ static int __init blkif_init(void)
>>
>>       blkif_xenbus_init();
>>
>> +     DPRINTK("blkif_inited\n");
>> +
>>       return 0;
>>
>>   out_of_memory:
>> diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
>> --- blkback/cdrom.c   2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/cdrom.c       2011-06-22 07:34:50.000000000 +0800
>> @@ -35,9 +35,9 @@
>>  #include "common.h"
>>
>>  #undef DPRINTK
>> -#define DPRINTK(_f, _a...)                   \
>> -     printk("(%s() file=%s, line=%d) " _f "\n",      \
>> -              __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/cdrom (%s:%d) " fmt ".\n",      \
>> +              __FUNCTION__, __LINE__, ##args)
>>
>>
>>  #define MEDIA_PRESENT "media-present"
>> diff -urNp blkback/common.h blkback-qos/common.h
>> --- blkback/common.h  2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/common.h      2011-06-22 07:34:50.000000000 +0800
>> @@ -100,8 +100,17 @@ typedef struct blkif_st {
>>
>>       grant_handle_t shmem_handle;
>>       grant_ref_t    shmem_ref;
>> +
>> +     /* qos information */
>> +     unsigned long   reqtime;
>> +     int    reqcount;
>> +     int    reqmin;
>> +     int    reqrate;
>> +
>>  } blkif_t;
>>
>> +#define VBD_QOS_MIN_RATE_LIMIT                       2*1024          /*     
>>  1MBs    */
>> +
>>  struct backend_info
>>  {
>>       struct xenbus_device *dev;
>> @@ -111,6 +120,8 @@ struct backend_info
>>       unsigned major;
>>       unsigned minor;
>>       char *mode;
>> +     struct xenbus_watch rate_watch;
>> +     int have_rate_watch;
>>  };
>>
>>  blkif_t *blkif_alloc(domid_t domid);
>> diff -urNp blkback/vbd.c blkback-qos/vbd.c
>> --- blkback/vbd.c     2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800
>> @@ -35,6 +35,11 @@
>>  #define vbd_sz(_v)   ((_v)->bdev->bd_part ?                          \
>>       (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
>>
>> +#undef DPRINTK
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/vbd (%s:%d) " fmt ".\n",        \
>> +              __FUNCTION__, __LINE__, ##args)
>> +
>>  unsigned long long vbd_size(struct vbd *vbd)
>>  {
>>       return vbd_sz(vbd);
>> @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
>>       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
>>               vbd->type |= VDISK_REMOVABLE;
>>
>> -     DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
>> +     DPRINTK("Successful creation of handle=%04x (dom=%u)",
>>               handle, blkif->domid);
>>       return 0;
>>  }
>> diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
>> --- blkback/xenbus.c  2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/xenbus.c      2011-06-22 07:34:50.000000000 +0800
>> @@ -25,13 +25,14 @@
>>
>>  #undef DPRINTK
>>  #define DPRINTK(fmt, args...)                                \
>> -     pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
>> +     printk("blkback/xenbus (%s:%d) " fmt ".\n",     \
>>                __FUNCTION__, __LINE__, ##args)
>>
>>  static void connect(struct backend_info *);
>>  static int connect_ring(struct backend_info *);
>>  static void backend_changed(struct xenbus_watch *, const char **,
>>                           unsigned int);
>> +static void unregister_rate_watch(struct backend_info *be);
>>
>>  static int blkback_name(blkif_t *blkif, char *buf)
>>  {
>> @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
>>       char name[TASK_COMM_LEN];
>>
>>       /* Not ready to connect? */
>> -     if (!blkif->irq || !blkif->vbd.bdev)
>> +     if (!blkif->irq || !blkif->vbd.bdev){
>> +             DPRINTK("Not ready to connect");
>>               return;
>> +     }
>>
>>       /* Already connected? */
>>       if (blkif->be->dev->state == XenbusStateConnected)
>> @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
>>               be->cdrom_watch.node = NULL;
>>       }
>>
>> +     unregister_rate_watch(be);
>> +
>>       if (be->blkif) {
>>               blkif_disconnect(be->blkif);
>>               vbd_free(&be->blkif->vbd);
>> @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
>>
>>       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
>>                                &be->backend_watch, backend_changed);
>> +
>> +     DPRINTK("blkback_probe called");
>> +     DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
>> +
>>       if (err)
>>               goto fail;
>>
>> @@ -266,7 +275,6 @@ fail:
>>       return err;
>>  }
>>
>> -
>>  /**
>>   * Callback received when the hotplug scripts have placed the 
>> physical-device
>>   * node.  Read it and the mode node, and create a vbd.  If the frontend is
>> @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
>>       struct xenbus_device *dev = be->dev;
>>       int cdrom = 0;
>>       char *device_type;
>> +     char name[TASK_COMM_LEN];
>>
>> -     DPRINTK("");
>> +     DPRINTK("backend_changed called");
>>
>>       err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
>>                          &major, &minor);
>> @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
>>               kfree(device_type);
>>       }
>>
>> +     /* gather information about QoS policy for this device. */
>> +     err = blkback_name(be->blkif, name);
>> +     if (err) {
>> +             xenbus_dev_error(be->dev, err, "get blkback dev name");
>> +             return;
>> +     }
>> +
>> +     err = xenbus_gather(XBT_NIL, dev->otherend,
>> +                             "tokens-rate", "%d", &be->blkif->reqrate,
>> +                             NULL);
>> +     if(err){
>> +             DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
>> +     }else{
>> +             if(be->blkif->reqrate <= 0){
>> +                     be->blkif->reqmin = 0 ;
>> +                     DPRINTK("%s tokens-rate == 0,no limit",name);
>> +             }else{
>> +                     DPRINTK("%s 
>> xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     be->blkif->reqrate *= 2;
>> +                     be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
>> +                     if(be->blkif->reqmin > be->blkif->reqrate){
>> +                             be->blkif->reqrate = be->blkif->reqmin;
>> +                             DPRINTK("%s reset default 
>> value(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     }
>> +             }
>> +     }
>> +     be->blkif->reqtime = jiffies;
>> +
>>       if (be->major == 0 && be->minor == 0) {
>>               /* Front end dir is a number, which is used as the handle. */
>>
>> @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
>>
>>  /* ** Connection ** */
>>
>> +static void unregister_rate_watch(struct backend_info *be)
>> +{
>> +     if (be->have_rate_watch) {
>> +             unregister_xenbus_watch(&be->rate_watch);
>> +             kfree(be->rate_watch.node);
>> +     }
>> +     be->have_rate_watch = 0;
>> +}
>> +
>> +static void rate_changed(struct xenbus_watch *watch,
>> +                       const char **vec, unsigned int len)
>> +{
>> +
>> +     struct backend_info *be=container_of(watch,struct backend_info, 
>> rate_watch);
>> +     int err;
>> +     char name[TASK_COMM_LEN];
>> +
>> +     err = blkback_name(be->blkif, name);
>> +     if (err) {
>> +             xenbus_dev_error(be->dev, err, "get blkback dev name");
>> +             return;
>> +     }
>> +
>> +     err = xenbus_gather(XBT_NIL,be->dev->otherend,
>> +                                     "tokens-rate",  "%d",
>> +                                     &be->blkif->reqrate,NULL);
>> +     if(err){
>> +             DPRINTK("%s xenbus_gather(tokens-rate) error",name);
>> +     }else{
>> +             if(be->blkif->reqrate <= 0){
>> +                     be->blkif->reqmin = 0;
>> +                     DPRINTK("%s tokens-rate == 0,no limit",name);
>> +             }else{
>> +                     DPRINTK("%s 
>> xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     be->blkif->reqrate *= 2;
>> +                     be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
>> +                     if(be->blkif->reqmin > be->blkif->reqrate){
>> +                             be->blkif->reqrate = be->blkif->reqmin;
>> +                             DPRINTK("%s reset default 
>> value(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     }
>> +             }
>> +     }
>> +}
>>
>>  /**
>>   * Write the physical details regarding the block device to the store, and
>> @@ -439,6 +519,14 @@ again:
>>       if (err)
>>               goto abort;
>>
>> +     /*add by andrew for centos pv*/
>> +     err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
>> +     if (err){
>> +             xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
>> +                     dev->nodename);
>> +             goto abort;
>> +     }
>> +
>>       err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
>>                           vbd_size(&be->blkif->vbd));
>>       if (err) {
>> @@ -469,11 +557,22 @@ again:
>>       if (err)
>>               xenbus_dev_fatal(dev, err, "ending transaction");
>>
>> +     DPRINTK("xenbus_switch_to XenbusStateConnected");
>> +
>>       err = xenbus_switch_state(dev, XenbusStateConnected);
>>       if (err)
>>               xenbus_dev_fatal(dev, err, "switching to Connected state",
>>                                dev->nodename);
>>
>> +     unregister_rate_watch(be);
>> +     err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
>> +                                                             
>> &be->rate_watch,rate_changed);
>> +     if (!err)
>> +             be->have_rate_watch = 1;
>> +     else
>> +             xenbus_dev_fatal(dev, err, "watching tokens-rate",
>> +                              dev->nodename);
>> +
>>       return;
>>   abort:
>>       xenbus_transaction_end(xbt, 1);
>
>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@xxxxxxxxxxxxxxxxxxx
>> http://lists.xensource.com/xen-devel
>
>
> _______________________________________________
> Xen-users mailing list
> Xen-users@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-users
>



-- 
the purpose of libvirt is to provide an abstraction layer hiding all
xen features added since 2006 until they were finally understood and
copied by the kvm devs.

_______________________________________________
Xen-users mailing list
Xen-users@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-users