[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v2 1/4] xen: add real time scheduler rt



On Sun, 2014-09-07 at 15:40 -0400, Meng Xu wrote:
> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> new file mode 100644
> index 0000000..412f8b1

> +/*
> + * Debug only. Used to printout debug information
> + */
> +#define printtime()\
> +        ({s_time_t now = NOW(); \
> +          printk("%u : %3ld.%3ldus : %-19s\n",smp_processor_id(),\
> +          now/MICROSECS(1), now%MICROSECS(1)/1000, __func__);} )
> +
You probably don't need this. As I said yesterday, you can keep it in an
out-of-series debug commit/patch.

> +/*
> + * rt tracing events ("only" 512 available!). Check
> + * include/public/trace.h for more details.
> + */
> +#define TRC_RT_TICKLE           TRC_SCHED_CLASS_EVT(RT, 1)
> +#define TRC_RT_RUNQ_PICK        TRC_SCHED_CLASS_EVT(RT, 2)
> +#define TRC_RT_BUDGET_BURN      TRC_SCHED_CLASS_EVT(RT, 3)
> +#define TRC_RT_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RT, 4)
> +#define TRC_RT_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RT, 5)
> +#define TRC_RT_VCPU_DUMP        TRC_SCHED_CLASS_EVT(RT, 6)
>
Ditto about the uselessness of TRC_RT_VCPU_DUMP.

Also, as already said, RTDS here and everywhere else.

> +
> +/*
> + * Systme-wide private data, include a global RunQueue
> + * Global lock is referenced by schedule_data.schedule_lock from all 
> + * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
> + */
> +struct rt_private {
> +    spinlock_t lock;           /* The global coarse grand lock */
> +    struct list_head sdom;     /* list of availalbe domains, used for dump */
> +    struct list_head runq;     /* Ordered list of runnable VMs */
                                                     runnable vcpus ?

> +    struct rt_vcpu *flag_vcpu; /* position of the first depleted vcpu */
> +    cpumask_t cpus;            /* cpumask_t of available physical cpus */
> +    cpumask_t tickled;         /* cpus been tickled */
> +};

> +/*
> + * Debug related code, dump vcpu/cpu information
> + */
> +static void
> +rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
> +{
> +    struct rt_private *prv = RT_PRIV(ops);
> +    char cpustr[1024];
> +    cpumask_t *cpupool_mask;
> +
> +    ASSERT(svc != NULL);
> +    /* flag vcpu */
> +    if( svc->sdom == NULL )
> +        return;
> +
> +    cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity);
> +    printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
> +           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime
> +           " onR=%d runnable=%d cpu_hard_affinity=%s ",
>
How does this come up in the console? Should we break it with a '\n'
somewhere? It looks rather long...

> +            svc->vcpu->domain->domain_id,
> +            svc->vcpu->vcpu_id,
> +            svc->vcpu->processor,
> +            svc->period,
> +            svc->budget,
> +            svc->cur_budget,
> +            svc->cur_deadline,
> +            svc->last_start,
> +            __vcpu_on_runq(svc),
> +            vcpu_runnable(svc->vcpu),
> +            cpustr);
> +    memset(cpustr, 0, sizeof(cpustr));
> +    cpupool_mask = cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool);
> +    cpumask_scnprintf(cpustr, sizeof(cpustr), cpupool_mask);
> +    printk("cpupool=%s ", cpustr);
> +    memset(cpustr, 0, sizeof(cpustr));
> +    cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->cpus);
> +    printk("prv->cpus=%s\n", cpustr);
> +    
> +    /* TRACE */
> +    {
> +        struct {
> +            unsigned dom:16,vcpu:16;
> +            unsigned processor;
> +            unsigned cur_budget_lo, cur_budget_hi;
> +            unsigned cur_deadline_lo, cur_deadline_hi;
> +            unsigned is_vcpu_on_runq:16,is_vcpu_runnable:16;
> +        } d;
> +        d.dom = svc->vcpu->domain->domain_id;
> +        d.vcpu = svc->vcpu->vcpu_id;
> +        d.processor = svc->vcpu->processor;
> +        d.cur_budget_lo = (unsigned) svc->cur_budget;
> +        d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32);
> +        d.cur_deadline_lo = (unsigned) svc->cur_deadline;
> +        d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32);
> +        d.is_vcpu_on_runq = __vcpu_on_runq(svc);
> +        d.is_vcpu_runnable = vcpu_runnable(svc->vcpu);
> +        trace_var(TRC_RT_VCPU_DUMP, 1,
> +                  sizeof(d),
> +                  (unsigned char *)&d);
> +    }
> +}
> +

> +/*
> + * update deadline and budget when deadline is in the past,
> + * it need to be updated to the deadline of the current period 
> + */
> +static void
> +rt_update_helper(s_time_t now, struct rt_vcpu *svc)
> +{
>
While you're reworking this function, I'd also consider a different name
like 'rt_update_deadline', or 'rt_update_bandwidth', or something else
(it's the _helper part I don't like).

> +    s_time_t diff = now - svc->cur_deadline;
> +
> +    if ( diff >= 0 ) 
> +    {
> +        /* now can be later for several periods */
> +        long count = ( diff/svc->period ) + 1;
> +        svc->cur_deadline += count * svc->period;
> +        svc->cur_budget = svc->budget;
> +
> +        /* TRACE */
> +        {
> +            struct {
> +                unsigned dom:16,vcpu:16;
> +                unsigned cur_budget_lo, cur_budget_hi;
> +            } d;
> +            d.dom = svc->vcpu->domain->domain_id;
> +            d.vcpu = svc->vcpu->vcpu_id;
> +            d.cur_budget_lo = (unsigned) svc->cur_budget;
> +            d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32);
> +            trace_var(TRC_RT_BUDGET_REPLENISH, 1,
> +                      sizeof(d),
> +                      (unsigned char *) &d);
> +        }
> +
> +        return;
> +    }
> +}
> +
> +static inline void
> +__runq_remove(struct rt_vcpu *svc)
> +{
> +    if ( __vcpu_on_runq(svc) )
> +        list_del_init(&svc->runq_elem);
> +}
> +
> +/*
> + * Insert svc in the RunQ according to EDF: vcpus with smaller deadlines
> + * goes first.
      go

And if it was me that wrote 'goes', apologies for that. :-D

> + */
> +static void
> +__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> +{
> +    struct rt_private *prv = RT_PRIV(ops);
> +    struct list_head *runq = RUNQ(ops);
> +    struct list_head *iter;
> +    spinlock_t *schedule_lock;
> +    
>This empty line above seems to be actually empty, but looking more
carefully, it does contain 4 spaces, doesn't it?

If that's the case, avoid doing this, i.e., make sure that empty lines
are actually empty. :-D

Looking at each patch with `git show' should highlight occurrences of
this phenomenon, as well  as of any trailing white space, by marking
them in red.

> +    schedule_lock = per_cpu(schedule_data, 
> svc->vcpu->processor).schedule_lock;
> +    ASSERT( spin_is_locked(schedule_lock) );
> +    
As of now, the only lock around is prv->lock, isn't it? So this
per_cpu(xxx) is a complex way to get to prv->lock, or am I missing
something.

In credit, the pre-inited set of locks are actually used "as they are",
while in credit2, there is some remapping going on, but there is more
than one lock anyway. That's why you find things like the above in those
two schedulers. Here, you should not need anything like that, (as you do
everywhere else) just go ahead and use prv->lock.

Of course, that does not mean you don't need the lock remapping in
rt_alloc_pdata(). That code looks ok to me, just adapt this bit above,
as, like this, it makes things harder to understand.

Or am I overlooking something?

> +    ASSERT( !__vcpu_on_runq(svc) );
> +
> +    /* svc still has budget */
> +    if ( svc->cur_budget > 0 ) 
> +    {
> +        list_for_each(iter, runq) 
> +        {
> +            struct rt_vcpu * iter_svc = __runq_elem(iter);
> +            if ( iter_svc->cur_budget == 0 ||
> +                 svc->cur_deadline <= iter_svc->cur_deadline )
> +                    break;
> +         }
> +        list_add_tail(&svc->runq_elem, iter);
> +     }
> +    else 
> +    {
> +        list_add(&svc->runq_elem, &prv->flag_vcpu->runq_elem);
> +    }
> +}
> +
I agree with George about the queue splitting.

> +static void
> +rt_deinit(const struct scheduler *ops)
> +{
> +    struct rt_private *prv = RT_PRIV(ops);
> +
> +    printtime();
> +    printk("\n");
>
As said, when removing all the calls to rt_dump_vcpu, also remove both
the definition and all these calls to printtime(), they're of very few
value, IMO.

> +    xfree(prv->flag_vcpu);
> +    xfree(prv);
> +}

> +static void *
> +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
> +{
> +    unsigned long flags;
> +    struct rt_dom *sdom;
> +    struct rt_private * prv = RT_PRIV(ops);
> +
> +    sdom = xzalloc(struct rt_dom);
> +    if ( sdom == NULL ) 
> +    {
> +        printk("%s, xzalloc failed\n", __func__);
> +        return NULL;
>
Just `return NULL', the printk() is pretty useless. Failures like this,
will be identified, without the need of it.

> +    }
> +
> +    INIT_LIST_HEAD(&sdom->vcpu);
> +    INIT_LIST_HEAD(&sdom->sdom_elem);
> +    sdom->dom = dom;
> +
> +    /* spinlock here to insert the dom */
> +    spin_lock_irqsave(&prv->lock, flags);
> +    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
> +    spin_unlock_irqrestore(&prv->lock, flags);
> +
> +    return sdom;
> +}

> +static void *
> +rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
> +{
> +    struct rt_vcpu *svc;
> +    s_time_t now = NOW();
> +
> +    /* Allocate per-VCPU info */
> +    svc = xzalloc(struct rt_vcpu);
> +    if ( svc == NULL ) 
> +    {
> +        printk("%s, xzalloc failed\n", __func__);
> +        return NULL;
> +    }
> +
> +    INIT_LIST_HEAD(&svc->runq_elem);
> +    INIT_LIST_HEAD(&svc->sdom_elem);
> +    svc->flags = 0U;
> +    svc->sdom = dd;
> +    svc->vcpu = vc;
> +    svc->last_start = 0;
> +
> +    svc->period = RT_DS_DEFAULT_PERIOD;
> +    if ( !is_idle_vcpu(vc) )
> +        svc->budget = RT_DS_DEFAULT_BUDGET;
> +
> +    rt_update_helper(now, svc);
> +
And one more point in favour of pulling the check out of the helper. In
fact, in this case (independently whether you want to keep the division,
because it's the first time we set the deadline, or use the while loop),
you don't need to check if the deadline is in the past... You already
know it is!! :-D

That would mean you can just call rt_update_helper(), without further
checking, not here, neither inside the helper. Faster, but that does not
matter much in this case. Cleaner, and that _always_ does. :-)

> +    /* Debug only: dump new vcpu's info */
> +    rt_dump_vcpu(ops, svc);
> +
> +    return svc;
> +}

> +/*
> + * Burn budget in nanosecond granularity
> + */
> +static void
> +burn_budgets(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) 
> +{
>
burn_budget()? I mean, why the trailing 's'?

(yes, this is a very minor thing.)

> +    s_time_t delta;
> +
> +    /* don't burn budget for idle VCPU */
> +    if ( is_idle_vcpu(svc->vcpu) ) 
> +        return;
> +
> +    rt_update_helper(now, svc);
> +
> +    /* not burn budget when vcpu miss deadline */
> +    if ( now >= svc->cur_deadline )
> +        return;
> +
How this can be true?

Unless I'm missing something, in rt_update_helper(), if the deadline is
behind now, you move it ahead of it (and replenish the budget). Here you
check again whether the deadline is behind now, which should not be
possible, as you just took care of that... Isn't it so?

Considering both mine and George's suggestion, if you rework the helper
and move the check out of it, then this one is fine (and you just call
the helper if the condition is verified). If you don't want to do that,
then I guess you can have the helper returning 0|1 depending on whether
or not the update happened, and use such value here, for deciding
whether to bail or not.

I think I'd prefer the former (pulling the check out of the helper).

> +    /* burn at nanoseconds level */
> +    delta = now - svc->last_start;
> +    /* 
> +     * delta < 0 only happens in nested virtualization;
> +     * TODO: how should we handle delta < 0 in a better way? 
> +     */
> +    if ( delta < 0 ) 
> +    {
> +        printk("%s, ATTENTION: now is behind last_start! delta = %ld",
> +                __func__, delta);
> +        rt_dump_vcpu(ops, svc);
> +        svc->last_start = now;
> +        svc->cur_budget = 0;
> +        return;
> +    }
> +
> +    if ( svc->cur_budget == 0 ) 
> +        return;
> +
> +    svc->cur_budget -= delta;
> +    if ( svc->cur_budget < 0 ) 
> +        svc->cur_budget = 0;
> +
> +    /* TRACE */
> +    {
> +        struct {
> +            unsigned dom:16, vcpu:16;
> +            unsigned cur_budget_lo;
> +            unsigned cur_budget_hi;
> +            int delta;
> +        } d;
> +        d.dom = svc->vcpu->domain->domain_id;
> +        d.vcpu = svc->vcpu->vcpu_id;
> +        d.cur_budget_lo = (unsigned) svc->cur_budget;
> +        d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32);
> +        d.delta = delta;
> +        trace_var(TRC_RT_BUDGET_BURN, 1,
> +                  sizeof(d),
> +                  (unsigned char *) &d);
> +    }
> +}
> +
> +/* 
> + * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
> + * lock is grabbed before calling this function 
> + */
> +static struct rt_vcpu *
> +__runq_pick(const struct scheduler *ops, cpumask_t mask)
                                            cpumask_t *mask

would be better, I think.

> +/*
> + * Update vcpu's budget and sort runq by insert the modifed vcpu back to runq
> + * lock is grabbed before calling this function 
> + */
> +static void
> +__repl_update(const struct scheduler *ops, s_time_t now)
> +{
> +    struct list_head *runq = RUNQ(ops);
> +    struct list_head *iter;
> +    struct list_head *tmp;
> +    struct rt_vcpu *svc = NULL;
> +
> +    list_for_each_safe(iter, tmp, runq) 
> +    {
> +        svc = __runq_elem(iter);
> +
> +        /* not update flag_vcpu's budget */
> +        if(svc->sdom == NULL)
> +            continue;
> +
> +        rt_update_helper(now, svc);
> +        /* reinsert the vcpu if its deadline is updated */
> +        if ( now >= 0 )
> +        {
>
This is wrong, and I saw you noticed this already.

> +            __runq_remove(svc);
> +            __runq_insert(ops, svc);
> +        }
> +    }
> +}
> +
> +/* 
> + * schedule function for rt scheduler.
> + * The lock is already grabbed in schedule.c, no need to lock here 
> + */
> +static struct task_slice
> +rt_schedule(const struct scheduler *ops, s_time_t now, bool_t 
> tasklet_work_scheduled)
> +{
> +    const int cpu = smp_processor_id();
> +    struct rt_private * prv = RT_PRIV(ops);
> +    struct rt_vcpu * const scurr = RT_VCPU(current);
> +    struct rt_vcpu * snext = NULL;
> +    struct task_slice ret = { .migrated = 0 };
> +
> +    /* clear ticked bit now that we've been scheduled */
> +    if ( cpumask_test_cpu(cpu, &prv->tickled) )
> +        cpumask_clear_cpu(cpu, &prv->tickled);
> +
Is the test important? cpumask operations may be quite expensive, and I
think always clearing is better than always testing and sometimes
(rather often, I think) clear.

I'm open to other views on this, though. :-)

> +    /* burn_budget would return for IDLE VCPU */
> +    burn_budgets(ops, scurr, now);
> +
> +    __repl_update(ops, now);

> +/*
> + * Pick a vcpu on a cpu to kick out to place the running candidate
>
Rather than 'Pick a vcpu on a cpu to kick out...', I'd say 'Pick a cpu
where to run a vcpu, possibly kicking out the vcpu running there'.

Right. For this round, I tried, while looking at the patch, as hard as I
could to concentrate on the algorithm, and on how the Xen scheduling
framework is being used here. As a result, I confirm my previous
impression that this code is in a fair state and that, as an
experimental and in-development feature, it could well be checked in
soon (as far as comments are addressed, of course :-D ).

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

Attachment: signature.asc
Description: This is a digitally signed message part

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.