Xen project Mailing List

[Xen-devel] [PATCH 3/6] xen: credit1: increase efficiency and scalability of load balancing.

From: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

Date: Thu, 02 Mar 2017 11:38:19 +0100

Cc: George Dunlap <george.dunlap@xxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Delivery-date: Thu, 02 Mar 2017 10:38:27 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

During load balancing, we check the non idle pCPUs to see if they have runnable but not running vCPUs that can be stolen by and set to run on currently idle pCPUs. If a pCPU has only one running (or runnable) vCPU, though, we don't want to steal it from there, and it's therefore pointless bothering with it (especially considering that bothering means trying to take its runqueue lock!). On large systems, when load is only slightly higher than the number of pCPUs (i.e., there are just a few more active vCPUs than the number of the pCPUs), this may mean that: - we go through all the pCPUs, - for each one, we (try to) take its runqueue locks, - we figure out there's actually nothing to be stolen! To mitigate this, we introduce here the concept of overloaded runqueues, and a cpumask where to record what pCPUs are in such state. An overloaded runqueue has at least runnable 2 vCPUs (plus the idle one, which is always there). Typically, this means 1 vCPU is running, and 1 is sitting in the runqueue, and can hence be stolen. Then, in csched_balance_load(), it is enough to go over the overloaded pCPUs, instead than all non-idle pCPUs, which is better. signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx> --- Cc: George Dunlap <george.dunlap@xxxxxxxxxxxxx> Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> --- I'm Cc-ing Andy on this patch, because we've discussed once about doing something like this upstream. --- xen/common/sched_credit.c | 56 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c index 2b13e99..529b6c7 100644 --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -171,6 +171,7 @@ struct csched_pcpu { struct timer ticker; unsigned int tick; unsigned int idle_bias; + unsigned int nr_runnable; }; /* @@ -221,6 +222,7 @@ struct csched_private { uint32_t ncpus; struct timer master_ticker; unsigned int master; + cpumask_var_t overloaded; cpumask_var_t idlers; cpumask_var_t cpus; uint32_t weight; @@ -263,7 +265,10 @@ static inline bool_t is_runq_idle(unsigned int cpu) static inline void __runq_insert(struct csched_vcpu *svc) { - const struct list_head * const runq = RUNQ(svc->vcpu->processor); + unsigned int cpu = svc->vcpu->processor; + const struct list_head * const runq = RUNQ(cpu); + struct csched_private * const prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); + struct csched_pcpu * const spc = CSCHED_PCPU(cpu); struct list_head *iter; BUG_ON( __vcpu_on_runq(svc) ); @@ -288,12 +293,37 @@ __runq_insert(struct csched_vcpu *svc) } list_add_tail(&svc->runq_elem, iter); + + /* + * If there is more than just the idle vCPU and a "regular" vCPU runnable + * on the runqueue of this pCPU, mark it as overloaded (so other pCPU + * can come and pick up some work). + */ + if ( ++spc->nr_runnable > 2 && + !cpumask_test_cpu(cpu, prv->overloaded) ) + cpumask_set_cpu(cpu, prv->overloaded); } static inline void __runq_remove(struct csched_vcpu *svc) { + unsigned int cpu = svc->vcpu->processor; + struct csched_private * const prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); + struct csched_pcpu * const spc = CSCHED_PCPU(cpu); + BUG_ON( !__vcpu_on_runq(svc) ); + + /* + * Mark the CPU as no longer overloaded when we drop to having only + * 1 vCPU in its runqueue. In fact, this means that just the idle + * idle vCPU and a "regular" vCPU are around. + */ + if ( --spc->nr_runnable <= 2 && + cpumask_test_cpu(cpu, prv->overloaded) ) + cpumask_clear_cpu(cpu, prv->overloaded); + + ASSERT(spc->nr_runnable >= 1); + list_del_init(&svc->runq_elem); } @@ -590,6 +620,7 @@ init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu) /* Start off idling... */ BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu))); cpumask_set_cpu(cpu, prv->idlers); + spc->nr_runnable = 1; } static void @@ -1704,8 +1735,8 @@ csched_load_balance(struct csched_private *prv, int cpu, peer_node = node; do { - /* Find out what the !idle are in this node */ - cpumask_andnot(&workers, online, prv->idlers); + /* Select the pCPUs in this node that have work we can steal. */ + cpumask_and(&workers, online, prv->overloaded); cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); __cpumask_clear_cpu(cpu, &workers); @@ -1989,7 +2020,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu) runq = &spc->runq; cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu)); - printk("CPU[%02d] sort=%d, sibling=%s, ", cpu, spc->runq_sort_last, cpustr); + printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ", + cpu, spc->nr_runnable, spc->runq_sort_last, cpustr); cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); printk("core=%s\n", cpustr); @@ -2027,7 +2059,7 @@ csched_dump(const struct scheduler *ops) spin_lock_irqsave(&prv->lock, flags); -#define idlers_buf keyhandler_scratch +#define cpumask_buf keyhandler_scratch printk("info:\n" "\tncpus = %u\n" @@ -2055,8 +2087,10 @@ csched_dump(const struct scheduler *ops) prv->ticks_per_tslice, vcpu_migration_delay); - cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers); - printk("idlers: %s\n", idlers_buf); + cpumask_scnprintf(cpumask_buf, sizeof(cpumask_buf), prv->idlers); + printk("idlers: %s\n", cpumask_buf); + cpumask_scnprintf(cpumask_buf, sizeof(cpumask_buf), prv->overloaded); + printk("overloaded: %s\n", cpumask_buf); printk("active vcpus:\n"); loop = 0; @@ -2079,7 +2113,7 @@ csched_dump(const struct scheduler *ops) vcpu_schedule_unlock(lock, svc->vcpu); } } -#undef idlers_buf +#undef cpumask_buf spin_unlock_irqrestore(&prv->lock, flags); } @@ -2093,8 +2127,11 @@ csched_init(struct scheduler *ops) if ( prv == NULL ) return -ENOMEM; if ( !zalloc_cpumask_var(&prv->cpus) || - !zalloc_cpumask_var(&prv->idlers) ) + !zalloc_cpumask_var(&prv->idlers) || + !zalloc_cpumask_var(&prv->overloaded) ) { + free_cpumask_var(prv->overloaded); + free_cpumask_var(prv->idlers); free_cpumask_var(prv->cpus); xfree(prv); return -ENOMEM; @@ -2141,6 +2178,7 @@ csched_deinit(struct scheduler *ops) ops->sched_data = NULL; free_cpumask_var(prv->cpus); free_cpumask_var(prv->idlers); + free_cpumask_var(prv->overloaded); xfree(prv); } } _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.