[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 3/6] xen: credit1: increase efficiency and scalability of load balancing.



During load balancing, we check the non idle pCPUs to
see if they have runnable but not running vCPUs that
can be stolen by and set to run on currently idle pCPUs.

If a pCPU has only one running (or runnable) vCPU,
though, we don't want to steal it from there, and
it's therefore pointless bothering with it
(especially considering that bothering means trying
to take its runqueue lock!).

On large systems, when load is only slightly higher
than the number of pCPUs (i.e., there are just a few
more active vCPUs than the number of the pCPUs), this
may mean that:
 - we go through all the pCPUs,
 - for each one, we (try to) take its runqueue locks,
 - we figure out there's actually nothing to be stolen!

To mitigate this, we introduce here the concept of
overloaded runqueues, and a cpumask where to record
what pCPUs are in such state.

An overloaded runqueue has at least runnable 2 vCPUs
(plus the idle one, which is always there). Typically,
this means 1 vCPU is running, and 1 is sitting in  the
runqueue, and can hence be stolen.

Then, in  csched_balance_load(), it is enough to go
over the overloaded pCPUs, instead than all non-idle
pCPUs, which is better.

signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
---
Cc: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
I'm Cc-ing Andy on this patch, because we've discussed once about doing
something like this upstream.
---
 xen/common/sched_credit.c |   56 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 2b13e99..529b6c7 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -171,6 +171,7 @@ struct csched_pcpu {
     struct timer ticker;
     unsigned int tick;
     unsigned int idle_bias;
+    unsigned int nr_runnable;
 };
 
 /*
@@ -221,6 +222,7 @@ struct csched_private {
     uint32_t ncpus;
     struct timer  master_ticker;
     unsigned int master;
+    cpumask_var_t overloaded;
     cpumask_var_t idlers;
     cpumask_var_t cpus;
     uint32_t weight;
@@ -263,7 +265,10 @@ static inline bool_t is_runq_idle(unsigned int cpu)
 static inline void
 __runq_insert(struct csched_vcpu *svc)
 {
-    const struct list_head * const runq = RUNQ(svc->vcpu->processor);
+    unsigned int cpu = svc->vcpu->processor;
+    const struct list_head * const runq = RUNQ(cpu);
+    struct csched_private * const prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
+    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
     struct list_head *iter;
 
     BUG_ON( __vcpu_on_runq(svc) );
@@ -288,12 +293,37 @@ __runq_insert(struct csched_vcpu *svc)
     }
 
     list_add_tail(&svc->runq_elem, iter);
+
+    /*
+     * If there is more than just the idle vCPU and a "regular" vCPU runnable
+     * on the runqueue of this pCPU, mark it as overloaded (so other pCPU
+     * can come and pick up some work).
+     */
+    if ( ++spc->nr_runnable > 2 &&
+         !cpumask_test_cpu(cpu, prv->overloaded) )
+        cpumask_set_cpu(cpu, prv->overloaded);
 }
 
 static inline void
 __runq_remove(struct csched_vcpu *svc)
 {
+    unsigned int cpu = svc->vcpu->processor;
+    struct csched_private * const prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
+    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
+
     BUG_ON( !__vcpu_on_runq(svc) );
+
+    /*
+     * Mark the CPU as no longer overloaded when we drop to having only
+     * 1 vCPU in its runqueue. In fact, this means that just the idle
+     * idle vCPU and a "regular" vCPU are around.
+     */
+    if ( --spc->nr_runnable <= 2 &&
+         cpumask_test_cpu(cpu, prv->overloaded) )
+        cpumask_clear_cpu(cpu, prv->overloaded);
+
+    ASSERT(spc->nr_runnable >= 1);
+
     list_del_init(&svc->runq_elem);
 }
 
@@ -590,6 +620,7 @@ init_pdata(struct csched_private *prv, struct csched_pcpu 
*spc, int cpu)
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
     cpumask_set_cpu(cpu, prv->idlers);
+    spc->nr_runnable = 1;
 }
 
 static void
@@ -1704,8 +1735,8 @@ csched_load_balance(struct csched_private *prv, int cpu,
         peer_node = node;
         do
         {
-            /* Find out what the !idle are in this node */
-            cpumask_andnot(&workers, online, prv->idlers);
+            /* Select the pCPUs in this node that have work we can steal. */
+            cpumask_and(&workers, online, prv->overloaded);
             cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
             __cpumask_clear_cpu(cpu, &workers);
 
@@ -1989,7 +2020,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
     runq = &spc->runq;
 
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
-    printk("CPU[%02d] sort=%d, sibling=%s, ", cpu, spc->runq_sort_last, 
cpustr);
+    printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ",
+           cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
     printk("core=%s\n", cpustr);
 
@@ -2027,7 +2059,7 @@ csched_dump(const struct scheduler *ops)
 
     spin_lock_irqsave(&prv->lock, flags);
 
-#define idlers_buf keyhandler_scratch
+#define cpumask_buf keyhandler_scratch
 
     printk("info:\n"
            "\tncpus              = %u\n"
@@ -2055,8 +2087,10 @@ csched_dump(const struct scheduler *ops)
            prv->ticks_per_tslice,
            vcpu_migration_delay);
 
-    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
-    printk("idlers: %s\n", idlers_buf);
+    cpumask_scnprintf(cpumask_buf, sizeof(cpumask_buf), prv->idlers);
+    printk("idlers: %s\n", cpumask_buf);
+    cpumask_scnprintf(cpumask_buf, sizeof(cpumask_buf), prv->overloaded);
+    printk("overloaded: %s\n", cpumask_buf);
 
     printk("active vcpus:\n");
     loop = 0;
@@ -2079,7 +2113,7 @@ csched_dump(const struct scheduler *ops)
             vcpu_schedule_unlock(lock, svc->vcpu);
         }
     }
-#undef idlers_buf
+#undef cpumask_buf
 
     spin_unlock_irqrestore(&prv->lock, flags);
 }
@@ -2093,8 +2127,11 @@ csched_init(struct scheduler *ops)
     if ( prv == NULL )
         return -ENOMEM;
     if ( !zalloc_cpumask_var(&prv->cpus) ||
-         !zalloc_cpumask_var(&prv->idlers) )
+         !zalloc_cpumask_var(&prv->idlers) ||
+         !zalloc_cpumask_var(&prv->overloaded) )
     {
+        free_cpumask_var(prv->overloaded);
+        free_cpumask_var(prv->idlers);
         free_cpumask_var(prv->cpus);
         xfree(prv);
         return -ENOMEM;
@@ -2141,6 +2178,7 @@ csched_deinit(struct scheduler *ops)
         ops->sched_data = NULL;
         free_cpumask_var(prv->cpus);
         free_cpumask_var(prv->idlers);
+        free_cpumask_var(prv->overloaded);
         xfree(prv);
     }
 }


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.