[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC PATCH v1 09/16] xen: Credit1: SMT-aware domain co-scheduling parameter and data structs



In fact, we want to be able to enforce that only vcpus belonging to the
same domain are executed on pcpus that are part of one core (i.e., that
are 'siblings hyperthread', or just 'hyperthreads').

To achieve that, we introduce a new new data structure, representing a
physical core, and use it to track what (vcpus of what) domains are
currently running on the pcpus of each core.

As far as this commit is concerned, however, only the boot command line
parameter (to enable or disable the feature), the data structures and
the domain tracking logic are implemented.

Of course, until we actually enforce the fact that only vcpus of the
same domain runs on each core, whatever the tracking shows (e.g., via
`xl debug-keys r') is not valid, and should be ignored.

Signed-off-by: Dario Faggioli <dfaggioli@xxxxxxxx>
---
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>
Cc: Jan Beulich <jbeulich@xxxxxxxx>
Cc: Wei Liu <wei.liu2@xxxxxxxxxx>
---
TODO:
 - (in this patch and in all the following ones) most of the tracking
   and of the serialization logic, necessary to implement the feature,
   are not really only executed when csched_smt_cosched=true, but all
   the time (basically, setting sched_smt_cosched=false would indeed
   disable the feature, but we still get some of the overhead.
---
 docs/misc/xen-command-line.markdown |   11 ++++
 xen/common/sched_credit.c           |  103 ++++++++++++++++++++++++++++++++++-
 xen/common/schedule.c               |    7 ++
 xen/include/xen/sched.h             |    7 ++
 4 files changed, 124 insertions(+), 4 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 559c0662fa..3f3b3dec41 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1692,6 +1692,17 @@ amount of time that a vcpu can be scheduled for before 
preempting it,
 in microseconds.  The default is 1000us (1ms).  Setting this to 0
 disables it altogether.
 
+### sched\_smt\_cosched
+> `= <boolean>`
+
+If true, forces the scheduler to run, at any given point in time, only
+vCPUs that belongs to one domain (or nothing!) on the various pCPUs
+that belong to one physical core (the so colled SMT-siblings, or
+SMT-hyperthreads, or just hyperthreads).
+
+This feature is referred to as SMT domain co-scheduling, or SMT
+co-scheduling or even just co-scheduling.
+
 ### sched\_smt\_power\_savings
 > `= <boolean>`
 
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index cd5524c3ba..fb418ffb2f 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -155,8 +155,19 @@ struct csched_pcpu {
 
     unsigned int tick;
     struct timer ticker;
+    struct csched_core *core;
 };
 
+/* For dealing with threads in the same cores */
+struct csched_core {
+    spinlock_t lock;
+    cpumask_t cpus, idlers;
+    struct csched_dom *sdom;
+    bool init_done;
+};
+
+static struct csched_core *cores;
+
 /*
  * Virtual CPU
  */
@@ -605,6 +616,9 @@ csched_alloc_pdata(const struct scheduler *ops, int cpu)
 static void
 init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
 {
+    unsigned int i;
+    unsigned long flags;
+
     ASSERT(spin_is_locked(&prv->lock));
     /* cpu data needs to be allocated, but STILL uninitialized. */
     ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL);
@@ -631,6 +645,50 @@ init_pdata(struct csched_private *prv, struct csched_pcpu 
*spc, int cpu)
     spc->runq_sort_last = prv->runq_sort;
     spc->idle_bias = nr_cpu_ids - 1;
 
+    for ( i = 0; i < nr_cpu_ids; i++ )
+    {
+        /*
+         * We do this _only_ the first time that this pcpu is assigned to
+         * an instance of the Credit scheduler. This is ok, as, no matter
+         * to what cpupool and scheduler instance the CPU is then moved,
+         * topology does not change.
+         */
+        if ( !cores[i].init_done )
+        {
+            /*
+             * The absolute first time we run the loop, we initialize cores[0],
+             * we put the CPU in it, and break out.
+             */
+            spin_lock_init(&cores[i].lock);
+            ASSERT(cores[i].sdom == NULL);
+            printk("activating core %d for cpu %d\n", i, cpu);
+            cpumask_set_cpu(cpu, &cores[i].cpus);
+            cpumask_set_cpu(cpu, &cores[i].idlers);
+            spc->core = &cores[i];
+            cores[i].init_done = true;
+            break;
+        }
+
+        /*
+         * If we are here, at least one element of the cores array has been
+         * initialized, and one CPU has "been put" in it. Check if this CPU
+         * is a sibling, and should refer to that element too. If not, stay
+         * in the loop; at some point we'll find a non initialised element
+         * and use it.
+         */
+        ASSERT(!cpumask_empty(&cores[i].cpus));
+        if ( cpumask_test_cpu(cpu, per_cpu(cpu_sibling_mask, 
cpumask_first(&cores[i].cpus))) )
+        {
+            printk("putting cpu %d in core %d\n", cpu, i);
+            spin_lock_irqsave(&cores[i].lock, flags);
+            cpumask_set_cpu(cpu, &cores[i].cpus);
+            cpumask_set_cpu(cpu, &cores[i].idlers);
+            spin_unlock_irqrestore(&cores[i].lock, flags);
+            spc->core = &cores[i];
+            break;
+        }
+    }
+
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
     cpumask_set_cpu(cpu, prv->idlers);
@@ -1857,6 +1915,7 @@ csched_schedule(
     const int cpu = smp_processor_id();
     struct list_head * const runq = RUNQ(cpu);
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
     struct csched_private *prv = CSCHED_PRIV(ops);
     struct csched_vcpu *snext;
     struct task_slice ret = { .migrated = 0 };
@@ -1988,6 +2047,8 @@ csched_schedule(
         snext = csched_load_balance(prv, cpu, snext, &ret.migrated);
 
  out:
+    spin_lock(&spc->core->lock);
+
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
      * will tickle us when they get extra work.
@@ -1999,13 +2060,27 @@ csched_schedule(
             cpumask_set_cpu(cpu, prv->idlers);
             csched_smt_idle_mask_set(cpu, prv);
         }
+        cpumask_set_cpu(cpu, &spc->core->idlers);
+        if ( cpumask_equal(per_cpu(cpu_sibling_mask, cpu), &spc->core->idlers) 
)
+            spc->core->sdom = NULL;
     }
-    else if ( cpumask_test_cpu(cpu, prv->idlers) )
+    else
     {
-        cpumask_clear_cpu(cpu, prv->idlers);
-        csched_smt_idle_mask_clear(cpu, prv);
+        if ( cpumask_test_cpu(cpu, prv->idlers) )
+        {
+            cpumask_clear_cpu(cpu, prv->idlers);
+            csched_smt_idle_mask_clear(cpu, prv);
+        }
+
+        if ( !tasklet_work_scheduled )
+        {
+            cpumask_clear_cpu(cpu, &spc->core->idlers);
+            spc->core->sdom = snext->sdom;
+        }
     }
 
+    spin_unlock(&spc->core->lock);
+
     if ( !is_idle_vcpu(snext->vcpu) )
         snext->start_time += now;
 
@@ -2085,8 +2160,20 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
     printk("CPU[%02d] nr_runbl=%d, sort=%d, sibling=%s, ",
            cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
+    cpumask_scnprintf(cpustr, sizeof(cpustr), &spc->core->idlers);
+    printk("idle_sibling=%s, ", cpustr);
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
-    printk("core=%s\n", cpustr);
+    printk("core=%s", cpustr);
+    ASSERT(spc->core->init_done);
+    ASSERT(cpumask_equal(per_cpu(cpu_sibling_mask, cpu), &spc->core->cpus));
+    if ( sched_smt_cosched )
+    {
+        if ( spc->core->sdom )
+            printk(", sdom=d%d", spc->core->sdom->dom->domain_id);
+        else
+           printk(", sdom=/");
+    }
+    printk("\n");
 
     /* current VCPU (nothing to say if that's the idle vcpu). */
     svc = CSCHED_VCPU(curr_on_cpu(cpu));
@@ -2220,6 +2307,14 @@ csched_init(struct scheduler *ops)
     if ( prv == NULL )
         return -ENOMEM;
 
+    /* Allocate all core structures, and mark them as un-initialized */
+    cores = xzalloc_array(struct csched_core, nr_cpu_ids);
+    if ( !cores )
+    {
+        xfree(prv);
+        return -ENOMEM;
+    }
+
     prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES);
     if ( prv->balance_bias == NULL )
     {
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 05281d6af7..ef28576d77 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -49,6 +49,13 @@ string_param("sched", opt_sched);
 bool_t sched_smt_power_savings = 0;
 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
 
+/*
+ * If enabled, only vcpus of the same domain will be scheduled on siblings
+ * hyperthread of the same core.
+ */
+bool sched_smt_cosched = 0;
+boolean_param("sched_smt_cosched", sched_smt_cosched);
+
 /* Default scheduling rate limit: 1ms
  * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms 
is undefined
  * */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 09c25bfdd2..1c2383cccb 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -905,6 +905,13 @@ static inline bool is_vcpu_online(const struct vcpu *v)
  */
 extern bool sched_smt_power_savings;
 
+/*
+ * sched_smt_cosched = 1, vcpus which are not from the same domain, will
+ * never be scheduled and run, at the same time, on two sibling hyperthreads
+ * of the same core.
+ */
+extern bool sched_smt_cosched;
+
 /*
  * If all the siblings of cpu (including cpu itself) are idle, set
  * their bits in mask.


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.