[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] xen: Credit2: stop hitting a BU_ON() when creating a Credit2 cpupool



This fixes hitting this BUG_ON():

xen/common/sched_credit2.c:3452
 rqd = c2rqd(ops, cpu);
 BUG_ON(!cpumask_test_cpu(cpu, &rqd->active));

when Credit2 is used as default scheduler, and a
cpupool, also using Credit2, is created.

The bug hit because c2rqd() accesses
rqd[per_cpu(runq_map,cpu)]->active when the value
of runq_map for this cpu is -1 (and so, we basically
access random memory).

Bug was introduced in a2c4e5ab ("xen: credit2: make
the cpu to runqueue map per-cpu").

In fact, in that commit the CPU to runq map became a
global data structure (instead of a per-scheduler
array, as it was before that commit). So, when a CPU
is deassigned from a scheduler, its corresponding
value in the runq map is set to -1. And this makes
things explode, in case the CPU was also being
assigned to another scheduler.

Basically, when moving a CPU from a scheduler
(cpupool) to another, we call:

 - csched2_init_pdata(new_ops, cpu);
   |
   --> per_cpu(runq_map,cpu)= _new_runq_id_ ;
 - csched2_deinit_pdata(old_ops, cpu);
   |
   --> per_cpu(runq_map,cpu)= -1;

The solution is to put the information of to which
runqueue a CPU belongs in a 'csched2_pcpu' data
structure (as, e.g., Credit does, for all the per-pcpu
pieces of information that it needs to keep), and
use the sched_priv field of the per_cpu(schedule_data)
structure, to reach out to it.

This solution has the following properties:
- it prevents Xen from crashing; :-)

- it's more in line with how the schedule.c<-->sched_foo.c
  interface has been designed, and is supposed to be
  used (that sched_priv field, is specifically meant
  to be used for this kind of data);

- the improvement brought by a2c4e5ab is retained. In
  fact, the fact that each instance of Credit2 was
  (needlessly) keeping a large array of int-s, for
  CPU to runq mapping, which that commit made
  unnecessary, is *not* reintroduced. Actually, we
  use even less memory, as instead of a per-cpu
  set of int-s (which means there were one even for
  the CPUs which don't belong to a Credit2 instance),
  we allocate a 1-field-only (an int) struct, only
  for the CPUs that belong to a Credit2 instance.

Note that this, also:
* allows the init_pdata() internal helper function to
  return void, simplifying its callers (and making it
  more similar to its counterparts in other schedulers);
* enables more extensive use of the c2rqd() helper,
  making the code more uniform and easier to read;
* requires the hook csched2_{alloc,free}_pdata to be
  defined for Credit2;
* requires the assignment of new values to 'scheduler'
  and 'schedule_data.sched_priv' per-cpu variables
  to be moved up a bit, in csched2_switch_sched().

Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
---
Cc: George Dunlap <george.dunlap@xxxxxxxxxx>
Cc: Anshul Makkar <anshulmakkar@xxxxxxxxx>
Cc: Julien Grall <julien.grall@xxxxxxx>
---
Release-wise, Julien, considering that:
- we're not even freezed yet,
- IAC, this is a bugfix,

I'd say it should go in.
---
 xen/common/sched_credit2.c |  133 +++++++++++++++++++++++++++++---------------
 1 file changed, 89 insertions(+), 44 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 18f39ca..4a08a9f 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -504,9 +504,11 @@ struct csched2_private {
  * Physical CPU
  *
  * The only per-pCPU information we need to maintain is of which runqueue
- * each CPU is part of.
+ * each CPU is part of (-1 to mean 'to none').
  */
-static DEFINE_PER_CPU(int, runq_map);
+struct csched2_pcpu {
+    int runq_id;
+};
 
 /*
  * Virtual CPU
@@ -565,6 +567,11 @@ static inline struct csched2_private *csched2_priv(const 
struct scheduler *ops)
     return ops->sched_data;
 }
 
+static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu)
+{
+    return per_cpu(schedule_data, cpu).sched_priv;
+}
+
 static inline struct csched2_vcpu *csched2_vcpu(const struct vcpu *v)
 {
     return v->sched_priv;
@@ -578,7 +585,7 @@ static inline struct csched2_dom *csched2_dom(const struct 
domain *d)
 /* CPU to runq_id macro */
 static inline int c2r(unsigned int cpu)
 {
-    return per_cpu(runq_map, cpu);
+    return csched2_pcpu(cpu)->runq_id;
 }
 
 /* CPU to runqueue struct macro */
@@ -3769,31 +3776,33 @@ csched2_dump(const struct scheduler *ops)
 #undef cpustr
 }
 
-/* Returns the ID of the runqueue the cpu is assigned to. */
-static unsigned
-init_pdata(struct csched2_private *prv, unsigned int cpu)
+static void
+init_pdata(struct csched2_private *prv,
+           struct csched2_pcpu *spc,
+           unsigned int cpu)
 {
-    unsigned rqi;
     struct csched2_runqueue_data *rqd;
 
     ASSERT(rw_is_write_locked(&prv->lock));
+    /* The CPU must not have been initialized already, in this scheduler. */
     ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
+    /* The per-CPU data needs to be allocated, but STILL uninitialized. */
+    ASSERT(spc && spc->runq_id == -1);
+    /* alloc_pdata must have been called already. */
+    ASSERT(per_cpu(schedule_data, cpu).sched_priv == spc);
 
     /* Figure out which runqueue to put it in */
-    rqi = cpu_to_runqueue(prv, cpu);
+    spc->runq_id = cpu_to_runqueue(prv, cpu);
 
-    rqd = prv->rqd + rqi;
+    rqd = prv->rqd + spc->runq_id;
 
-    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqi);
-    if ( ! cpumask_test_cpu(rqi, &prv->active_queues) )
+    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id);
+    if ( !cpumask_test_cpu(spc->runq_id, &prv->active_queues) )
     {
         printk(XENLOG_INFO " First cpu on runqueue, activating\n");
-        activate_runqueue(prv, rqi);
+        activate_runqueue(prv, spc->runq_id);
     }
     
-    /* Set the runqueue map */
-    per_cpu(runq_map, cpu) = rqi;
-    
     __cpumask_set_cpu(cpu, &rqd->idle);
     __cpumask_set_cpu(cpu, &rqd->active);
     __cpumask_set_cpu(cpu, &prv->initialized);
@@ -3801,8 +3810,21 @@ init_pdata(struct csched2_private *prv, unsigned int cpu)
 
     if ( cpumask_weight(&rqd->active) == 1 )
         rqd->pick_bias = cpu;
+}
 
-    return rqi;
+static void *
+csched2_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+    struct csched2_pcpu *spc;
+
+    spc = xzalloc(struct csched2_pcpu);
+    if ( spc == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    /* Initial situation: not part of any runqueue. */
+    spc->runq_id = -1;
+
+    return spc;
 }
 
 static void
@@ -3811,20 +3833,14 @@ csched2_init_pdata(const struct scheduler *ops, void 
*pdata, int cpu)
     struct csched2_private *prv = csched2_priv(ops);
     spinlock_t *old_lock;
     unsigned long flags;
-    unsigned rqi;
-
-    /*
-     * pdata contains what alloc_pdata returned. But since we don't (need to)
-     * implement alloc_pdata, either that's NULL, or something is very wrong!
-     */
-    ASSERT(!pdata);
 
     write_lock_irqsave(&prv->lock, flags);
     old_lock = pcpu_schedule_lock(cpu);
 
-    rqi = init_pdata(prv, cpu);
+    init_pdata(prv, pdata, cpu);
+
     /* Move the scheduler lock to the new runq lock. */
-    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
+    per_cpu(schedule_data, cpu).schedule_lock = &c2rqd(ops, cpu)->lock;
 
     /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
     spin_unlock(old_lock);
@@ -3838,9 +3854,8 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned 
int cpu,
 {
     struct csched2_private *prv = csched2_priv(new_ops);
     struct csched2_vcpu *svc = vdata;
-    unsigned rqi;
 
-    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
+    ASSERT(pdata && svc && is_idle_vcpu(svc->vcpu));
 
     /*
      * We own one runqueue lock already (from schedule_cpu_switch()). This
@@ -3855,7 +3870,10 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned 
int cpu,
 
     idle_vcpu[cpu]->sched_priv = vdata;
 
-    rqi = init_pdata(prv, cpu);
+    per_cpu(scheduler, cpu) = new_ops;
+    per_cpu(schedule_data, cpu).sched_priv = pdata;
+
+    init_pdata(prv, pdata, cpu);
 
     /*
      * Now that we know what runqueue we'll go in, double check what's said
@@ -3863,10 +3881,7 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned 
int cpu,
      * this scheduler, and so it's safe to have taken it /before/ our
      * private global lock.
      */
-    ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock);
-
-    per_cpu(scheduler, cpu) = new_ops;
-    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
+    ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &c2rqd(new_ops, 
cpu)->lock);
 
     /*
      * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
@@ -3874,7 +3889,7 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned 
int cpu,
      * taking it, find all the initializations we've done above in place.
      */
     smp_mb();
-    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
+    per_cpu(schedule_data, cpu).schedule_lock = &c2rqd(new_ops, cpu)->lock;
 
     write_unlock(&prv->lock);
 }
@@ -3885,25 +3900,27 @@ csched2_deinit_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
     unsigned long flags;
     struct csched2_private *prv = csched2_priv(ops);
     struct csched2_runqueue_data *rqd;
-    int rqi;
+    struct csched2_pcpu *spc = pcpu;
 
     write_lock_irqsave(&prv->lock, flags);
 
+    /* Both alloc_pdata and init_pdata must have been called on this CPU. */
+    ASSERT(spc && spc->runq_id != -1 &&
+           cpumask_test_cpu(cpu, &prv->initialized));
+
     /*
-     * alloc_pdata is not implemented, so pcpu must be NULL. On the other
-     * hand, init_pdata must have been called for this pCPU.
+     * NB. Since per_cpu(schedule_data,cpu).sched_priv may have been set to
+     * something that is different than pcpu already, we **can't** use the
+     * csched2_pcpu(), c2r() and c2rqd() helpers in here!
      */
-    ASSERT(!pcpu && cpumask_test_cpu(cpu, &prv->initialized));
-    
-    /* Find the old runqueue and remove this cpu from it */
-    rqi = per_cpu(runq_map, cpu);
 
-    rqd = prv->rqd + rqi;
+    /* Find the old runqueue and remove this cpu from it */
+    rqd = prv->rqd + spc->runq_id;
 
     /* No need to save IRQs here, they're already disabled */
     spin_lock(&rqd->lock);
 
-    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqi);
+    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, 
spc->runq_id);
 
     __cpumask_clear_cpu(cpu, &rqd->idle);
     __cpumask_clear_cpu(cpu, &rqd->smt_idle);
@@ -3912,12 +3929,12 @@ csched2_deinit_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
     if ( cpumask_empty(&rqd->active) )
     {
         printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
-        deactivate_runqueue(prv, rqi);
+        deactivate_runqueue(prv, spc->runq_id);
     }
     else if ( rqd->pick_bias == cpu )
         rqd->pick_bias = cpumask_first(&rqd->active);
 
-    per_cpu(runq_map, cpu) = -1;
+    spc->runq_id = -1;
 
     spin_unlock(&rqd->lock);
 
@@ -3928,6 +3945,32 @@ csched2_deinit_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
     return;
 }
 
+static void
+csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+
+    /*
+     * We want to be sure that either init_pdata has never been called, for
+     * this CPU in this scheduler, or that deinit_pdata has been called on
+     * it already.
+     */
+    ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
+
+    /*
+     * pcpu can be NULL, in case we are coming from CPU_UP_CANCELLED
+     * (because bringing up the CPU failed). And if it is not, it must,
+     * for the same reasons explained in the above comment, be invalid.
+     */
+    if ( pcpu != NULL )
+    {
+        struct csched2_pcpu *spc = pcpu;
+
+        ASSERT(spc->runq_id == -1);
+        xfree(spc);
+    }
+}
+
 static int
 csched2_init(struct scheduler *ops)
 {
@@ -4045,8 +4088,10 @@ static const struct scheduler sched_credit2_def = {
     .deinit         = csched2_deinit,
     .alloc_vdata    = csched2_alloc_vdata,
     .free_vdata     = csched2_free_vdata,
+    .alloc_pdata    = csched2_alloc_pdata,
     .init_pdata     = csched2_init_pdata,
     .deinit_pdata   = csched2_deinit_pdata,
+    .free_pdata     = csched2_free_pdata,
     .switch_sched   = csched2_switch_sched,
     .alloc_domdata  = csched2_alloc_domdata,
     .free_domdata   = csched2_free_domdata,


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.