# HG changeset patch # Parent 1452fb248cd513832cfbbd1100b9b72a0dde7ea6 Rework locking for sched_adjust. The main idea is to move (as much as possible) locking logic from generic code to the various pluggable schedulers. While at it, the following is also accomplished: - pausing all the non-current VCPUs of a domain while changing its scheduling parameters is not effective in avoiding races and it is prone to deadlock, so that is removed. - sedf needs a global lock for preventing races while adjusting domains' scheduling parameters (as it is for credit and credit2), so that is added. Signed-off-by: Dario Faggioli diff -r 1452fb248cd5 xen/common/sched_credit.c --- a/xen/common/sched_credit.c Fri Dec 16 15:45:40 2011 +0100 +++ b/xen/common/sched_credit.c Fri Dec 16 17:49:46 2011 +0100 @@ -161,6 +161,7 @@ struct csched_dom { * System-wide private data */ struct csched_private { + /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ spinlock_t lock; struct list_head active_sdom; uint32_t ncpus; @@ -800,6 +801,10 @@ csched_dom_cntl( struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; + /* Protect both get and put branches with the pluggable scheduler + * lock. Runq lock not needed anywhere in here. */ + spin_lock_irqsave(&prv->lock, flags); + if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit.weight = sdom->weight; @@ -809,8 +814,6 @@ csched_dom_cntl( { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); - spin_lock_irqsave(&prv->lock, flags); - if ( op->u.credit.weight != 0 ) { if ( !list_empty(&sdom->active_sdom_elem) ) @@ -824,9 +827,10 @@ csched_dom_cntl( if ( op->u.credit.cap != (uint16_t)~0U ) sdom->cap = op->u.credit.cap; - spin_unlock_irqrestore(&prv->lock, flags); } + spin_unlock_irqrestore(&prv->lock, flags); + return 0; } diff -r 1452fb248cd5 xen/common/sched_credit2.c --- a/xen/common/sched_credit2.c Fri Dec 16 15:45:40 2011 +0100 +++ b/xen/common/sched_credit2.c Fri Dec 16 17:49:46 2011 +0100 @@ -1384,6 +1384,10 @@ csched_dom_cntl( struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; + /* Must hold csched_priv lock to read and update sdom, + * runq lock to update csvcs. */ + spin_lock_irqsave(&prv->lock, flags); + if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit2.weight = sdom->weight; @@ -1397,10 +1401,6 @@ csched_dom_cntl( struct list_head *iter; int old_weight; - /* Must hold csched_priv lock to update sdom, runq lock to - * update csvcs. */ - spin_lock_irqsave(&prv->lock, flags); - old_weight = sdom->weight; sdom->weight = op->u.credit2.weight; @@ -1411,22 +1411,23 @@ csched_dom_cntl( struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem); /* NB: Locking order is important here. Because we grab this lock here, we - * must never lock csched_priv.lock if we're holding a runqueue - * lock. */ - vcpu_schedule_lock_irq(svc->vcpu); + * must never lock csched_priv.lock if we're holding a runqueue lock. + * Also, calling vcpu_schedule_lock() is enough, since IRQs have already + * been disabled. */ + vcpu_schedule_lock(svc->vcpu); BUG_ON(svc->rqd != RQD(ops, svc->vcpu->processor)); svc->weight = sdom->weight; update_max_weight(svc->rqd, svc->weight, old_weight); - vcpu_schedule_unlock_irq(svc->vcpu); + vcpu_schedule_unlock(svc->vcpu); } - - spin_unlock_irqrestore(&prv->lock, flags); } } + spin_unlock_irqrestore(&prv->lock, flags); + return 0; } diff -r 1452fb248cd5 xen/common/sched_sedf.c --- a/xen/common/sched_sedf.c Fri Dec 16 15:45:40 2011 +0100 +++ b/xen/common/sched_sedf.c Fri Dec 16 17:49:46 2011 +0100 @@ -61,6 +61,11 @@ struct sedf_dom_info { struct domain *domain; }; +struct sedf_priv_info { + /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ + spinlock_t lock; +}; + struct sedf_vcpu_info { struct vcpu *vcpu; struct list_head list; @@ -115,6 +120,8 @@ struct sedf_cpu_info { s_time_t current_slice_expires; }; +#define SEDF_PRIV(_ops) \ + ((struct sedf_priv_info *)((_ops)->sched_data)) #define EDOM_INFO(d) ((struct sedf_vcpu_info *)((d)->sched_priv)) #define CPU_INFO(cpu) \ ((struct sedf_cpu_info *)per_cpu(schedule_data, cpu).sched_priv) @@ -772,6 +779,31 @@ static struct task_slice sedf_do_extra_s } +static int sedf_init(struct scheduler *ops) +{ + struct sedf_priv_info *prv; + + prv = xzalloc(struct sedf_priv_info); + if ( prv == NULL ) + return -ENOMEM; + + ops->sched_data = prv; + spin_lock_init(&prv->lock); + + return 0; +} + + +static void sedf_deinit(const struct scheduler *ops) +{ + struct sedf_priv_info *prv; + + prv = SEDF_PRIV(ops); + if ( prv != NULL ) + xfree(prv); +} + + /* Main scheduling function Reasons for calling this function are: -timeslice for the current period used up @@ -1320,22 +1352,15 @@ static void sedf_dump_cpu_state(const st /* Adjusts periods and slices of the domains accordingly to their weights. */ -static int sedf_adjust_weights(struct cpupool *c, struct xen_domctl_scheduler_op *cmd) +static int sedf_adjust_weights(struct cpupool *c, int nr_cpus, int *sumw, s_time_t *sumt) { struct vcpu *p; struct domain *d; - unsigned int cpu, nr_cpus = cpumask_last(&cpu_online_map) + 1; - int *sumw = xzalloc_array(int, nr_cpus); - s_time_t *sumt = xzalloc_array(s_time_t, nr_cpus); + unsigned int cpu; - if ( !sumw || !sumt ) - { - xfree(sumt); - xfree(sumw); - return -ENOMEM; - } - - /* Sum across all weights. */ + /* Sum across all weights. Notice that no runq locking is needed + * here: the caller holds sedf_priv_info.lock and we're not changing + * anything that is accessed during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { @@ -1365,7 +1390,9 @@ static int sedf_adjust_weights(struct cp } rcu_read_unlock(&domlist_read_lock); - /* Adjust all slices (and periods) to the new weight. */ + /* Adjust all slices (and periods) to the new weight. Unlike above, we + * need to take thr runq lock for the various VCPUs: we're modyfing + * slice and period which are referenced during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { @@ -1375,20 +1402,20 @@ static int sedf_adjust_weights(struct cp continue; if ( EDOM_INFO(p)->weight ) { + /* Interrupts already off */ + vcpu_schedule_lock(p); EDOM_INFO(p)->period_orig = EDOM_INFO(p)->period = WEIGHT_PERIOD; EDOM_INFO(p)->slice_orig = EDOM_INFO(p)->slice = (EDOM_INFO(p)->weight * (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[cpu])) / sumw[cpu]; + vcpu_schedule_unlock(p); } } } rcu_read_unlock(&domlist_read_lock); - xfree(sumt); - xfree(sumw); - return 0; } @@ -1396,19 +1423,45 @@ static int sedf_adjust_weights(struct cp /* set or fetch domain scheduling parameters */ static int sedf_adjust(const struct scheduler *ops, struct domain *p, struct xen_domctl_scheduler_op *op) { + struct sedf_priv_info *prv = SEDF_PRIV(ops); + unsigned long flags; + unsigned int nr_cpus = cpumask_last(&cpu_online_map) + 1; + int *sumw = xzalloc_array(int, nr_cpus); + s_time_t *sumt = xzalloc_array(s_time_t, nr_cpus); struct vcpu *v; - int rc; + int rc = 0; PRINT(2,"sedf_adjust was called, domain-id %i new period %"PRIu64" " "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n", p->domain_id, op->u.sedf.period, op->u.sedf.slice, op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no"); + /* Serialize against the pluggable scheduler lock to protect from + * concurrent updates. We need to take the runq lock for the VCPUs + * as well, since we are touching extraweight, weight, slice and + * period. As in sched_credit2.c, runq locks nest inside the + * pluggable scheduler lock. */ + spin_lock_irqsave(&prv->lock, flags); + if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo ) { + /* These are used in sedf_adjust_weights() but have to be allocated in + * this function, as we need to avoid nesting xmem_pool_alloc's lock + * within our prv->lock. */ + if ( !sumw || !sumt ) + { + /* Check for errors here, the _getinfo branch doesn't care */ + rc = -ENOMEM; + goto out; + } + /* Check for sane parameters. */ if ( !op->u.sedf.period && !op->u.sedf.weight ) - return -EINVAL; + { + rc = -EINVAL; + goto out; + } + if ( op->u.sedf.weight ) { if ( (op->u.sedf.extratime & EXTRA_AWARE) && @@ -1417,59 +1470,78 @@ static int sedf_adjust(const struct sche /* Weight-driven domains with extratime only. */ for_each_vcpu ( p, v ) { + /* (Here and everywhere in the following) IRQs are already off, + * hence vcpu_spin_lock() is the one. */ + vcpu_schedule_lock(v); EDOM_INFO(v)->extraweight = op->u.sedf.weight; EDOM_INFO(v)->weight = 0; EDOM_INFO(v)->slice = 0; EDOM_INFO(v)->period = WEIGHT_PERIOD; + vcpu_schedule_unlock(v); } } else { /* Weight-driven domains with real-time execution. */ - for_each_vcpu ( p, v ) + for_each_vcpu ( p, v ) { + vcpu_schedule_lock(v); EDOM_INFO(v)->weight = op->u.sedf.weight; + vcpu_schedule_unlock(v); + } } } else { + /* + * Sanity checking: note that disabling extra weight requires + * that we set a non-zero slice. + */ + if ( (op->u.sedf.period > PERIOD_MAX) || + (op->u.sedf.period < PERIOD_MIN) || + (op->u.sedf.slice > op->u.sedf.period) || + (op->u.sedf.slice < SLICE_MIN) ) + { + rc = -EINVAL; + goto out; + } + /* Time-driven domains. */ for_each_vcpu ( p, v ) { - /* - * Sanity checking: note that disabling extra weight requires - * that we set a non-zero slice. - */ - if ( (op->u.sedf.period > PERIOD_MAX) || - (op->u.sedf.period < PERIOD_MIN) || - (op->u.sedf.slice > op->u.sedf.period) || - (op->u.sedf.slice < SLICE_MIN) ) - return -EINVAL; + vcpu_schedule_lock(v); EDOM_INFO(v)->weight = 0; EDOM_INFO(v)->extraweight = 0; EDOM_INFO(v)->period_orig = EDOM_INFO(v)->period = op->u.sedf.period; EDOM_INFO(v)->slice_orig = EDOM_INFO(v)->slice = op->u.sedf.slice; + vcpu_schedule_unlock(v); } } - rc = sedf_adjust_weights(p->cpupool, op); + rc = sedf_adjust_weights(p->cpupool, nr_cpus, sumw, sumt); if ( rc ) - return rc; + goto out; for_each_vcpu ( p, v ) { + vcpu_schedule_lock(v); EDOM_INFO(v)->status = (EDOM_INFO(v)->status & ~EXTRA_AWARE) | (op->u.sedf.extratime & EXTRA_AWARE); EDOM_INFO(v)->latency = op->u.sedf.latency; extraq_check(v); + vcpu_schedule_unlock(v); } } else if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { if ( p->vcpu[0] == NULL ) - return -EINVAL; + { + rc = -EINVAL; + goto out; + } + op->u.sedf.period = EDOM_INFO(p->vcpu[0])->period; op->u.sedf.slice = EDOM_INFO(p->vcpu[0])->slice; op->u.sedf.extratime = EDOM_INFO(p->vcpu[0])->status & EXTRA_AWARE; @@ -1477,14 +1549,23 @@ static int sedf_adjust(const struct sche op->u.sedf.weight = EDOM_INFO(p->vcpu[0])->weight; } - PRINT(2,"sedf_adjust_finished\n"); - return 0; +out: + spin_unlock_irqrestore(&prv->lock, flags); + + xfree(sumt); + xfree(sumw); + + PRINT(2,"sedf_adjust_finished with return code %d\n", rc); + return rc; } +static struct sedf_priv_info _sedf_priv; + const struct scheduler sched_sedf_def = { - .name = "Simple EDF Scheduler", - .opt_name = "sedf", - .sched_id = XEN_SCHEDULER_SEDF, + .name = "Simple EDF Scheduler", + .opt_name = "sedf", + .sched_id = XEN_SCHEDULER_SEDF, + .sched_data = &_sedf_priv, .init_domain = sedf_init_domain, .destroy_domain = sedf_destroy_domain, @@ -1498,6 +1579,9 @@ const struct scheduler sched_sedf_def = .alloc_domdata = sedf_alloc_domdata, .free_domdata = sedf_free_domdata, + .init = sedf_init, + .deinit = sedf_deinit, + .do_schedule = sedf_do_schedule, .pick_cpu = sedf_pick_cpu, .dump_cpu_state = sedf_dump_cpu_state, diff -r 1452fb248cd5 xen/common/schedule.c --- a/xen/common/schedule.c Fri Dec 16 15:45:40 2011 +0100 +++ b/xen/common/schedule.c Fri Dec 16 17:49:46 2011 +0100 @@ -1005,7 +1005,6 @@ int sched_id(void) /* Adjust scheduling parameter for a given domain. */ long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) { - struct vcpu *v; long ret; if ( (op->sched_id != DOM2OP(d)->sched_id) || @@ -1013,40 +1012,11 @@ long sched_adjust(struct domain *d, stru (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) ) return -EINVAL; - /* - * Most VCPUs we can simply pause. If we are adjusting this VCPU then - * we acquire the local schedule_lock to guard against concurrent updates. - * - * We only acquire the local schedule lock after we have paused all other - * VCPUs in this domain. There are two reasons for this: - * 1- We don't want to hold up interrupts as pausing a VCPU can - * trigger a tlb shootdown. - * 2- Pausing other VCPUs involves briefly locking the schedule - * lock of the CPU they are running on. This CPU could be the - * same as ours. - */ - - for_each_vcpu ( d, v ) - { - if ( v != current ) - vcpu_pause(v); - } - - if ( d == current->domain ) - vcpu_schedule_lock_irq(current); - + /* NB: the pluggable scheduler code needs to take care + * of locking by itself. */ if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 ) TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); - if ( d == current->domain ) - vcpu_schedule_unlock_irq(current); - - for_each_vcpu ( d, v ) - { - if ( v != current ) - vcpu_unpause(v); - } - return ret; }