diff -r f72d26c00002 xen/common/Makefile --- a/xen/common/Makefile Tue Jun 02 11:50:16 2009 +0100 +++ b/xen/common/Makefile Mon Jun 15 23:24:59 2009 +0100 @@ -13,6 +13,7 @@ obj-y += page_alloc.o obj-y += rangeset.o obj-y += sched_credit.o +obj-y += sched_credit2.o obj-y += sched_sedf.o obj-y += schedule.o obj-y += shutdown.o diff -r f72d26c00002 xen/common/sched_credit2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/common/sched_credit2.c Mon Jun 15 23:24:59 2009 +0100 @@ -0,0 +1,1076 @@ + +/**************************************************************************** + * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd + **************************************************************************** + * + * File: common/csched_credit2.c + * Author: George Dunlap + * + * Description: Credit-based SMP CPU scheduler + * Based on an earlier verson by Emmanuel Ackaouy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __i386__ +#define PRI_stime "lld" +#else +#define PRI_stime "ld" +#endif + +#define iprintk(x...) +#define d2printk(x...) +//#define d2printk printk + +#define TRC_CSCHED2_TICK TRC_SCHED_CLASS + 1 +#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS + 2 +#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3 +#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS + 4 +#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5 + +/* + * Design: + * + * Credits represent a fixed amount of CPU time. A VM accumulates credits + * relative to its weight, and consumes credits as it runs. + * + * Things that need to happen: + * + Distributing credits + * + Subtracting credits + * + Changing priority based on credits + * + */ + +/* + * CSCHED_STATS + * + * Manage very basic counters and stats. + * + * Useful for debugging live systems. The stats are displayed + * with runq dumps ('r' on the Xen console). */ +//#define CSCHED_STATS + + +/* + * Basic constants + */ +#define CSCHED_DEFAULT_WEIGHT 256 +#define CSCHED_CREDITS_PER_MS 100 +#define CSCHED_MS_PER_TICK 10 +#define CSCHED_MS_PER_ACCT 100 +#define CSCHED_MIN_TIMER MICROSECS(500) +//#define CSCHED_MAX_SCHEDULE MILLISECS(10) +#define CSCHED_MAX_SCHEDULE MILLISECS(2) +#define CSCHED_DEFAULT_SLICE MILLISECS(10) +#define CSCHED_BOOST_TIME MILLISECS(1) +#define CSCHED_CREDIT_TO_STIME(_c) ((_c)*(MILLISECS(1)/CSCHED_CREDITS_PER_MS)) +#define CSCHED_STIME_TO_CREDIT(_t) ((_t)*CSCHED_CREDITS_PER_MS/MILLISECS(1)) + +#define CSCHED_CREDITS_PER_ACCT ( (CSCHED_MS_PER_ACCT) * CSCHED_CREDITS_PER_MS ) + +#define CSCHED_IDLE_CREDIT (-(1<<30)) + +/* + * Flags + */ +#define __CSFLAG_unblock_boost 1 +#define CSFLAG_unblock_boost (1<<__CSFLAG_unblock_boost) + + +/* + * Useful macros + */ +#define CSCHED_PCPU(_c) \ + ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) +#define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) +#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) +//#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) +#define RUNQ(_cpu) (&csched_priv.runq) /* GRQ */ + +/* + * System-wide private data + */ +struct csched_private { + spinlock_t lock; + struct list_head sdom; + struct list_head svc; /* List of all vcpus */ + struct list_head runq; /* Global runqueue */ + uint32_t ncpus; + cpumask_t idlers; /* Used for initial placement. */ + + int global_weight; + int scale_factor; +}; + +/* + * Physical CPU + */ +struct csched_pcpu { + struct list_head runq; +}; + +/* + * Virtual CPU + */ +struct csched_vcpu { + struct list_head global_elem; /* On the global vcpu list */ + struct list_head sdom_elem; /* On the domain vcpu list */ + struct list_head runq_elem; /* On the runqueue */ + + /* Up-pointers */ + struct csched_dom *sdom; + struct vcpu *vcpu; + + int credit; /* Written by csched_schedule() */ + s_time_t start_time; /* When we were scheduled (used for credit) */ + //s_time_t boost_time; /* How much boost time is left? */ + unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */ + +}; + +/* + * Domain + */ +struct csched_dom { + struct list_head vcpu; + struct list_head sdom_elem; + struct domain *dom; + uint16_t weight; + uint16_t nr_vcpus; +}; + + +/* + * Global variables + */ +static struct csched_private csched_priv; + + +/* + * Runqueue related code + */ + +static /*inline*/ int +__vcpu_on_runq(struct csched_vcpu *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static /*inline*/ struct csched_vcpu * +__runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct csched_vcpu, runq_elem); +} + +static int +__runq_insert(struct list_head *runq, struct csched_vcpu *svc, s_time_t now) +{ + struct list_head *iter; + int pos = 0; + + d2printk("rqi d%dv%d\n", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id); + + list_for_each( iter, runq ) + { + struct csched_vcpu * iter_svc = __runq_elem(iter); + + if ( svc->credit > iter_svc->credit ) + { + d2printk(" p%d d%dv%d\n", + pos, + iter_svc->vcpu->domain->domain_id, + iter_svc->vcpu->vcpu_id); + break; + } + pos++; + } + + list_add_tail(&svc->runq_elem, iter); + + return pos; +} + +static void +runq_insert(unsigned int cpu, struct csched_vcpu *svc, s_time_t now) +{ + struct list_head * runq = RUNQ(cpu); + int pos = 0; + + ASSERT( spin_is_locked(&csched_priv.lock) ); /* GRQ */ + + BUG_ON( __vcpu_on_runq(svc) ); + /* FIXME: Check runqueue handles this cpu*/ + //BUG_ON( cpu != svc->vcpu->processor ); + + pos = __runq_insert(runq, svc, now); + + { + struct { + unsigned dom:16,vcpu:16; + unsigned pos; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.pos = pos; + trace_var(TRC_CSCHED2_RUNQ_POS, 1, + sizeof(d), + (unsigned char *)&d); + } + + return; +} + +static inline void +__runq_remove(struct csched_vcpu *svc) +{ + BUG_ON( !__vcpu_on_runq(svc) ); + list_del_init(&svc->runq_elem); +} + +void burn_credits(struct csched_vcpu *, s_time_t); + +/* Check to see if the item on the runqueue is higher priority than what's + * currently running; if so, wake up the processor */ +static /*inline*/ void +runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now) +{ + int i, min=-1, credit_min=1<<30; + cpumask_t mask; + + ASSERT(cur); + cpus_clear(mask); + + d2printk("rqt d%dv%d cd%dv%d\n", + new->vcpu->domain->domain_id, + new->vcpu->vcpu_id, + cur->vcpu->domain->domain_id, + cur->vcpu->vcpu_id); + + /* Find the cpu in this queue group that has the lowest credits */ + /* FIXME: HACK! */ +#define HACK_START_CPU 0 +#define HACK_MAX_CPUS 2 + for ( i=HACK_START_CPU; icredit > cur->credit && cur->credit < credit_min ) + { + min = i; + credit_min = cur->credit; + } + { + struct { + unsigned dom:16,vcpu:16; + unsigned credit; + } d; + d.dom = cur->vcpu->domain->domain_id; + d.vcpu = cur->vcpu->vcpu_id; + d.credit = cur->credit; + trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, + sizeof(d), + (unsigned char *)&d); + } + } + + if ( min != -1 ) + { + cpu_set(min, mask); + } + + /* Send scheduler interrupts to designated CPUs */ + if ( !cpus_empty(mask) ) + cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ); +} + +static void +runq_sort(/* runq, */s_time_t now) +{ + struct list_head *runq = RUNQ(0)/*GRQ*/, *elem, *n; + struct list_head temp; + //int flags; + + + /* GRQ */ + //spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags); + //spin_lock_irqsave(&csched_priv.lock, flags); + + if ( list_empty(runq) ) + goto out_unlock; + + /* FIXME: Insertion sort -- yuck! But it's quick-and-dirty... */ + + /* First, take everything off and add it to a temporary list */ + temp.next = runq->next; + temp.next->prev = &temp; + temp.prev = runq->prev; + temp.prev->next = &temp; + + runq->next = runq->prev = runq; + + /* Now insert everything again */ + list_for_each_safe ( elem, n, &temp ) + { + struct csched_vcpu *svc_elem; + + list_del_init(elem); + + svc_elem = __runq_elem(elem); + __runq_insert(runq, svc_elem, now); + } + +out_unlock: + /* GRQ */ + //spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags); + //spin_unlock_irqrestore(&csched_priv.lock, flags); + ; +} + +/* + * Credit-related code + */ + +void burn_credits(struct csched_vcpu *svc, s_time_t now) +{ + s_time_t delta; + + /* Assert svc is current */ + ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr)); + + if ( is_idle_vcpu(svc->vcpu) ) + { + ASSERT(svc->credit == CSCHED_IDLE_CREDIT); + return; + } + + delta = now - svc->start_time; + + if ( delta > 0 ) { + /* This will round down; should we consider rounding up...? */ + svc->credit -= CSCHED_STIME_TO_CREDIT(delta); + svc->start_time = now; + d2printk("b d%dv%d c%d\n", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->credit); + } else { + d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n", + __func__, now, svc->start_time); + } + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned credit; + int delta; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.credit = svc->credit; + d.delta = delta; + trace_var(TRC_CSCHED2_CREDIT_BURN, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +/* Depost credit if the highest runnable process has run out of credit */ +void credit_deposit(s_time_t now) +{ + struct list_head *iter; + + list_for_each ( iter, &csched_priv.svc ) + { + struct csched_vcpu *svc; + + svc = list_entry(iter, struct csched_vcpu, global_elem); + + ASSERT( ! is_idle_vcpu(svc->vcpu) ); + + svc->credit = CSCHED_STIME_TO_CREDIT(svc->sdom->weight + * csched_priv.scale_factor); + + d2printk("d d%dv%d c%d\n", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->credit); + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned credit; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.credit = svc->credit; + trace_var(TRC_CSCHED2_CREDIT_ADD, 1, + sizeof(d), + (unsigned char *)&d); + } + + if ( svc->start_time < now ) + svc->start_time = now; + } + + runq_sort(/*runq,*/now); +} + +/* Scale factor: multiply weight so that everyone can run every 50 ms */ +void update_scale_factor(void) +{ + if ( csched_priv.global_weight > 0 ) + csched_priv.scale_factor = MILLISECS(50) / csched_priv.global_weight; + else + csched_priv.scale_factor = 1; + + printk("%s: gw %d f %d\n", + __func__, csched_priv.global_weight, csched_priv.scale_factor); +} + +/* + * Initialization code + */ + +static int +csched_pcpu_init(int cpu) +{ + struct csched_pcpu *spc; + unsigned long flags; + + /* Allocate per-PCPU info */ + spc = xmalloc(struct csched_pcpu); + if ( spc == NULL ) + return -1; + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Initialize/update system-wide config */ + csched_priv.ncpus++; + + INIT_LIST_HEAD(&spc->runq); + per_cpu(schedule_data, cpu).sched_priv = spc; + + /* Start off idling... */ + BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr)); + cpu_set(cpu, csched_priv.idlers); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + return 0; +} + +#ifndef NDEBUG +static /*inline*/ void +__csched_vcpu_check(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_dom * const sdom = svc->sdom; + + BUG_ON( svc->vcpu != vc ); + BUG_ON( sdom != CSCHED_DOM(vc->domain) ); + if ( sdom ) + { + BUG_ON( is_idle_vcpu(vc) ); + BUG_ON( sdom->dom != vc->domain ); + } + else + { + BUG_ON( !is_idle_vcpu(vc) ); + } +} +#define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) +#else +#define CSCHED_VCPU_CHECK(_vc) +#endif + +#if 0 +static /*inline*/ int +__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu) +{ + /* + * Don't pick up work that's in the peer's scheduling tail. Also only pick + * up work that's allowed to run on our CPU. + */ + return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity); +} +#endif + +static int +csched_cpu_pick(struct vcpu *vc) +{ + cpumask_t cpus; + cpumask_t idlers; + int cpu; + + /* + * Pick from online CPUs in VCPU's affinity mask, giving a + * preference to its current processor if it's in there. + */ + cpus_and(cpus, cpu_online_map, vc->cpu_affinity); + cpu = cpu_isset(vc->processor, cpus) + ? vc->processor + : cycle_cpu(vc->processor, cpus); + ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) ); + + /* + * Try to find an idle processor within the above constraints. + * + * In multi-core and multi-threaded CPUs, not all idle execution + * vehicles are equal! + * + * We give preference to the idle execution vehicle with the most + * idling neighbours in its grouping. This distributes work across + * distinct cores first and guarantees we don't do something stupid + * like run two VCPUs on co-hyperthreads while there are idle cores + * or sockets. + */ + idlers = csched_priv.idlers; + cpu_set(cpu, idlers); + cpus_and(cpus, cpus, idlers); + cpu_clear(cpu, cpus); + + while ( !cpus_empty(cpus) ) + { + cpumask_t cpu_idlers; + cpumask_t nxt_idlers; + int nxt; + + nxt = cycle_cpu(cpu, cpus); + + if ( cpu_isset(cpu, cpu_core_map[nxt]) ) + { + ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) ); + cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]); + cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]); + } + else + { + ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) ); + cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]); + cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]); + } + + if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) ) + { + cpu = nxt; + cpu_clear(cpu, cpus); + } + else + { + cpus_andnot(cpus, cpus, nxt_idlers); + } + } + + return cpu; +} + +static int +csched_vcpu_init(struct vcpu *vc) +{ + struct domain * const dom = vc->domain; + struct csched_dom *sdom = CSCHED_DOM(dom); + struct csched_vcpu *svc; + + printk("%s: Initializing d%dv%d\n", + __func__, dom->domain_id, vc->vcpu_id); + + /* Allocate per-VCPU info */ + svc = xmalloc(struct csched_vcpu); + if ( svc == NULL ) + return -1; + + INIT_LIST_HEAD(&svc->global_elem); + INIT_LIST_HEAD(&svc->sdom_elem); + INIT_LIST_HEAD(&svc->runq_elem); + + svc->sdom = sdom; + svc->vcpu = vc; + svc->credit = 0; /* FIXME: maybe something else? */ + svc->flags = 0U; + vc->sched_priv = svc; + + if ( sdom ) { + list_add_tail(&svc->sdom_elem, &sdom->vcpu); + sdom->nr_vcpus++; + csched_priv.global_weight += sdom->weight; + update_scale_factor(); + } else + printk("Strange, sdom NULL!\n"); + + if ( ! is_idle_vcpu(vc) ) + list_add_tail(&svc->global_elem, &csched_priv.svc); + else + svc->credit = CSCHED_IDLE_CREDIT; + + /* Allocate per-PCPU info */ + if ( unlikely(!CSCHED_PCPU(vc->processor)) ) + { + if ( csched_pcpu_init(vc->processor) != 0 ) + return -1; + } + + CSCHED_VCPU_CHECK(vc); + return 0; +} + +static void +csched_vcpu_destroy(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_dom * const sdom = svc->sdom; + unsigned long flags; + + BUG_ON( sdom == NULL ); + BUG_ON( !list_empty(&svc->runq_elem) ); + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Remove from sdom list */ + list_del_init(&svc->global_elem); + list_del_init(&svc->sdom_elem); + + sdom->nr_vcpus--; + csched_priv.global_weight -= sdom->weight; + update_scale_factor(); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + xfree(svc); +} + +static void +csched_vcpu_sleep(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + + BUG_ON( is_idle_vcpu(vc) ); + + if ( per_cpu(schedule_data, vc->processor).curr == vc ) + cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); + else if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); +} + +static void +csched_vcpu_wake(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + const unsigned int cpu = vc->processor; + s_time_t now = 0; + int flags; + + d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id); + + BUG_ON( is_idle_vcpu(vc) ); + + /* GRQ */ + spin_lock_irqsave(&csched_priv.lock, flags); + + + /* Make sure svc priority mod happens before runq check */ + if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) ) + { + goto out; + } + if ( unlikely(__vcpu_on_runq(svc)) ) + { + /* If we've boosted someone that's already on a runqueue, prioritize + * it and inform the cpu in question. */ + goto out; + } + + now = NOW(); + + /* Put the VCPU on the runq */ + runq_insert(cpu, svc, now); + runq_tickle(cpu, svc, now); + +out: + spin_unlock_irqrestore(&csched_priv.lock, flags); /* GRQ */ + d2printk("w-\n"); + return; +} + +static int +csched_dom_cntl( + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct csched_dom * const sdom = CSCHED_DOM(d); + unsigned long flags; + + if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) + { + op->u.credit2.weight = sdom->weight; + } + else + { + ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); + + if ( op->u.credit2.weight != 0 ) + { + spin_lock_irqsave(&csched_priv.lock, flags); + + csched_priv.global_weight -= sdom->weight * sdom->nr_vcpus; + + sdom->weight = op->u.credit2.weight; + + csched_priv.global_weight += sdom->weight * sdom->nr_vcpus; + + update_scale_factor(); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + } + } + + return 0; +} + +static int +csched_dom_init(struct domain *dom) +{ + struct csched_dom *sdom; + + printk("%s: Initializing domain %d\n", __func__, dom->domain_id); + + if ( is_idle_domain(dom) ) + return 0; + + sdom = xmalloc(struct csched_dom); + if ( sdom == NULL ) + return -ENOMEM; + + /* Initialize credit and weight */ + INIT_LIST_HEAD(&sdom->vcpu); + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + sdom->weight = CSCHED_DEFAULT_WEIGHT; + sdom->nr_vcpus = 0; + + dom->sched_priv = sdom; + + list_add_tail(&sdom->sdom_elem, &csched_priv.sdom); + + return 0; +} + +static void +csched_dom_destroy(struct domain *dom) +{ + struct csched_dom *sdom = CSCHED_DOM(dom); + + BUG_ON(!list_empty(&sdom->vcpu)); + + list_del_init(&sdom->sdom_elem); + + xfree(CSCHED_DOM(dom)); +} + +#if 0 +static void csched_load_balance(int cpu) +{ + /* FIXME: Do something. */ +} +#endif + +/* How long should we let this vcpu run for? */ +static s_time_t +csched_runtime(struct csched_vcpu *snext, struct csched_pcpu *spc, + s_time_t now) +{ + s_time_t time = 0; + + /* Basic timeslice */ + time = CSCHED_CREDIT_TO_STIME(snext->credit); + + /* Check limits */ + if ( time < CSCHED_MIN_TIMER ) + time = CSCHED_MIN_TIMER; + else if ( time > CSCHED_MAX_SCHEDULE ) + time = CSCHED_MAX_SCHEDULE; + + return time; +} + +void __dump_execstate(void *unused); + +/* + * This function is in the critical path. It is designed to be simple and + * fast for the common case. + */ +static struct task_slice +csched_schedule(s_time_t now) +{ + const int cpu = smp_processor_id(); + struct list_head * const runq = RUNQ(cpu); + struct csched_pcpu *spc = CSCHED_PCPU(cpu); + struct csched_vcpu * const scurr = CSCHED_VCPU(current); + struct csched_vcpu *snext; + struct task_slice ret; + int flags; + + CSCHED_VCPU_CHECK(current); + + d2printk("sc p%d c d%dv%d now %"PRI_stime"\n", + cpu, + scurr->vcpu->domain->domain_id, + scurr->vcpu->vcpu_id, + now); + + + /* GRQ */ + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Update credits */ + burn_credits(scurr, now); + + /* + * Select next runnable local VCPU (ie top of local runq) + * Insert will cause credits to be updated. + */ + if ( vcpu_runnable(current) ) + runq_insert(cpu, scurr, now); + else + BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); + + snext = __runq_elem(runq->next); + + if ( snext->credit <= 0 && !is_idle_vcpu(snext->vcpu) ) + { + /* If the next item has <= 0 credits, update credits and resort */ + credit_deposit(now); + snext = __runq_elem(runq->next); + } + + __runq_remove(snext); + + /* HACK. Multiple cpus are sharing a runqueue; but due to the way + * things are set up, it's possible for a vcpu to be scheduled out on one + * cpu and put on the runqueue, and taken off by another cpu, before the first + * cpu has actually completed the context switch (indicated by is_running). + * + * So in general we just wait for is_running to be false, always checking + * to see if it should still be put on the runqueue (i.e., it may be + * paused). + * + * Even so, occasionally we get into a deadlock situation. I haven't found + * out who the other "hold-and-wait"-er is because they seem to have + * irqs disabled. In any case, if we spin for 65K times, we assume there's + * a deadlock and put the vcpu on the tail of the runqueue (yes, behind the + * idle vcpus). It will be re-ordered at most 10ms later when we do a + * runqueue sort. */ + if ( snext != scurr && snext->vcpu->is_running ) + { + int count = 0; + do { + BUG_ON(count < 0); + count++; + if ( (count & 0xffff) == 0 ) { + printk("p%d d%dv%d running on p%d, passed %d iterations!\n", + cpu, snext->vcpu->domain->domain_id, + snext->vcpu->vcpu_id, + snext->vcpu->processor, + count); + if ( vcpu_runnable(snext->vcpu) ) + list_add_tail(&snext->runq_elem, runq); + + } else if ( vcpu_runnable(snext->vcpu) ) + runq_insert(cpu, snext, now); + + BUG_ON(list_empty(runq)); + + snext = __runq_elem(runq->next); + __runq_remove(snext); + } while ( snext != scurr && snext->vcpu->is_running ); + //printk("done\n"); + } + /* GRQ: Fast and loose! */ + snext->vcpu->processor = cpu; + + /* GRQ */ + spin_unlock_irqrestore(&csched_priv.lock, flags); + + /* + * Update idlers mask if necessary. When we're idling, other CPUs + * will tickle us when they get extra work. + */ + if ( is_idle_vcpu(snext->vcpu) ) + { + if ( !cpu_isset(cpu, csched_priv.idlers) ) + cpu_set(cpu, csched_priv.idlers); + } + else if ( cpu_isset(cpu, csched_priv.idlers) ) + { + cpu_clear(cpu, csched_priv.idlers); + } + + if ( !is_idle_vcpu(snext->vcpu) ) + snext->start_time = now; + /* + * Return task to run next... + */ + ret.time = csched_runtime(snext, spc, now); + ret.task = snext->vcpu; + + CSCHED_VCPU_CHECK(ret.task); + return ret; +} + +static void +csched_dump_vcpu(struct csched_vcpu *svc) +{ + struct csched_dom * const sdom = svc->sdom; + + printk("[%i.%i] flags=%x cpu=%i", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->flags, + svc->vcpu->processor); + + if ( sdom ) + { + printk(" credit=%i [w=%u]", svc->credit, sdom->weight); + } + + printk("\n"); +} + +static void +csched_dump_pcpu(int cpu) +{ + struct list_head *runq, *iter; + struct csched_pcpu *spc; + struct csched_vcpu *svc; + int loop; + char cpustr[100]; + + spc = CSCHED_PCPU(cpu); + runq = &spc->runq; + + cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_sibling_map[cpu]); + printk(" sibling=%s, ", cpustr); + cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_core_map[cpu]); + printk("core=%s\n", cpustr); + + /* current VCPU */ + svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); + if ( svc ) + { + printk("\trun: "); + csched_dump_vcpu(svc); + } + + loop = 0; + list_for_each( iter, runq ) + { + svc = __runq_elem(iter); + if ( svc ) + { + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} + +static void +csched_dump(void) +{ + struct list_head *iter_sdom, *iter_svc; + int loop; + char idlers_buf[100]; + + printk("info:\n" + "\tncpus = %u\n" + "\tdefault-weight = %d\n" + "\tms per tick = %dms\n" + "\tcredits per ms = %d\n" + "\tms per acct = %dms\n" + "\tglobal weight = %d\n" + "\tscale factor = %d\n", + csched_priv.ncpus, + CSCHED_DEFAULT_WEIGHT, + CSCHED_MS_PER_TICK, + CSCHED_CREDITS_PER_MS, + CSCHED_MS_PER_ACCT, + csched_priv.global_weight, + csched_priv.scale_factor); + + cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers); + printk("idlers: %s\n", idlers_buf); + + printk("active vcpus:\n"); + loop = 0; + list_for_each( iter_sdom, &csched_priv.sdom ) + { + struct csched_dom *sdom; + sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem); + + list_for_each( iter_svc, &sdom->vcpu ) + { + struct csched_vcpu *svc; + svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem); + + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} + +static void +csched_init(void) +{ + spin_lock_init(&csched_priv.lock); + INIT_LIST_HEAD(&csched_priv.sdom); + INIT_LIST_HEAD(&csched_priv.svc); + INIT_LIST_HEAD(&csched_priv.runq); /* GRQ */ + + csched_priv.ncpus = 0; + cpus_clear(csched_priv.idlers); + csched_priv.global_weight = 0; + csched_priv.scale_factor = 1; +} + +struct scheduler sched_credit2_def = { + .name = "SMP Credit Scheduler rev2", + .opt_name = "credit2", + .sched_id = XEN_SCHEDULER_CREDIT2, + + .init_domain = csched_dom_init, + .destroy_domain = csched_dom_destroy, + + .init_vcpu = csched_vcpu_init, + .destroy_vcpu = csched_vcpu_destroy, + + .sleep = csched_vcpu_sleep, + .wake = csched_vcpu_wake, + + .adjust = csched_dom_cntl, + + .pick_cpu = csched_cpu_pick, + .do_schedule = csched_schedule, + + .dump_cpu_state = csched_dump_pcpu, + .dump_settings = csched_dump, + .init = csched_init, +}; diff -r f72d26c00002 xen/common/schedule.c --- a/xen/common/schedule.c Tue Jun 02 11:50:16 2009 +0100 +++ b/xen/common/schedule.c Mon Jun 15 23:24:59 2009 +0100 @@ -58,9 +58,11 @@ extern struct scheduler sched_sedf_def; extern struct scheduler sched_credit_def; +extern struct scheduler sched_credit2_def; static struct scheduler *schedulers[] = { &sched_sedf_def, &sched_credit_def, + &sched_credit2_def, NULL }; diff -r f72d26c00002 xen/include/public/domctl.h --- a/xen/include/public/domctl.h Tue Jun 02 11:50:16 2009 +0100 +++ b/xen/include/public/domctl.h Mon Jun 15 23:24:59 2009 +0100 @@ -297,6 +297,7 @@ /* Scheduler types. */ #define XEN_SCHEDULER_SEDF 4 #define XEN_SCHEDULER_CREDIT 5 +#define XEN_SCHEDULER_CREDIT2 6 /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 @@ -315,6 +316,9 @@ uint16_t weight; uint16_t cap; } credit; + struct xen_domctl_sched_credit2 { + uint16_t weight; + } credit2; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; diff -r f72d26c00002 xen/include/public/trace.h --- a/xen/include/public/trace.h Tue Jun 02 11:50:16 2009 +0100 +++ b/xen/include/public/trace.h Mon Jun 15 23:24:59 2009 +0100 @@ -53,6 +53,7 @@ #define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ #define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */ +#define TRC_SCHED_CLASS 0x00022000 /* Scheduler-specific */ #define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */ /* Trace events per class */