diff -r f72d26c00002 xen/common/Makefile
--- a/xen/common/Makefile	Tue Jun 02 11:50:16 2009 +0100
+++ b/xen/common/Makefile	Mon Jun 15 23:24:59 2009 +0100
@@ -13,6 +13,7 @@
 obj-y += page_alloc.o
 obj-y += rangeset.o
 obj-y += sched_credit.o
+obj-y += sched_credit2.o
 obj-y += sched_sedf.o
 obj-y += schedule.o
 obj-y += shutdown.o
diff -r f72d26c00002 xen/common/sched_credit2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/sched_credit2.c	Mon Jun 15 23:24:59 2009 +0100
@@ -0,0 +1,1076 @@
+
+/****************************************************************************
+ * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
+ ****************************************************************************
+ *
+ *        File: common/csched_credit2.c
+ *      Author: George Dunlap
+ *
+ * Description: Credit-based SMP CPU scheduler
+ * Based on an earlier verson by Emmanuel Ackaouy.
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+
+#if __i386__
+#define PRI_stime "lld"
+#else
+#define PRI_stime "ld"
+#endif
+
+#define iprintk(x...)
+#define d2printk(x...)
+//#define d2printk printk
+
+#define TRC_CSCHED2_TICK        TRC_SCHED_CLASS + 1
+#define TRC_CSCHED2_RUNQ_POS    TRC_SCHED_CLASS + 2
+#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3
+#define TRC_CSCHED2_CREDIT_ADD  TRC_SCHED_CLASS + 4
+#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5
+
+/* 
+ * Design:
+ *
+ * Credits represent a fixed amount of CPU time.  A VM accumulates credits
+ * relative to its weight, and consumes credits as it runs. 
+ * 
+ * Things that need to happen:
+ * + Distributing credits
+ * + Subtracting credits
+ * + Changing priority based on credits
+ *
+ */
+
+/*
+ * CSCHED_STATS
+ *
+ * Manage very basic counters and stats.
+ *
+ * Useful for debugging live systems. The stats are displayed
+ * with runq dumps ('r' on the Xen console). */
+//#define CSCHED_STATS
+
+
+/*
+ * Basic constants
+ */
+#define CSCHED_DEFAULT_WEIGHT       256
+#define CSCHED_CREDITS_PER_MS       100
+#define CSCHED_MS_PER_TICK          10
+#define CSCHED_MS_PER_ACCT          100
+#define CSCHED_MIN_TIMER            MICROSECS(500)
+//#define CSCHED_MAX_SCHEDULE         MILLISECS(10)
+#define CSCHED_MAX_SCHEDULE         MILLISECS(2)
+#define CSCHED_DEFAULT_SLICE        MILLISECS(10)
+#define CSCHED_BOOST_TIME           MILLISECS(1)
+#define CSCHED_CREDIT_TO_STIME(_c)  ((_c)*(MILLISECS(1)/CSCHED_CREDITS_PER_MS))
+#define CSCHED_STIME_TO_CREDIT(_t)  ((_t)*CSCHED_CREDITS_PER_MS/MILLISECS(1))
+
+#define CSCHED_CREDITS_PER_ACCT     ( (CSCHED_MS_PER_ACCT) * CSCHED_CREDITS_PER_MS )
+
+#define CSCHED_IDLE_CREDIT                 (-(1<<30))
+
+/*
+ * Flags
+ */
+#define __CSFLAG_unblock_boost 1
+#define CSFLAG_unblock_boost (1<<__CSFLAG_unblock_boost)
+
+
+/*
+ * Useful macros
+ */
+#define CSCHED_PCPU(_c)     \
+    ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
+#define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
+#define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
+//#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+#define RUNQ(_cpu)          (&csched_priv.runq) /* GRQ */
+
+/*
+ * System-wide private data
+ */
+struct csched_private {
+    spinlock_t lock;
+    struct list_head sdom;
+    struct list_head svc;  /* List of all vcpus */
+    struct list_head runq; /* Global runqueue */
+    uint32_t ncpus;
+    cpumask_t idlers;      /* Used for initial placement. */
+
+    int global_weight;
+    int scale_factor;
+};
+
+/*
+ * Physical CPU
+ */
+struct csched_pcpu {
+    struct list_head runq;
+};
+
+/*
+ * Virtual CPU
+ */
+struct csched_vcpu {
+    struct list_head global_elem; /* On the global vcpu list */
+    struct list_head sdom_elem;   /* On the domain vcpu list */
+    struct list_head runq_elem;   /* On the runqueue         */
+
+    /* Up-pointers */
+    struct csched_dom *sdom;
+    struct vcpu *vcpu;
+
+    int credit;  /* Written by csched_schedule() */
+    s_time_t start_time;   /* When we were scheduled (used for credit) */
+    //s_time_t boost_time; /* How much boost time is left? */
+    unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */
+
+};
+
+/*
+ * Domain
+ */
+struct csched_dom {
+    struct list_head vcpu;
+    struct list_head sdom_elem;
+    struct domain *dom;
+    uint16_t weight;
+    uint16_t nr_vcpus;
+};
+
+
+/*
+ * Global variables
+ */
+static struct csched_private csched_priv;
+
+
+/*
+ * Runqueue related code
+ */
+
+static /*inline*/ int
+__vcpu_on_runq(struct csched_vcpu *svc)
+{
+    return !list_empty(&svc->runq_elem);
+}
+
+static /*inline*/ struct csched_vcpu *
+__runq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct csched_vcpu, runq_elem);
+}
+
+static int
+__runq_insert(struct list_head *runq, struct csched_vcpu *svc, s_time_t now)
+{
+    struct list_head *iter;
+    int pos = 0;
+
+    d2printk("rqi d%dv%d\n",
+           svc->vcpu->domain->domain_id,
+           svc->vcpu->vcpu_id);
+
+    list_for_each( iter, runq )
+    {
+        struct csched_vcpu * iter_svc = __runq_elem(iter);
+
+        if ( svc->credit > iter_svc->credit )
+        {
+            d2printk(" p%d d%dv%d\n",
+                   pos,
+                   iter_svc->vcpu->domain->domain_id,
+                   iter_svc->vcpu->vcpu_id);
+            break;
+        }
+        pos++;
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+
+    return pos;
+}
+
+static void
+runq_insert(unsigned int cpu, struct csched_vcpu *svc, s_time_t now)
+{
+    struct list_head * runq = RUNQ(cpu);
+    int pos = 0;
+
+    ASSERT( spin_is_locked(&csched_priv.lock) ); /* GRQ */
+
+    BUG_ON( __vcpu_on_runq(svc) );
+    /* FIXME: Check runqueue handles this cpu*/
+    //BUG_ON( cpu != svc->vcpu->processor ); 
+
+    pos = __runq_insert(runq, svc, now);
+
+    {
+        struct {
+            unsigned dom:16,vcpu:16;
+            unsigned pos;
+        } d;
+        d.dom = svc->vcpu->domain->domain_id;
+        d.vcpu = svc->vcpu->vcpu_id;
+        d.pos = pos;
+        trace_var(TRC_CSCHED2_RUNQ_POS, 1,
+                  sizeof(d),
+                  (unsigned char *)&d);
+    }
+
+    return;
+}
+
+static inline void
+__runq_remove(struct csched_vcpu *svc)
+{
+    BUG_ON( !__vcpu_on_runq(svc) );
+    list_del_init(&svc->runq_elem);
+}
+
+void burn_credits(struct csched_vcpu *, s_time_t);
+
+/* Check to see if the item on the runqueue is higher priority than what's
+ * currently running; if so, wake up the processor */
+static /*inline*/ void
+runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now)
+{
+    int i, min=-1, credit_min=1<<30;
+    cpumask_t mask;
+
+    ASSERT(cur);
+    cpus_clear(mask);
+
+    d2printk("rqt d%dv%d cd%dv%d\n",
+             new->vcpu->domain->domain_id,
+             new->vcpu->vcpu_id,
+             cur->vcpu->domain->domain_id,
+             cur->vcpu->vcpu_id);
+
+    /* Find the cpu in this queue group that has the lowest credits */
+    /* FIXME: HACK! */
+#define HACK_START_CPU 0
+#define HACK_MAX_CPUS 2
+    for ( i=HACK_START_CPU; i<HACK_MAX_CPUS; i++)
+    {
+        struct csched_vcpu * const cur =
+            CSCHED_VCPU(per_cpu(schedule_data, i).curr);
+
+        /* Update credits for current to see if we want to preempt */
+        burn_credits(cur, now);
+
+        if ( new->credit > cur->credit && cur->credit < credit_min )
+        {
+            min = i;
+            credit_min = cur->credit;
+        }
+        {
+            struct {
+                unsigned dom:16,vcpu:16;
+                unsigned credit;
+            } d;
+            d.dom = cur->vcpu->domain->domain_id;
+            d.vcpu = cur->vcpu->vcpu_id;
+            d.credit = cur->credit;
+            trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
+                      sizeof(d),
+                      (unsigned char *)&d);
+        }
+    }
+
+    if ( min != -1 )
+    {
+        cpu_set(min, mask);
+    }
+
+    /* Send scheduler interrupts to designated CPUs */
+    if ( !cpus_empty(mask) )
+        cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
+}
+
+static void
+runq_sort(/* runq, */s_time_t now)
+{
+    struct list_head *runq = RUNQ(0)/*GRQ*/, *elem, *n;
+    struct list_head temp;
+    //int flags;
+
+
+    /* GRQ */
+    //spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+    //spin_lock_irqsave(&csched_priv.lock, flags);
+
+    if ( list_empty(runq) )
+        goto out_unlock;
+    
+    /* FIXME: Insertion sort -- yuck!  But it's quick-and-dirty... */
+
+    /* First, take everything off and add it to a temporary list */
+    temp.next = runq->next;
+    temp.next->prev = &temp;
+    temp.prev = runq->prev;
+    temp.prev->next = &temp;
+
+    runq->next = runq->prev = runq;
+
+    /* Now insert everything again */
+    list_for_each_safe ( elem, n, &temp )
+    {
+        struct csched_vcpu *svc_elem;
+
+        list_del_init(elem);
+
+        svc_elem = __runq_elem(elem);
+        __runq_insert(runq, svc_elem, now);
+    }
+
+out_unlock:
+    /* GRQ */
+    //spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+    //spin_unlock_irqrestore(&csched_priv.lock, flags);
+    ;
+}
+
+/*
+ * Credit-related code
+ */
+
+void burn_credits(struct csched_vcpu *svc, s_time_t now)
+{
+    s_time_t delta;
+
+    /* Assert svc is current */
+    ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
+
+    if ( is_idle_vcpu(svc->vcpu) )
+    {
+        ASSERT(svc->credit == CSCHED_IDLE_CREDIT);
+        return;
+    }
+
+    delta = now - svc->start_time;
+
+    if ( delta > 0 ) {
+        /* This will round down; should we consider rounding up...? */
+        svc->credit -= CSCHED_STIME_TO_CREDIT(delta);
+        svc->start_time = now;
+        d2printk("b d%dv%d c%d\n",
+                 svc->vcpu->domain->domain_id,
+                 svc->vcpu->vcpu_id,
+                 svc->credit);
+    } else {
+        d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n",
+               __func__, now, svc->start_time);
+    }
+    
+    /* TRACE */
+    {
+        struct {
+            unsigned dom:16,vcpu:16;
+            unsigned credit;
+            int delta;
+        } d;
+        d.dom = svc->vcpu->domain->domain_id;
+        d.vcpu = svc->vcpu->vcpu_id;
+        d.credit = svc->credit;
+        d.delta = delta;
+        trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
+                  sizeof(d),
+                  (unsigned char *)&d);
+    }
+}
+
+/* Depost credit if the highest runnable process has run out of credit */ 
+void credit_deposit(s_time_t now)
+{
+    struct list_head *iter;
+
+    list_for_each ( iter, &csched_priv.svc )
+    {
+        struct csched_vcpu *svc;
+
+        svc = list_entry(iter, struct csched_vcpu, global_elem);
+
+        ASSERT( ! is_idle_vcpu(svc->vcpu) );
+
+        svc->credit = CSCHED_STIME_TO_CREDIT(svc->sdom->weight
+                                             * csched_priv.scale_factor);
+
+        d2printk("d d%dv%d c%d\n",
+                 svc->vcpu->domain->domain_id,
+                 svc->vcpu->vcpu_id,
+                 svc->credit);
+
+        /* TRACE */
+        {
+            struct {
+                unsigned dom:16,vcpu:16;
+                unsigned credit;
+            } d;
+            d.dom = svc->vcpu->domain->domain_id;
+            d.vcpu = svc->vcpu->vcpu_id;
+            d.credit = svc->credit;
+            trace_var(TRC_CSCHED2_CREDIT_ADD, 1,
+                      sizeof(d),
+                      (unsigned char *)&d);
+        }
+
+        if ( svc->start_time < now )
+            svc->start_time = now;
+    }
+
+    runq_sort(/*runq,*/now);
+}
+
+/* Scale factor: multiply weight so that everyone can run every 50 ms */
+void update_scale_factor(void)
+{
+    if ( csched_priv.global_weight > 0 )
+        csched_priv.scale_factor = MILLISECS(50) / csched_priv.global_weight;
+    else
+        csched_priv.scale_factor = 1;
+
+    printk("%s: gw %d f %d\n",
+           __func__, csched_priv.global_weight, csched_priv.scale_factor);
+}
+
+/*
+ * Initialization code
+ */
+       
+static int
+csched_pcpu_init(int cpu)
+{
+    struct csched_pcpu *spc;
+    unsigned long flags;
+
+    /* Allocate per-PCPU info */
+    spc = xmalloc(struct csched_pcpu);
+    if ( spc == NULL )
+        return -1;
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    /* Initialize/update system-wide config */
+    csched_priv.ncpus++;
+
+    INIT_LIST_HEAD(&spc->runq);
+    per_cpu(schedule_data, cpu).sched_priv = spc;
+
+    /* Start off idling... */
+    BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
+    cpu_set(cpu, csched_priv.idlers);
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    return 0;
+}
+
+#ifndef NDEBUG
+static /*inline*/ void
+__csched_vcpu_check(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_dom * const sdom = svc->sdom;
+
+    BUG_ON( svc->vcpu != vc );
+    BUG_ON( sdom != CSCHED_DOM(vc->domain) );
+    if ( sdom )
+    {
+        BUG_ON( is_idle_vcpu(vc) );
+        BUG_ON( sdom->dom != vc->domain );
+    }
+    else
+    {
+        BUG_ON( !is_idle_vcpu(vc) );
+    }
+}
+#define CSCHED_VCPU_CHECK(_vc)  (__csched_vcpu_check(_vc))
+#else
+#define CSCHED_VCPU_CHECK(_vc)
+#endif
+
+#if 0
+static /*inline*/ int
+__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
+{
+    /*
+     * Don't pick up work that's in the peer's scheduling tail. Also only pick
+     * up work that's allowed to run on our CPU.
+     */
+    return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
+}
+#endif
+
+static int
+csched_cpu_pick(struct vcpu *vc)
+{
+    cpumask_t cpus;
+    cpumask_t idlers;
+    int cpu;
+
+    /*
+     * Pick from online CPUs in VCPU's affinity mask, giving a
+     * preference to its current processor if it's in there.
+     */
+    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    cpu = cpu_isset(vc->processor, cpus)
+            ? vc->processor
+            : cycle_cpu(vc->processor, cpus);
+    ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
+
+    /*
+     * Try to find an idle processor within the above constraints.
+     *
+     * In multi-core and multi-threaded CPUs, not all idle execution
+     * vehicles are equal!
+     *
+     * We give preference to the idle execution vehicle with the most
+     * idling neighbours in its grouping. This distributes work across
+     * distinct cores first and guarantees we don't do something stupid
+     * like run two VCPUs on co-hyperthreads while there are idle cores
+     * or sockets.
+     */
+    idlers = csched_priv.idlers;
+    cpu_set(cpu, idlers);
+    cpus_and(cpus, cpus, idlers);
+    cpu_clear(cpu, cpus);
+
+    while ( !cpus_empty(cpus) )
+    {
+        cpumask_t cpu_idlers;
+        cpumask_t nxt_idlers;
+        int nxt;
+
+        nxt = cycle_cpu(cpu, cpus);
+
+        if ( cpu_isset(cpu, cpu_core_map[nxt]) )
+        {
+            ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
+            cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
+            cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
+        }
+        else
+        {
+            ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
+            cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
+            cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
+        }
+
+        if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
+        {
+            cpu = nxt;
+            cpu_clear(cpu, cpus);
+        }
+        else
+        {
+            cpus_andnot(cpus, cpus, nxt_idlers);
+        }
+    }
+
+    return cpu;
+}
+
+static int
+csched_vcpu_init(struct vcpu *vc)
+{
+    struct domain * const dom = vc->domain;
+    struct csched_dom *sdom = CSCHED_DOM(dom);
+    struct csched_vcpu *svc;
+
+    printk("%s: Initializing d%dv%d\n",
+           __func__, dom->domain_id, vc->vcpu_id);
+
+    /* Allocate per-VCPU info */
+    svc = xmalloc(struct csched_vcpu);
+    if ( svc == NULL )
+        return -1;
+
+    INIT_LIST_HEAD(&svc->global_elem);
+    INIT_LIST_HEAD(&svc->sdom_elem);
+    INIT_LIST_HEAD(&svc->runq_elem);
+
+    svc->sdom = sdom;
+    svc->vcpu = vc;
+    svc->credit = 0; /* FIXME: maybe something else? */
+    svc->flags = 0U;
+    vc->sched_priv = svc;
+
+    if ( sdom ) {
+        list_add_tail(&svc->sdom_elem, &sdom->vcpu);
+        sdom->nr_vcpus++;
+        csched_priv.global_weight += sdom->weight;
+        update_scale_factor();
+    } else
+        printk("Strange, sdom NULL!\n");
+
+    if ( ! is_idle_vcpu(vc) )
+        list_add_tail(&svc->global_elem, &csched_priv.svc);
+    else
+        svc->credit = CSCHED_IDLE_CREDIT;
+
+    /* Allocate per-PCPU info */
+    if ( unlikely(!CSCHED_PCPU(vc->processor)) )
+    {
+        if ( csched_pcpu_init(vc->processor) != 0 )
+            return -1;
+    }
+
+    CSCHED_VCPU_CHECK(vc);
+    return 0;
+}
+
+static void
+csched_vcpu_destroy(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_dom * const sdom = svc->sdom;
+    unsigned long flags;
+
+    BUG_ON( sdom == NULL );
+    BUG_ON( !list_empty(&svc->runq_elem) );
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    /* Remove from sdom list */
+    list_del_init(&svc->global_elem);
+    list_del_init(&svc->sdom_elem);
+
+    sdom->nr_vcpus--;
+    csched_priv.global_weight -= sdom->weight;
+    update_scale_factor();
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_sleep(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( per_cpu(schedule_data, vc->processor).curr == vc )
+        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
+    else if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+}
+
+static void
+csched_vcpu_wake(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    const unsigned int cpu = vc->processor;
+    s_time_t now = 0;
+    int flags;
+
+    d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id);
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    /* GRQ */
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+
+    /* Make sure svc priority mod happens before runq check */
+    if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
+    {
+        goto out;
+    }
+    if ( unlikely(__vcpu_on_runq(svc)) )
+    {
+        /* If we've boosted someone that's already on a runqueue, prioritize
+         * it and inform the cpu in question. */
+        goto out;
+    }
+
+    now = NOW();
+
+    /* Put the VCPU on the runq */
+    runq_insert(cpu, svc, now);
+    runq_tickle(cpu, svc, now);
+
+out:
+    spin_unlock_irqrestore(&csched_priv.lock, flags); /* GRQ */
+    d2printk("w-\n");
+    return;
+}
+
+static int
+csched_dom_cntl(
+    struct domain *d,
+    struct xen_domctl_scheduler_op *op)
+{
+    struct csched_dom * const sdom = CSCHED_DOM(d);
+    unsigned long flags;
+
+    if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
+    {
+        op->u.credit2.weight = sdom->weight;
+    }
+    else
+    {
+        ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
+
+        if ( op->u.credit2.weight != 0 )
+        {
+            spin_lock_irqsave(&csched_priv.lock, flags);
+
+            csched_priv.global_weight -= sdom->weight * sdom->nr_vcpus;
+
+            sdom->weight = op->u.credit2.weight;
+
+            csched_priv.global_weight += sdom->weight * sdom->nr_vcpus;
+
+            update_scale_factor();
+
+            spin_unlock_irqrestore(&csched_priv.lock, flags);
+        }
+    }
+
+    return 0;
+}
+
+static int
+csched_dom_init(struct domain *dom)
+{
+    struct csched_dom *sdom;
+
+    printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
+
+    if ( is_idle_domain(dom) )
+        return 0;
+
+    sdom = xmalloc(struct csched_dom);
+    if ( sdom == NULL )
+        return -ENOMEM;
+
+    /* Initialize credit and weight */
+    INIT_LIST_HEAD(&sdom->vcpu);
+    INIT_LIST_HEAD(&sdom->sdom_elem);
+    sdom->dom = dom;
+    sdom->weight = CSCHED_DEFAULT_WEIGHT;
+    sdom->nr_vcpus = 0;
+
+    dom->sched_priv = sdom;
+
+    list_add_tail(&sdom->sdom_elem, &csched_priv.sdom);
+
+    return 0;
+}
+
+static void
+csched_dom_destroy(struct domain *dom)
+{
+    struct csched_dom *sdom = CSCHED_DOM(dom);
+
+    BUG_ON(!list_empty(&sdom->vcpu));
+
+    list_del_init(&sdom->sdom_elem);
+    
+    xfree(CSCHED_DOM(dom));
+}
+
+#if 0
+static void csched_load_balance(int cpu)
+{
+    /* FIXME: Do something. */
+}
+#endif
+
+/* How long should we let this vcpu run for? */
+static s_time_t
+csched_runtime(struct csched_vcpu *snext, struct csched_pcpu *spc,
+               s_time_t now)
+{
+    s_time_t time = 0;
+
+    /* Basic timeslice */
+    time = CSCHED_CREDIT_TO_STIME(snext->credit);
+
+    /* Check limits */
+    if ( time < CSCHED_MIN_TIMER )
+        time = CSCHED_MIN_TIMER;
+    else if ( time > CSCHED_MAX_SCHEDULE )
+        time = CSCHED_MAX_SCHEDULE;
+
+    return time;
+}
+
+void __dump_execstate(void *unused);
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static struct task_slice
+csched_schedule(s_time_t now)
+{
+    const int cpu = smp_processor_id();
+    struct list_head * const runq = RUNQ(cpu);
+    struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_vcpu *snext;
+    struct task_slice ret;
+    int flags;
+
+    CSCHED_VCPU_CHECK(current);
+
+    d2printk("sc p%d c d%dv%d now %"PRI_stime"\n",
+             cpu,
+             scurr->vcpu->domain->domain_id,
+             scurr->vcpu->vcpu_id,
+             now);
+
+
+    /* GRQ */
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    /* Update credits */
+    burn_credits(scurr, now);
+
+    /*
+     * Select next runnable local VCPU (ie top of local runq)
+     * Insert will cause credits to be updated. 
+     */
+    if ( vcpu_runnable(current) )
+        runq_insert(cpu, scurr, now);
+    else
+        BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
+
+    snext = __runq_elem(runq->next);
+
+    if ( snext->credit <= 0 && !is_idle_vcpu(snext->vcpu) )
+    {
+        /* If the next item has <= 0 credits, update credits and resort */
+        credit_deposit(now);
+        snext = __runq_elem(runq->next);
+    }
+    
+    __runq_remove(snext);
+
+    /* HACK.  Multiple cpus are sharing a runqueue; but due to the way
+     * things are set up, it's possible for a vcpu to be scheduled out on one
+     * cpu and put on the runqueue, and taken off by another cpu, before the first
+     * cpu has actually completed the context switch (indicated by is_running).
+     * 
+     * So in general we just wait for is_running to be false, always checking
+     * to see if it should still be put on the runqueue (i.e., it may be
+     * paused).
+     *
+     * Even so, occasionally we get into a deadlock situation.  I haven't found
+     * out who the other "hold-and-wait"-er is because they seem to have
+     * irqs disabled.  In any case, if we spin for 65K times, we assume there's
+     * a deadlock and put the vcpu on the tail of the runqueue (yes, behind the
+     * idle vcpus).  It will be re-ordered at most 10ms later when we do a
+     * runqueue sort. */
+    if ( snext != scurr && snext->vcpu->is_running )
+    {
+        int count = 0;
+        do {
+            BUG_ON(count < 0);
+            count++;
+            if ( (count & 0xffff) == 0 ) {
+                printk("p%d d%dv%d running on p%d, passed %d iterations!\n",
+                         cpu, snext->vcpu->domain->domain_id,
+                         snext->vcpu->vcpu_id,
+                         snext->vcpu->processor,
+                         count);
+                if ( vcpu_runnable(snext->vcpu) )
+                    list_add_tail(&snext->runq_elem, runq);
+
+            } else if ( vcpu_runnable(snext->vcpu) )
+                runq_insert(cpu, snext, now);
+
+            BUG_ON(list_empty(runq));
+
+            snext = __runq_elem(runq->next);
+            __runq_remove(snext);
+        } while ( snext != scurr && snext->vcpu->is_running );
+        //printk("done\n");
+    }
+    /* GRQ: Fast and loose! */
+    snext->vcpu->processor = cpu;
+
+    /* GRQ */
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    /*
+     * Update idlers mask if necessary. When we're idling, other CPUs
+     * will tickle us when they get extra work.
+     */
+    if ( is_idle_vcpu(snext->vcpu) )
+    {
+        if ( !cpu_isset(cpu, csched_priv.idlers) )
+            cpu_set(cpu, csched_priv.idlers);
+    }
+    else if ( cpu_isset(cpu, csched_priv.idlers) )
+    {
+        cpu_clear(cpu, csched_priv.idlers);
+    }
+
+    if ( !is_idle_vcpu(snext->vcpu) )
+        snext->start_time = now;
+    /*
+     * Return task to run next...
+     */
+    ret.time = csched_runtime(snext, spc, now);
+    ret.task = snext->vcpu;
+
+    CSCHED_VCPU_CHECK(ret.task);
+    return ret;
+}
+
+static void
+csched_dump_vcpu(struct csched_vcpu *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+
+    printk("[%i.%i] flags=%x cpu=%i",
+            svc->vcpu->domain->domain_id,
+            svc->vcpu->vcpu_id,
+            svc->flags,
+            svc->vcpu->processor);
+
+    if ( sdom )
+    {
+        printk(" credit=%i [w=%u]", svc->credit, sdom->weight);
+    }
+
+    printk("\n");
+}
+
+static void
+csched_dump_pcpu(int cpu)
+{
+    struct list_head *runq, *iter;
+    struct csched_pcpu *spc;
+    struct csched_vcpu *svc;
+    int loop;
+    char cpustr[100];
+
+    spc = CSCHED_PCPU(cpu);
+    runq = &spc->runq;
+
+    cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_sibling_map[cpu]);
+    printk(" sibling=%s, ", cpustr);
+    cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_core_map[cpu]);
+    printk("core=%s\n", cpustr);
+
+    /* current VCPU */
+    svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    if ( svc )
+    {
+        printk("\trun: ");
+        csched_dump_vcpu(svc);
+    }
+
+    loop = 0;
+    list_for_each( iter, runq )
+    {
+        svc = __runq_elem(iter);
+        if ( svc )
+        {
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+
+static void
+csched_dump(void)
+{
+    struct list_head *iter_sdom, *iter_svc;
+    int loop;
+    char idlers_buf[100];
+
+    printk("info:\n"
+           "\tncpus              = %u\n"
+           "\tdefault-weight     = %d\n"
+           "\tms per tick        = %dms\n"
+           "\tcredits per ms     = %d\n"
+           "\tms per acct        = %dms\n"
+           "\tglobal weight      = %d\n"
+           "\tscale factor       = %d\n",
+           csched_priv.ncpus,
+           CSCHED_DEFAULT_WEIGHT,
+           CSCHED_MS_PER_TICK,
+           CSCHED_CREDITS_PER_MS,
+           CSCHED_MS_PER_ACCT,
+           csched_priv.global_weight,
+           csched_priv.scale_factor);
+
+    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
+    printk("idlers: %s\n", idlers_buf);
+
+    printk("active vcpus:\n");
+    loop = 0;
+    list_for_each( iter_sdom, &csched_priv.sdom )
+    {
+        struct csched_dom *sdom;
+        sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem);
+
+        list_for_each( iter_svc, &sdom->vcpu )
+        {
+            struct csched_vcpu *svc;
+            svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem);
+
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+
+static void
+csched_init(void)
+{
+    spin_lock_init(&csched_priv.lock);
+    INIT_LIST_HEAD(&csched_priv.sdom);
+    INIT_LIST_HEAD(&csched_priv.svc);
+    INIT_LIST_HEAD(&csched_priv.runq); /* GRQ */
+
+    csched_priv.ncpus = 0;
+    cpus_clear(csched_priv.idlers);
+    csched_priv.global_weight = 0;
+    csched_priv.scale_factor = 1;
+}
+
+struct scheduler sched_credit2_def = {
+    .name           = "SMP Credit Scheduler rev2",
+    .opt_name       = "credit2",
+    .sched_id       = XEN_SCHEDULER_CREDIT2,
+
+    .init_domain    = csched_dom_init,
+    .destroy_domain = csched_dom_destroy,
+
+    .init_vcpu      = csched_vcpu_init,
+    .destroy_vcpu   = csched_vcpu_destroy,
+
+    .sleep          = csched_vcpu_sleep,
+    .wake           = csched_vcpu_wake,
+
+    .adjust         = csched_dom_cntl,
+
+    .pick_cpu       = csched_cpu_pick,
+    .do_schedule    = csched_schedule,
+
+    .dump_cpu_state = csched_dump_pcpu,
+    .dump_settings  = csched_dump,
+    .init           = csched_init,
+};
diff -r f72d26c00002 xen/common/schedule.c
--- a/xen/common/schedule.c	Tue Jun 02 11:50:16 2009 +0100
+++ b/xen/common/schedule.c	Mon Jun 15 23:24:59 2009 +0100
@@ -58,9 +58,11 @@
 
 extern struct scheduler sched_sedf_def;
 extern struct scheduler sched_credit_def;
+extern struct scheduler sched_credit2_def;
 static struct scheduler *schedulers[] = { 
     &sched_sedf_def,
     &sched_credit_def,
+    &sched_credit2_def,
     NULL
 };
 
diff -r f72d26c00002 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h	Tue Jun 02 11:50:16 2009 +0100
+++ b/xen/include/public/domctl.h	Mon Jun 15 23:24:59 2009 +0100
@@ -297,6 +297,7 @@
 /* Scheduler types. */
 #define XEN_SCHEDULER_SEDF     4
 #define XEN_SCHEDULER_CREDIT   5
+#define XEN_SCHEDULER_CREDIT2  6
 /* Set or get info? */
 #define XEN_DOMCTL_SCHEDOP_putinfo 0
 #define XEN_DOMCTL_SCHEDOP_getinfo 1
@@ -315,6 +316,9 @@
             uint16_t weight;
             uint16_t cap;
         } credit;
+        struct xen_domctl_sched_credit2 {
+            uint16_t weight;
+        } credit2;
     } u;
 };
 typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
diff -r f72d26c00002 xen/include/public/trace.h
--- a/xen/include/public/trace.h	Tue Jun 02 11:50:16 2009 +0100
+++ b/xen/include/public/trace.h	Mon Jun 15 23:24:59 2009 +0100
@@ -53,6 +53,7 @@
 #define TRC_HVM_HANDLER   0x00082000   /* various HVM handlers      */
 
 #define TRC_SCHED_MIN       0x00021000   /* Just runstate changes */
+#define TRC_SCHED_CLASS     0x00022000   /* Scheduler-specific    */
 #define TRC_SCHED_VERBOSE   0x00028000   /* More inclusive scheduling */
 
 /* Trace events per class */