[Xen-changelog] New weighted fair-share CPU scheduler w/ automat

# HG changeset patch
# User ack@xxxxxxxxxxxxxxxxxxxxxxx
# Node ID e539abd27a0f2b1a64b4d129a10748d50c93e6fb
# Parent  b6937b93141961b67dc642581266e6fc2015bc91
New weighted fair-share CPU scheduler w/ automatic SMP load balancing
Signed-off-by: Emmanuel Ackaouy <ack@xxxxxxxxxxxxx>
---
 tools/libxc/Makefile                      |    1 
 tools/libxc/xc_csched.c                   |   50 +
 tools/libxc/xenctrl.h                     |    8 
 tools/python/xen/lowlevel/xc/xc.c         |   61 +
 tools/python/xen/xend/XendDomain.py       |   22 
 tools/python/xen/xend/server/SrvDomain.py |   14 
 tools/python/xen/xm/main.py               |   45 +
 xen/common/Makefile                       |    1 
 xen/common/sched_credit.c                 | 1233 ++++++++++++++++++++++++++++++
 xen/common/schedule.c                     |    5 
 xen/include/public/sched_ctl.h            |    5 
 xen/include/xen/sched-if.h                |    2 
 xen/include/xen/softirq.h                 |   13 
 13 files changed, 1460 insertions(+)

diff -r b6937b931419 -r e539abd27a0f tools/libxc/Makefile
--- a/tools/libxc/Makefile      Fri May 26 09:44:29 2006 +0100
+++ b/tools/libxc/Makefile      Fri May 26 11:14:36 2006 +0100
@@ -20,6 +20,7 @@ SRCS       += xc_physdev.c
 SRCS       += xc_physdev.c
 SRCS       += xc_private.c
 SRCS       += xc_sedf.c
+SRCS       += xc_csched.c
 SRCS       += xc_tbuf.c
 
 ifeq ($(patsubst x86%,x86,$(XEN_TARGET_ARCH)),x86)
diff -r b6937b931419 -r e539abd27a0f tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri May 26 09:44:29 2006 +0100
+++ b/tools/libxc/xenctrl.h     Fri May 26 11:14:36 2006 +0100
@@ -354,6 +354,14 @@ int xc_sedf_domain_get(int xc_handle,
                        uint64_t *latency, uint16_t *extratime,
                        uint16_t *weight);
 
+int xc_csched_domain_set(int xc_handle,
+                         uint32_t domid,
+                         struct csched_domain *sdom);
+
+int xc_csched_domain_get(int xc_handle,
+                         uint32_t domid,
+                         struct csched_domain *sdom);
+
 typedef evtchn_status_t xc_evtchn_status_t;
 
 /*
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri May 26 11:14:36 2006 +0100
@@ -716,6 +716,49 @@ static PyObject *pyxc_sedf_domain_get(Xc
                          "weight",    weight);
 }
 
+static PyObject *pyxc_csched_domain_set(XcObject *self,
+                                        PyObject *args,
+                                        PyObject *kwds)
+{
+    uint32_t domid;
+    uint16_t weight;
+    uint16_t cap;
+    static char *kwd_list[] = { "dom", "weight", "cap", NULL };
+    static char kwd_type[] = "I|HH";
+    struct csched_domain sdom;
+    
+    weight = 0;
+    cap = (uint16_t)~0U;
+    if( !PyArg_ParseTupleAndKeywords(args, kwds, kwd_type, kwd_list, 
+                                     &domid, &weight, &cap) )
+        return NULL;
+
+    sdom.weight = weight;
+    sdom.cap = cap;
+
+    if ( xc_csched_domain_set(self->xc_handle, domid, &sdom) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_csched_domain_get(XcObject *self, PyObject *args)
+{
+    uint32_t domid;
+    struct csched_domain sdom;
+    
+    if( !PyArg_ParseTuple(args, "I", &domid) )
+        return NULL;
+    
+    if ( xc_csched_domain_get(self->xc_handle, domid, &sdom) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+
+    return Py_BuildValue("{s:H,s:H}",
+                         "weight",  sdom.weight,
+                         "cap",     sdom.cap);
+}
+
 static PyObject *pyxc_domain_setmaxmem(XcObject *self, PyObject *args)
 {
     uint32_t dom;
@@ -1040,6 +1083,24 @@ static PyMethodDef pyxc_methods[] = {
       " slice     [long]: CPU reservation per period\n"
       " latency   [long]: domain's wakeup latency hint\n"
       " extratime [int]:  domain aware of extratime?\n"},
+    
+    { "csched_domain_set",
+      (PyCFunction)pyxc_csched_domain_set,
+      METH_KEYWORDS, "\n"
+      "Set the scheduling parameters for a domain when running with the\n"
+      "SMP credit scheduler.\n"
+      " domid     [int]:   domain id to set\n"
+      " weight    [short]: domain's scheduling weight\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "csched_domain_get",
+      (PyCFunction)pyxc_csched_domain_get,
+      METH_VARARGS, "\n"
+      "Get the scheduling parameters for a domain when running with the\n"
+      "SMP credit scheduler.\n"
+      " domid     [int]:   domain id to get\n"
+      "Returns:   [dict]\n"
+      " weight    [short]: domain's scheduling weight\n"},
 
     { "evtchn_alloc_unbound", 
       (PyCFunction)pyxc_evtchn_alloc_unbound,
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/xend/XendDomain.py       Fri May 26 11:14:36 2006 +0100
@@ -522,6 +522,28 @@ class XendDomain:
         except Exception, ex:
             raise XendError(str(ex))
 
+    def domain_csched_get(self, domid):
+        """Get credit scheduler parameters for a domain.
+        """
+        dominfo = self.domain_lookup_by_name_or_id_nr(domid)
+        if not dominfo:
+            raise XendInvalidDomain(str(domid))
+        try:
+            return xc.csched_domain_get(dominfo.getDomid())
+        except Exception, ex:
+            raise XendError(str(ex))
+    
+    def domain_csched_set(self, domid, weight, cap):
+        """Set credit scheduler parameters for a domain.
+        """
+        dominfo = self.domain_lookup_by_name_or_id_nr(domid)
+        if not dominfo:
+            raise XendInvalidDomain(str(domid))
+        try:
+            return xc.csched_domain_set(dominfo.getDomid(), weight, cap)
+        except Exception, ex:
+            raise XendError(str(ex))
+
     def domain_maxmem_set(self, domid, mem):
         """Set the memory limit for a domain.
 
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/xend/server/SrvDomain.py
--- a/tools/python/xen/xend/server/SrvDomain.py Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/xend/server/SrvDomain.py Fri May 26 11:14:36 2006 +0100
@@ -129,6 +129,20 @@ class SrvDomain(SrvDir):
                     ['latency', 'int'],
                     ['extratime', 'int'],
                     ['weight', 'int']])
+        val = fn(req.args, {'dom': self.dom.domid})
+        return val
+    
+    def op_domain_csched_get(self, _, req):
+        fn = FormFn(self.xd.domain_csched_get,
+                    [['dom', 'int']])
+        val = fn(req.args, {'dom': self.dom.domid})
+        return val
+
+
+    def op_domain_csched_set(self, _, req):
+        fn = FormFn(self.xd.domain_csched_set,
+                    [['dom', 'int'],
+                     ['weight', 'int']])
         val = fn(req.args, {'dom': self.dom.domid})
         return val
 
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/xm/main.py       Fri May 26 11:14:36 2006 +0100
@@ -99,6 +99,7 @@ sched_sedf_help = "sched-sedf [DOM] [OPT
                                     specifies another way of setting a 
domain's\n\
                                     cpu period/slice."
 
+csched_help = "csched                           Set or get credit scheduler 
parameters"
 block_attach_help = """block-attach <DomId> <BackDev> <FrontDev> <Mode>
                 [BackDomId]         Create a new virtual block device"""
 block_detach_help = """block-detach  <DomId> <DevId>    Destroy a domain's 
virtual block device,
@@ -174,6 +175,7 @@ host_commands = [
     ]
 
 scheduler_commands = [
+    "csched",
     "sched-bvt",
     "sched-bvt-ctxallow",
     "sched-sedf",
@@ -735,6 +737,48 @@ def xm_sched_sedf(args):
         else:
             print_sedf(sedf_info)
 
+def xm_csched(args):
+    usage_msg = """Csched:     Set or get credit scheduler parameters
+ Usage:
+
+        csched -d domain [-w weight] [-c cap]
+    """
+    try:
+        opts, args = getopt.getopt(args[0:], "d:w:c:",
+            ["domain=", "weight=", "cap="])
+    except getopt.GetoptError:
+        # print help information and exit:
+        print usage_msg
+        sys.exit(1)
+
+    domain = None
+    weight = None
+    cap = None
+
+    for o, a in opts:
+        if o == "-d":
+            domain = a
+        elif o == "-w":
+            weight = int(a)
+        elif o == "-c":
+            cap = int(a);
+
+    if domain is None:
+        # place holder for system-wide scheduler parameters
+        print usage_msg
+        sys.exit(1)
+
+    if weight is None and cap is None:
+        print server.xend.domain.csched_get(domain)
+    else:
+        if weight is None:
+            weight = int(0)
+        if cap is None:
+            cap = int(~0)
+
+        err = server.xend.domain.csched_set(domain, weight, cap)
+        if err != 0:
+            print err
 
 def xm_info(args):
     arg_check(args, "info", 0)
@@ -1032,6 +1076,7 @@ commands = {
     "sched-bvt": xm_sched_bvt,
     "sched-bvt-ctxallow": xm_sched_bvt_ctxallow,
     "sched-sedf": xm_sched_sedf,
+    "csched": xm_csched,
     # block
     "block-attach": xm_block_attach,
     "block-detach": xm_block_detach,
diff -r b6937b931419 -r e539abd27a0f xen/common/Makefile
--- a/xen/common/Makefile       Fri May 26 09:44:29 2006 +0100
+++ b/xen/common/Makefile       Fri May 26 11:14:36 2006 +0100
@@ -13,6 +13,7 @@ obj-y += page_alloc.o
 obj-y += page_alloc.o
 obj-y += rangeset.o
 obj-y += sched_bvt.o
+obj-y += sched_credit.o
 obj-y += sched_sedf.o
 obj-y += schedule.o
 obj-y += softirq.o
diff -r b6937b931419 -r e539abd27a0f xen/common/schedule.c
--- a/xen/common/schedule.c     Fri May 26 09:44:29 2006 +0100
+++ b/xen/common/schedule.c     Fri May 26 11:14:36 2006 +0100
@@ -50,9 +50,11 @@ struct schedule_data schedule_data[NR_CP
 
 extern struct scheduler sched_bvt_def;
 extern struct scheduler sched_sedf_def;
+extern struct scheduler sched_credit_def;
 static struct scheduler *schedulers[] = { 
     &sched_bvt_def,
     &sched_sedf_def,
+    &sched_credit_def,
     NULL
 };
 
@@ -639,6 +641,8 @@ static void t_timer_fn(void *unused)
 
     page_scrub_schedule_work();
 
+    SCHED_OP(tick, cpu);
+
     set_timer(&t_timer[cpu], NOW() + MILLISECS(10));
 }
 
@@ -681,6 +685,7 @@ void __init scheduler_init(void)
         printk("Could not find scheduler: %s\n", opt_sched);
 
     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
+    SCHED_OP(init);
 
     if ( idle_vcpu[0] != NULL )
     {
diff -r b6937b931419 -r e539abd27a0f xen/include/public/sched_ctl.h
--- a/xen/include/public/sched_ctl.h    Fri May 26 09:44:29 2006 +0100
+++ b/xen/include/public/sched_ctl.h    Fri May 26 11:14:36 2006 +0100
@@ -10,6 +10,7 @@
 /* Scheduler types. */
 #define SCHED_BVT      0
 #define SCHED_SEDF     4
+#define SCHED_CREDIT   5
 
 /* Set or get info? */
 #define SCHED_INFO_PUT 0
@@ -48,6 +49,10 @@ struct sched_adjdom_cmd {
             uint32_t extratime;
             uint32_t weight;
         } sedf;
+        struct csched_domain {
+            uint16_t weight;
+            uint16_t cap;
+        } credit;
     } u;
 };
 
diff -r b6937b931419 -r e539abd27a0f xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Fri May 26 09:44:29 2006 +0100
+++ b/xen/include/xen/sched-if.h        Fri May 26 11:14:36 2006 +0100
@@ -58,6 +58,8 @@ struct scheduler {
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
 
+    void         (*init)           (void);
+    void         (*tick)           (unsigned int cpu);
     int          (*alloc_task)     (struct vcpu *);
     void         (*add_task)       (struct vcpu *);
     void         (*free_task)      (struct domain *);
diff -r b6937b931419 -r e539abd27a0f xen/include/xen/softirq.h
--- a/xen/include/xen/softirq.h Fri May 26 09:44:29 2006 +0100
+++ b/xen/include/xen/softirq.h Fri May 26 11:14:36 2006 +0100
@@ -26,6 +26,19 @@ asmlinkage void do_softirq(void);
 asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, softirq_handler handler);
 
+static inline void cpumask_raise_softirq(cpumask_t mask, unsigned int nr)
+{
+    int cpu;
+
+    for_each_cpu_mask(cpu, mask)
+    {
+        if ( test_and_set_bit(nr, &softirq_pending(cpu)) )
+            cpu_clear(cpu, mask);
+    }
+
+    smp_send_event_check_mask(mask);
+}
+
 static inline void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
 {
     if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )
diff -r b6937b931419 -r e539abd27a0f tools/libxc/xc_csched.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_csched.c   Fri May 26 11:14:36 2006 +0100
@@ -0,0 +1,50 @@
+/****************************************************************************
+ * (C) 2006 - Emmanuel Ackaouy - XenSource Inc.
+ ****************************************************************************
+ *
+ *        File: xc_csched.c
+ *      Author: Emmanuel Ackaouy
+ *
+ * Description: XC Interface to the credit scheduler
+ *
+ */
+#include "xc_private.h"
+
+
+int
+xc_csched_domain_set(
+    int xc_handle,
+    uint32_t domid,
+    struct csched_domain *sdom)
+{
+    DECLARE_DOM0_OP;
+
+    op.cmd = DOM0_ADJUSTDOM;    
+    op.u.adjustdom.domain = (domid_t) domid;
+    op.u.adjustdom.sched_id = SCHED_CREDIT;
+    op.u.adjustdom.direction = SCHED_INFO_PUT;
+    op.u.adjustdom.u.credit = *sdom;
+
+    return do_dom0_op(xc_handle, &op);
+}
+
+int
+xc_csched_domain_get(
+    int xc_handle,
+    uint32_t domid,
+    struct csched_domain *sdom)
+{
+    DECLARE_DOM0_OP;
+    int err;
+
+    op.cmd = DOM0_ADJUSTDOM;    
+    op.u.adjustdom.domain = (domid_t) domid;
+    op.u.adjustdom.sched_id = SCHED_CREDIT;
+    op.u.adjustdom.direction = SCHED_INFO_GET;
+
+    err = do_dom0_op(xc_handle, &op);
+    if ( err == 0 )
+        *sdom = op.u.adjustdom.u.credit;
+
+    return err;
+}
diff -r b6937b931419 -r e539abd27a0f xen/common/sched_credit.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/sched_credit.c Fri May 26 11:14:36 2006 +0100
@@ -0,0 +1,1233 @@
+/****************************************************************************
+ * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
+ ****************************************************************************
+ *
+ *        File: common/csched_credit.c
+ *      Author: Emmanuel Ackaouy
+ *
+ * Description: Credit-based SMP CPU scheduler
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+
+
+/*
+ * CSCHED_STATS
+ *
+ * Manage very basic counters and stats.
+ *
+ * Useful for debugging live systems. The stats are displayed
+ * with runq dumps ('r' on the Xen console).
+ */
+#define CSCHED_STATS
+
+
+/*
+ * Basic constants
+ */
+#define CSCHED_TICK             10      /* milliseconds */
+#define CSCHED_TSLICE           30      /* milliseconds */
+#define CSCHED_ACCT_NTICKS      3
+#define CSCHED_ACCT_PERIOD      (CSCHED_ACCT_NTICKS * CSCHED_TICK)
+#define CSCHED_DEFAULT_WEIGHT   256
+
+
+/*
+ * Priorities
+ */
+#define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
+#define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
+#define CSCHED_PRI_IDLE         -64     /* idle */
+#define CSCHED_PRI_TS_PARKED    -65     /* time-share w/ capped credits */
+
+
+/*
+ * Useful macros
+ */
+#define CSCHED_PCPU(_c)     ((struct csched_pcpu 
*)schedule_data[_c].sched_priv)
+#define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
+#define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
+#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+
+
+/*
+ * Stats
+ */
+#ifdef CSCHED_STATS
+
+#define CSCHED_STAT(_X)         (csched_priv.stats._X)
+#define CSCHED_STAT_DEFINE(_X)  uint32_t _X;
+#define CSCHED_STAT_PRINTK(_X)                                  \
+    do                                                          \
+    {                                                           \
+        printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X));  \
+    } while ( 0 );
+
+#define CSCHED_STATS_EXPAND_SCHED(_MACRO)   \
+    _MACRO(vcpu_alloc)                      \
+    _MACRO(vcpu_add)                        \
+    _MACRO(vcpu_sleep)                      \
+    _MACRO(vcpu_wake_running)               \
+    _MACRO(vcpu_wake_onrunq)                \
+    _MACRO(vcpu_wake_runnable)              \
+    _MACRO(vcpu_wake_not_runnable)          \
+    _MACRO(dom_free)                        \
+    _MACRO(schedule)                        \
+    _MACRO(tickle_local_idler)              \
+    _MACRO(tickle_local_over)               \
+    _MACRO(tickle_local_under)              \
+    _MACRO(tickle_local_other)              \
+    _MACRO(acct_run)                        \
+    _MACRO(acct_no_work)                    \
+    _MACRO(acct_balance)                    \
+    _MACRO(acct_reorder)                    \
+    _MACRO(acct_min_credit)                 \
+    _MACRO(acct_vcpu_active)                \
+    _MACRO(acct_vcpu_idle)                  \
+    _MACRO(acct_vcpu_credit_min)
+
+#define CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO)    \
+    _MACRO(vcpu_migrate)                                \
+    _MACRO(load_balance_idle)                           \
+    _MACRO(load_balance_over)                           \
+    _MACRO(load_balance_other)                          \
+    _MACRO(steal_trylock_failed)                        \
+    _MACRO(steal_peer_down)                             \
+    _MACRO(steal_peer_idle)                             \
+    _MACRO(steal_peer_running)                          \
+    _MACRO(steal_peer_pinned)                           \
+    _MACRO(tickle_idlers_none)                          \
+    _MACRO(tickle_idlers_some)
+
+#ifndef NDEBUG
+#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)  \
+    _MACRO(vcpu_check)
+#else
+#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
+#endif
+
+#define CSCHED_STATS_EXPAND(_MACRO)                 \
+    CSCHED_STATS_EXPAND_SCHED(_MACRO)               \
+    CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO)    \
+    CSCHED_STATS_EXPAND_CHECKS(_MACRO)
+
+#define CSCHED_STATS_RESET()                                        \
+    do                                                              \
+    {                                                               \
+        memset(&csched_priv.stats, 0, sizeof(csched_priv.stats));   \
+    } while ( 0 )
+
+#define CSCHED_STATS_DEFINE()                   \
+    struct                                      \
+    {                                           \
+        CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
+    } stats
+
+#define CSCHED_STATS_PRINTK()                   \
+    do                                          \
+    {                                           \
+        printk("stats:\n");                     \
+        CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
+    } while ( 0 )
+
+#define CSCHED_STAT_CRANK(_X)   (CSCHED_STAT(_X)++)
+
+#else /* CSCHED_STATS */
+
+#define CSCHED_STATS_RESET()    do {} while ( 0 )
+#define CSCHED_STATS_DEFINE()   do {} while ( 0 )
+#define CSCHED_STATS_PRINTK()   do {} while ( 0 )
+#define CSCHED_STAT_CRANK(_X)   do {} while ( 0 )
+
+#endif /* CSCHED_STATS */
+
+
+/*
+ * Physical CPU
+ */
+struct csched_pcpu {
+    struct list_head runq;
+    uint32_t runq_sort_last;
+};
+
+/*
+ * Virtual CPU
+ */
+struct csched_vcpu {
+    struct list_head runq_elem;
+    struct list_head active_vcpu_elem;
+    struct csched_dom *sdom;
+    struct vcpu *vcpu;
+    atomic_t credit;
+    int credit_last;
+    uint32_t credit_incr;
+    uint32_t state_active;
+    uint32_t state_idle;
+    int16_t pri;
+};
+
+/*
+ * Domain
+ */
+struct csched_dom {
+    struct list_head active_vcpu;
+    struct list_head active_sdom_elem;
+    struct domain *dom;
+    uint16_t active_vcpu_count;
+    uint16_t weight;
+    uint16_t cap;
+};
+
+/*
+ * System-wide private data
+ */
+struct csched_private {
+    spinlock_t lock;
+    struct list_head active_sdom;
+    uint32_t ncpus;
+    unsigned int master;
+    cpumask_t idlers;
+    uint32_t weight;
+    uint32_t credit;
+    int credit_balance;
+    uint32_t runq_sort;
+    CSCHED_STATS_DEFINE();
+};
+
+
+/*
+ * Global variables
+ */
+static struct csched_private csched_priv;
+
+
+
+static inline int
+__vcpu_on_runq(struct csched_vcpu *svc)
+{
+    return !list_empty(&svc->runq_elem);
+}
+
+static inline struct csched_vcpu *
+__runq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct csched_vcpu, runq_elem);
+}
+
+static inline void
+__runq_insert(unsigned int cpu, struct csched_vcpu *svc)
+{
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+
+    BUG_ON( __vcpu_on_runq(svc) );
+    BUG_ON( cpu != svc->vcpu->processor );
+
+    list_for_each( iter, runq )
+    {
+        const struct csched_vcpu * const iter_svc = __runq_elem(iter);
+        if ( svc->pri > iter_svc->pri )
+            break;
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+}
+
+static inline void
+__runq_remove(struct csched_vcpu *svc)
+{
+    BUG_ON( !__vcpu_on_runq(svc) );
+    list_del_init(&svc->runq_elem);
+}
+
+static inline void
+__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
+{
+    struct csched_vcpu * const cur = CSCHED_VCPU(schedule_data[cpu].curr);
+    cpumask_t mask;
+
+    ASSERT(cur);
+    cpus_clear(mask);
+
+    /* If strictly higher priority than current VCPU, signal the CPU */
+    if ( new->pri > cur->pri )
+    {
+        if ( cur->pri == CSCHED_PRI_IDLE )
+            CSCHED_STAT_CRANK(tickle_local_idler);
+        else if ( cur->pri == CSCHED_PRI_TS_OVER )
+            CSCHED_STAT_CRANK(tickle_local_over);
+        else if ( cur->pri == CSCHED_PRI_TS_UNDER )
+            CSCHED_STAT_CRANK(tickle_local_under);
+        else
+            CSCHED_STAT_CRANK(tickle_local_other);
+
+        cpu_set(cpu, mask);
+    }
+
+    /*
+     * If this CPU has at least two runnable VCPUs, we tickle any idlers to
+     * let them know there is runnable work in the system...
+     */
+    if ( cur->pri > CSCHED_PRI_IDLE )
+    {
+        if ( cpus_empty(csched_priv.idlers) )
+        {
+            CSCHED_STAT_CRANK(tickle_idlers_none);
+        }
+        else
+        {
+            CSCHED_STAT_CRANK(tickle_idlers_some);
+            cpus_or(mask, mask, csched_priv.idlers);
+        }
+    }
+
+    /* Send scheduler interrupts to designated CPUs */
+    if ( !cpus_empty(mask) )
+        cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
+}
+
+static void
+csched_pcpu_init(int cpu)
+{
+    struct csched_pcpu *spc;
+    unsigned long flags;
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    /* Initialize/update system-wide config */
+    csched_priv.credit += CSCHED_ACCT_PERIOD;
+    if ( csched_priv.ncpus <= cpu )
+        csched_priv.ncpus = cpu + 1;
+    if ( csched_priv.master >= csched_priv.ncpus )
+        csched_priv.master = cpu;
+
+    /* Allocate per-PCPU info */
+    spc = xmalloc(struct csched_pcpu);
+    BUG_ON( spc == NULL );
+    INIT_LIST_HEAD(&spc->runq);
+    spc->runq_sort_last = csched_priv.runq_sort;
+    schedule_data[cpu].sched_priv = spc;
+
+    /* Start off idling... */
+    BUG_ON( !is_idle_vcpu(schedule_data[cpu].curr) );
+    cpu_set(cpu, csched_priv.idlers);
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+}
+
+#ifndef NDEBUG
+static inline void
+__csched_vcpu_check(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_dom * const sdom = svc->sdom;
+
+    BUG_ON( svc->vcpu != vc );
+    BUG_ON( sdom != CSCHED_DOM(vc->domain) );
+    if ( sdom )
+    {
+        BUG_ON( is_idle_vcpu(vc) );
+        BUG_ON( sdom->dom != vc->domain );
+    }
+    else
+    {
+        BUG_ON( !is_idle_vcpu(vc) );
+    }
+
+    CSCHED_STAT_CRANK(vcpu_check);
+}
+#define CSCHED_VCPU_CHECK(_vc)  (__csched_vcpu_check(_vc))
+#else
+#define CSCHED_VCPU_CHECK(_vc)
+#endif
+
+static inline int
+__csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
+{
+    /*
+     * Don't pick up work that's in the peer's scheduling tail. Also only pick
+     * up work that's allowed to run on our CPU.
+     */
+    if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_running);
+        return 0;
+    }
+
+    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_pinned);
+        return 0;
+    }
+
+    return 1;
+}
+
+static void
+csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
+{
+    struct csched_dom * const sdom = svc->sdom;
+    unsigned long flags;
+
+    /* Update credits */
+    atomic_sub(credit_dec, &svc->credit);
+
+    /* Put this VCPU and domain back on the active list if it was idling */
+    if ( list_empty(&svc->active_vcpu_elem) )
+    {
+        spin_lock_irqsave(&csched_priv.lock, flags);
+
+        if ( list_empty(&svc->active_vcpu_elem) )
+        {
+            CSCHED_STAT_CRANK(acct_vcpu_active);
+            svc->state_active++;
+
+            sdom->active_vcpu_count++;
+            list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
+            if ( list_empty(&sdom->active_sdom_elem) )
+            {
+                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+                csched_priv.weight += sdom->weight;
+            }
+        }
+
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+    }
+}
+
+static inline void
+__csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+
+    BUG_ON( list_empty(&svc->active_vcpu_elem) );
+
+    CSCHED_STAT_CRANK(acct_vcpu_idle);
+    svc->state_idle++;
+
+    sdom->active_vcpu_count--;
+    list_del_init(&svc->active_vcpu_elem);
+    if ( list_empty(&sdom->active_vcpu) )
+    {
+        BUG_ON( csched_priv.weight < sdom->weight );
+        list_del_init(&sdom->active_sdom_elem);
+        csched_priv.weight -= sdom->weight;
+    }
+
+    atomic_set(&svc->credit, 0);
+}
+
+static int
+csched_vcpu_alloc(struct vcpu *vc)
+{
+    struct domain * const dom = vc->domain;
+    struct csched_dom *sdom;
+    struct csched_vcpu *svc;
+    int16_t pri;
+
+    CSCHED_STAT_CRANK(vcpu_alloc);
+
+    /* Allocate, if appropriate, per-domain info */
+    if ( is_idle_vcpu(vc) )
+    {
+        sdom = NULL;
+        pri = CSCHED_PRI_IDLE;
+    }
+    else if ( CSCHED_DOM(dom) )
+    {
+        sdom = CSCHED_DOM(dom);
+        pri = CSCHED_PRI_TS_UNDER;
+    }
+    else 
+    {
+        sdom = xmalloc(struct csched_dom);
+        if ( !sdom )
+            return -1;
+
+        /* Initialize credit and weight */
+        INIT_LIST_HEAD(&sdom->active_vcpu);
+        sdom->active_vcpu_count = 0;
+        INIT_LIST_HEAD(&sdom->active_sdom_elem);
+        sdom->dom = dom;
+        sdom->weight = CSCHED_DEFAULT_WEIGHT;
+        sdom->cap = 0U;
+        dom->sched_priv = sdom;
+        pri = CSCHED_PRI_TS_UNDER;
+    }
+
+    /* Allocate per-VCPU info */
+    svc = xmalloc(struct csched_vcpu);
+    if ( !svc )
+        return -1;
+
+    INIT_LIST_HEAD(&svc->runq_elem);
+    INIT_LIST_HEAD(&svc->active_vcpu_elem);
+    svc->sdom = sdom;
+    svc->vcpu = vc;
+    atomic_set(&svc->credit, 0);
+    svc->credit_last = 0;
+    svc->credit_incr = 0U;
+    svc->state_active = 0U;
+    svc->state_idle = 0U;
+    svc->pri = pri;
+    vc->sched_priv = svc;
+
+    CSCHED_VCPU_CHECK(vc);
+
+    /* Attach fair-share VCPUs to the accounting list */
+    if ( likely(sdom != NULL) )
+        csched_vcpu_acct(svc, 0);
+
+    return 0;
+}
+
+static void
+csched_vcpu_add(struct vcpu *vc) 
+{
+    CSCHED_STAT_CRANK(vcpu_add);
+
+    /* Allocate per-PCPU info */
+    if ( unlikely(!CSCHED_PCPU(vc->processor)) )
+        csched_pcpu_init(vc->processor);
+
+    CSCHED_VCPU_CHECK(vc);
+}
+
+static void
+csched_vcpu_free(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_dom * const sdom = svc->sdom;
+    unsigned long flags;
+
+    BUG_ON( sdom == NULL );
+    BUG_ON( !list_empty(&svc->runq_elem) );
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    if ( !list_empty(&svc->active_vcpu_elem) )
+        __csched_vcpu_acct_idle_locked(svc);
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_sleep(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+
+    CSCHED_STAT_CRANK(vcpu_sleep);
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( schedule_data[vc->processor].curr == vc )
+        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
+    else if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+}
+
+static void
+csched_vcpu_wake(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    const unsigned int cpu = vc->processor;
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( unlikely(schedule_data[cpu].curr == vc) )
+    {
+        CSCHED_STAT_CRANK(vcpu_wake_running);
+        return;
+    }
+    if ( unlikely(__vcpu_on_runq(svc)) )
+    {
+        CSCHED_STAT_CRANK(vcpu_wake_onrunq);
+        return;
+    }
+
+    if ( likely(vcpu_runnable(vc)) )
+        CSCHED_STAT_CRANK(vcpu_wake_runnable);
+    else
+        CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
+
+    /* Put the VCPU on the runq and tickle CPUs */
+    __runq_insert(cpu, svc);
+    __runq_tickle(cpu, svc);
+}
+
+static int
+csched_vcpu_set_affinity(struct vcpu *vc, cpumask_t *affinity)
+{
+    unsigned long flags;
+    int lcpu;
+
+    if ( vc == current )
+    {
+        /* No locking needed but also can't move on the spot... */
+        if ( !cpu_isset(vc->processor, *affinity) )
+            return -EBUSY;
+
+        vc->cpu_affinity = *affinity;
+    }
+    else
+    {
+        /* Pause, modify, and unpause. */
+        vcpu_pause(vc);
+
+        vc->cpu_affinity = *affinity;
+        if ( !cpu_isset(vc->processor, vc->cpu_affinity) )
+        {
+            /*
+             * We must grab the scheduler lock for the CPU currently owning
+             * this VCPU before changing its ownership.
+             */
+            vcpu_schedule_lock_irqsave(vc, flags);
+            lcpu = vc->processor;
+
+            vc->processor = first_cpu(vc->cpu_affinity);
+
+            spin_unlock_irqrestore(&schedule_data[lcpu].schedule_lock, flags);
+        }
+
+        vcpu_unpause(vc);
+    }
+
+    return 0;
+}
+
+static int
+csched_dom_cntl(
+    struct domain *d,
+    struct sched_adjdom_cmd *cmd)
+{
+    struct csched_dom * const sdom = CSCHED_DOM(d);
+    unsigned long flags;
+
+    if ( cmd->direction == SCHED_INFO_GET )
+    {
+        cmd->u.credit.weight = sdom->weight;
+        cmd->u.credit.cap = sdom->cap;
+    }
+    else
+    {
+        ASSERT( cmd->direction == SCHED_INFO_PUT );
+
+        spin_lock_irqsave(&csched_priv.lock, flags);
+
+        if ( cmd->u.credit.weight != 0 )
+        {
+            csched_priv.weight -= sdom->weight;
+            sdom->weight = cmd->u.credit.weight;
+            csched_priv.weight += sdom->weight;
+        }
+
+        if ( cmd->u.credit.cap != (uint16_t)~0U )
+            sdom->cap = cmd->u.credit.cap;
+
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+    }
+
+    return 0;
+}
+
+static void
+csched_dom_free(struct domain *dom)
+{
+    struct csched_dom * const sdom = CSCHED_DOM(dom);
+    int i;
+
+    CSCHED_STAT_CRANK(dom_free);
+
+    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+    {
+        if ( dom->vcpu[i] )
+            csched_vcpu_free(dom->vcpu[i]);
+    }
+
+    xfree(sdom);
+}
+
+/*
+ * This is a O(n) optimized sort of the runq.
+ *
+ * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
+ * through the runq and move up any UNDERs that are preceded by OVERS. We
+ * remember the last UNDER to make the move up operation O(1).
+ */
+static void
+csched_runq_sort(unsigned int cpu)
+{
+    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
+    struct list_head *runq, *elem, *next, *last_under;
+    struct csched_vcpu *svc_elem;
+    unsigned long flags;
+    int sort_epoch;
+
+    sort_epoch = csched_priv.runq_sort;
+    if ( sort_epoch == spc->runq_sort_last )
+        return;
+
+    spc->runq_sort_last = sort_epoch;
+
+    spin_lock_irqsave(&schedule_data[cpu].schedule_lock, flags);
+
+    runq = &spc->runq;
+    elem = runq->next;
+    last_under = runq;
+
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+
+        if ( svc_elem->pri == CSCHED_PRI_TS_UNDER )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+
+        elem = next;
+    }
+
+    spin_unlock_irqrestore(&schedule_data[cpu].schedule_lock, flags);
+}
+
+static void
+csched_acct(void)
+{
+    unsigned long flags;
+    struct list_head *iter_vcpu, *next_vcpu;
+    struct list_head *iter_sdom, *next_sdom;
+    struct csched_vcpu *svc;
+    struct csched_dom *sdom;
+    uint32_t credit_total;
+    uint32_t weight_total;
+    uint32_t weight_left;
+    uint32_t credit_fair;
+    uint32_t credit_peak;
+    int credit_balance;
+    int credit_xtra;
+    int credit;
+
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    weight_total = csched_priv.weight;
+    credit_total = csched_priv.credit;
+
+    /* Converge balance towards 0 when it drops negative */
+    if ( csched_priv.credit_balance < 0 )
+    {
+        credit_total -= csched_priv.credit_balance;
+        CSCHED_STAT_CRANK(acct_balance);
+    }
+
+    if ( unlikely(weight_total == 0) )
+    {
+        csched_priv.credit_balance = 0;
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        CSCHED_STAT_CRANK(acct_no_work);
+        return;
+    }
+
+    CSCHED_STAT_CRANK(acct_run);
+
+    weight_left = weight_total;
+    credit_balance = 0;
+    credit_xtra = 0;
+
+    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    {
+        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+        BUG_ON( is_idle_domain(sdom->dom) );
+        BUG_ON( sdom->active_vcpu_count == 0 );
+        BUG_ON( sdom->weight == 0 );
+        BUG_ON( sdom->weight > weight_left );
+
+        weight_left -= sdom->weight;
+
+        /*
+         * A domain's fair share is computed using its weight in competition
+         * with that of all other active domains.
+         *
+         * At most, a domain can use credits to run all its active VCPUs
+         * for one full accounting period. We allow a domain to earn more
+         * only when the system-wide credit balance is negative.
+         */
+        credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
+        if ( csched_priv.credit_balance < 0 )
+        {
+            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+                             (weight_total - 1)
+                           ) / weight_total;
+        }
+        if ( sdom->cap != 0U )
+        {
+            uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 
100;
+            if ( credit_cap < credit_peak )
+                credit_peak = credit_cap;
+        }
+
+        credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
+                      ) / weight_total;
+
+        if ( credit_fair < credit_peak )
+        {
+            credit_xtra = 1;
+        }
+        else
+        {
+            if ( weight_left != 0U )
+            {
+                /* Give other domains a chance at unused credits */
+                credit_total += ( ( ( credit_fair - credit_peak
+                                    ) * weight_total
+                                  ) + ( weight_left - 1 )
+                                ) / weight_left;
+            }
+
+            if ( credit_xtra )
+            {
+                /*
+                 * Lazily keep domains with extra credits at the head of
+                 * the queue to give others a chance at them in future
+                 * accounting periods.
+                 */
+                CSCHED_STAT_CRANK(acct_reorder);
+                list_del(&sdom->active_sdom_elem);
+                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+            }
+
+            credit_fair = credit_peak;
+        }
+
+        /* Compute fair share per VCPU */
+        credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
+                      ) / sdom->active_vcpu_count;
+
+
+        list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
+        {
+            svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
+            BUG_ON( sdom != svc->sdom );
+
+            /* Increment credit */
+            atomic_add(credit_fair, &svc->credit);
+            credit = atomic_read(&svc->credit);
+
+            /*
+             * Recompute priority or, if VCPU is idling, remove it from
+             * the active list.
+             */
+            if ( credit < 0 )
+            {
+                if ( sdom->cap == 0U )
+                    svc->pri = CSCHED_PRI_TS_OVER;
+                else
+                    svc->pri = CSCHED_PRI_TS_PARKED;
+
+                if ( credit < -CSCHED_TSLICE )
+                {
+                    CSCHED_STAT_CRANK(acct_min_credit);
+                    credit = -CSCHED_TSLICE;
+                    atomic_set(&svc->credit, credit);
+                }
+            }
+            else
+            {
+                svc->pri = CSCHED_PRI_TS_UNDER;
+
+                if ( credit > CSCHED_TSLICE )
+                    __csched_vcpu_acct_idle_locked(svc);
+            }
+
+            svc->credit_last = credit;
+            svc->credit_incr = credit_fair;
+            credit_balance += credit;
+        }
+    }
+
+    csched_priv.credit_balance = credit_balance;
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    /* Inform each CPU that its runq needs to be sorted */
+    csched_priv.runq_sort++;
+}
+
+static void
+csched_tick(unsigned int cpu)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(current);
+    struct csched_dom * const sdom = svc->sdom;
+
+    /*
+     * Accounting for running VCPU
+     *
+     * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
+     */
+    if ( likely(sdom != NULL) )
+    {
+        csched_vcpu_acct(svc, CSCHED_TICK);
+    }
+
+    /*
+     * Accounting duty
+     *
+     * Note: Currently, this is always done by the master boot CPU. Eventually,
+     * we could distribute or at the very least cycle the duty.
+     */
+    if ( (csched_priv.master == cpu) &&
+         (schedule_data[cpu].tick % CSCHED_ACCT_NTICKS) == 0 )
+    {
+        csched_acct();
+    }
+
+    /*
+     * Check if runq needs to be sorted
+     *
+     * Every physical CPU resorts the runq after the accounting master has
+     * modified priorities. This is a special O(n) sort and runs at most
+     * once per accounting period (currently 30 milliseconds).
+     */
+    csched_runq_sort(cpu);
+}
+
+static struct csched_vcpu *
+csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
+{
+    struct list_head *iter;
+    struct csched_vcpu *speer;
+    struct vcpu *vc;
+
+    list_for_each( iter, &spc->runq )
+    {
+        speer = __runq_elem(iter);
+
+        /*
+         * If next available VCPU here is not of higher priority than ours,
+         * this PCPU is useless to us.
+         */
+        if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
+        {
+            CSCHED_STAT_CRANK(steal_peer_idle);
+            break;
+        }
+
+        /* Is this VCPU is runnable on our PCPU? */
+        vc = speer->vcpu;
+        BUG_ON( is_idle_vcpu(vc) );
+
+        if ( __csched_vcpu_is_stealable(cpu, vc) )
+        {
+            /* We got a candidate. Grab it! */
+            __runq_remove(speer);
+            vc->processor = cpu;
+
+            return speer;
+        }
+    }
+
+    return NULL;
+}
+
+static struct csched_vcpu *
+csched_load_balance(int cpu, struct csched_vcpu *snext)
+{
+    struct csched_pcpu *spc;
+    struct csched_vcpu *speer;
+    int peer_cpu;
+
+    if ( snext->pri == CSCHED_PRI_IDLE )
+        CSCHED_STAT_CRANK(load_balance_idle);
+    else if ( snext->pri == CSCHED_PRI_TS_OVER )
+        CSCHED_STAT_CRANK(load_balance_over);
+    else
+        CSCHED_STAT_CRANK(load_balance_other);
+
+    peer_cpu = cpu;
+    BUG_ON( peer_cpu != snext->vcpu->processor );
+
+    while ( 1 )
+    {
+        /* For each PCPU in the system starting with our neighbour... */
+        peer_cpu = (peer_cpu + 1) % csched_priv.ncpus;
+        if ( peer_cpu == cpu )
+            break;
+
+        BUG_ON( peer_cpu >= csched_priv.ncpus );
+        BUG_ON( peer_cpu == cpu );
+
+        /*
+         * Get ahold of the scheduler lock for this peer CPU.
+         *
+         * Note: We don't spin on this lock but simply try it. Spinning could
+         * cause a deadlock if the peer CPU is also load balancing and trying
+         * to lock this CPU.
+         */
+        if ( spin_trylock(&schedule_data[peer_cpu].schedule_lock) )
+        {
+
+            spc = CSCHED_PCPU(peer_cpu);
+            if ( unlikely(spc == NULL) )
+            {
+                CSCHED_STAT_CRANK(steal_peer_down);
+                speer = NULL;
+            }
+            else
+            {
+                speer = csched_runq_steal(spc, cpu, snext->pri);
+            }
+
+            spin_unlock(&schedule_data[peer_cpu].schedule_lock);
+
+            /* Got one! */
+            if ( speer )
+            {
+                CSCHED_STAT_CRANK(vcpu_migrate);
+                return speer;
+            }
+        }
+        else
+        {
+            CSCHED_STAT_CRANK(steal_trylock_failed);
+        }
+    }
+
+
+    /* Failed to find more important work */
+    __runq_remove(snext);
+    return snext;
+}
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static struct task_slice
+csched_schedule(s_time_t now)
+{
+    const int cpu = smp_processor_id();
+    struct list_head * const runq = RUNQ(cpu);
+    struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_vcpu *snext;
+    struct task_slice ret;
+
+    CSCHED_STAT_CRANK(schedule);
+    CSCHED_VCPU_CHECK(current);
+
+    /*
+     * Select next runnable local VCPU (ie top of local runq)
+     */
+    if ( vcpu_runnable(current) )
+        __runq_insert(cpu, scurr);
+    else
+        BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
+
+    snext = __runq_elem(runq->next);
+
+    /*
+     * SMP Load balance:
+     *
+     * If the next highest priority local runnable VCPU has already eaten
+     * through its credits, look on other PCPUs to see if we have more
+     * urgent work... If not, csched_load_balance() will return snext, but
+     * already removed from the runq.
+     */
+    if ( snext->pri > CSCHED_PRI_TS_OVER )
+        __runq_remove(snext);
+    else
+        snext = csched_load_balance(cpu, snext);
+
+    /*
+     * Update idlers mask if necessary. When we're idling, other CPUs
+     * will tickle us when they get extra work.
+     */
+    if ( snext->pri == CSCHED_PRI_IDLE )
+    {
+        if ( !cpu_isset(cpu, csched_priv.idlers) )
+            cpu_set(cpu, csched_priv.idlers);
+    }
+    else if ( cpu_isset(cpu, csched_priv.idlers) )
+    {
+        cpu_clear(cpu, csched_priv.idlers);
+    }
+
+    /*
+     * Return task to run next...
+     */
+    ret.time = MILLISECS(CSCHED_TSLICE);
+    ret.task = snext->vcpu;
+
+    CSCHED_VCPU_CHECK(ret.task);
+    BUG_ON( !vcpu_runnable(ret.task) );
+
+    return ret;
+}
+
+static void
+csched_dump_vcpu(struct csched_vcpu *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+
+    printk("[%i.%i] pri=%i cpu=%i",
+            svc->vcpu->domain->domain_id,
+            svc->vcpu->vcpu_id,
+            svc->pri,
+            svc->vcpu->processor);
+
+    if ( sdom )
+    {
+        printk(" credit=%i (%d+%u) {a=%u i=%u w=%u}",
+            atomic_read(&svc->credit),
+            svc->credit_last,
+            svc->credit_incr,
+            svc->state_active,
+            svc->state_idle,
+            sdom->weight);
+    }
+
+    printk("\n");
+}
+
+static void
+csched_dump_pcpu(int cpu)
+{
+    struct list_head *runq, *iter;
+    struct csched_pcpu *spc;
+    struct csched_vcpu *svc;
+    int loop;
+
+    spc = CSCHED_PCPU(cpu);
+    runq = &spc->runq;
+
+    printk(" tick=%lu, sort=%d\n",
+            schedule_data[cpu].tick,
+            spc->runq_sort_last);
+
+    /* current VCPU */
+    svc = CSCHED_VCPU(schedule_data[cpu].curr);
+    if ( svc )
+    {
+        printk("\trun: ");
+        csched_dump_vcpu(svc);
+    }
+
+    loop = 0;
+    list_for_each( iter, runq )
+    {
+        svc = __runq_elem(iter);
+        if ( svc )
+        {
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+
+static void
+csched_dump(void)
+{
+    struct list_head *iter_sdom, *iter_svc;
+    int loop;
+
+    printk("info:\n"
+           "\tncpus              = %u\n"
+           "\tmaster             = %u\n"
+           "\tcredit             = %u\n"
+           "\tcredit balance     = %d\n"
+           "\tweight             = %u\n"
+           "\trunq_sort          = %u\n"
+           "\ttick               = %dms\n"
+           "\ttslice             = %dms\n"
+           "\taccounting period  = %dms\n"
+           "\tdefault-weight     = %d\n",
+           csched_priv.ncpus,
+           csched_priv.master,
+           csched_priv.credit,
+           csched_priv.credit_balance,
+           csched_priv.weight,
+           csched_priv.runq_sort,
+           CSCHED_TICK,
+           CSCHED_TSLICE,
+           CSCHED_ACCT_PERIOD,
+           CSCHED_DEFAULT_WEIGHT);
+
+    printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
+
+    CSCHED_STATS_PRINTK();
+
+    printk("active vcpus:\n");
+    loop = 0;
+    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    {
+        struct csched_dom *sdom;
+        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+        list_for_each( iter_svc, &sdom->active_vcpu )
+        {
+            struct csched_vcpu *svc;
+            svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
+
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+
+static void
+csched_init(void)
+{
+    spin_lock_init(&csched_priv.lock);
+    INIT_LIST_HEAD(&csched_priv.active_sdom);
+    csched_priv.ncpus = 0;
+    csched_priv.master = UINT_MAX;
+    cpus_clear(csched_priv.idlers);
+    csched_priv.weight = 0U;
+    csched_priv.credit = 0U;
+    csched_priv.credit_balance = 0;
+    csched_priv.runq_sort = 0U;
+    CSCHED_STATS_RESET();
+}
+
+
+struct scheduler sched_credit_def = {
+    .name           = "SMP Credit Scheduler",
+    .opt_name       = "credit",
+    .sched_id       = SCHED_CREDIT,
+
+    .alloc_task     = csched_vcpu_alloc,
+    .add_task       = csched_vcpu_add,
+    .sleep          = csched_vcpu_sleep,
+    .wake           = csched_vcpu_wake,
+    .set_affinity   = csched_vcpu_set_affinity,
+
+    .adjdom         = csched_dom_cntl,
+    .free_task      = csched_dom_free,
+
+    .tick           = csched_tick,
+    .do_schedule    = csched_schedule,
+
+    .dump_cpu_state = csched_dump_pcpu,
+    .dump_settings  = csched_dump,
+    .init           = csched_init,
+};

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
WARNING - OLD ARCHIVES

xen-changelog

[Xen-changelog] New weighted fair-share CPU scheduler w/ automatic SMP l