WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] New weighted fair-share CPU scheduler w/ automatic SMP l

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] New weighted fair-share CPU scheduler w/ automatic SMP load balancing
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Fri, 26 May 2006 12:42:10 +0000
Delivery-date: Fri, 26 May 2006 05:43:42 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User ack@xxxxxxxxxxxxxxxxxxxxxxx
# Node ID e539abd27a0f2b1a64b4d129a10748d50c93e6fb
# Parent  b6937b93141961b67dc642581266e6fc2015bc91
New weighted fair-share CPU scheduler w/ automatic SMP load balancing
Signed-off-by: Emmanuel Ackaouy <ack@xxxxxxxxxxxxx>
---
 tools/libxc/Makefile                      |    1 
 tools/libxc/xc_csched.c                   |   50 +
 tools/libxc/xenctrl.h                     |    8 
 tools/python/xen/lowlevel/xc/xc.c         |   61 +
 tools/python/xen/xend/XendDomain.py       |   22 
 tools/python/xen/xend/server/SrvDomain.py |   14 
 tools/python/xen/xm/main.py               |   45 +
 xen/common/Makefile                       |    1 
 xen/common/sched_credit.c                 | 1233 ++++++++++++++++++++++++++++++
 xen/common/schedule.c                     |    5 
 xen/include/public/sched_ctl.h            |    5 
 xen/include/xen/sched-if.h                |    2 
 xen/include/xen/softirq.h                 |   13 
 13 files changed, 1460 insertions(+)

diff -r b6937b931419 -r e539abd27a0f tools/libxc/Makefile
--- a/tools/libxc/Makefile      Fri May 26 09:44:29 2006 +0100
+++ b/tools/libxc/Makefile      Fri May 26 11:14:36 2006 +0100
@@ -20,6 +20,7 @@ SRCS       += xc_physdev.c
 SRCS       += xc_physdev.c
 SRCS       += xc_private.c
 SRCS       += xc_sedf.c
+SRCS       += xc_csched.c
 SRCS       += xc_tbuf.c
 
 ifeq ($(patsubst x86%,x86,$(XEN_TARGET_ARCH)),x86)
diff -r b6937b931419 -r e539abd27a0f tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri May 26 09:44:29 2006 +0100
+++ b/tools/libxc/xenctrl.h     Fri May 26 11:14:36 2006 +0100
@@ -354,6 +354,14 @@ int xc_sedf_domain_get(int xc_handle,
                        uint64_t *latency, uint16_t *extratime,
                        uint16_t *weight);
 
+int xc_csched_domain_set(int xc_handle,
+                         uint32_t domid,
+                         struct csched_domain *sdom);
+
+int xc_csched_domain_get(int xc_handle,
+                         uint32_t domid,
+                         struct csched_domain *sdom);
+
 typedef evtchn_status_t xc_evtchn_status_t;
 
 /*
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri May 26 11:14:36 2006 +0100
@@ -716,6 +716,49 @@ static PyObject *pyxc_sedf_domain_get(Xc
                          "weight",    weight);
 }
 
+static PyObject *pyxc_csched_domain_set(XcObject *self,
+                                        PyObject *args,
+                                        PyObject *kwds)
+{
+    uint32_t domid;
+    uint16_t weight;
+    uint16_t cap;
+    static char *kwd_list[] = { "dom", "weight", "cap", NULL };
+    static char kwd_type[] = "I|HH";
+    struct csched_domain sdom;
+    
+    weight = 0;
+    cap = (uint16_t)~0U;
+    if( !PyArg_ParseTupleAndKeywords(args, kwds, kwd_type, kwd_list, 
+                                     &domid, &weight, &cap) )
+        return NULL;
+
+    sdom.weight = weight;
+    sdom.cap = cap;
+
+    if ( xc_csched_domain_set(self->xc_handle, domid, &sdom) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_csched_domain_get(XcObject *self, PyObject *args)
+{
+    uint32_t domid;
+    struct csched_domain sdom;
+    
+    if( !PyArg_ParseTuple(args, "I", &domid) )
+        return NULL;
+    
+    if ( xc_csched_domain_get(self->xc_handle, domid, &sdom) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+
+    return Py_BuildValue("{s:H,s:H}",
+                         "weight",  sdom.weight,
+                         "cap",     sdom.cap);
+}
+
 static PyObject *pyxc_domain_setmaxmem(XcObject *self, PyObject *args)
 {
     uint32_t dom;
@@ -1040,6 +1083,24 @@ static PyMethodDef pyxc_methods[] = {
       " slice     [long]: CPU reservation per period\n"
       " latency   [long]: domain's wakeup latency hint\n"
       " extratime [int]:  domain aware of extratime?\n"},
+    
+    { "csched_domain_set",
+      (PyCFunction)pyxc_csched_domain_set,
+      METH_KEYWORDS, "\n"
+      "Set the scheduling parameters for a domain when running with the\n"
+      "SMP credit scheduler.\n"
+      " domid     [int]:   domain id to set\n"
+      " weight    [short]: domain's scheduling weight\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "csched_domain_get",
+      (PyCFunction)pyxc_csched_domain_get,
+      METH_VARARGS, "\n"
+      "Get the scheduling parameters for a domain when running with the\n"
+      "SMP credit scheduler.\n"
+      " domid     [int]:   domain id to get\n"
+      "Returns:   [dict]\n"
+      " weight    [short]: domain's scheduling weight\n"},
 
     { "evtchn_alloc_unbound", 
       (PyCFunction)pyxc_evtchn_alloc_unbound,
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/xend/XendDomain.py       Fri May 26 11:14:36 2006 +0100
@@ -522,6 +522,28 @@ class XendDomain:
         except Exception, ex:
             raise XendError(str(ex))
 
+    def domain_csched_get(self, domid):
+        """Get credit scheduler parameters for a domain.
+        """
+        dominfo = self.domain_lookup_by_name_or_id_nr(domid)
+        if not dominfo:
+            raise XendInvalidDomain(str(domid))
+        try:
+            return xc.csched_domain_get(dominfo.getDomid())
+        except Exception, ex:
+            raise XendError(str(ex))
+    
+    def domain_csched_set(self, domid, weight, cap):
+        """Set credit scheduler parameters for a domain.
+        """
+        dominfo = self.domain_lookup_by_name_or_id_nr(domid)
+        if not dominfo:
+            raise XendInvalidDomain(str(domid))
+        try:
+            return xc.csched_domain_set(dominfo.getDomid(), weight, cap)
+        except Exception, ex:
+            raise XendError(str(ex))
+
     def domain_maxmem_set(self, domid, mem):
         """Set the memory limit for a domain.
 
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/xend/server/SrvDomain.py
--- a/tools/python/xen/xend/server/SrvDomain.py Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/xend/server/SrvDomain.py Fri May 26 11:14:36 2006 +0100
@@ -129,6 +129,20 @@ class SrvDomain(SrvDir):
                     ['latency', 'int'],
                     ['extratime', 'int'],
                     ['weight', 'int']])
+        val = fn(req.args, {'dom': self.dom.domid})
+        return val
+    
+    def op_domain_csched_get(self, _, req):
+        fn = FormFn(self.xd.domain_csched_get,
+                    [['dom', 'int']])
+        val = fn(req.args, {'dom': self.dom.domid})
+        return val
+
+
+    def op_domain_csched_set(self, _, req):
+        fn = FormFn(self.xd.domain_csched_set,
+                    [['dom', 'int'],
+                     ['weight', 'int']])
         val = fn(req.args, {'dom': self.dom.domid})
         return val
 
diff -r b6937b931419 -r e539abd27a0f tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Fri May 26 09:44:29 2006 +0100
+++ b/tools/python/xen/xm/main.py       Fri May 26 11:14:36 2006 +0100
@@ -99,6 +99,7 @@ sched_sedf_help = "sched-sedf [DOM] [OPT
                                     specifies another way of setting a 
domain's\n\
                                     cpu period/slice."
 
+csched_help = "csched                           Set or get credit scheduler 
parameters"
 block_attach_help = """block-attach <DomId> <BackDev> <FrontDev> <Mode>
                 [BackDomId]         Create a new virtual block device"""
 block_detach_help = """block-detach  <DomId> <DevId>    Destroy a domain's 
virtual block device,
@@ -174,6 +175,7 @@ host_commands = [
     ]
 
 scheduler_commands = [
+    "csched",
     "sched-bvt",
     "sched-bvt-ctxallow",
     "sched-sedf",
@@ -735,6 +737,48 @@ def xm_sched_sedf(args):
         else:
             print_sedf(sedf_info)
 
+def xm_csched(args):
+    usage_msg = """Csched:     Set or get credit scheduler parameters
+ Usage:
+
+        csched -d domain [-w weight] [-c cap]
+    """
+    try:
+        opts, args = getopt.getopt(args[0:], "d:w:c:",
+            ["domain=", "weight=", "cap="])
+    except getopt.GetoptError:
+        # print help information and exit:
+        print usage_msg
+        sys.exit(1)
+
+    domain = None
+    weight = None
+    cap = None
+
+    for o, a in opts:
+        if o == "-d":
+            domain = a
+        elif o == "-w":
+            weight = int(a)
+        elif o == "-c":
+            cap = int(a);
+
+    if domain is None:
+        # place holder for system-wide scheduler parameters
+        print usage_msg
+        sys.exit(1)
+
+    if weight is None and cap is None:
+        print server.xend.domain.csched_get(domain)
+    else:
+        if weight is None:
+            weight = int(0)
+        if cap is None:
+            cap = int(~0)
+
+        err = server.xend.domain.csched_set(domain, weight, cap)
+        if err != 0:
+            print err
 
 def xm_info(args):
     arg_check(args, "info", 0)
@@ -1032,6 +1076,7 @@ commands = {
     "sched-bvt": xm_sched_bvt,
     "sched-bvt-ctxallow": xm_sched_bvt_ctxallow,
     "sched-sedf": xm_sched_sedf,
+    "csched": xm_csched,
     # block
     "block-attach": xm_block_attach,
     "block-detach": xm_block_detach,
diff -r b6937b931419 -r e539abd27a0f xen/common/Makefile
--- a/xen/common/Makefile       Fri May 26 09:44:29 2006 +0100
+++ b/xen/common/Makefile       Fri May 26 11:14:36 2006 +0100
@@ -13,6 +13,7 @@ obj-y += page_alloc.o
 obj-y += page_alloc.o
 obj-y += rangeset.o
 obj-y += sched_bvt.o
+obj-y += sched_credit.o
 obj-y += sched_sedf.o
 obj-y += schedule.o
 obj-y += softirq.o
diff -r b6937b931419 -r e539abd27a0f xen/common/schedule.c
--- a/xen/common/schedule.c     Fri May 26 09:44:29 2006 +0100
+++ b/xen/common/schedule.c     Fri May 26 11:14:36 2006 +0100
@@ -50,9 +50,11 @@ struct schedule_data schedule_data[NR_CP
 
 extern struct scheduler sched_bvt_def;
 extern struct scheduler sched_sedf_def;
+extern struct scheduler sched_credit_def;
 static struct scheduler *schedulers[] = { 
     &sched_bvt_def,
     &sched_sedf_def,
+    &sched_credit_def,
     NULL
 };
 
@@ -639,6 +641,8 @@ static void t_timer_fn(void *unused)
 
     page_scrub_schedule_work();
 
+    SCHED_OP(tick, cpu);
+
     set_timer(&t_timer[cpu], NOW() + MILLISECS(10));
 }
 
@@ -681,6 +685,7 @@ void __init scheduler_init(void)
         printk("Could not find scheduler: %s\n", opt_sched);
 
     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
+    SCHED_OP(init);
 
     if ( idle_vcpu[0] != NULL )
     {
diff -r b6937b931419 -r e539abd27a0f xen/include/public/sched_ctl.h
--- a/xen/include/public/sched_ctl.h    Fri May 26 09:44:29 2006 +0100
+++ b/xen/include/public/sched_ctl.h    Fri May 26 11:14:36 2006 +0100
@@ -10,6 +10,7 @@
 /* Scheduler types. */
 #define SCHED_BVT      0
 #define SCHED_SEDF     4
+#define SCHED_CREDIT   5
 
 /* Set or get info? */
 #define SCHED_INFO_PUT 0
@@ -48,6 +49,10 @@ struct sched_adjdom_cmd {
             uint32_t extratime;
             uint32_t weight;
         } sedf;
+        struct csched_domain {
+            uint16_t weight;
+            uint16_t cap;
+        } credit;
     } u;
 };
 
diff -r b6937b931419 -r e539abd27a0f xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Fri May 26 09:44:29 2006 +0100
+++ b/xen/include/xen/sched-if.h        Fri May 26 11:14:36 2006 +0100
@@ -58,6 +58,8 @@ struct scheduler {
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
 
+    void         (*init)           (void);
+    void         (*tick)           (unsigned int cpu);
     int          (*alloc_task)     (struct vcpu *);
     void         (*add_task)       (struct vcpu *);
     void         (*free_task)      (struct domain *);
diff -r b6937b931419 -r e539abd27a0f xen/include/xen/softirq.h
--- a/xen/include/xen/softirq.h Fri May 26 09:44:29 2006 +0100
+++ b/xen/include/xen/softirq.h Fri May 26 11:14:36 2006 +0100
@@ -26,6 +26,19 @@ asmlinkage void do_softirq(void);
 asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, softirq_handler handler);
 
+static inline void cpumask_raise_softirq(cpumask_t mask, unsigned int nr)
+{
+    int cpu;
+
+    for_each_cpu_mask(cpu, mask)
+    {
+        if ( test_and_set_bit(nr, &softirq_pending(cpu)) )
+            cpu_clear(cpu, mask);
+    }
+
+    smp_send_event_check_mask(mask);
+}
+
 static inline void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
 {
     if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )
diff -r b6937b931419 -r e539abd27a0f tools/libxc/xc_csched.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_csched.c   Fri May 26 11:14:36 2006 +0100
@@ -0,0 +1,50 @@
+/****************************************************************************
+ * (C) 2006 - Emmanuel Ackaouy - XenSource Inc.
+ ****************************************************************************
+ *
+ *        File: xc_csched.c
+ *      Author: Emmanuel Ackaouy
+ *
+ * Description: XC Interface to the credit scheduler
+ *
+ */
+#include "xc_private.h"
+
+
+int
+xc_csched_domain_set(
+    int xc_handle,
+    uint32_t domid,
+    struct csched_domain *sdom)
+{
+    DECLARE_DOM0_OP;
+
+    op.cmd = DOM0_ADJUSTDOM;    
+    op.u.adjustdom.domain = (domid_t) domid;
+    op.u.adjustdom.sched_id = SCHED_CREDIT;
+    op.u.adjustdom.direction = SCHED_INFO_PUT;
+    op.u.adjustdom.u.credit = *sdom;
+
+    return do_dom0_op(xc_handle, &op);
+}
+
+int
+xc_csched_domain_get(
+    int xc_handle,
+    uint32_t domid,
+    struct csched_domain *sdom)
+{
+    DECLARE_DOM0_OP;
+    int err;
+
+    op.cmd = DOM0_ADJUSTDOM;    
+    op.u.adjustdom.domain = (domid_t) domid;
+    op.u.adjustdom.sched_id = SCHED_CREDIT;
+    op.u.adjustdom.direction = SCHED_INFO_GET;
+
+    err = do_dom0_op(xc_handle, &op);
+    if ( err == 0 )
+        *sdom = op.u.adjustdom.u.credit;
+
+    return err;
+}
diff -r b6937b931419 -r e539abd27a0f xen/common/sched_credit.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/sched_credit.c Fri May 26 11:14:36 2006 +0100
@@ -0,0 +1,1233 @@
+/****************************************************************************
+ * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
+ ****************************************************************************
+ *
+ *        File: common/csched_credit.c
+ *      Author: Emmanuel Ackaouy
+ *
+ * Description: Credit-based SMP CPU scheduler
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+
+
+/*
+ * CSCHED_STATS
+ *
+ * Manage very basic counters and stats.
+ *
+ * Useful for debugging live systems. The stats are displayed
+ * with runq dumps ('r' on the Xen console).
+ */
+#define CSCHED_STATS
+
+
+/*
+ * Basic constants
+ */
+#define CSCHED_TICK             10      /* milliseconds */
+#define CSCHED_TSLICE           30      /* milliseconds */
+#define CSCHED_ACCT_NTICKS      3
+#define CSCHED_ACCT_PERIOD      (CSCHED_ACCT_NTICKS * CSCHED_TICK)
+#define CSCHED_DEFAULT_WEIGHT   256
+
+
+/*
+ * Priorities
+ */
+#define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
+#define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
+#define CSCHED_PRI_IDLE         -64     /* idle */
+#define CSCHED_PRI_TS_PARKED    -65     /* time-share w/ capped credits */
+
+
+/*
+ * Useful macros
+ */
+#define CSCHED_PCPU(_c)     ((struct csched_pcpu 
*)schedule_data[_c].sched_priv)
+#define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
+#define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
+#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+
+
+/*
+ * Stats
+ */
+#ifdef CSCHED_STATS
+
+#define CSCHED_STAT(_X)         (csched_priv.stats._X)
+#define CSCHED_STAT_DEFINE(_X)  uint32_t _X;
+#define CSCHED_STAT_PRINTK(_X)                                  \
+    do                                                          \
+    {                                                           \
+        printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X));  \
+    } while ( 0 );
+
+#define CSCHED_STATS_EXPAND_SCHED(_MACRO)   \
+    _MACRO(vcpu_alloc)                      \
+    _MACRO(vcpu_add)                        \
+    _MACRO(vcpu_sleep)                      \
+    _MACRO(vcpu_wake_running)               \
+    _MACRO(vcpu_wake_onrunq)                \
+    _MACRO(vcpu_wake_runnable)              \
+    _MACRO(vcpu_wake_not_runnable)          \
+    _MACRO(dom_free)                        \
+    _MACRO(schedule)                        \
+    _MACRO(tickle_local_idler)              \
+    _MACRO(tickle_local_over)               \
+    _MACRO(tickle_local_under)              \
+    _MACRO(tickle_local_other)              \
+    _MACRO(acct_run)                        \
+    _MACRO(acct_no_work)                    \
+    _MACRO(acct_balance)                    \
+    _MACRO(acct_reorder)                    \
+    _MACRO(acct_min_credit)                 \
+    _MACRO(acct_vcpu_active)                \
+    _MACRO(acct_vcpu_idle)                  \
+    _MACRO(acct_vcpu_credit_min)
+
+#define CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO)    \
+    _MACRO(vcpu_migrate)                                \
+    _MACRO(load_balance_idle)                           \
+    _MACRO(load_balance_over)                           \
+    _MACRO(load_balance_other)                          \
+    _MACRO(steal_trylock_failed)                        \
+    _MACRO(steal_peer_down)                             \
+    _MACRO(steal_peer_idle)                             \
+    _MACRO(steal_peer_running)                          \
+    _MACRO(steal_peer_pinned)                           \
+    _MACRO(tickle_idlers_none)                          \
+    _MACRO(tickle_idlers_some)
+
+#ifndef NDEBUG
+#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)  \
+    _MACRO(vcpu_check)
+#else
+#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
+#endif
+
+#define CSCHED_STATS_EXPAND(_MACRO)                 \
+    CSCHED_STATS_EXPAND_SCHED(_MACRO)               \
+    CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO)    \
+    CSCHED_STATS_EXPAND_CHECKS(_MACRO)
+
+#define CSCHED_STATS_RESET()                                        \
+    do                                                              \
+    {                                                               \
+        memset(&csched_priv.stats, 0, sizeof(csched_priv.stats));   \
+    } while ( 0 )
+
+#define CSCHED_STATS_DEFINE()                   \
+    struct                                      \
+    {                                           \
+        CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
+    } stats
+
+#define CSCHED_STATS_PRINTK()                   \
+    do                                          \
+    {                                           \
+        printk("stats:\n");                     \
+        CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
+    } while ( 0 )
+
+#define CSCHED_STAT_CRANK(_X)   (CSCHED_STAT(_X)++)
+
+#else /* CSCHED_STATS */
+
+#define CSCHED_STATS_RESET()    do {} while ( 0 )
+#define CSCHED_STATS_DEFINE()   do {} while ( 0 )
+#define CSCHED_STATS_PRINTK()   do {} while ( 0 )
+#define CSCHED_STAT_CRANK(_X)   do {} while ( 0 )
+
+#endif /* CSCHED_STATS */
+
+
+/*
+ * Physical CPU
+ */
+struct csched_pcpu {
+    struct list_head runq;
+    uint32_t runq_sort_last;
+};
+
+/*
+ * Virtual CPU
+ */
+struct csched_vcpu {
+    struct list_head runq_elem;
+    struct list_head active_vcpu_elem;
+    struct csched_dom *sdom;
+    struct vcpu *vcpu;
+    atomic_t credit;
+    int credit_last;
+    uint32_t credit_incr;
+    uint32_t state_active;
+    uint32_t state_idle;
+    int16_t pri;
+};
+
+/*
+ * Domain
+ */
+struct csched_dom {
+    struct list_head active_vcpu;
+    struct list_head active_sdom_elem;
+    struct domain *dom;
+    uint16_t active_vcpu_count;
+    uint16_t weight;
+    uint16_t cap;
+};
+
+/*
+ * System-wide private data
+ */
+struct csched_private {
+    spinlock_t lock;
+    struct list_head active_sdom;
+    uint32_t ncpus;
+    unsigned int master;
+    cpumask_t idlers;
+    uint32_t weight;
+    uint32_t credit;
+    int credit_balance;
+    uint32_t runq_sort;
+    CSCHED_STATS_DEFINE();
+};
+
+
+/*
+ * Global variables
+ */
+static struct csched_private csched_priv;
+
+
+
+static inline int
+__vcpu_on_runq(struct csched_vcpu *svc)
+{
+    return !list_empty(&svc->runq_elem);
+}
+
+static inline struct csched_vcpu *
+__runq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct csched_vcpu, runq_elem);
+}
+
+static inline void
+__runq_insert(unsigned int cpu, struct csched_vcpu *svc)
+{
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+
+    BUG_ON( __vcpu_on_runq(svc) );
+    BUG_ON( cpu != svc->vcpu->processor );
+
+    list_for_each( iter, runq )
+    {
+        const struct csched_vcpu * const iter_svc = __runq_elem(iter);
+        if ( svc->pri > iter_svc->pri )
+            break;
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+}
+
+static inline void
+__runq_remove(struct csched_vcpu *svc)
+{
+    BUG_ON( !__vcpu_on_runq(svc) );
+    list_del_init(&svc->runq_elem);
+}
+
+static inline void
+__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
+{
+    struct csched_vcpu * const cur = CSCHED_VCPU(schedule_data[cpu].curr);
+    cpumask_t mask;
+
+    ASSERT(cur);
+    cpus_clear(mask);
+
+    /* If strictly higher priority than current VCPU, signal the CPU */
+    if ( new->pri > cur->pri )
+    {
+        if ( cur->pri == CSCHED_PRI_IDLE )
+            CSCHED_STAT_CRANK(tickle_local_idler);
+        else if ( cur->pri == CSCHED_PRI_TS_OVER )
+            CSCHED_STAT_CRANK(tickle_local_over);
+        else if ( cur->pri == CSCHED_PRI_TS_UNDER )
+            CSCHED_STAT_CRANK(tickle_local_under);
+        else
+            CSCHED_STAT_CRANK(tickle_local_other);
+
+        cpu_set(cpu, mask);
+    }
+
+    /*
+     * If this CPU has at least two runnable VCPUs, we tickle any idlers to
+     * let them know there is runnable work in the system...
+     */
+    if ( cur->pri > CSCHED_PRI_IDLE )
+    {
+        if ( cpus_empty(csched_priv.idlers) )
+        {
+            CSCHED_STAT_CRANK(tickle_idlers_none);
+        }
+        else
+        {
+            CSCHED_STAT_CRANK(tickle_idlers_some);
+            cpus_or(mask, mask, csched_priv.idlers);
+        }
+    }
+
+    /* Send scheduler interrupts to designated CPUs */
+    if ( !cpus_empty(mask) )
+        cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
+}
+
+static void
+csched_pcpu_init(int cpu)
+{
+    struct csched_pcpu *spc;
+    unsigned long flags;
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    /* Initialize/update system-wide config */
+    csched_priv.credit += CSCHED_ACCT_PERIOD;
+    if ( csched_priv.ncpus <= cpu )
+        csched_priv.ncpus = cpu + 1;
+    if ( csched_priv.master >= csched_priv.ncpus )
+        csched_priv.master = cpu;
+
+    /* Allocate per-PCPU info */
+    spc = xmalloc(struct csched_pcpu);
+    BUG_ON( spc == NULL );
+    INIT_LIST_HEAD(&spc->runq);
+    spc->runq_sort_last = csched_priv.runq_sort;
+    schedule_data[cpu].sched_priv = spc;
+
+    /* Start off idling... */
+    BUG_ON( !is_idle_vcpu(schedule_data[cpu].curr) );
+    cpu_set(cpu, csched_priv.idlers);
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+}
+
+#ifndef NDEBUG
+static inline void
+__csched_vcpu_check(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_dom * const sdom = svc->sdom;
+
+    BUG_ON( svc->vcpu != vc );
+    BUG_ON( sdom != CSCHED_DOM(vc->domain) );
+    if ( sdom )
+    {
+        BUG_ON( is_idle_vcpu(vc) );
+        BUG_ON( sdom->dom != vc->domain );
+    }
+    else
+    {
+        BUG_ON( !is_idle_vcpu(vc) );
+    }
+
+    CSCHED_STAT_CRANK(vcpu_check);
+}
+#define CSCHED_VCPU_CHECK(_vc)  (__csched_vcpu_check(_vc))
+#else
+#define CSCHED_VCPU_CHECK(_vc)
+#endif
+
+static inline int
+__csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
+{
+    /*
+     * Don't pick up work that's in the peer's scheduling tail. Also only pick
+     * up work that's allowed to run on our CPU.
+     */
+    if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_running);
+        return 0;
+    }
+
+    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_pinned);
+        return 0;
+    }
+
+    return 1;
+}
+
+static void
+csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
+{
+    struct csched_dom * const sdom = svc->sdom;
+    unsigned long flags;
+
+    /* Update credits */
+    atomic_sub(credit_dec, &svc->credit);
+
+    /* Put this VCPU and domain back on the active list if it was idling */
+    if ( list_empty(&svc->active_vcpu_elem) )
+    {
+        spin_lock_irqsave(&csched_priv.lock, flags);
+
+        if ( list_empty(&svc->active_vcpu_elem) )
+        {
+            CSCHED_STAT_CRANK(acct_vcpu_active);
+            svc->state_active++;
+
+            sdom->active_vcpu_count++;
+            list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
+            if ( list_empty(&sdom->active_sdom_elem) )
+            {
+                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+                csched_priv.weight += sdom->weight;
+            }
+        }
+
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+    }
+}
+
+static inline void
+__csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+
+    BUG_ON( list_empty(&svc->active_vcpu_elem) );
+
+    CSCHED_STAT_CRANK(acct_vcpu_idle);
+    svc->state_idle++;
+
+    sdom->active_vcpu_count--;
+    list_del_init(&svc->active_vcpu_elem);
+    if ( list_empty(&sdom->active_vcpu) )
+    {
+        BUG_ON( csched_priv.weight < sdom->weight );
+        list_del_init(&sdom->active_sdom_elem);
+        csched_priv.weight -= sdom->weight;
+    }
+
+    atomic_set(&svc->credit, 0);
+}
+
+static int
+csched_vcpu_alloc(struct vcpu *vc)
+{
+    struct domain * const dom = vc->domain;
+    struct csched_dom *sdom;
+    struct csched_vcpu *svc;
+    int16_t pri;
+
+    CSCHED_STAT_CRANK(vcpu_alloc);
+
+    /* Allocate, if appropriate, per-domain info */
+    if ( is_idle_vcpu(vc) )
+    {
+        sdom = NULL;
+        pri = CSCHED_PRI_IDLE;
+    }
+    else if ( CSCHED_DOM(dom) )
+    {
+        sdom = CSCHED_DOM(dom);
+        pri = CSCHED_PRI_TS_UNDER;
+    }
+    else 
+    {
+        sdom = xmalloc(struct csched_dom);
+        if ( !sdom )
+            return -1;
+
+        /* Initialize credit and weight */
+        INIT_LIST_HEAD(&sdom->active_vcpu);
+        sdom->active_vcpu_count = 0;
+        INIT_LIST_HEAD(&sdom->active_sdom_elem);
+        sdom->dom = dom;
+        sdom->weight = CSCHED_DEFAULT_WEIGHT;
+        sdom->cap = 0U;
+        dom->sched_priv = sdom;
+        pri = CSCHED_PRI_TS_UNDER;
+    }
+
+    /* Allocate per-VCPU info */
+    svc = xmalloc(struct csched_vcpu);
+    if ( !svc )
+        return -1;
+
+    INIT_LIST_HEAD(&svc->runq_elem);
+    INIT_LIST_HEAD(&svc->active_vcpu_elem);
+    svc->sdom = sdom;
+    svc->vcpu = vc;
+    atomic_set(&svc->credit, 0);
+    svc->credit_last = 0;
+    svc->credit_incr = 0U;
+    svc->state_active = 0U;
+    svc->state_idle = 0U;
+    svc->pri = pri;
+    vc->sched_priv = svc;
+
+    CSCHED_VCPU_CHECK(vc);
+
+    /* Attach fair-share VCPUs to the accounting list */
+    if ( likely(sdom != NULL) )
+        csched_vcpu_acct(svc, 0);
+
+    return 0;
+}
+
+static void
+csched_vcpu_add(struct vcpu *vc) 
+{
+    CSCHED_STAT_CRANK(vcpu_add);
+
+    /* Allocate per-PCPU info */
+    if ( unlikely(!CSCHED_PCPU(vc->processor)) )
+        csched_pcpu_init(vc->processor);
+
+    CSCHED_VCPU_CHECK(vc);
+}
+
+static void
+csched_vcpu_free(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_dom * const sdom = svc->sdom;
+    unsigned long flags;
+
+    BUG_ON( sdom == NULL );
+    BUG_ON( !list_empty(&svc->runq_elem) );
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    if ( !list_empty(&svc->active_vcpu_elem) )
+        __csched_vcpu_acct_idle_locked(svc);
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_sleep(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+
+    CSCHED_STAT_CRANK(vcpu_sleep);
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( schedule_data[vc->processor].curr == vc )
+        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
+    else if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+}
+
+static void
+csched_vcpu_wake(struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    const unsigned int cpu = vc->processor;
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( unlikely(schedule_data[cpu].curr == vc) )
+    {
+        CSCHED_STAT_CRANK(vcpu_wake_running);
+        return;
+    }
+    if ( unlikely(__vcpu_on_runq(svc)) )
+    {
+        CSCHED_STAT_CRANK(vcpu_wake_onrunq);
+        return;
+    }
+
+    if ( likely(vcpu_runnable(vc)) )
+        CSCHED_STAT_CRANK(vcpu_wake_runnable);
+    else
+        CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
+
+    /* Put the VCPU on the runq and tickle CPUs */
+    __runq_insert(cpu, svc);
+    __runq_tickle(cpu, svc);
+}
+
+static int
+csched_vcpu_set_affinity(struct vcpu *vc, cpumask_t *affinity)
+{
+    unsigned long flags;
+    int lcpu;
+
+    if ( vc == current )
+    {
+        /* No locking needed but also can't move on the spot... */
+        if ( !cpu_isset(vc->processor, *affinity) )
+            return -EBUSY;
+
+        vc->cpu_affinity = *affinity;
+    }
+    else
+    {
+        /* Pause, modify, and unpause. */
+        vcpu_pause(vc);
+
+        vc->cpu_affinity = *affinity;
+        if ( !cpu_isset(vc->processor, vc->cpu_affinity) )
+        {
+            /*
+             * We must grab the scheduler lock for the CPU currently owning
+             * this VCPU before changing its ownership.
+             */
+            vcpu_schedule_lock_irqsave(vc, flags);
+            lcpu = vc->processor;
+
+            vc->processor = first_cpu(vc->cpu_affinity);
+
+            spin_unlock_irqrestore(&schedule_data[lcpu].schedule_lock, flags);
+        }
+
+        vcpu_unpause(vc);
+    }
+
+    return 0;
+}
+
+static int
+csched_dom_cntl(
+    struct domain *d,
+    struct sched_adjdom_cmd *cmd)
+{
+    struct csched_dom * const sdom = CSCHED_DOM(d);
+    unsigned long flags;
+
+    if ( cmd->direction == SCHED_INFO_GET )
+    {
+        cmd->u.credit.weight = sdom->weight;
+        cmd->u.credit.cap = sdom->cap;
+    }
+    else
+    {
+        ASSERT( cmd->direction == SCHED_INFO_PUT );
+
+        spin_lock_irqsave(&csched_priv.lock, flags);
+
+        if ( cmd->u.credit.weight != 0 )
+        {
+            csched_priv.weight -= sdom->weight;
+            sdom->weight = cmd->u.credit.weight;
+            csched_priv.weight += sdom->weight;
+        }
+
+        if ( cmd->u.credit.cap != (uint16_t)~0U )
+            sdom->cap = cmd->u.credit.cap;
+
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+    }
+
+    return 0;
+}
+
+static void
+csched_dom_free(struct domain *dom)
+{
+    struct csched_dom * const sdom = CSCHED_DOM(dom);
+    int i;
+
+    CSCHED_STAT_CRANK(dom_free);
+
+    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+    {
+        if ( dom->vcpu[i] )
+            csched_vcpu_free(dom->vcpu[i]);
+    }
+
+    xfree(sdom);
+}
+
+/*
+ * This is a O(n) optimized sort of the runq.
+ *
+ * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
+ * through the runq and move up any UNDERs that are preceded by OVERS. We
+ * remember the last UNDER to make the move up operation O(1).
+ */
+static void
+csched_runq_sort(unsigned int cpu)
+{
+    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
+    struct list_head *runq, *elem, *next, *last_under;
+    struct csched_vcpu *svc_elem;
+    unsigned long flags;
+    int sort_epoch;
+
+    sort_epoch = csched_priv.runq_sort;
+    if ( sort_epoch == spc->runq_sort_last )
+        return;
+
+    spc->runq_sort_last = sort_epoch;
+
+    spin_lock_irqsave(&schedule_data[cpu].schedule_lock, flags);
+
+    runq = &spc->runq;
+    elem = runq->next;
+    last_under = runq;
+
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+
+        if ( svc_elem->pri == CSCHED_PRI_TS_UNDER )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+
+        elem = next;
+    }
+
+    spin_unlock_irqrestore(&schedule_data[cpu].schedule_lock, flags);
+}
+
+static void
+csched_acct(void)
+{
+    unsigned long flags;
+    struct list_head *iter_vcpu, *next_vcpu;
+    struct list_head *iter_sdom, *next_sdom;
+    struct csched_vcpu *svc;
+    struct csched_dom *sdom;
+    uint32_t credit_total;
+    uint32_t weight_total;
+    uint32_t weight_left;
+    uint32_t credit_fair;
+    uint32_t credit_peak;
+    int credit_balance;
+    int credit_xtra;
+    int credit;
+
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    weight_total = csched_priv.weight;
+    credit_total = csched_priv.credit;
+
+    /* Converge balance towards 0 when it drops negative */
+    if ( csched_priv.credit_balance < 0 )
+    {
+        credit_total -= csched_priv.credit_balance;
+        CSCHED_STAT_CRANK(acct_balance);
+    }
+
+    if ( unlikely(weight_total == 0) )
+    {
+        csched_priv.credit_balance = 0;
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        CSCHED_STAT_CRANK(acct_no_work);
+        return;
+    }
+
+    CSCHED_STAT_CRANK(acct_run);
+
+    weight_left = weight_total;
+    credit_balance = 0;
+    credit_xtra = 0;
+
+    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    {
+        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+        BUG_ON( is_idle_domain(sdom->dom) );
+        BUG_ON( sdom->active_vcpu_count == 0 );
+        BUG_ON( sdom->weight == 0 );
+        BUG_ON( sdom->weight > weight_left );
+
+        weight_left -= sdom->weight;
+
+        /*
+         * A domain's fair share is computed using its weight in competition
+         * with that of all other active domains.
+         *
+         * At most, a domain can use credits to run all its active VCPUs
+         * for one full accounting period. We allow a domain to earn more
+         * only when the system-wide credit balance is negative.
+         */
+        credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
+        if ( csched_priv.credit_balance < 0 )
+        {
+            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+                             (weight_total - 1)
+                           ) / weight_total;
+        }
+        if ( sdom->cap != 0U )
+        {
+            uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 
100;
+            if ( credit_cap < credit_peak )
+                credit_peak = credit_cap;
+        }
+
+        credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
+                      ) / weight_total;
+
+        if ( credit_fair < credit_peak )
+        {
+            credit_xtra = 1;
+        }
+        else
+        {
+            if ( weight_left != 0U )
+            {
+                /* Give other domains a chance at unused credits */
+                credit_total += ( ( ( credit_fair - credit_peak
+                                    ) * weight_total
+                                  ) + ( weight_left - 1 )
+                                ) / weight_left;
+            }
+
+            if ( credit_xtra )
+            {
+                /*
+                 * Lazily keep domains with extra credits at the head of
+                 * the queue to give others a chance at them in future
+                 * accounting periods.
+                 */
+                CSCHED_STAT_CRANK(acct_reorder);
+                list_del(&sdom->active_sdom_elem);
+                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+            }
+
+            credit_fair = credit_peak;
+        }
+
+        /* Compute fair share per VCPU */
+        credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
+                      ) / sdom->active_vcpu_count;
+
+
+        list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
+        {
+            svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
+            BUG_ON( sdom != svc->sdom );
+
+            /* Increment credit */
+            atomic_add(credit_fair, &svc->credit);
+            credit = atomic_read(&svc->credit);
+
+            /*
+             * Recompute priority or, if VCPU is idling, remove it from
+             * the active list.
+             */
+            if ( credit < 0 )
+            {
+                if ( sdom->cap == 0U )
+                    svc->pri = CSCHED_PRI_TS_OVER;
+                else
+                    svc->pri = CSCHED_PRI_TS_PARKED;
+
+                if ( credit < -CSCHED_TSLICE )
+                {
+                    CSCHED_STAT_CRANK(acct_min_credit);
+                    credit = -CSCHED_TSLICE;
+                    atomic_set(&svc->credit, credit);
+                }
+            }
+            else
+            {
+                svc->pri = CSCHED_PRI_TS_UNDER;
+
+                if ( credit > CSCHED_TSLICE )
+                    __csched_vcpu_acct_idle_locked(svc);
+            }
+
+            svc->credit_last = credit;
+            svc->credit_incr = credit_fair;
+            credit_balance += credit;
+        }
+    }
+
+    csched_priv.credit_balance = credit_balance;
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    /* Inform each CPU that its runq needs to be sorted */
+    csched_priv.runq_sort++;
+}
+
+static void
+csched_tick(unsigned int cpu)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(current);
+    struct csched_dom * const sdom = svc->sdom;
+
+    /*
+     * Accounting for running VCPU
+     *
+     * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
+     */
+    if ( likely(sdom != NULL) )
+    {
+        csched_vcpu_acct(svc, CSCHED_TICK);
+    }
+
+    /*
+     * Accounting duty
+     *
+     * Note: Currently, this is always done by the master boot CPU. Eventually,
+     * we could distribute or at the very least cycle the duty.
+     */
+    if ( (csched_priv.master == cpu) &&
+         (schedule_data[cpu].tick % CSCHED_ACCT_NTICKS) == 0 )
+    {
+        csched_acct();
+    }
+
+    /*
+     * Check if runq needs to be sorted
+     *
+     * Every physical CPU resorts the runq after the accounting master has
+     * modified priorities. This is a special O(n) sort and runs at most
+     * once per accounting period (currently 30 milliseconds).
+     */
+    csched_runq_sort(cpu);
+}
+
+static struct csched_vcpu *
+csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
+{
+    struct list_head *iter;
+    struct csched_vcpu *speer;
+    struct vcpu *vc;
+
+    list_for_each( iter, &spc->runq )
+    {
+        speer = __runq_elem(iter);
+
+        /*
+         * If next available VCPU here is not of higher priority than ours,
+         * this PCPU is useless to us.
+         */
+        if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
+        {
+            CSCHED_STAT_CRANK(steal_peer_idle);
+            break;
+        }
+
+        /* Is this VCPU is runnable on our PCPU? */
+        vc = speer->vcpu;
+        BUG_ON( is_idle_vcpu(vc) );
+
+        if ( __csched_vcpu_is_stealable(cpu, vc) )
+        {
+            /* We got a candidate. Grab it! */
+            __runq_remove(speer);
+            vc->processor = cpu;
+
+            return speer;
+        }
+    }
+
+    return NULL;
+}
+
+static struct csched_vcpu *
+csched_load_balance(int cpu, struct csched_vcpu *snext)
+{
+    struct csched_pcpu *spc;
+    struct csched_vcpu *speer;
+    int peer_cpu;
+
+    if ( snext->pri == CSCHED_PRI_IDLE )
+        CSCHED_STAT_CRANK(load_balance_idle);
+    else if ( snext->pri == CSCHED_PRI_TS_OVER )
+        CSCHED_STAT_CRANK(load_balance_over);
+    else
+        CSCHED_STAT_CRANK(load_balance_other);
+
+    peer_cpu = cpu;
+    BUG_ON( peer_cpu != snext->vcpu->processor );
+
+    while ( 1 )
+    {
+        /* For each PCPU in the system starting with our neighbour... */
+        peer_cpu = (peer_cpu + 1) % csched_priv.ncpus;
+        if ( peer_cpu == cpu )
+            break;
+
+        BUG_ON( peer_cpu >= csched_priv.ncpus );
+        BUG_ON( peer_cpu == cpu );
+
+        /*
+         * Get ahold of the scheduler lock for this peer CPU.
+         *
+         * Note: We don't spin on this lock but simply try it. Spinning could
+         * cause a deadlock if the peer CPU is also load balancing and trying
+         * to lock this CPU.
+         */
+        if ( spin_trylock(&schedule_data[peer_cpu].schedule_lock) )
+        {
+
+            spc = CSCHED_PCPU(peer_cpu);
+            if ( unlikely(spc == NULL) )
+            {
+                CSCHED_STAT_CRANK(steal_peer_down);
+                speer = NULL;
+            }
+            else
+            {
+                speer = csched_runq_steal(spc, cpu, snext->pri);
+            }
+
+            spin_unlock(&schedule_data[peer_cpu].schedule_lock);
+
+            /* Got one! */
+            if ( speer )
+            {
+                CSCHED_STAT_CRANK(vcpu_migrate);
+                return speer;
+            }
+        }
+        else
+        {
+            CSCHED_STAT_CRANK(steal_trylock_failed);
+        }
+    }
+
+
+    /* Failed to find more important work */
+    __runq_remove(snext);
+    return snext;
+}
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static struct task_slice
+csched_schedule(s_time_t now)
+{
+    const int cpu = smp_processor_id();
+    struct list_head * const runq = RUNQ(cpu);
+    struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_vcpu *snext;
+    struct task_slice ret;
+
+    CSCHED_STAT_CRANK(schedule);
+    CSCHED_VCPU_CHECK(current);
+
+    /*
+     * Select next runnable local VCPU (ie top of local runq)
+     */
+    if ( vcpu_runnable(current) )
+        __runq_insert(cpu, scurr);
+    else
+        BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
+
+    snext = __runq_elem(runq->next);
+
+    /*
+     * SMP Load balance:
+     *
+     * If the next highest priority local runnable VCPU has already eaten
+     * through its credits, look on other PCPUs to see if we have more
+     * urgent work... If not, csched_load_balance() will return snext, but
+     * already removed from the runq.
+     */
+    if ( snext->pri > CSCHED_PRI_TS_OVER )
+        __runq_remove(snext);
+    else
+        snext = csched_load_balance(cpu, snext);
+
+    /*
+     * Update idlers mask if necessary. When we're idling, other CPUs
+     * will tickle us when they get extra work.
+     */
+    if ( snext->pri == CSCHED_PRI_IDLE )
+    {
+        if ( !cpu_isset(cpu, csched_priv.idlers) )
+            cpu_set(cpu, csched_priv.idlers);
+    }
+    else if ( cpu_isset(cpu, csched_priv.idlers) )
+    {
+        cpu_clear(cpu, csched_priv.idlers);
+    }
+
+    /*
+     * Return task to run next...
+     */
+    ret.time = MILLISECS(CSCHED_TSLICE);
+    ret.task = snext->vcpu;
+
+    CSCHED_VCPU_CHECK(ret.task);
+    BUG_ON( !vcpu_runnable(ret.task) );
+
+    return ret;
+}
+
+static void
+csched_dump_vcpu(struct csched_vcpu *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+
+    printk("[%i.%i] pri=%i cpu=%i",
+            svc->vcpu->domain->domain_id,
+            svc->vcpu->vcpu_id,
+            svc->pri,
+            svc->vcpu->processor);
+
+    if ( sdom )
+    {
+        printk(" credit=%i (%d+%u) {a=%u i=%u w=%u}",
+            atomic_read(&svc->credit),
+            svc->credit_last,
+            svc->credit_incr,
+            svc->state_active,
+            svc->state_idle,
+            sdom->weight);
+    }
+
+    printk("\n");
+}
+
+static void
+csched_dump_pcpu(int cpu)
+{
+    struct list_head *runq, *iter;
+    struct csched_pcpu *spc;
+    struct csched_vcpu *svc;
+    int loop;
+
+    spc = CSCHED_PCPU(cpu);
+    runq = &spc->runq;
+
+    printk(" tick=%lu, sort=%d\n",
+            schedule_data[cpu].tick,
+            spc->runq_sort_last);
+
+    /* current VCPU */
+    svc = CSCHED_VCPU(schedule_data[cpu].curr);
+    if ( svc )
+    {
+        printk("\trun: ");
+        csched_dump_vcpu(svc);
+    }
+
+    loop = 0;
+    list_for_each( iter, runq )
+    {
+        svc = __runq_elem(iter);
+        if ( svc )
+        {
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+
+static void
+csched_dump(void)
+{
+    struct list_head *iter_sdom, *iter_svc;
+    int loop;
+
+    printk("info:\n"
+           "\tncpus              = %u\n"
+           "\tmaster             = %u\n"
+           "\tcredit             = %u\n"
+           "\tcredit balance     = %d\n"
+           "\tweight             = %u\n"
+           "\trunq_sort          = %u\n"
+           "\ttick               = %dms\n"
+           "\ttslice             = %dms\n"
+           "\taccounting period  = %dms\n"
+           "\tdefault-weight     = %d\n",
+           csched_priv.ncpus,
+           csched_priv.master,
+           csched_priv.credit,
+           csched_priv.credit_balance,
+           csched_priv.weight,
+           csched_priv.runq_sort,
+           CSCHED_TICK,
+           CSCHED_TSLICE,
+           CSCHED_ACCT_PERIOD,
+           CSCHED_DEFAULT_WEIGHT);
+
+    printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
+
+    CSCHED_STATS_PRINTK();
+
+    printk("active vcpus:\n");
+    loop = 0;
+    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    {
+        struct csched_dom *sdom;
+        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+        list_for_each( iter_svc, &sdom->active_vcpu )
+        {
+            struct csched_vcpu *svc;
+            svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
+
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+
+static void
+csched_init(void)
+{
+    spin_lock_init(&csched_priv.lock);
+    INIT_LIST_HEAD(&csched_priv.active_sdom);
+    csched_priv.ncpus = 0;
+    csched_priv.master = UINT_MAX;
+    cpus_clear(csched_priv.idlers);
+    csched_priv.weight = 0U;
+    csched_priv.credit = 0U;
+    csched_priv.credit_balance = 0;
+    csched_priv.runq_sort = 0U;
+    CSCHED_STATS_RESET();
+}
+
+
+struct scheduler sched_credit_def = {
+    .name           = "SMP Credit Scheduler",
+    .opt_name       = "credit",
+    .sched_id       = SCHED_CREDIT,
+
+    .alloc_task     = csched_vcpu_alloc,
+    .add_task       = csched_vcpu_add,
+    .sleep          = csched_vcpu_sleep,
+    .wake           = csched_vcpu_wake,
+    .set_affinity   = csched_vcpu_set_affinity,
+
+    .adjdom         = csched_dom_cntl,
+    .free_task      = csched_dom_free,
+
+    .tick           = csched_tick,
+    .do_schedule    = csched_schedule,
+
+    .dump_cpu_state = csched_dump_pcpu,
+    .dump_settings  = csched_dump,
+    .init           = csched_init,
+};

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] New weighted fair-share CPU scheduler w/ automatic SMP load balancing, Xen patchbot-unstable <=