WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] cpupools [1/6]: hypervisor changes

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] cpupools [1/6]: hypervisor changes
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 22 Apr 2010 04:30:25 -0700
Delivery-date: Thu, 22 Apr 2010 04:31:31 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1271850483 -3600
# Node ID de94884a669cb23a58c50555c3d9e7f41b847d2d
# Parent  c7d7797656dfe6c41ab59954a7bba1a2748c6836
cpupools [1/6]: hypervisor changes

Signed-off-by: Juergen Gross <juergen.gross@xxxxxxxxxxxxxx>
---
 xen/arch/x86/domain_build.c |   11 
 xen/arch/x86/setup.c        |    6 
 xen/arch/x86/smpboot.c      |   18 -
 xen/common/Makefile         |    1 
 xen/common/cpupool.c        |  604 ++++++++++++++++++++++++++++++++++++++++++++
 xen/common/domain.c         |    8 
 xen/common/domctl.c         |   21 +
 xen/common/sched_credit.c   |  473 ++++++++++++++++++++++------------
 xen/common/sched_credit2.c  |  337 +++++++++++++++---------
 xen/common/sched_sedf.c     |  170 ++++++++----
 xen/common/schedule.c       |  324 +++++++++++++++++++----
 xen/include/public/domctl.h |   31 ++
 xen/include/xen/sched-if.h  |   60 +++-
 xen/include/xen/sched.h     |   22 +
 14 files changed, 1638 insertions(+), 448 deletions(-)

diff -r c7d7797656df -r de94884a669c xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/arch/x86/domain_build.c       Wed Apr 21 12:48:03 2010 +0100
@@ -9,6 +9,7 @@
 #include <xen/lib.h>
 #include <xen/ctype.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/smp.h>
 #include <xen/delay.h>
 #include <xen/event.h>
@@ -84,7 +85,7 @@ struct vcpu *__init alloc_dom0_vcpu0(voi
 struct vcpu *__init alloc_dom0_vcpu0(void)
 {
     if ( opt_dom0_max_vcpus == 0 )
-        opt_dom0_max_vcpus = num_online_cpus();
+        opt_dom0_max_vcpus = num_cpupool_cpus(cpupool0);
     if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
         opt_dom0_max_vcpus = MAX_VIRT_CPUS;
 
@@ -277,7 +278,7 @@ int __init construct_dom0(
     unsigned long _initrd_start, unsigned long initrd_len,
     char *cmdline)
 {
-    int i, rc, compatible, compat32, order, machine;
+    int i, cpu, rc, compatible, compat32, order, machine;
     struct cpu_user_regs *regs;
     unsigned long pfn, mfn;
     unsigned long nr_pages;
@@ -776,8 +777,12 @@ int __init construct_dom0(
 
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
+    cpu = first_cpu(cpupool0->cpu_valid);
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i % num_online_cpus());
+    {
+        cpu = cycle_cpu(cpu, cpupool0->cpu_valid);
+        (void)alloc_vcpu(d, i, cpu);
+    }
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
diff -r c7d7797656df -r de94884a669c xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/arch/x86/setup.c      Wed Apr 21 12:48:03 2010 +0100
@@ -2,6 +2,7 @@
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/serial.h>
 #include <xen/softirq.h>
@@ -1093,6 +1094,11 @@ void __init __start_xen(unsigned long mb
     if ( !tboot_protect_mem_regions() )
         panic("Could not protect TXT memory regions\n");
 
+    /* Create initial cpupool 0. */
+    cpupool0 = cpupool_create(0, NULL);
+    if ( (cpupool0 == NULL) || cpupool0_cpu_assign(cpupool0) )
+        panic("Error creating cpupool 0\n");
+
     /* Create initial domain 0. */
     dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
     if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) )
diff -r c7d7797656df -r de94884a669c xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/arch/x86/smpboot.c    Wed Apr 21 12:48:03 2010 +0100
@@ -39,6 +39,7 @@
 #include <xen/mm.h>
 #include <xen/domain.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/irq.h>
 #include <xen/delay.h>
 #include <xen/softirq.h>
@@ -1296,10 +1297,11 @@ int __cpu_disable(void)
        remove_siblinginfo(cpu);
 
        /* It's now safe to remove this processor from the online map */
+       cpu_clear(cpu, cpupool0->cpu_valid);
        cpu_clear(cpu, cpu_online_map);
        fixup_irqs();
 
-       cpu_disable_scheduler();
+       cpu_disable_scheduler(cpu);
 
        return 0;
 }
@@ -1336,11 +1338,6 @@ int cpu_down(unsigned int cpu)
        if (!spin_trylock(&cpu_add_remove_lock))
                return -EBUSY;
 
-       if (num_online_cpus() == 1) {
-               err = -EBUSY;
-               goto out;
-       }
-
        /* Can not offline BSP */
        if (cpu == 0) {
                err = -EINVAL;
@@ -1352,13 +1349,19 @@ int cpu_down(unsigned int cpu)
                goto out;
        }
 
+       err = cpupool_cpu_remove(cpu);
+       if (err)
+               goto out;
+
        printk("Prepare to bring CPU%d down...\n", cpu);
 
        cpufreq_del_cpu(cpu);
 
        err = stop_machine_run(take_cpu_down, NULL, cpu);
-       if (err < 0)
+       if (err < 0) {
+               cpupool_cpu_add(cpu);
                goto out;
+       }
 
        __cpu_die(cpu);
 
@@ -1559,6 +1562,7 @@ int __devinit __cpu_up(unsigned int cpu)
                process_pending_softirqs();
        }
 
+       cpupool_cpu_add(cpu);
        cpufreq_add_cpu(cpu);
        return 0;
 }
diff -r c7d7797656df -r de94884a669c xen/common/Makefile
--- a/xen/common/Makefile       Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/common/Makefile       Wed Apr 21 12:48:03 2010 +0100
@@ -1,5 +1,6 @@ obj-y += bitmap.o
 obj-y += bitmap.o
 obj-y += cpu.o
+obj-y += cpupool.o
 obj-y += domctl.o
 obj-y += domain.o
 obj-y += event_channel.o
diff -r c7d7797656df -r de94884a669c xen/common/cpupool.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/cpupool.c      Wed Apr 21 12:48:03 2010 +0100
@@ -0,0 +1,604 @@
+/******************************************************************************
+ * cpupool.c
+ * 
+ * Generic cpupool-handling functions.
+ *
+ * Cpupools are a feature to have configurable scheduling domains. Each
+ * cpupool runs an own scheduler on a dedicated set of physical cpus.
+ * A domain is bound to one cpupool at any time, but it can be moved to
+ * another cpupool.
+ *
+ * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
+ */
+
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/cpumask.h>
+#include <xen/percpu.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+
+#define for_each_cpupool(ptr)    \
+    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
+
+struct cpupool *cpupool0;                /* Initial cpupool with Dom0 */
+cpumask_t cpupool_free_cpus;             /* cpus not in any cpupool */
+
+static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
+
+static int cpupool0_max_cpus;
+integer_param("pool0_max_cpus", cpupool0_max_cpus);
+
+static int cpupool_moving_cpu = -1;
+static struct cpupool *cpupool_cpu_moving = NULL;
+static cpumask_t cpupool_locked_cpus = CPU_MASK_NONE;
+
+/* cpupool lock: be carefull, this lock is sometimes released on another cpu
+ *               as it was obtained!
+ */
+static DEFINE_SPINLOCK(cpupool_lock);
+
+DEFINE_PER_CPU(struct cpupool *, cpupool);
+
+static struct cpupool *alloc_cpupool_struct(void)
+{
+    return xmalloc(struct cpupool);
+}
+
+static void free_cpupool_struct(struct cpupool *c)
+{
+    xfree(c);
+}
+
+/*
+ * find a cpupool by it's id. to be called with cpupool lock held
+ * if exact is not specified, the first cpupool with an id larger or equal to
+ * the searched id is returned
+ * returns NULL if not found.
+ */
+static struct cpupool *cpupool_find_by_id(int id, int exact)
+{
+    struct cpupool **q;
+
+    for_each_cpupool(q)
+    {
+        if ( (*q)->cpupool_id == id )
+            return *q;
+        if ( (*q)->cpupool_id > id )
+            break;
+    }
+    return exact ? NULL : *q;
+}
+
+/*
+ * create a new cpupool with specified poolid and scheduler
+ * returns pointer to new cpupool structure if okay, NULL else
+ * possible failures:
+ * - no memory
+ * - poolid already used
+ * - unknown scheduler
+ */
+struct cpupool *cpupool_create(int poolid, char *sched)
+{
+    struct cpupool *c;
+    struct cpupool **q;
+    int last = 0;
+
+    if ( (c = alloc_cpupool_struct()) == NULL )
+        return NULL;
+    memset(c, 0, sizeof(*c));
+
+    printk(XENLOG_DEBUG "cpupool_create(pool=%d,sched=%s)\n", poolid, sched);
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+    {
+        last = (*q)->cpupool_id;
+        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
+            break;
+    }
+    if ( *q != NULL )
+    {
+        if ( (*q)->cpupool_id == poolid )
+        {
+            spin_unlock(&cpupool_lock);
+            free_cpupool_struct(c);
+            return NULL;
+        }
+        c->next = *q;
+    }
+    *q = c;
+    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
+    if ( schedule_init_global(sched, &(c->sched)) )
+    {
+        spin_unlock(&cpupool_lock);
+        cpupool_destroy(c);
+        return NULL;
+    }
+    spin_unlock(&cpupool_lock);
+
+    printk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id,
+        c->sched.name, c->sched.opt_name);
+
+    return c;
+}
+/*
+ * destroys the given cpupool
+ * returns 0 on success, 1 else
+ * possible failures:
+ * - pool still in use
+ * - cpus still assigned to pool
+ * - pool not in list
+ */
+int cpupool_destroy(struct cpupool *c)
+{
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+        if ( *q == c )
+            break;
+    if ( (*q != c) || (c->n_dom != 0) || cpus_weight(c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return 1;
+    }
+    *q = c->next;
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_destroy(pool=%d)\n", c->cpupool_id);
+    schedule_deinit_global(&(c->sched));
+    free_cpupool_struct(c);
+    return 0;
+}
+
+/*
+ * assign a specific cpu to a cpupool
+ * cpupool_lock must be held
+ */
+static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
+        return -EBUSY;
+    per_cpu(cpupool, cpu) = c;
+    schedule_cpu_switch(cpu, c);
+    cpu_clear(cpu, cpupool_free_cpus);
+    if (cpupool_moving_cpu == cpu)
+    {
+        cpupool_moving_cpu = -1;
+        cpupool_cpu_moving = NULL;
+    }
+    cpu_set(cpu, c->cpu_valid);
+    return 0;
+}
+
+/*
+ * assign free physical cpus to a cpupool
+ * cpus assigned are unused cpus with lowest possible ids
+ * returns the number of cpus assigned
+ */
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu)
+{
+    int i;
+    int n;
+
+    n = 0;
+    spin_lock(&cpupool_lock);
+    for_each_cpu_mask(i, cpupool_free_cpus)
+    {
+        if ( cpupool_assign_cpu_locked(c, i) == 0 )
+            n++;
+        if ( n == ncpu )
+            break;
+    }
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_assign_ncpu(pool=%d,ncpu=%d) rc %d\n",
+        c->cpupool_id, ncpu, n);
+    return n;
+}
+
+static long cpupool_unassign_cpu_helper(void *info)
+{
+    struct cpupool *c = (struct cpupool *)info;
+    int cpu = cpupool_moving_cpu;
+    long ret;
+    int cpupool_id = c->cpupool_id;
+
+    ret = cpu_disable_scheduler(cpu);
+    cpu_set(cpu, cpupool_free_cpus);
+    if ( !ret )
+    {
+        schedule_cpu_switch(cpu, NULL);
+        per_cpu(cpupool, cpu) = NULL;
+        cpupool_moving_cpu = -1;
+        cpupool_cpu_moving = NULL;
+    }
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %ld\n",
+        cpupool_id, cpu, ret);
+    return ret;
+}
+
+/*
+ * unassign a specific cpu from a cpupool
+ * we must be sure not to run on the cpu to be unassigned! to achieve this
+ * the main functionality is performed via continue_hypercall_on_cpu on a
+ * specific cpu.
+ * if the cpu to be removed is the last one of the cpupool no active domain
+ * must be bound to the cpupool. dying domains are moved to cpupool0 as they
+ * might be zombies.
+ * possible failures:
+ * - last cpu and still active domains in cpupool
+ * - cpu just being unplugged
+ */
+int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
+{
+    int work_cpu;
+    int ret;
+    struct domain *d;
+    int cpupool_id = c->cpupool_id;
+
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
+        cpupool_id, cpu);
+    spin_lock(&cpupool_lock);
+    ret = -EBUSY;
+    if ( (cpupool_moving_cpu != -1) && (cpu != cpupool_moving_cpu) )
+        goto out;
+    if ( cpu_isset(cpu, cpupool_locked_cpus) )
+        goto out;
+
+    ret = 0;
+    if ( !cpu_isset(cpu, c->cpu_valid) && (cpu != cpupool_moving_cpu) )
+        goto out;
+
+    if ( (c->n_dom > 0) && (cpus_weight(c->cpu_valid) == 1) &&
+         (cpu != cpupool_moving_cpu) )
+    {
+        for_each_domain(d)
+        {
+            if ( d->cpupool != c )
+                continue;
+            if ( !d->is_dying )
+            {
+                ret = -EBUSY;
+                break;
+            }
+            c->n_dom--;
+            ret = sched_move_domain(d, cpupool0);
+            if ( ret )
+            {
+                c->n_dom++;
+                break;
+            }
+            cpupool0->n_dom++;
+        }
+        if ( ret )
+            goto out;
+    }
+    cpupool_moving_cpu = cpu;
+    cpupool_cpu_moving = c;
+    cpu_clear(cpu, c->cpu_valid);
+    work_cpu = smp_processor_id();
+    if ( work_cpu == cpu )
+    {
+        work_cpu = first_cpu(cpupool0->cpu_valid);
+        if ( work_cpu == cpu )
+            work_cpu = next_cpu(cpu, cpupool0->cpu_valid);
+    }
+    return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
+
+out:
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
+        cpupool_id, cpu, ret);
+    return ret;
+}
+
+/*
+ * assign cpus to the default cpupool
+ * default are all cpus, less cpus may be specified as boot parameter
+ * possible failures:
+ * - no cpu assigned
+ */
+int __init cpupool0_cpu_assign(struct cpupool *c)
+{
+    if ( (cpupool0_max_cpus == 0) || (cpupool0_max_cpus > num_online_cpus()) )
+        cpupool0_max_cpus = num_online_cpus();
+    if ( !cpupool_assign_ncpu(cpupool0, cpupool0_max_cpus) )
+        return 1;
+    return 0;
+}
+
+/*
+ * add a new domain to a cpupool
+ * possible failures:
+ * - pool does not exist
+ * - no cpu assigned to pool
+ */
+int cpupool_add_domain(struct domain *d, int poolid)
+{
+    struct cpupool *c;
+    int rc = 1;
+    int n_dom;
+
+    if ( poolid == CPUPOOLID_NONE )
+        return 0;
+    spin_lock(&cpupool_lock);
+    c = cpupool_find_by_id(poolid, 1);
+    if ( (c != NULL) && cpus_weight(c->cpu_valid) )
+    {
+        c->n_dom++;
+        n_dom = c->n_dom;
+        d->cpupool = c;
+        rc = 0;
+    }
+    spin_unlock(&cpupool_lock);
+    if (!rc)
+        printk(XENLOG_DEBUG "cpupool_add_domain(dom=%d,pool=%d) n_dom %d\n",
+            d->domain_id, poolid, n_dom);
+    return rc;
+}
+
+/*
+ * remove a domain from a cpupool
+ */
+void cpupool_rm_domain(struct domain *d)
+{
+    int cpupool_id;
+    int n_dom;
+
+    if ( d->cpupool == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpupool_id = d->cpupool->cpupool_id;
+    d->cpupool->n_dom--;
+    n_dom = d->cpupool->n_dom;
+    d->cpupool = NULL;
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
+        d->domain_id, cpupool_id, n_dom);
+    return;
+}
+
+/*
+ * called to add a new cpu to pool admin
+ * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0
+ */
+void cpupool_cpu_add(unsigned int cpu)
+{
+    if ( cpupool0 == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpu_clear(cpu, cpupool_locked_cpus);
+    cpu_set(cpu, cpupool_free_cpus);
+    cpupool_assign_cpu_locked(cpupool0, cpu);
+    spin_unlock(&cpupool_lock);
+    return;
+}
+
+/*
+ * called to remove a cpu from pool admin
+ * the cpu to be removed is locked to avoid removing it from dom0
+ * returns failure if not in pool0
+ */
+int cpupool_cpu_remove(unsigned int cpu)
+{
+    int ret = 0;
+       
+    spin_lock(&cpupool_lock);
+    if ( !cpu_isset(cpu, cpupool0->cpu_valid))
+        ret = -EBUSY;
+    else
+        cpu_set(cpu, cpupool_locked_cpus);
+    spin_unlock(&cpupool_lock);
+
+    return ret;
+}
+
+/*
+ * do cpupool related domctl operations
+ */
+int cpupool_do_domctl(struct xen_domctl_cpupool_op *op)
+{
+    int ret;
+    struct cpupool *c;
+
+    switch ( op->op )
+    {
+
+    case XEN_DOMCTL_CPUPOOL_OP_CREATE:
+    {
+        int poolid;
+        const struct scheduler *sched;
+
+        poolid = (op->cpupool_id == XEN_DOMCTL_CPUPOOL_PAR_ANY) ?
+            CPUPOOLID_NONE: op->cpupool_id;
+        sched = scheduler_get_by_id(op->sched_id);
+        ret = -ENOENT;
+        if ( sched == NULL )
+            break;
+        ret = 0;
+        c = cpupool_create(poolid, sched->opt_name);
+        if ( c == NULL )
+            ret = -EINVAL;
+        else
+            op->cpupool_id = c->cpupool_id;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_DESTROY:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 1);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        ret = (cpupool_destroy(c) != 0) ? -EBUSY : 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_INFO:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        op->cpupool_id = c->cpupool_id;
+        op->sched_id = c->sched.sched_id;
+        op->n_dom = c->n_dom;
+        cpumask_to_xenctl_cpumap(&(op->cpumap), &(c->cpu_valid));
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_ADDCPU:
+    {
+        unsigned cpu;
+
+        cpu = op->cpu;
+        printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d)\n",
+            op->cpupool_id, cpu);
+        spin_lock(&cpupool_lock);
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = first_cpu(cpupool_free_cpus);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            goto addcpu_out;
+        ret = -EBUSY;
+        if ( !cpu_isset(cpu, cpupool_free_cpus) )
+            goto addcpu_out;
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        ret = -ENOENT;
+        if ( c == NULL )
+            goto addcpu_out;
+        ret = cpupool_assign_cpu_locked(c, cpu);
+addcpu_out:
+        spin_unlock(&cpupool_lock);
+        printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
+            op->cpupool_id, cpu, ret);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_RMCPU:
+    {
+        unsigned cpu;
+
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        cpu = op->cpu;
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = last_cpu(c->cpu_valid);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            break;
+        /* caution: cpupool_unassign_cpu uses continue_hypercall_on_cpu and
+         * will continue after the local return
+         */
+        ret = cpupool_unassign_cpu(c, cpu);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN:
+    {
+        struct domain *d;
+
+        ret = -EINVAL;
+        if ( op->domid == 0 )
+            break;
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->domid);
+        if ( d == NULL )
+            break;
+        if ( d->cpupool == NULL )
+        {
+            ret = -EINVAL;
+            rcu_unlock_domain(d);
+            break;
+        }
+        printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d\n",
+            d->domain_id, op->cpupool_id);
+        ret = -ENOENT;
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 1);
+        if ( (c != NULL) && cpus_weight(c->cpu_valid) )
+        {
+            d->cpupool->n_dom--;
+            ret = sched_move_domain(d, c);
+            if ( ret )
+                d->cpupool->n_dom++;
+            else
+                c->n_dom++;
+        }
+        spin_unlock(&cpupool_lock);
+        printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d ret %d\n",
+            d->domain_id, op->cpupool_id, ret);
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_FREEINFO:
+    {
+        cpumask_to_xenctl_cpumap(&(op->cpumap),
+            &cpupool_free_cpus);
+        ret = 0;
+    }
+    break;
+
+    default:
+        ret = -ENOSYS;
+
+    }
+
+    return ret;
+}
+
+void schedule_dump(struct cpupool *c);
+
+void dump_runq(unsigned char key)
+{
+    unsigned long    flags;
+    s_time_t         now = NOW();
+    struct cpupool **c;
+
+    spin_lock(&cpupool_lock);
+    local_irq_save(flags);
+
+    printk("sched_smt_power_savings: %s\n",
+            sched_smt_power_savings? "enabled":"disabled");
+    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+
+    printk("Idle cpupool:\n");
+    schedule_dump(NULL);
+
+    for_each_cpupool(c)
+    {
+        printk("Cpupool %d:\n", (*c)->cpupool_id);
+        schedule_dump(*c);
+    }
+
+    local_irq_restore(flags);
+    spin_unlock(&cpupool_lock);
+}
+
+static int __init cpupool_init(void)
+{
+    cpupool_free_cpus = cpu_online_map;
+    cpupool_list = NULL;
+    return 0;
+}
+__initcall(cpupool_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r c7d7797656df -r de94884a669c xen/common/domain.c
--- a/xen/common/domain.c       Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/common/domain.c       Wed Apr 21 12:48:03 2010 +0100
@@ -218,6 +218,7 @@ struct domain *domain_create(
     enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
            INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 };
     int init_status = 0;
+    int poolid = CPUPOOLID_NONE;
 
     if ( (d = alloc_domain_struct()) == NULL )
         return NULL;
@@ -282,6 +283,8 @@ struct domain *domain_create(
         if ( grant_table_create(d) != 0 )
             goto fail;
         init_status |= INIT_gnttab;
+
+        poolid = 0;
     }
 
     if ( arch_domain_create(d, domcr_flags) != 0 )
@@ -291,6 +294,9 @@ struct domain *domain_create(
     d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex);
     d->irq_caps   = rangeset_new(d, "Interrupts", 0);
     if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) )
+        goto fail;
+
+    if ( cpupool_add_domain(d, poolid) != 0 )
         goto fail;
 
     if ( sched_init_domain(d) != 0 )
@@ -601,6 +607,8 @@ static void complete_domain_destroy(stru
 
     rangeset_domain_destroy(d);
 
+    cpupool_rm_domain(d);
+
     sched_destroy_domain(d);
 
     /* Free page used by xen oprofile buffer. */
diff -r c7d7797656df -r de94884a669c xen/common/domctl.c
--- a/xen/common/domctl.c       Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/common/domctl.c       Wed Apr 21 12:48:03 2010 +0100
@@ -11,6 +11,7 @@
 #include <xen/lib.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/domain_page.h>
@@ -140,10 +141,12 @@ void getdomaininfo(struct domain *d, str
     info->shared_info_frame = mfn_to_gmfn(d, __pa(d->shared_info)>>PAGE_SHIFT);
     BUG_ON(SHARED_M2P(info->shared_info_frame));
 
+    info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
+
     memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t));
 }
 
-static unsigned int default_vcpu0_location(void)
+static unsigned int default_vcpu0_location(cpumask_t *online)
 {
     struct domain *d;
     struct vcpu   *v;
@@ -173,7 +176,7 @@ static unsigned int default_vcpu0_locati
     if ( cpus_weight(per_cpu(cpu_sibling_map, 0)) > 1 )
         cpu = next_cpu(cpu, per_cpu(cpu_sibling_map, 0));
     cpu_exclude_map = per_cpu(cpu_sibling_map, 0);
-    for_each_online_cpu ( i )
+    for_each_cpu_mask(i, *online)
     {
         if ( cpu_isset(i, cpu_exclude_map) )
             continue;
@@ -450,6 +453,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     {
         struct domain *d;
         unsigned int i, max = op->u.max_vcpus.max, cpu;
+        cpumask_t *online;
 
         ret = -ESRCH;
         if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
@@ -498,6 +502,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             goto maxvcpu_out;
 
         ret = -ENOMEM;
+        online = (d->cpupool == NULL) ? &cpu_online_map : 
&d->cpupool->cpu_valid;
         if ( max > d->max_vcpus )
         {
             struct vcpu **vcpus;
@@ -521,8 +526,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
                 continue;
 
             cpu = (i == 0) ?
-                default_vcpu0_location() :
-                cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map);
+                default_vcpu0_location(online) :
+                cycle_cpu(d->vcpu[i-1]->processor, *online);
 
             if ( alloc_vcpu(d, i, cpu) == NULL )
                 goto maxvcpu_out;
@@ -961,6 +966,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     }
     break;
 
+    case XEN_DOMCTL_cpupool_op:
+    {
+        ret = cpupool_do_domctl(&op->u.cpupool_op);
+        if ( (ret == 0) && copy_to_guest(u_domctl, op, 1) )
+            ret = -EFAULT;
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, u_domctl);
         break;
diff -r c7d7797656df -r de94884a669c xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/common/sched_credit.c Wed Apr 21 12:48:03 2010 +0100
@@ -70,11 +70,15 @@
 /*
  * Useful macros
  */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
 #define CSCHED_PCPU(_c)     \
     ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
 #define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
 #define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
 #define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+#define CSCHED_CPUONLINE(_pool)    \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
 
 
 /*
@@ -160,19 +164,22 @@ struct csched_private {
     struct timer  master_ticker;
     unsigned int master;
     cpumask_t idlers;
+    cpumask_t cpus;
     uint32_t weight;
     uint32_t credit;
     int credit_balance;
     uint32_t runq_sort;
+    int ticker_active;
 };
 
 
 /*
  * Global variables
  */
-static struct csched_private csched_priv;
+static struct csched_private *csched_priv0 = NULL;
 
 static void csched_tick(void *_cpu);
+static void csched_acct(void *dummy);
 
 static inline int
 __vcpu_on_runq(struct csched_vcpu *svc)
@@ -238,6 +245,7 @@ __runq_tickle(unsigned int cpu, struct c
 {
     struct csched_vcpu * const cur =
         CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
     cpumask_t mask;
 
     ASSERT(cur);
@@ -264,7 +272,7 @@ __runq_tickle(unsigned int cpu, struct c
      */
     if ( cur->pri > CSCHED_PRI_IDLE )
     {
-        if ( cpus_empty(csched_priv.idlers) )
+        if ( cpus_empty(prv->idlers) )
         {
             CSCHED_STAT_CRANK(tickle_idlers_none);
         }
@@ -272,7 +280,7 @@ __runq_tickle(unsigned int cpu, struct c
         {
             cpumask_t idle_mask;
 
-            cpus_and(idle_mask, csched_priv.idlers, new->vcpu->cpu_affinity);
+            cpus_and(idle_mask, prv->idlers, new->vcpu->cpu_affinity);
             if ( !cpus_empty(idle_mask) )
             {
                 CSCHED_STAT_CRANK(tickle_idlers_some);
@@ -294,40 +302,80 @@ __runq_tickle(unsigned int cpu, struct c
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
 }
 
-static int
-csched_pcpu_init(int cpu)
+static void
+csched_free_pdata(struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc = pcpu;
+    unsigned long flags;
+
+    if ( spc == NULL )
+        return;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    prv->credit -= CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus--;
+    cpu_clear(cpu, prv->idlers);
+    cpu_clear(cpu, prv->cpus);
+    if ( (prv->master == cpu) && (prv->ncpus > 0) )
+    {
+        prv->master = first_cpu(prv->cpus);
+        migrate_timer(&prv->master_ticker, prv->master);
+    }
+    kill_timer(&spc->ticker);
+    if ( prv->ncpus == 0 )
+        kill_timer(&prv->master_ticker);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    xfree(spc);
+}
+
+static void *
+csched_alloc_pdata(struct scheduler *ops, int cpu)
 {
     struct csched_pcpu *spc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     /* Allocate per-PCPU info */
     spc = xmalloc(struct csched_pcpu);
     if ( spc == NULL )
-        return -1;
+        return NULL;
     memset(spc, 0, sizeof(*spc));
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
     /* Initialize/update system-wide config */
-    csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
-    if ( csched_priv.ncpus <= cpu )
-        csched_priv.ncpus = cpu + 1;
-    if ( csched_priv.master >= csched_priv.ncpus )
-        csched_priv.master = cpu;
+    prv->credit += CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus++;
+    cpu_set(cpu, prv->cpus);
+    if ( (prv->ncpus == 1) && (prv != csched_priv0) )
+    {
+        prv->master = cpu;
+        init_timer( &prv->master_ticker, csched_acct, prv, cpu);
+        prv->ticker_active = 2;
+    }
 
     init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
+
+    if ( prv == csched_priv0 )
+        prv->master = first_cpu(prv->cpus);
+
     INIT_LIST_HEAD(&spc->runq);
-    spc->runq_sort_last = csched_priv.runq_sort;
+    spc->runq_sort_last = prv->runq_sort;
     spc->idle_bias = NR_CPUS - 1;
-    per_cpu(schedule_data, cpu).sched_priv = spc;
+    if ( per_cpu(schedule_data, cpu).sched_priv == NULL )
+        per_cpu(schedule_data, cpu).sched_priv = spc;
 
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
-    cpu_set(cpu, csched_priv.idlers);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    return 0;
+    cpu_set(cpu, prv->idlers);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    return spc;
 }
 
 #ifndef NDEBUG
@@ -400,17 +448,19 @@ __csched_vcpu_is_migrateable(struct vcpu
 }
 
 static int
-_csched_cpu_pick(struct vcpu *vc, bool_t commit)
+_csched_cpu_pick(struct scheduler *ops, struct vcpu *vc, bool_t commit)
 {
     cpumask_t cpus;
     cpumask_t idlers;
+    cpumask_t *online;
     int cpu;
 
     /*
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
-    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    online = CSCHED_CPUONLINE(vc->domain->cpupool);
+    cpus_and(cpus, *online, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : cycle_cpu(vc->processor, cpus);
@@ -428,7 +478,7 @@ _csched_cpu_pick(struct vcpu *vc, bool_t
      * like run two VCPUs on co-hyperthreads while there are idle cores
      * or sockets.
      */
-    cpus_and(idlers, cpu_online_map, csched_priv.idlers);
+    cpus_and(idlers, cpu_online_map, CSCHED_PRIV(ops)->idlers);
     cpu_set(cpu, idlers);
     cpus_and(cpus, cpus, idlers);
     cpu_clear(cpu, cpus);
@@ -474,18 +524,18 @@ _csched_cpu_pick(struct vcpu *vc, bool_t
 }
 
 static int
-csched_cpu_pick(struct vcpu *vc)
-{
-    return _csched_cpu_pick(vc, 1);
+csched_cpu_pick(struct scheduler *ops, struct vcpu *vc)
+{
+    return _csched_cpu_pick(ops, vc, 1);
 }
 
 static inline void
-__csched_vcpu_acct_start(struct csched_vcpu *svc)
+__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
     unsigned long flags;
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
     if ( list_empty(&svc->active_vcpu_elem) )
     {
@@ -496,16 +546,17 @@ __csched_vcpu_acct_start(struct csched_v
         list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
         if ( list_empty(&sdom->active_sdom_elem) )
         {
-            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
-            csched_priv.weight += sdom->weight;
-        }
-    }
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+            list_add(&sdom->active_sdom_elem, &prv->active_sdom);
+            prv->weight += sdom->weight;
+        }
+    }
+
+    spin_unlock_irqrestore(&prv->lock, flags);
 }
 
 static inline void
-__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
+__csched_vcpu_acct_stop_locked(struct csched_private *prv,
+    struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
 
@@ -518,16 +569,17 @@ __csched_vcpu_acct_stop_locked(struct cs
     list_del_init(&svc->active_vcpu_elem);
     if ( list_empty(&sdom->active_vcpu) )
     {
-        BUG_ON( csched_priv.weight < sdom->weight );
+        BUG_ON( prv->weight < sdom->weight );
         list_del_init(&sdom->active_sdom_elem);
-        csched_priv.weight -= sdom->weight;
-    }
-}
-
-static void
-csched_vcpu_acct(unsigned int cpu)
+        prv->weight -= sdom->weight;
+    }
+}
+
+static void
+csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(current);
+    struct scheduler *ops = per_cpu(scheduler, cpu);
 
     ASSERT( current->processor == cpu );
     ASSERT( svc->sdom != NULL );
@@ -556,9 +608,9 @@ csched_vcpu_acct(unsigned int cpu)
      */
     if ( list_empty(&svc->active_vcpu_elem) )
     {
-        __csched_vcpu_acct_start(svc);
-    }
-    else if ( _csched_cpu_pick(current, 0) != cpu )
+        __csched_vcpu_acct_start(prv, svc);
+    }
+    else if ( _csched_cpu_pick(ops, current, 0) != cpu )
     {
         CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
         CSCHED_STAT_CRANK(migrate_running);
@@ -567,66 +619,75 @@ csched_vcpu_acct(unsigned int cpu)
     }
 }
 
-static int
-csched_vcpu_init(struct vcpu *vc)
-{
-    struct domain * const dom = vc->domain;
-    struct csched_dom *sdom = CSCHED_DOM(dom);
+static void *
+csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd)
+{
     struct csched_vcpu *svc;
-
-    CSCHED_STAT_CRANK(vcpu_init);
 
     /* Allocate per-VCPU info */
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
-        return -1;
+        return NULL;
     memset(svc, 0, sizeof(*svc));
 
     INIT_LIST_HEAD(&svc->runq_elem);
     INIT_LIST_HEAD(&svc->active_vcpu_elem);
-    svc->sdom = sdom;
+    svc->sdom = dd;
     svc->vcpu = vc;
     atomic_set(&svc->credit, 0);
     svc->flags = 0U;
-    svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+    svc->pri = is_idle_domain(vc->domain) ?
+        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
     CSCHED_VCPU_STATS_RESET(svc);
-    vc->sched_priv = svc;
-
-    /* Allocate per-PCPU info */
-    if ( unlikely(!CSCHED_PCPU(vc->processor)) )
-    {
-        if ( csched_pcpu_init(vc->processor) != 0 )
-            return -1;
-    }
-
-    CSCHED_VCPU_CHECK(vc);
-    return 0;
-}
-
-static void
-csched_vcpu_destroy(struct vcpu *vc)
+    CSCHED_STAT_CRANK(vcpu_init);
+    return svc;
+}
+
+static void
+csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc = vc->sched_priv;
+
+    if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
+        __runq_insert(vc->processor, svc);
+}
+
+static void
+csched_free_vdata(struct scheduler *ops, void *priv)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_vcpu *svc = priv;
+    unsigned long flags;
+
+    if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+
+    spin_lock_irqsave(&(prv->lock), flags);
+
+    if ( !list_empty(&svc->active_vcpu_elem) )
+        __csched_vcpu_acct_stop_locked(prv, svc);
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     struct csched_dom * const sdom = svc->sdom;
-    unsigned long flags;
 
     CSCHED_STAT_CRANK(vcpu_destroy);
 
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    if ( !list_empty(&svc->active_vcpu_elem) )
-        __csched_vcpu_acct_stop_locked(svc);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    xfree(svc);
-}
-
-static void
-csched_vcpu_sleep(struct vcpu *vc)
+    csched_free_vdata(ops, svc);
+}
+
+static void
+csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -641,7 +702,7 @@ csched_vcpu_sleep(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_wake(struct vcpu *vc)
+csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     const unsigned int cpu = vc->processor;
@@ -697,10 +758,12 @@ csched_vcpu_wake(struct vcpu *vc)
 
 static int
 csched_dom_cntl(
+    struct scheduler *ops,
     struct domain *d,
     struct xen_domctl_scheduler_op *op)
 {
     struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
@@ -712,14 +775,14 @@ csched_dom_cntl(
     {
         ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
 
-        spin_lock_irqsave(&csched_priv.lock, flags);
+        spin_lock_irqsave(&prv->lock, flags);
 
         if ( op->u.credit.weight != 0 )
         {
             if ( !list_empty(&sdom->active_sdom_elem) )
             {
-                csched_priv.weight -= sdom->weight;
-                csched_priv.weight += op->u.credit.weight;
+                prv->weight -= sdom->weight;
+                prv->weight += op->u.credit.weight;
             }
             sdom->weight = op->u.credit.weight;
         }
@@ -727,25 +790,20 @@ csched_dom_cntl(
         if ( op->u.credit.cap != (uint16_t)~0U )
             sdom->cap = op->u.credit.cap;
 
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        spin_unlock_irqrestore(&prv->lock, flags);
     }
 
     return 0;
 }
 
-static int
-csched_dom_init(struct domain *dom)
+static void *
+csched_alloc_domdata(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom;
-
-    CSCHED_STAT_CRANK(dom_init);
-
-    if ( is_idle_domain(dom) )
-        return 0;
 
     sdom = xmalloc(struct csched_dom);
     if ( sdom == NULL )
-        return -ENOMEM;
+        return NULL;
     memset(sdom, 0, sizeof(*sdom));
 
     /* Initialize credit and weight */
@@ -755,16 +813,40 @@ csched_dom_init(struct domain *dom)
     sdom->dom = dom;
     sdom->weight = CSCHED_DEFAULT_WEIGHT;
     sdom->cap = 0U;
+
+    return (void *)sdom;
+}
+
+static int
+csched_dom_init(struct scheduler *ops, struct domain *dom)
+{
+    struct csched_dom *sdom;
+
+    CSCHED_STAT_CRANK(dom_init);
+
+    if ( is_idle_domain(dom) )
+        return 0;
+
+    sdom = csched_alloc_domdata(ops, dom);
+    if ( sdom == NULL )
+        return -ENOMEM;
+
     dom->sched_priv = sdom;
 
     return 0;
 }
 
 static void
-csched_dom_destroy(struct domain *dom)
+csched_free_domdata(struct scheduler *ops, void *data)
+{
+    xfree(data);
+}
+
+static void
+csched_dom_destroy(struct scheduler *ops, struct domain *dom)
 {
     CSCHED_STAT_CRANK(dom_destroy);
-    xfree(CSCHED_DOM(dom));
+    csched_free_domdata(ops, CSCHED_DOM(dom));
 }
 
 /*
@@ -775,7 +857,7 @@ csched_dom_destroy(struct domain *dom)
  * remember the last UNDER to make the move up operation O(1).
  */
 static void
-csched_runq_sort(unsigned int cpu)
+csched_runq_sort(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
     struct list_head *runq, *elem, *next, *last_under;
@@ -783,7 +865,7 @@ csched_runq_sort(unsigned int cpu)
     unsigned long flags;
     int sort_epoch;
 
-    sort_epoch = csched_priv.runq_sort;
+    sort_epoch = prv->runq_sort;
     if ( sort_epoch == spc->runq_sort_last )
         return;
 
@@ -820,6 +902,7 @@ static void
 static void
 csched_acct(void* dummy)
 {
+    struct csched_private *prv = dummy;
     unsigned long flags;
     struct list_head *iter_vcpu, *next_vcpu;
     struct list_head *iter_sdom, *next_sdom;
@@ -836,22 +919,22 @@ csched_acct(void* dummy)
     int credit;
 
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    weight_total = csched_priv.weight;
-    credit_total = csched_priv.credit;
+    spin_lock_irqsave(&prv->lock, flags);
+
+    weight_total = prv->weight;
+    credit_total = prv->credit;
 
     /* Converge balance towards 0 when it drops negative */
-    if ( csched_priv.credit_balance < 0 )
-    {
-        credit_total -= csched_priv.credit_balance;
+    if ( prv->credit_balance < 0 )
+    {
+        credit_total -= prv->credit_balance;
         CSCHED_STAT_CRANK(acct_balance);
     }
 
     if ( unlikely(weight_total == 0) )
     {
-        csched_priv.credit_balance = 0;
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        prv->credit_balance = 0;
+        spin_unlock_irqrestore(&prv->lock, flags);
         CSCHED_STAT_CRANK(acct_no_work);
         goto out;
     }
@@ -863,7 +946,7 @@ csched_acct(void* dummy)
     credit_xtra = 0;
     credit_cap = 0U;
 
-    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
     {
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
 
@@ -883,9 +966,9 @@ csched_acct(void* dummy)
          * only when the system-wide credit balance is negative.
          */
         credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
-        if ( csched_priv.credit_balance < 0 )
-        {
-            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+        if ( prv->credit_balance < 0 )
+        {
+            credit_peak += ( ( -prv->credit_balance * sdom->weight) +
                              (weight_total - 1)
                            ) / weight_total;
         }
@@ -927,7 +1010,7 @@ csched_acct(void* dummy)
                  */
                 CSCHED_STAT_CRANK(acct_reorder);
                 list_del(&sdom->active_sdom_elem);
-                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+                list_add(&sdom->active_sdom_elem, &prv->active_sdom);
             }
 
             credit_fair = credit_peak;
@@ -993,7 +1076,7 @@ csched_acct(void* dummy)
                 /* Upper bound on credits means VCPU stops earning */
                 if ( credit > CSCHED_CREDITS_PER_TSLICE )
                 {
-                    __csched_vcpu_acct_stop_locked(svc);
+                    __csched_vcpu_acct_stop_locked(prv, svc);
                     credit = 0;
                     atomic_set(&svc->credit, credit);
                 }
@@ -1005,15 +1088,15 @@ csched_acct(void* dummy)
         }
     }
 
-    csched_priv.credit_balance = credit_balance;
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    prv->credit_balance = credit_balance;
+
+    spin_unlock_irqrestore(&prv->lock, flags);
 
     /* Inform each CPU that its runq needs to be sorted */
-    csched_priv.runq_sort++;
+    prv->runq_sort++;
 
 out:
-    set_timer( &csched_priv.master_ticker, NOW() +
+    set_timer( &prv->master_ticker, NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 }
 
@@ -1022,6 +1105,7 @@ csched_tick(void *_cpu)
 {
     unsigned int cpu = (unsigned long)_cpu;
     struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
 
     spc->tick++;
 
@@ -1029,7 +1113,7 @@ csched_tick(void *_cpu)
      * Accounting for running VCPU
      */
     if ( !is_idle_vcpu(current) )
-        csched_vcpu_acct(cpu);
+        csched_vcpu_acct(prv, cpu);
 
     /*
      * Check if runq needs to be sorted
@@ -1038,7 +1122,7 @@ csched_tick(void *_cpu)
      * modified priorities. This is a special O(n) sort and runs at most
      * once per accounting period (currently 30 milliseconds).
      */
-    csched_runq_sort(cpu);
+    csched_runq_sort(prv, cpu);
 
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
@@ -1091,16 +1175,19 @@ csched_runq_steal(int peer_cpu, int cpu,
 }
 
 static struct csched_vcpu *
-csched_load_balance(int cpu, struct csched_vcpu *snext)
+csched_load_balance(struct csched_private *prv, int cpu,
+    struct csched_vcpu *snext)
 {
     struct csched_vcpu *speer;
     cpumask_t workers;
+    cpumask_t *online;
     int peer_cpu;
 
     BUG_ON( cpu != snext->vcpu->processor );
+    online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
 
     /* If this CPU is going offline we shouldn't steal work. */
-    if ( unlikely(!cpu_online(cpu)) )
+    if ( unlikely(!cpu_isset(cpu, *online)) )
         goto out;
 
     if ( snext->pri == CSCHED_PRI_IDLE )
@@ -1114,7 +1201,7 @@ csched_load_balance(int cpu, struct csch
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
      */
-    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    cpus_andnot(workers, *online, prv->idlers);
     cpu_clear(cpu, workers);
     peer_cpu = cpu;
 
@@ -1156,11 +1243,12 @@ csched_load_balance(int cpu, struct csch
  * fast for the common case.
  */
 static struct task_slice
-csched_schedule(s_time_t now)
+csched_schedule(struct scheduler *ops, s_time_t now)
 {
     const int cpu = smp_processor_id();
     struct list_head * const runq = RUNQ(cpu);
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     struct csched_vcpu *snext;
     struct task_slice ret;
 
@@ -1207,7 +1295,7 @@ csched_schedule(s_time_t now)
     if ( snext->pri > CSCHED_PRI_TS_OVER )
         __runq_remove(snext);
     else
-        snext = csched_load_balance(cpu, snext);
+        snext = csched_load_balance(prv, cpu, snext);
 
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
@@ -1215,12 +1303,12 @@ csched_schedule(s_time_t now)
      */
     if ( snext->pri == CSCHED_PRI_IDLE )
     {
-        if ( !cpu_isset(cpu, csched_priv.idlers) )
-            cpu_set(cpu, csched_priv.idlers);
-    }
-    else if ( cpu_isset(cpu, csched_priv.idlers) )
-    {
-        cpu_clear(cpu, csched_priv.idlers);
+        if ( !cpu_isset(cpu, prv->idlers) )
+            cpu_set(cpu, prv->idlers);
+    }
+    else if ( cpu_isset(cpu, prv->idlers) )
+    {
+        cpu_clear(cpu, prv->idlers);
     }
 
     if ( !is_idle_vcpu(snext->vcpu) )
@@ -1267,7 +1355,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 }
 
 static void
-csched_dump_pcpu(int cpu)
+csched_dump_pcpu(struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
     struct csched_pcpu *spc;
@@ -1305,9 +1393,10 @@ csched_dump_pcpu(int cpu)
 }
 
 static void
-csched_dump(void)
+csched_dump(struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     int loop;
 #define idlers_buf keyhandler_scratch
 
@@ -1324,12 +1413,12 @@ csched_dump(void)
            "\tticks per tslice   = %d\n"
            "\tticks per acct     = %d\n"
            "\tmigration delay    = %uus\n",
-           csched_priv.ncpus,
-           csched_priv.master,
-           csched_priv.credit,
-           csched_priv.credit_balance,
-           csched_priv.weight,
-           csched_priv.runq_sort,
+           prv->ncpus,
+           prv->master,
+           prv->credit,
+           prv->credit_balance,
+           prv->weight,
+           prv->runq_sort,
            CSCHED_DEFAULT_WEIGHT,
            CSCHED_MSECS_PER_TICK,
            CSCHED_CREDITS_PER_MSEC,
@@ -1337,12 +1426,12 @@ csched_dump(void)
            CSCHED_TICKS_PER_ACCT,
            vcpu_migration_delay);
 
-    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
+    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
     printk("idlers: %s\n", idlers_buf);
 
     printk("active vcpus:\n");
     loop = 0;
-    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    list_for_each( iter_sdom, &prv->active_sdom )
     {
         struct csched_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
@@ -1359,18 +1448,30 @@ csched_dump(void)
 #undef idlers_buf
 }
 
-static void
-csched_init(void)
-{
-    spin_lock_init(&csched_priv.lock);
-    INIT_LIST_HEAD(&csched_priv.active_sdom);
-    csched_priv.ncpus = 0;
-    csched_priv.master = UINT_MAX;
-    cpus_clear(csched_priv.idlers);
-    csched_priv.weight = 0U;
-    csched_priv.credit = 0U;
-    csched_priv.credit_balance = 0;
-    csched_priv.runq_sort = 0U;
+static int
+csched_init(struct scheduler *ops, int pool0)
+{
+    struct csched_private *prv;
+
+    prv = xmalloc(struct csched_private);
+    if ( prv == NULL )
+        return 1;
+    memset(prv, 0, sizeof(*prv));
+    if ( pool0 )
+        csched_priv0 = prv;
+    ops->sched_data = prv;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->active_sdom);
+    prv->ncpus = 0;
+    prv->master = UINT_MAX;
+    cpus_clear(prv->idlers);
+    prv->weight = 0U;
+    prv->credit = 0U;
+    prv->credit_balance = 0;
+    prv->runq_sort = 0U;
+    prv->ticker_active = (csched_priv0 == prv) ? 0 : 1;
+
+    return 0;
 }
 
 /* Tickers cannot be kicked until SMP subsystem is alive. */
@@ -1380,8 +1481,10 @@ static __init int csched_start_tickers(v
     unsigned int cpu;
 
     /* Is the credit scheduler initialised? */
-    if ( csched_priv.ncpus == 0 )
+    if ( (csched_priv0 == NULL) || (csched_priv0->ncpus == 0) )
         return 0;
+
+    csched_priv0->ticker_active = 1;
 
     for_each_online_cpu ( cpu )
     {
@@ -1389,45 +1492,72 @@ static __init int csched_start_tickers(v
         set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
     }
 
-    init_timer( &csched_priv.master_ticker, csched_acct, NULL,
-                    csched_priv.master);
-
-    set_timer( &csched_priv.master_ticker, NOW() +
+    init_timer( &csched_priv0->master_ticker, csched_acct, csched_priv0,
+                    csched_priv0->master);
+
+    set_timer( &csched_priv0->master_ticker, NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 
     return 0;
 }
 __initcall(csched_start_tickers);
 
-static void csched_tick_suspend(void)
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+        xfree(prv);
+}
+
+static void csched_tick_suspend(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
 
-    spc = CSCHED_PCPU(smp_processor_id());
+    spc = CSCHED_PCPU(cpu);
 
     stop_timer(&spc->ticker);
 }
 
-static void csched_tick_resume(void)
+static void csched_tick_resume(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
     uint64_t now = NOW();
-
-    spc = CSCHED_PCPU(smp_processor_id());
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( !prv->ticker_active )
+        return;
+
+
+    spc = CSCHED_PCPU(cpu);
 
     set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
             - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
-}
-
-const struct scheduler sched_credit_def = {
+
+    if ( (prv->ticker_active == 2) && (prv->master == cpu) )
+    {
+        set_timer( &prv->master_ticker, now +
+            MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT -
+            now % MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT);
+        prv->ticker_active = 1;
+    }
+}
+
+static struct csched_private _csched_priv;
+
+struct scheduler sched_credit_def = {
     .name           = "SMP Credit Scheduler",
     .opt_name       = "credit",
     .sched_id       = XEN_SCHEDULER_CREDIT,
+    .sched_data     = &_csched_priv,
 
     .init_domain    = csched_dom_init,
     .destroy_domain = csched_dom_destroy,
 
-    .init_vcpu      = csched_vcpu_init,
+    .insert_vcpu    = csched_vcpu_insert,
     .destroy_vcpu   = csched_vcpu_destroy,
 
     .sleep          = csched_vcpu_sleep,
@@ -1441,6 +1571,13 @@ const struct scheduler sched_credit_def 
     .dump_cpu_state = csched_dump_pcpu,
     .dump_settings  = csched_dump,
     .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_vdata    = csched_alloc_vdata,
+    .free_vdata     = csched_free_vdata,
+    .alloc_pdata    = csched_alloc_pdata,
+    .free_pdata     = csched_free_pdata,
+    .alloc_domdata  = csched_alloc_domdata,
+    .free_domdata   = csched_free_domdata,
 
     .tick_suspend   = csched_tick_suspend,
     .tick_resume    = csched_tick_resume,
diff -r c7d7797656df -r de94884a669c xen/common/sched_credit2.c
--- a/xen/common/sched_credit2.c        Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/common/sched_credit2.c        Wed Apr 21 12:48:03 2010 +0100
@@ -149,12 +149,16 @@
 /*
  * Useful macros
  */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
 #define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
 #define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
+#define CSCHED_CPUONLINE(_pool)    \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
 /* CPU to runq_id macro */
-#define c2r(_cpu)           (csched_priv.runq_map[(_cpu)])
+#define c2r(_ops, _cpu)     (CSCHED_PRIV(_ops)->runq_map[(_cpu)])
 /* CPU to runqueue struct macro */
-#define RQD(_cpu)          (&csched_priv.rqd[c2r(_cpu)])
+#define RQD(_ops, _cpu)     (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)])
 
 /*
  * Per-runqueue data
@@ -212,11 +216,6 @@ struct csched_dom {
     uint16_t nr_vcpus;
 };
 
-
-/*
- * Global variables
- */
-static struct csched_private csched_priv;
 
 /*
  * Time-to-credit, credit-to-time.
@@ -284,15 +283,15 @@ __runq_insert(struct list_head *runq, st
 }
 
 static void
-runq_insert(unsigned int cpu, struct csched_vcpu *svc)
-{
-    struct list_head * runq = &RQD(cpu)->runq;
+runq_insert(struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc)
+{
+    struct list_head * runq = &RQD(ops, cpu)->runq;
     int pos = 0;
 
     ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) );
 
     BUG_ON( __vcpu_on_runq(svc) );
-    BUG_ON( c2r(cpu) != c2r(svc->vcpu->processor) );
+    BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) );
 
     pos = __runq_insert(runq, svc);
 
@@ -324,11 +323,12 @@ void burn_credits(struct csched_runqueue
 /* Check to see if the item on the runqueue is higher priority than what's
  * currently running; if so, wake up the processor */
 static /*inline*/ void
-runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now)
+runq_tickle(struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, 
s_time_t now)
 {
     int i, ipid=-1;
     s_time_t lowest=(1<<30);
-    struct csched_runqueue_data *rqd = RQD(cpu);
+    struct csched_runqueue_data *rqd = RQD(ops, cpu);
+    cpumask_t *online;
 
     d2printk("rqt d%dv%d cd%dv%d\n",
              new->vcpu->domain->domain_id,
@@ -336,13 +336,14 @@ runq_tickle(unsigned int cpu, struct csc
              current->domain->domain_id,
              current->vcpu_id);
 
+    online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
     /* Find the cpu in this queue group that has the lowest credits */
     for ( i=rqd->cpu_min ; i < rqd->cpu_max ; i++ )
     {
         struct csched_vcpu * cur;
 
         /* Skip cpus that aren't online */
-        if ( !cpu_online(i) )
+        if ( !cpu_isset(i, *online) )
             continue;
 
         cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr);
@@ -396,11 +397,11 @@ runq_tickle(unsigned int cpu, struct csc
 /*
  * Credit-related code
  */
-static void reset_credit(int cpu, s_time_t now)
+static void reset_credit(struct scheduler *ops, int cpu, s_time_t now)
 {
     struct list_head *iter;
 
-    list_for_each( iter, &RQD(cpu)->svc )
+    list_for_each( iter, &RQD(ops, cpu)->svc )
     {
         struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, 
rqd_elem);
 
@@ -521,64 +522,100 @@ __csched_vcpu_check(struct vcpu *vc)
 #define CSCHED_VCPU_CHECK(_vc)
 #endif
 
-static int
-csched_vcpu_init(struct vcpu *vc)
-{
-    struct domain * const dom = vc->domain;
-    struct csched_dom *sdom = CSCHED_DOM(dom);
+static void *
+csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd)
+{
     struct csched_vcpu *svc;
-
-    printk("%s: Initializing d%dv%d\n",
-           __func__, dom->domain_id, vc->vcpu_id);
 
     /* Allocate per-VCPU info */
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
-        return -1;
+        return NULL;
+    memset(svc, 0, sizeof(*svc));
 
     INIT_LIST_HEAD(&svc->rqd_elem);
     INIT_LIST_HEAD(&svc->sdom_elem);
     INIT_LIST_HEAD(&svc->runq_elem);
 
-    svc->sdom = sdom;
+    svc->sdom = dd;
     svc->vcpu = vc;
     svc->flags = 0U;
-    vc->sched_priv = svc;
 
     if ( ! is_idle_vcpu(vc) )
     {
-        BUG_ON( sdom == NULL );
+        BUG_ON( svc->sdom == NULL );
 
         svc->credit = CSCHED_CREDIT_INIT;
-        svc->weight = sdom->weight;
-
+        svc->weight = svc->sdom->weight;
+    }
+    else
+    {
+        BUG_ON( svc->sdom != NULL );
+        svc->credit = CSCHED_IDLE_CREDIT;
+        svc->weight = 0;
+    }
+
+    return svc;
+}
+
+static void
+csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc = vc->sched_priv;
+    struct domain * const dom = vc->domain;
+    struct csched_dom *sdom = CSCHED_DOM(dom);
+
+    printk("%s: Inserting d%dv%d\n",
+           __func__, dom->domain_id, vc->vcpu_id);
+
+    if ( ! is_idle_vcpu(vc) )
+    {
         /* FIXME: Do we need the private lock here? */
-        list_add_tail(&svc->sdom_elem, &sdom->vcpu);
+        list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
 
         /* Add vcpu to runqueue of initial processor */
         /* FIXME: Abstract for multiple runqueues */
         vcpu_schedule_lock_irq(vc);
 
-        list_add_tail(&svc->rqd_elem, &RQD(vc->processor)->svc);
-        update_max_weight(RQD(vc->processor), svc->weight, 0);
+        list_add_tail(&svc->rqd_elem, &RQD(ops, vc->processor)->svc);
+        update_max_weight(RQD(ops, vc->processor), svc->weight, 0);
 
         vcpu_schedule_unlock_irq(vc);
 
         sdom->nr_vcpus++;
     }
-    else
-    {
-        BUG_ON( sdom != NULL );
-        svc->credit = CSCHED_IDLE_CREDIT;
-        svc->weight = 0;
-    }
 
     CSCHED_VCPU_CHECK(vc);
-    return 0;
-}
-
-static void
-csched_vcpu_destroy(struct vcpu *vc)
+}
+
+static void
+csched_free_vdata(struct scheduler *ops, void *priv)
+{
+    struct csched_vcpu *svc = priv;
+    struct vcpu *vc = svc->vcpu;
+
+    if ( ! is_idle_vcpu(vc) )
+    {
+        /* Remove from runqueue */
+        vcpu_schedule_lock_irq(vc);
+
+        list_del_init(&svc->rqd_elem);
+        update_max_weight(RQD(ops, vc->processor), 0, svc->weight);
+
+        vcpu_schedule_unlock_irq(vc);
+
+        /* Remove from sdom list.  Don't need a lock for this, as it's called
+         * syncronously when nothing else can happen. */
+        list_del_init(&svc->sdom_elem);
+
+        svc->sdom->nr_vcpus--;
+    }
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     struct csched_dom * const sdom = svc->sdom;
@@ -586,25 +623,11 @@ csched_vcpu_destroy(struct vcpu *vc)
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
-    /* Remove from runqueue */
-    vcpu_schedule_lock_irq(vc);
-
-    list_del_init(&svc->rqd_elem);
-    update_max_weight(RQD(vc->processor), 0, svc->weight);
-
-    vcpu_schedule_unlock_irq(vc);
-
-    /* Remove from sdom list.  Don't need a lock for this, as it's called
-     * syncronously when nothing else can happen. */
-    list_del_init(&svc->sdom_elem);
-
-    sdom->nr_vcpus--;
-
-    xfree(svc);
-}
-
-static void
-csched_vcpu_sleep(struct vcpu *vc)
+    csched_free_vdata(ops, svc);
+}
+
+static void
+csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -617,7 +640,7 @@ csched_vcpu_sleep(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_wake(struct vcpu *vc)
+csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     const unsigned int cpu = vc->processor;
@@ -654,8 +677,8 @@ csched_vcpu_wake(struct vcpu *vc)
     now = NOW();
 
     /* Put the VCPU on the runq */
-    runq_insert(cpu, svc);
-    runq_tickle(cpu, svc, now);
+    runq_insert(ops, cpu, svc);
+    runq_tickle(ops, cpu, svc, now);
 
 out:
     d2printk("w-\n");
@@ -663,7 +686,7 @@ out:
 }
 
 static void
-csched_context_saved(struct vcpu *vc)
+csched_context_saved(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -688,15 +711,15 @@ csched_context_saved(struct vcpu *vc)
 
         BUG_ON(__vcpu_on_runq(svc));
 
-        runq_insert(cpu, svc);
-        runq_tickle(cpu, svc, NOW());
+        runq_insert(ops, cpu, svc);
+        runq_tickle(ops, cpu, svc, NOW());
     }
 
     vcpu_schedule_unlock_irq(vc);
 }
 
 static int
-csched_cpu_pick(struct vcpu *vc)
+csched_cpu_pick(struct scheduler *ops, struct vcpu *vc)
 {
     /* FIXME: Chose a schedule group based on load */
     /* FIXME: Migrate the vcpu to the new runqueue list, updating
@@ -706,10 +729,12 @@ csched_cpu_pick(struct vcpu *vc)
 
 static int
 csched_dom_cntl(
+    struct scheduler *ops,
     struct domain *d,
     struct xen_domctl_scheduler_op *op)
 {
     struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
@@ -727,7 +752,7 @@ csched_dom_cntl(
 
             /* Must hold csched_priv lock to update sdom, runq lock to
              * update csvcs. */
-            spin_lock_irqsave(&csched_priv.lock, flags);
+            spin_lock_irqsave(&prv->lock, flags);
 
             old_weight = sdom->weight;
 
@@ -744,32 +769,28 @@ csched_dom_cntl(
                 vcpu_schedule_lock_irq(svc->vcpu);
 
                 svc->weight = sdom->weight;
-                update_max_weight(RQD(svc->vcpu->processor), svc->weight, 
old_weight);
+                update_max_weight(RQD(ops, svc->vcpu->processor), svc->weight, 
old_weight);
 
                 vcpu_schedule_unlock_irq(svc->vcpu);
             }
 
-            spin_unlock_irqrestore(&csched_priv.lock, flags);
+            spin_unlock_irqrestore(&prv->lock, flags);
         }
     }
 
     return 0;
 }
 
-static int
-csched_dom_init(struct domain *dom)
+static void *
+csched_alloc_domdata(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom;
     int flags;
 
-    printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
-
-    if ( is_idle_domain(dom) )
-        return 0;
-
     sdom = xmalloc(struct csched_dom);
     if ( sdom == NULL )
-        return -ENOMEM;
+        return NULL;
+    memset(sdom, 0, sizeof(*sdom));
 
     /* Initialize credit and weight */
     INIT_LIST_HEAD(&sdom->vcpu);
@@ -778,40 +799,65 @@ csched_dom_init(struct domain *dom)
     sdom->weight = CSCHED_DEFAULT_WEIGHT;
     sdom->nr_vcpus = 0;
 
+    spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
+
+    list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom);
+
+    spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
+
+    return (void *)sdom;
+}
+
+static int
+csched_dom_init(struct scheduler *ops, struct domain *dom)
+{
+    struct csched_dom *sdom;
+
+    printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
+
+    if ( is_idle_domain(dom) )
+        return 0;
+
+    sdom = csched_alloc_domdata(ops, dom);
+    if ( sdom == NULL )
+        return -ENOMEM;
+
     dom->sched_priv = sdom;
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    list_add_tail(&sdom->sdom_elem, &csched_priv.sdom);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
     return 0;
 }
 
 static void
-csched_dom_destroy(struct domain *dom)
+csched_free_domdata(struct scheduler *ops, void *data)
+{
+    int flags;
+    struct csched_dom *sdom = data;
+
+    spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
+
+    list_del_init(&sdom->sdom_elem);
+
+    spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
+
+    xfree(data);
+}
+
+static void
+csched_dom_destroy(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom = CSCHED_DOM(dom);
-    int flags;
 
     BUG_ON(!list_empty(&sdom->vcpu));
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    list_del_init(&sdom->sdom_elem);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    xfree(CSCHED_DOM(dom));
+    csched_free_domdata(ops, CSCHED_DOM(dom));
 }
 
 /* How long should we let this vcpu run for? */
 static s_time_t
-csched_runtime(int cpu, struct csched_vcpu *snext)
+csched_runtime(struct scheduler *ops, int cpu, struct csched_vcpu *snext)
 {
     s_time_t time = CSCHED_MAX_TIMER;
-    struct csched_runqueue_data *rqd = RQD(cpu);
+    struct csched_runqueue_data *rqd = RQD(ops, cpu);
     struct list_head *runq = &rqd->runq;
 
     if ( is_idle_vcpu(snext->vcpu) )
@@ -851,10 +897,10 @@ void __dump_execstate(void *unused);
  * fast for the common case.
  */
 static struct task_slice
-csched_schedule(s_time_t now)
+csched_schedule(struct scheduler *ops, s_time_t now)
 {
     const int cpu = smp_processor_id();
-    struct csched_runqueue_data *rqd = RQD(cpu);
+    struct csched_runqueue_data *rqd = RQD(ops, cpu);
     struct list_head * const runq = &rqd->runq;
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
     struct csched_vcpu *snext = NULL;
@@ -927,7 +973,7 @@ csched_schedule(s_time_t now)
     }
 
     if ( !is_idle_vcpu(snext->vcpu) && snext->credit <= CSCHED_CREDIT_RESET )
-        reset_credit(cpu, now);
+        reset_credit(ops, cpu, now);
 
 #if 0
     /*
@@ -955,7 +1001,7 @@ csched_schedule(s_time_t now)
     /*
      * Return task to run next...
      */
-    ret.time = csched_runtime(cpu, snext);
+    ret.time = csched_runtime(ops, cpu, snext);
     ret.task = snext->vcpu;
 
     CSCHED_VCPU_CHECK(ret.task);
@@ -977,7 +1023,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 }
 
 static void
-csched_dump_pcpu(int cpu)
+csched_dump_pcpu(struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
     struct csched_vcpu *svc;
@@ -986,7 +1032,7 @@ csched_dump_pcpu(int cpu)
 
     /* FIXME: Do locking properly for access to runqueue structures */
 
-    runq = &RQD(cpu)->runq;
+    runq = &RQD(ops, cpu)->runq;
 
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu));
     printk(" sibling=%s, ", cpustr);
@@ -1014,22 +1060,23 @@ csched_dump_pcpu(int cpu)
 }
 
 static void
-csched_dump(void)
+csched_dump(struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     int loop;
 
     printk("info:\n"
            "\tncpus              = %u\n"
            "\tdefault-weight     = %d\n",
-           csched_priv.ncpus,
+           prv->ncpus,
            CSCHED_DEFAULT_WEIGHT);
 
     /* FIXME: Locking! */
 
     printk("active vcpus:\n");
     loop = 0;
-    list_for_each( iter_sdom, &csched_priv.sdom )
+    list_for_each( iter_sdom, &prv->sdom )
     {
         struct csched_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem);
@@ -1046,42 +1093,49 @@ csched_dump(void)
 }
 
 static void
-make_runq_map(void)
+make_runq_map(struct scheduler *ops)
 {
     int cpu, cpu_count=0;
+    struct csched_private *prv = CSCHED_PRIV(ops);
 
     /* FIXME: Read pcpu layout and do this properly */
     for_each_possible_cpu( cpu )
     {
-        csched_priv.runq_map[cpu] = 0;
+        prv->runq_map[cpu] = 0;
         cpu_count++;
     }
-    csched_priv.runq_count = 1;
+    prv->runq_count = 1;
 
     /* Move to the init code...? */
-    csched_priv.rqd[0].cpu_min = 0;
-    csched_priv.rqd[0].cpu_max = cpu_count;
-}
-
-static void
-csched_init(void)
+    prv->rqd[0].cpu_min = 0;
+    prv->rqd[0].cpu_max = cpu_count;
+}
+
+static int
+csched_init(struct scheduler *ops, int pool0)
 {
     int i;
+    struct csched_private *prv;
 
     printk("Initializing Credit2 scheduler\n" \
            " WARNING: This is experimental software in development.\n" \
            " Use at your own risk.\n");
 
-    spin_lock_init(&csched_priv.lock);
-    INIT_LIST_HEAD(&csched_priv.sdom);
-
-    csched_priv.ncpus = 0;
-
-    make_runq_map();
-
-    for ( i=0; i<csched_priv.runq_count ; i++ )
-    {
-        struct csched_runqueue_data *rqd = csched_priv.rqd + i;
+    prv = xmalloc(struct csched_private);
+    if ( prv == NULL )
+        return 1;
+    memset(prv, 0, sizeof(*prv));
+
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->sdom);
+
+    prv->ncpus = 0;
+
+    make_runq_map(ops);
+
+    for ( i=0; i<prv->runq_count ; i++ )
+    {
+        struct csched_runqueue_data *rqd = prv->rqd + i;
 
         rqd->max_weight = 1;
         rqd->id = i;
@@ -1096,24 +1150,40 @@ csched_init(void)
         spinlock_t *lock;
 
         /* Point the per-cpu schedule lock to the runq_id lock */
-        runq_id = csched_priv.runq_map[i];
+        runq_id = prv->runq_map[i];
         lock = &per_cpu(schedule_data, runq_id)._lock;
 
         per_cpu(schedule_data, i).schedule_lock = lock;
 
-        csched_priv.ncpus++;
-    }
-}
+        prv->ncpus++;
+    }
+
+    return 0;
+}
+
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+        xfree(prv);
+}
+
+
+static struct csched_private _csched_priv;
 
 struct scheduler sched_credit2_def = {
     .name           = "SMP Credit Scheduler rev2",
     .opt_name       = "credit2",
     .sched_id       = XEN_SCHEDULER_CREDIT2,
+    .sched_data     = &_csched_priv,
 
     .init_domain    = csched_dom_init,
     .destroy_domain = csched_dom_destroy,
 
-    .init_vcpu      = csched_vcpu_init,
+    .insert_vcpu    = csched_vcpu_insert,
     .destroy_vcpu   = csched_vcpu_destroy,
 
     .sleep          = csched_vcpu_sleep,
@@ -1128,4 +1198,9 @@ struct scheduler sched_credit2_def = {
     .dump_cpu_state = csched_dump_pcpu,
     .dump_settings  = csched_dump,
     .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_vdata    = csched_alloc_vdata,
+    .free_vdata     = csched_free_vdata,
+    .alloc_domdata  = csched_alloc_domdata,
+    .free_domdata   = csched_free_domdata,
 };
diff -r c7d7797656df -r de94884a669c xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c   Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/common/sched_sedf.c   Wed Apr 21 12:48:03 2010 +0100
@@ -20,6 +20,9 @@
         if ( (_f) <= SEDFLEVEL )                \
             printk(_a );                        \
     } while ( 0 )
+
+#define SEDF_CPUONLINE(_pool)                                             \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
 
 #ifndef NDEBUG
 #define SEDF_STATS
@@ -132,7 +135,7 @@ struct sedf_cpu_info {
 #define sedf_runnable(edom)  (!(EDOM_INFO(edom)->status & SEDF_ASLEEP))
 
 
-static void sedf_dump_cpu_state(int i);
+static void sedf_dump_cpu_state(struct scheduler *ops, int i);
 
 static inline int extraq_on(struct vcpu *d, int i)
 {
@@ -329,30 +332,17 @@ static inline void __add_to_runqueue_sor
 }
 
 
-static int sedf_init_vcpu(struct vcpu *v)
+static void *sedf_alloc_vdata(struct scheduler *ops, struct vcpu *v, void *dd)
 {
     struct sedf_vcpu_info *inf;
 
-    if ( (v->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL )
-        return -1;
-    memset(v->sched_priv, 0, sizeof(struct sedf_vcpu_info));
-
-    inf = EDOM_INFO(v);
+    inf = xmalloc(struct sedf_vcpu_info);
+    if ( inf == NULL )
+        return NULL;
+
+    memset(inf, 0, sizeof(struct sedf_vcpu_info));
     inf->vcpu = v;
- 
-    /* Allocate per-CPU context if this is the first domain to be added. */
-    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
-    {
-        per_cpu(schedule_data, v->processor).sched_priv = 
-            xmalloc(struct sedf_cpu_info);
-        BUG_ON(per_cpu(schedule_data, v->processor).sched_priv == NULL);
-        memset(CPU_INFO(v->processor), 0, sizeof(*CPU_INFO(v->processor)));
-        INIT_LIST_HEAD(WAITQ(v->processor));
-        INIT_LIST_HEAD(RUNQ(v->processor));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_PEN_Q));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_UTIL_Q));
-    }
-       
+
     /* Every VCPU gets an equal share of extratime by default. */
     inf->deadl_abs   = 0;
     inf->latency     = 0;
@@ -383,39 +373,88 @@ static int sedf_init_vcpu(struct vcpu *v
     }
     else
     {
-        EDOM_INFO(v)->deadl_abs = 0;
-        EDOM_INFO(v)->status &= ~SEDF_ASLEEP;
-    }
-
-    return 0;
-}
-
-static void sedf_destroy_vcpu(struct vcpu *v)
-{
-    xfree(v->sched_priv);
-}
-
-static int sedf_init_domain(struct domain *d)
-{
-    d->sched_priv = xmalloc(struct sedf_dom_info);
+        inf->deadl_abs = 0;
+        inf->status &= ~SEDF_ASLEEP;
+    }
+
+    return inf;
+}
+
+static void *
+sedf_alloc_pdata(struct scheduler *ops, int cpu)
+{
+    struct sedf_cpu_info *spc;
+
+    spc = xmalloc(struct sedf_cpu_info);
+    BUG_ON(spc == NULL);
+    memset(spc, 0, sizeof(*spc));
+    INIT_LIST_HEAD(&spc->waitq);
+    INIT_LIST_HEAD(&spc->runnableq);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]);
+
+    return (void *)spc;
+}
+
+static void
+sedf_free_pdata(struct scheduler *ops, void *spc, int cpu)
+{
+    if ( spc == NULL )
+        return;
+
+    xfree(spc);
+}
+
+static void sedf_free_vdata(struct scheduler *ops, void *priv)
+{
+    xfree(priv);
+}
+
+static void sedf_destroy_vcpu(struct scheduler *ops, struct vcpu *v)
+{
+    sedf_free_vdata(ops, v->sched_priv);
+}
+
+static void *
+sedf_alloc_domdata(struct scheduler *ops, struct domain *d)
+{
+    void *mem;
+
+    mem = xmalloc(struct sedf_dom_info);
+    if ( mem == NULL )
+        return NULL;
+
+    memset(mem, 0, sizeof(struct sedf_dom_info));
+
+    return mem;
+}
+
+static int sedf_init_domain(struct scheduler *ops, struct domain *d)
+{
+    d->sched_priv = sedf_alloc_domdata(ops, d);
     if ( d->sched_priv == NULL )
         return -ENOMEM;
 
-    memset(d->sched_priv, 0, sizeof(struct sedf_dom_info));
-
     return 0;
 }
 
-static void sedf_destroy_domain(struct domain *d)
-{
-    xfree(d->sched_priv);
-}
-
-static int sedf_pick_cpu(struct vcpu *v)
+static void sedf_free_domdata(struct scheduler *ops, void *data)
+{
+    xfree(data);
+}
+
+static void sedf_destroy_domain(struct scheduler *ops, struct domain *d)
+{
+    sedf_free_domdata(ops, d->sched_priv);
+}
+
+static int sedf_pick_cpu(struct scheduler *ops, struct vcpu *v)
 {
     cpumask_t online_affinity;
-
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    cpumask_t *online;
+
+    online = SEDF_CPUONLINE(v->domain->cpupool);
+    cpus_and(online_affinity, v->cpu_affinity, *online);
     return first_cpu(online_affinity);
 }
 
@@ -751,7 +790,7 @@ static struct task_slice sedf_do_extra_s
    -timeslice for the current period used up
    -domain on waitqueue has started it's period
    -and various others ;) in general: determine which domain to run next*/
-static struct task_slice sedf_do_schedule(s_time_t now)
+static struct task_slice sedf_do_schedule(struct scheduler *ops, s_time_t now)
 {
     int                   cpu      = smp_processor_id();
     struct list_head     *runq     = RUNQ(cpu);
@@ -786,6 +825,13 @@ static struct task_slice sedf_do_schedul
     }
  check_waitq:
     update_queues(now, runq, waitq);
+
+    if ( unlikely(!cpu_isset(cpu, *SEDF_CPUONLINE(per_cpu(cpupool, cpu)))) )
+    {
+        ret.task = IDLETASK(cpu);
+        ret.time = SECONDS(1);
+        goto sched_done;
+    }
  
     /*now simply pick the first domain from the runqueue, which has the
       earliest deadline, because the list is sorted*/
@@ -824,6 +870,7 @@ static struct task_slice sedf_do_schedul
                                      extraq, cpu);
     }
 
+  sched_done:
     /*TODO: Do something USEFUL when this happens and find out, why it
       still can happen!!!*/
     if ( ret.time < 0)
@@ -841,7 +888,7 @@ static struct task_slice sedf_do_schedul
 }
 
 
-static void sedf_sleep(struct vcpu *d)
+static void sedf_sleep(struct scheduler *ops, struct vcpu *d)
 {
     PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",
           d->domain->domain_id, d->vcpu_id);
@@ -1060,7 +1107,7 @@ static inline int should_switch(struct v
     return 1;
 }
 
-static void sedf_wake(struct vcpu *d)
+static void sedf_wake(struct scheduler *ops, struct vcpu *d)
 {
     s_time_t              now = NOW();
     struct sedf_vcpu_info* inf = EDOM_INFO(d);
@@ -1213,8 +1260,8 @@ static void sedf_dump_domain(struct vcpu
 }
 
 
-/* dumps all domains on hte specified cpu */
-static void sedf_dump_cpu_state(int i)
+/* dumps all domains on the specified cpu */
+static void sedf_dump_cpu_state(struct scheduler *ops, int i)
 {
     struct list_head      *list, *queue, *tmp;
     struct sedf_vcpu_info *d_inf;
@@ -1287,7 +1334,7 @@ static void sedf_dump_cpu_state(int i)
 
 
 /* Adjusts periods and slices of the domains accordingly to their weights. */
-static int sedf_adjust_weights(struct xen_domctl_scheduler_op *cmd)
+static int sedf_adjust_weights(struct cpupool *c, struct 
xen_domctl_scheduler_op *cmd)
 {
     struct vcpu *p;
     struct domain      *d;
@@ -1308,6 +1355,8 @@ static int sedf_adjust_weights(struct xe
     rcu_read_lock(&domlist_read_lock);
     for_each_domain( d )
     {
+        if ( c != d->cpupool )
+            continue;
         for_each_vcpu( d, p )
         {
             if ( EDOM_INFO(p)->weight )
@@ -1359,7 +1408,7 @@ static int sedf_adjust_weights(struct xe
 
 
 /* set or fetch domain scheduling parameters */
-static int sedf_adjust(struct domain *p, struct xen_domctl_scheduler_op *op)
+static int sedf_adjust(struct scheduler *ops, struct domain *p, struct 
xen_domctl_scheduler_op *op)
 {
     struct vcpu *v;
     int rc;
@@ -1368,9 +1417,6 @@ static int sedf_adjust(struct domain *p,
           "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n",
           p->domain_id, op->u.sedf.period, op->u.sedf.slice,
           op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no");
-
-    if ( !p->vcpu )
-        return -EINVAL;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo )
     {
@@ -1421,7 +1467,7 @@ static int sedf_adjust(struct domain *p,
             }
         }
 
-        rc = sedf_adjust_weights(op);
+        rc = sedf_adjust_weights(p->cpupool, op);
         if ( rc )
             return rc;
 
@@ -1449,7 +1495,7 @@ static int sedf_adjust(struct domain *p,
     return 0;
 }
 
-const struct scheduler sched_sedf_def = {
+struct scheduler sched_sedf_def = {
     .name     = "Simple EDF Scheduler",
     .opt_name = "sedf",
     .sched_id = XEN_SCHEDULER_SEDF,
@@ -1457,8 +1503,14 @@ const struct scheduler sched_sedf_def = 
     .init_domain    = sedf_init_domain,
     .destroy_domain = sedf_destroy_domain,
 
-    .init_vcpu      = sedf_init_vcpu,
     .destroy_vcpu   = sedf_destroy_vcpu,
+
+    .alloc_vdata    = sedf_alloc_vdata,
+    .free_vdata     = sedf_free_vdata,
+    .alloc_pdata    = sedf_alloc_pdata,
+    .free_pdata     = sedf_free_pdata,
+    .alloc_domdata  = sedf_alloc_domdata,
+    .free_domdata   = sedf_free_domdata,
 
     .do_schedule    = sedf_do_schedule,
     .pick_cpu       = sedf_pick_cpu,
diff -r c7d7797656df -r de94884a669c xen/common/schedule.c
--- a/xen/common/schedule.c     Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/common/schedule.c     Wed Apr 21 12:48:03 2010 +0100
@@ -53,11 +53,12 @@ static void poll_timer_fn(void *data);
 
 /* This is global for now so that private implementations can reach it */
 DEFINE_PER_CPU(struct schedule_data, schedule_data);
+DEFINE_PER_CPU(struct scheduler *, scheduler);
 
 extern const struct scheduler sched_sedf_def;
 extern const struct scheduler sched_credit_def;
 extern const struct scheduler sched_credit2_def;
-static const struct scheduler *__initdata schedulers[] = {
+static const struct scheduler *schedulers[] = {
     &sched_sedf_def,
     &sched_credit_def,
     &sched_credit2_def,
@@ -66,9 +67,15 @@ static const struct scheduler *__initdat
 
 static struct scheduler __read_mostly ops;
 
-#define SCHED_OP(fn, ...)                                 \
-         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
-          : (typeof(ops.fn(__VA_ARGS__)))0 )
+#define SCHED_OP(opsptr, fn, ...)                                          \
+         (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ )  \
+          : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
+
+#define DOM2OP(_d)    (((_d)->cpupool == NULL) ? &ops : 
&((_d)->cpupool->sched))
+#define VCPU2OP(_v)   (DOM2OP((_v)->domain))
+#define VCPU2ONLINE(_v)                                                    \
+         (((_v)->domain->cpupool == NULL) ? &cpu_online_map                \
+         : &(_v)->domain->cpupool->cpu_valid)
 
 static inline void trace_runstate_change(struct vcpu *v, int new_state)
 {
@@ -209,7 +216,86 @@ int sched_init_vcpu(struct vcpu *v, unsi
 
     TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
 
-    return SCHED_OP(init_vcpu, v);
+    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
+    {
+        per_cpu(schedule_data, v->processor).sched_priv =
+            SCHED_OP(DOM2OP(d), alloc_pdata, processor);
+        if ( per_cpu(schedule_data, v->processor).sched_priv == NULL )
+            return 1;
+    }
+
+    v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
+    if ( v->sched_priv == NULL )
+        return 1;
+
+    if ( is_idle_domain(d) )
+        per_cpu(schedule_data, v->processor).sched_idlevpriv = v->sched_priv;
+
+    return 0;
+}
+
+int sched_move_domain(struct domain *d, struct cpupool *c)
+{
+    struct vcpu *v;
+    unsigned int new_p;
+    void **vcpu_priv;
+    void *domdata;
+
+    domdata = SCHED_OP(&(c->sched), alloc_domdata, d);
+    if ( domdata == NULL )
+        return -ENOMEM;
+
+    vcpu_priv = xmalloc_array(void *, d->max_vcpus);
+    if ( vcpu_priv == NULL )
+    {
+        SCHED_OP(&(c->sched), free_domdata, domdata);
+        return -ENOMEM;
+    }
+
+    memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
+    for_each_vcpu ( d, v )
+    {
+        vcpu_priv[v->vcpu_id] = SCHED_OP(&(c->sched), alloc_vdata, v, domdata);
+        if ( vcpu_priv[v->vcpu_id] == NULL )
+        {
+            for_each_vcpu ( d, v )
+            {
+                if ( vcpu_priv[v->vcpu_id] != NULL )
+                    xfree(vcpu_priv[v->vcpu_id]);
+            }
+            xfree(vcpu_priv);
+            SCHED_OP(&(c->sched), free_domdata, domdata);
+            return -ENOMEM;
+        }
+    }
+
+    domain_pause(d);
+
+    new_p = first_cpu(c->cpu_valid);
+    for_each_vcpu ( d, v )
+    {
+        migrate_timer(&v->periodic_timer, new_p);
+        migrate_timer(&v->singleshot_timer, new_p);
+        migrate_timer(&v->poll_timer, new_p);
+
+        SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
+
+        cpus_setall(v->cpu_affinity);
+        v->processor = new_p;
+        v->sched_priv = vcpu_priv[v->vcpu_id];
+
+        new_p = cycle_cpu(new_p, c->cpu_valid);
+    }
+
+    d->cpupool = c;
+    SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
+    d->sched_priv = domdata;
+
+    domain_unpause(d);
+
+    xfree(vcpu_priv);
+
+    return 0;
 }
 
 void sched_destroy_vcpu(struct vcpu *v)
@@ -219,17 +305,17 @@ void sched_destroy_vcpu(struct vcpu *v)
     kill_timer(&v->poll_timer);
     if ( test_and_clear_bool(v->is_urgent) )
         atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
-    SCHED_OP(destroy_vcpu, v);
+    SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
 }
 
 int sched_init_domain(struct domain *d)
 {
-    return SCHED_OP(init_domain, d);
+    return SCHED_OP(DOM2OP(d), init_domain, d);
 }
 
 void sched_destroy_domain(struct domain *d)
 {
-    SCHED_OP(destroy_domain, d);
+    SCHED_OP(DOM2OP(d), destroy_domain, d);
 }
 
 void vcpu_sleep_nosync(struct vcpu *v)
@@ -243,7 +329,7 @@ void vcpu_sleep_nosync(struct vcpu *v)
         if ( v->runstate.state == RUNSTATE_runnable )
             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
 
-        SCHED_OP(sleep, v);
+        SCHED_OP(VCPU2OP(v), sleep, v);
     }
 
     vcpu_schedule_unlock_irqrestore(v, flags);
@@ -271,7 +357,7 @@ void vcpu_wake(struct vcpu *v)
     {
         if ( v->runstate.state >= RUNSTATE_blocked )
             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
-        SCHED_OP(wake, v);
+        SCHED_OP(VCPU2OP(v), wake, v);
     }
     else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
     {
@@ -326,7 +412,7 @@ static void vcpu_migrate(struct vcpu *v)
 
     /* Select new CPU. */
     old_cpu = v->processor;
-    new_cpu = SCHED_OP(pick_cpu, v);
+    new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
 
     /*
      * Transfer urgency status to new CPU before switching CPUs, as once
@@ -369,19 +455,29 @@ void vcpu_force_reschedule(struct vcpu *
 }
 
 /*
- * This function is used by cpu_hotplug code from stop_machine context.
- * Hence we can avoid needing to take certain locks.
+ * This function is used by cpu_hotplug code from stop_machine context
+ * and from cpupools to switch schedulers on a cpu.
  */
-void cpu_disable_scheduler(void)
+int cpu_disable_scheduler(unsigned int cpu)
 {
     struct domain *d;
     struct vcpu *v;
-    unsigned int cpu = smp_processor_id();
+    struct cpupool *c;
+    int    ret = 0;
+
+    c = per_cpu(cpupool, cpu);
+    if ( c == NULL )
+        return ret;
 
     for_each_domain ( d )
     {
+        if ( d->cpupool != c )
+            continue;
+
         for_each_vcpu ( d, v )
         {
+            vcpu_schedule_lock_irq(v);
+
             if ( (cpus_weight(v->cpu_affinity) == 1) &&
                  cpu_isset(cpu, v->cpu_affinity) )
             {
@@ -395,26 +491,46 @@ void cpu_disable_scheduler(void)
              * be chosen when the timer is next re-set.
              */
             if ( v->singleshot_timer.cpu == cpu )
-                migrate_timer(&v->singleshot_timer, 0);
+            {
+                int cpu_mig = first_cpu(c->cpu_valid);
+                if ( cpu_mig == cpu )
+                    cpu_mig = next_cpu(cpu_mig, c->cpu_valid);
+                migrate_timer(&v->singleshot_timer, cpu_mig);
+            }
 
             if ( v->processor == cpu )
             {
                 set_bit(_VPF_migrating, &v->pause_flags);
+                vcpu_schedule_unlock_irq(v);
                 vcpu_sleep_nosync(v);
                 vcpu_migrate(v);
             }
+            else
+            {
+                vcpu_schedule_unlock_irq(v);
+            }
+
+            /*
+             * A vcpu active in the hypervisor will not be migratable.
+             * The caller should try again after releasing and reaquiring
+             * all locks.
+             */
+            if ( v->processor == cpu )
+                ret = -EAGAIN;
         }
     }
+    return ret;
 }
 
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
     cpumask_t online_affinity, old_affinity;
+    cpumask_t *online;
 
     if ( v->domain->is_pinned )
         return -EINVAL;
-
-    cpus_and(online_affinity, *affinity, cpu_online_map);
+    online = VCPU2ONLINE(v);
+    cpus_and(online_affinity, *affinity, *online);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
@@ -723,7 +839,7 @@ long sched_adjust(struct domain *d, stru
     struct vcpu *v;
     long ret;
     
-    if ( (op->sched_id != ops.sched_id) ||
+    if ( (op->sched_id != DOM2OP(d)->sched_id) ||
          ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
           (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
         return -EINVAL;
@@ -750,7 +866,7 @@ long sched_adjust(struct domain *d, stru
     if ( d == current->domain )
         vcpu_schedule_lock_irq(current);
 
-    if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
+    if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
 
     if ( d == current->domain )
@@ -797,6 +913,7 @@ static void schedule(void)
 {
     struct vcpu          *prev = current, *next = NULL;
     s_time_t              now = NOW();
+    struct scheduler     *sched = this_cpu(scheduler);
     struct schedule_data *sd;
     struct task_slice     next_slice;
 
@@ -812,7 +929,7 @@ static void schedule(void)
     stop_timer(&sd->s_timer);
     
     /* get policy-specific decision on scheduling... */
-    next_slice = ops.do_schedule(now);
+    next_slice = sched->do_schedule(sched, now);
 
     next = next_slice.task;
 
@@ -871,6 +988,10 @@ static void schedule(void)
     update_vcpu_system_time(next);
     vcpu_periodic_timer_work(next);
 
+    TRACE_4D(TRC_SCHED_SWITCH,
+             prev->domain->domain_id, prev->vcpu_id,
+             next->domain->domain_id, next->vcpu_id);
+
     context_switch(prev, next);
 }
 
@@ -884,7 +1005,7 @@ void context_saved(struct vcpu *prev)
     /* Check for migration request /after/ clearing running flag. */
     smp_mb();
 
-    SCHED_OP(context_saved, prev);
+    SCHED_OP(VCPU2OP(prev), context_saved, prev);
 
     if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
         vcpu_migrate(prev);
@@ -920,6 +1041,19 @@ static void poll_timer_fn(void *data)
         vcpu_unblock(v);
 }
 
+/* Get scheduler by id */
+const struct scheduler *scheduler_get_by_id(unsigned int id)
+{
+    int i;
+
+    for ( i = 0; schedulers[i] != NULL; i++ )
+    {
+        if ( schedulers[i]->sched_id == id )
+            return schedulers[i];
+    }
+    return NULL;
+}
+
 /* Initialise the data structures. */
 void __init scheduler_init(void)
 {
@@ -927,64 +1061,138 @@ void __init scheduler_init(void)
 
     open_softirq(SCHEDULE_SOFTIRQ, schedule);
 
+    for ( i = 0; schedulers[i] != NULL; i++ )
+    {
+        ops = *schedulers[i];
+        if ( strcmp(ops.opt_name, opt_sched) == 0 )
+            break;
+    }
+    
+    if ( schedulers[i] == NULL )
+    {
+        printk("Could not find scheduler: %s\n", opt_sched);
+        ops = *schedulers[0];
+    }
+
     for_each_possible_cpu ( i )
     {
+        per_cpu(scheduler, i) = &ops;
         spin_lock_init(&per_cpu(schedule_data, i)._lock);
         per_cpu(schedule_data, i).schedule_lock
             = &per_cpu(schedule_data, i)._lock;
         init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
     }
 
-    for ( i = 0; schedulers[i] != NULL; i++ )
-    {
-        ops = *schedulers[i];
-        if ( strcmp(ops.opt_name, opt_sched) == 0 )
+    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
+    if ( SCHED_OP(&ops, init, 1) )
+        panic("scheduler returned error on init\n");
+}
+
+/* switch scheduler on cpu */
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
+{
+    unsigned long flags;
+    struct vcpu *v;
+    void *vpriv = NULL;
+    void *ppriv;
+    void *ppriv_old;
+    struct scheduler *old_ops;
+    struct scheduler *new_ops;
+
+    old_ops = per_cpu(scheduler, cpu);
+    new_ops = (c == NULL) ? &ops : &(c->sched);
+    v = per_cpu(schedule_data, cpu).idle;
+    ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
+    if ( c != NULL )
+        vpriv = SCHED_OP(new_ops, alloc_vdata, v, v->domain->sched_priv);
+
+    spin_lock_irqsave(per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( c == NULL )
+    {
+        vpriv = v->sched_priv;
+        v->sched_priv = per_cpu(schedule_data, cpu).sched_idlevpriv;
+    }
+    else
+    {
+        v->sched_priv = vpriv;
+        vpriv = NULL;
+    }
+    SCHED_OP(old_ops, tick_suspend, cpu);
+    per_cpu(scheduler, cpu) = new_ops;
+    ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
+    per_cpu(schedule_data, cpu).sched_priv = ppriv;
+    SCHED_OP(new_ops, tick_resume, cpu);
+    SCHED_OP(new_ops, insert_vcpu, v);
+
+    spin_unlock_irqrestore(per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( vpriv != NULL )
+        SCHED_OP(old_ops, free_vdata, vpriv);
+    SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
+}
+
+/* init scheduler global data */
+int schedule_init_global(char *name, struct scheduler *sched)
+{
+    int i;
+    const struct scheduler *data;
+
+    data = &ops;
+    for ( i = 0; (schedulers[i] != NULL) && (name != NULL) ; i++ )
+    {
+        if ( strcmp(schedulers[i]->opt_name, name) == 0 )
+        {
+            data = schedulers[i];
             break;
-    }
-    
-    if ( schedulers[i] == NULL )
-    {
-        printk("Could not find scheduler: %s\n", opt_sched);
-        ops = *schedulers[0];
-    }
-
-    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(init);
-}
-
-void dump_runq(unsigned char key)
-{
-    s_time_t      now = NOW();
-    int           i;
-    unsigned long flags;
-
-    local_irq_save(flags);
-
-    printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(dump_settings);
-    printk("sched_smt_power_savings: %s\n",
-            sched_smt_power_savings? "enabled":"disabled");
-    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
-
-    for_each_online_cpu ( i )
+        }
+    }
+    memcpy(sched, data, sizeof(*sched));
+    return SCHED_OP(sched, init, 0);
+}
+
+/* deinitialize scheduler global data */
+void schedule_deinit_global(struct scheduler *sched)
+{
+    SCHED_OP(sched, deinit);
+}
+
+void schedule_dump(struct cpupool *c)
+{
+    int               i;
+    struct scheduler *sched;
+    cpumask_t        *cpus;
+
+    sched = (c == NULL) ? &ops : &(c->sched);
+    cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
+    printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+    SCHED_OP(sched, dump_settings);
+
+    for_each_cpu_mask (i, *cpus)
     {
         spin_lock(per_cpu(schedule_data, i).schedule_lock);
         printk("CPU[%02d] ", i);
-        SCHED_OP(dump_cpu_state, i);
+        SCHED_OP(sched, dump_cpu_state, i);
         spin_unlock(per_cpu(schedule_data, i).schedule_lock);
     }
-
-    local_irq_restore(flags);
 }
 
 void sched_tick_suspend(void)
 {
-    SCHED_OP(tick_suspend);
+    struct scheduler *sched;
+    unsigned int cpu = smp_processor_id();
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_suspend, cpu);
 }
 
 void sched_tick_resume(void)
 {
-    SCHED_OP(tick_resume);
+    struct scheduler *sched;
+    unsigned int cpu = smp_processor_id();
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_resume, cpu);
 }
 
 #ifdef CONFIG_COMPAT
diff -r c7d7797656df -r de94884a669c xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/include/public/domctl.h       Wed Apr 21 12:48:03 2010 +0100
@@ -35,7 +35,7 @@
 #include "xen.h"
 #include "grant_table.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000006
+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000007
 
 struct xenctl_cpumap {
     XEN_GUEST_HANDLE_64(uint8) bitmap;
@@ -60,10 +60,10 @@ struct xen_domctl_createdomain {
  /* Should domain memory integrity be verifed by tboot during Sx? */
 #define _XEN_DOMCTL_CDF_s3_integrity  2
 #define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
-    uint32_t flags;
  /* Disable out-of-sync shadow page tables? */
 #define _XEN_DOMCTL_CDF_oos_off       3
 #define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
+    uint32_t flags;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
@@ -106,6 +106,7 @@ struct xen_domctl_getdomaininfo {
     uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
     uint32_t ssidref;
     xen_domain_handle_t handle;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
@@ -784,6 +785,30 @@ struct xen_domctl_mem_sharing_op {
 };
 typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t);
+
+/*
+ * cpupool operations
+ */
+/* XEN_DOMCTL_cpupool_op */
+#define XEN_DOMCTL_CPUPOOL_OP_CREATE                1  /* C */
+#define XEN_DOMCTL_CPUPOOL_OP_DESTROY               2  /* D */
+#define XEN_DOMCTL_CPUPOOL_OP_INFO                  3  /* I */
+#define XEN_DOMCTL_CPUPOOL_OP_ADDCPU                4  /* A */
+#define XEN_DOMCTL_CPUPOOL_OP_RMCPU                 5  /* R */
+#define XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
+#define XEN_DOMCTL_CPUPOOL_OP_FREEINFO              7  /* F */
+#define XEN_DOMCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
+struct xen_domctl_cpupool_op {
+    uint32_t op;          /* IN */
+    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
+    uint32_t sched_id;    /* IN: C      OUT: I  */
+    uint32_t domid;       /* IN: M              */
+    uint32_t cpu;         /* IN: AR             */
+    uint32_t n_dom;       /*            OUT: I  */
+    struct xenctl_cpumap cpumap; /*     OUT: IF */
+};
+typedef struct xen_domctl_cpupool_op xen_domctl_cpupool_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpupool_op_t);
 
 
 struct xen_domctl {
@@ -846,6 +871,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_gettscinfo                    59
 #define XEN_DOMCTL_settscinfo                    60
 #define XEN_DOMCTL_getpageframeinfo3             61
+#define XEN_DOMCTL_cpupool_op                    62
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -894,6 +920,7 @@ struct xen_domctl {
         struct xen_domctl_debug_op          debug_op;
         struct xen_domctl_mem_event_op      mem_event_op;
         struct xen_domctl_mem_sharing_op    mem_sharing_op;
+        struct xen_domctl_cpupool_op        cpupool_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
diff -r c7d7797656df -r de94884a669c xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/include/xen/sched-if.h        Wed Apr 21 12:48:03 2010 +0100
@@ -9,6 +9,12 @@
 #define __XEN_SCHED_IF_H__
 
 #include <xen/percpu.h>
+
+/* A global pointer to the initial cpupool (POOL0). */
+extern struct cpupool *cpupool0;
+
+/* cpus currently in no cpupool */
+extern cpumask_t cpupool_free_cpus;
 
 /*
  * In order to allow a scheduler to remap the lock->cpu mapping,
@@ -26,11 +32,14 @@ struct schedule_data {
     struct vcpu        *curr;           /* current task                    */
     struct vcpu        *idle;           /* idle task for this cpu          */
     void               *sched_priv;
+    void               *sched_idlevpriv; /* default scheduler vcpu data    */
     struct timer        s_timer;        /* scheduling timer                */
     atomic_t            urgent_count;   /* how many urgent vcpus           */
 } __cacheline_aligned;
 
 DECLARE_PER_CPU(struct schedule_data, schedule_data);
+DECLARE_PER_CPU(struct scheduler *, scheduler);
+DECLARE_PER_CPU(struct cpupool *, cpupool);
 
 static inline void vcpu_schedule_lock(struct vcpu *v)
 {
@@ -78,29 +87,50 @@ struct scheduler {
     char *name;             /* full name for this scheduler      */
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
+    void *sched_data;       /* global data pointer               */
 
-    void         (*init)           (void);
+    int          (*init)           (struct scheduler *, int);
+    void         (*deinit)         (struct scheduler *);
 
-    int          (*init_domain)    (struct domain *);
-    void         (*destroy_domain) (struct domain *);
+    void         (*free_vdata)     (struct scheduler *, void *);
+    void *       (*alloc_vdata)    (struct scheduler *, struct vcpu *,
+                                    void *);
+    void         (*free_pdata)     (struct scheduler *, void *, int);
+    void *       (*alloc_pdata)    (struct scheduler *, int);
+    void         (*free_domdata)   (struct scheduler *, void *);
+    void *       (*alloc_domdata)  (struct scheduler *, struct domain *);
 
-    int          (*init_vcpu)      (struct vcpu *);
-    void         (*destroy_vcpu)   (struct vcpu *);
+    int          (*init_domain)    (struct scheduler *, struct domain *);
+    void         (*destroy_domain) (struct scheduler *, struct domain *);
 
-    void         (*sleep)          (struct vcpu *);
-    void         (*wake)           (struct vcpu *);
-    void         (*context_saved)  (struct vcpu *);
+    void         (*insert_vcpu)    (struct scheduler *, struct vcpu *);
+    void         (*destroy_vcpu)   (struct scheduler *, struct vcpu *);
 
-    struct task_slice (*do_schedule) (s_time_t);
+    void         (*sleep)          (struct scheduler *, struct vcpu *);
+    void         (*wake)           (struct scheduler *, struct vcpu *);
+    void         (*context_saved)  (struct scheduler *, struct vcpu *);
 
-    int          (*pick_cpu)       (struct vcpu *);
-    int          (*adjust)         (struct domain *,
+    struct task_slice (*do_schedule) (struct scheduler *, s_time_t);
+
+    int          (*pick_cpu)       (struct scheduler *, struct vcpu *);
+    int          (*adjust)         (struct scheduler *, struct domain *,
                                     struct xen_domctl_scheduler_op *);
-    void         (*dump_settings)  (void);
-    void         (*dump_cpu_state) (int);
+    void         (*dump_settings)  (struct scheduler *);
+    void         (*dump_cpu_state) (struct scheduler *, int);
 
-    void         (*tick_suspend)    (void);
-    void         (*tick_resume)     (void);
+    void         (*tick_suspend)    (struct scheduler *, unsigned int);
+    void         (*tick_resume)     (struct scheduler *, unsigned int);
 };
 
+struct cpupool
+{
+    int              cpupool_id;
+    cpumask_t        cpu_valid;      /* all cpus assigned to pool */
+    struct cpupool   *next;
+    unsigned int     n_dom;
+    struct scheduler sched;
+};
+
+const struct scheduler *scheduler_get_by_id(unsigned int id);
+
 #endif /* __XEN_SCHED_IF_H__ */
diff -r c7d7797656df -r de94884a669c xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Apr 21 08:31:31 2010 +0100
+++ b/xen/include/xen/sched.h   Wed Apr 21 12:48:03 2010 +0100
@@ -213,6 +213,7 @@ struct domain
 
     /* Scheduling. */
     void            *sched_priv;    /* scheduler-specific data */
+    struct cpupool  *cpupool;
 
     struct domain   *next_in_list;
     struct domain   *next_in_hashbucket;
@@ -465,6 +466,7 @@ void sched_destroy_vcpu(struct vcpu *v);
 void sched_destroy_vcpu(struct vcpu *v);
 int  sched_init_domain(struct domain *d);
 void sched_destroy_domain(struct domain *d);
+int sched_move_domain(struct domain *d, struct cpupool *c);
 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
 int  sched_id(void);
 void sched_tick_suspend(void);
@@ -575,8 +577,13 @@ void domain_unpause_by_systemcontroller(
 void domain_unpause_by_systemcontroller(struct domain *d);
 void cpu_init(void);
 
+struct scheduler;
+
+int schedule_init_global(char *name, struct scheduler *sched);
+void schedule_deinit_global(struct scheduler *sched);
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c);
 void vcpu_force_reschedule(struct vcpu *v);
-void cpu_disable_scheduler(void);
+int cpu_disable_scheduler(unsigned int cpu);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
@@ -606,6 +613,19 @@ extern enum cpufreq_controller {
 extern enum cpufreq_controller {
     FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
 } cpufreq_controller;
+
+#define CPUPOOLID_NONE    -1
+
+struct cpupool *cpupool_create(int poolid, char *sched);
+int cpupool_destroy(struct cpupool *c);
+int cpupool0_cpu_assign(struct cpupool *c);
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu);
+void cpupool_cpu_add(unsigned int cpu);
+int cpupool_cpu_remove(unsigned int cpu);
+int cpupool_add_domain(struct domain *d, int poolid);
+void cpupool_rm_domain(struct domain *d);
+int cpupool_do_domctl(struct xen_domctl_cpupool_op *op);
+#define num_cpupool_cpus(c) (cpus_weight((c)->cpu_valid))
 
 #endif /* __SCHED_H__ */
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] cpupools [1/6]: hypervisor changes, Xen patchbot-unstable <=