# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 027812e4a63cde88a0cc03a3a83d40325f4e34f8
# Parent 26c03c17c418ba106ebda01502713da2fc9d28c6
Fix context switching race which could cause vcpu_pause()
to not actually do its job properly. This requires us to
move clearing of VCPUF_running to be protected by the
scheduler lock.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
diff -r 26c03c17c418 -r 027812e4a63c xen/arch/ia64/xenmisc.c
--- a/xen/arch/ia64/xenmisc.c Tue Aug 16 17:02:49 2005
+++ b/xen/arch/ia64/xenmisc.c Tue Aug 16 18:02:24 2005
@@ -280,7 +280,6 @@
unsigned long context_switch_count = 0;
-// context_switch
void context_switch(struct vcpu *prev, struct vcpu *next)
{
//printk("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
@@ -290,22 +289,14 @@
//if (prev->domain->domain_id == 0 && next->domain->domain_id == 1) cs01foo();
//printk("@@sw %d->%d\n",prev->domain->domain_id,next->domain->domain_id);
#ifdef CONFIG_VTI
- unsigned long psr;
- /* Interrupt is enabled after next task is chosen.
- * So we have to disable it for stack switch.
- */
- local_irq_save(psr);
vtm_domain_out(prev);
- /* Housekeeping for prev domain */
-#endif // CONFIG_VTI
-
+#endif
context_switch_count++;
switch_to(prev,next,prev);
#ifdef CONFIG_VTI
- /* Post-setup for new domain */
vtm_domain_in(current);
- local_irq_restore(psr);
-#endif // CONFIG_VTI
+#endif
+
// leave this debug for now: it acts as a heartbeat when more than
// one domain is active
{
@@ -315,25 +306,27 @@
if (!cnt[id]--) { printk("%x",id); cnt[id] = 500000; }
if (!i--) { printk("+",id); i = 1000000; }
}
- clear_bit(_VCPUF_running, &prev->vcpu_flags);
- //if (!is_idle_task(next->domain) )
- //send_guest_virq(next, VIRQ_TIMER);
+
#ifdef CONFIG_VTI
if (VMX_DOMAIN(current))
vmx_load_all_rr(current);
- return;
-#else // CONFIG_VTI
+#else
if (!is_idle_task(current->domain)) {
load_region_regs(current);
if (vcpu_timer_expired(current)) vcpu_pend_timer(current);
}
if (vcpu_timer_expired(current)) vcpu_pend_timer(current);
-#endif // CONFIG_VTI
+#endif
+}
+
+void context_switch_finalise(struct vcpu *next)
+{
+ /* nothing to do */
}
void continue_running(struct vcpu *same)
{
- /* nothing to do */
+ /* nothing to do */
}
void panic_domain(struct pt_regs *regs, const char *fmt, ...)
diff -r 26c03c17c418 -r 027812e4a63c xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Tue Aug 16 17:02:49 2005
+++ b/xen/arch/x86/domain.c Tue Aug 16 18:02:24 2005
@@ -48,6 +48,8 @@
struct percpu_ctxt {
struct vcpu *curr_vcpu;
+ unsigned int context_not_finalised;
+ unsigned int dirty_segment_mask;
} __cacheline_aligned;
static struct percpu_ctxt percpu_ctxt[NR_CPUS];
@@ -541,51 +543,59 @@
__r; })
#if CONFIG_VMX
-#define load_msrs(_p, _n) if (vmx_switch_on) vmx_load_msrs((_p), (_n))
+#define load_msrs(n) if (vmx_switch_on) vmx_load_msrs(n)
#else
-#define load_msrs(_p, _n) ((void)0)
+#define load_msrs(n) ((void)0)
#endif
-static void load_segments(struct vcpu *p, struct vcpu *n)
-{
- struct vcpu_guest_context *pctxt = &p->arch.guest_context;
+/*
+ * save_segments() writes a mask of segments which are dirty (non-zero),
+ * allowing load_segments() to avoid some expensive segment loads and
+ * MSR writes.
+ */
+#define DIRTY_DS 0x01
+#define DIRTY_ES 0x02
+#define DIRTY_FS 0x04
+#define DIRTY_GS 0x08
+#define DIRTY_FS_BASE 0x10
+#define DIRTY_GS_BASE_USER 0x20
+
+static void load_segments(struct vcpu *n)
+{
struct vcpu_guest_context *nctxt = &n->arch.guest_context;
int all_segs_okay = 1;
+ unsigned int dirty_segment_mask, cpu = smp_processor_id();
+
+ /* Load and clear the dirty segment mask. */
+ dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
+ percpu_ctxt[cpu].dirty_segment_mask = 0;
/* Either selector != 0 ==> reload. */
- if ( unlikely(pctxt->user_regs.ds | nctxt->user_regs.ds) )
+ if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
/* Either selector != 0 ==> reload. */
- if ( unlikely(pctxt->user_regs.es | nctxt->user_regs.es) )
+ if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
/*
* Either selector != 0 ==> reload.
* Also reload to reset FS_BASE if it was non-zero.
*/
- if ( unlikely(pctxt->user_regs.fs |
- pctxt->fs_base |
+ if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
nctxt->user_regs.fs) )
- {
all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
- if ( pctxt->user_regs.fs ) /* != 0 selector kills fs_base */
- pctxt->fs_base = 0;
- }
/*
* Either selector != 0 ==> reload.
* Also reload to reset GS_BASE if it was non-zero.
*/
- if ( unlikely(pctxt->user_regs.gs |
- pctxt->gs_base_user |
+ if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
nctxt->user_regs.gs) )
{
/* Reset GS_BASE with user %gs? */
- if ( pctxt->user_regs.gs || !nctxt->gs_base_user )
+ if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
- if ( pctxt->user_regs.gs ) /* != 0 selector kills gs_base_user */
- pctxt->gs_base_user = 0;
}
/* This can only be non-zero if selector is NULL. */
@@ -650,7 +660,9 @@
static void save_segments(struct vcpu *v)
{
- struct cpu_user_regs *regs = &v->arch.guest_context.user_regs;
+ struct vcpu_guest_context *ctxt = &v->arch.guest_context;
+ struct cpu_user_regs *regs = &ctxt->user_regs;
+ unsigned int dirty_segment_mask = 0;
if ( VMX_DOMAIN(v) )
rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
@@ -659,18 +671,34 @@
__asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) );
__asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) );
__asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) );
-}
-
-static void clear_segments(void)
-{
- __asm__ __volatile__ (
- " movl %0,%%ds; "
- " movl %0,%%es; "
- " movl %0,%%fs; "
- " movl %0,%%gs; "
- ""safe_swapgs" "
- " movl %0,%%gs"
- : : "r" (0) );
+
+ if ( regs->ds )
+ dirty_segment_mask |= DIRTY_DS;
+
+ if ( regs->es )
+ dirty_segment_mask |= DIRTY_ES;
+
+ if ( regs->fs )
+ {
+ dirty_segment_mask |= DIRTY_FS;
+ ctxt->fs_base = 0; /* != 0 selector kills fs_base */
+ }
+ else if ( ctxt->fs_base )
+ {
+ dirty_segment_mask |= DIRTY_FS_BASE;
+ }
+
+ if ( regs->gs )
+ {
+ dirty_segment_mask |= DIRTY_GS;
+ ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
+ }
+ else if ( ctxt->gs_base_user )
+ {
+ dirty_segment_mask |= DIRTY_GS_BASE_USER;
+ }
+
+ percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
}
long do_switch_to_user(void)
@@ -706,10 +734,9 @@
#elif defined(__i386__)
-#define load_segments(_p, _n) ((void)0)
-#define load_msrs(_p, _n) ((void)0)
-#define save_segments(_p) ((void)0)
-#define clear_segments() ((void)0)
+#define load_segments(n) ((void)0)
+#define load_msrs(n) ((void)0)
+#define save_segments(p) ((void)0)
static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
{
@@ -726,9 +753,9 @@
static void __context_switch(void)
{
struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
- unsigned int cpu = smp_processor_id();
- struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
- struct vcpu *n = current;
+ unsigned int cpu = smp_processor_id();
+ struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
+ struct vcpu *n = current;
if ( !is_idle_task(p->domain) )
{
@@ -786,23 +813,27 @@
void context_switch(struct vcpu *prev, struct vcpu *next)
{
- struct vcpu *realprev;
-
- local_irq_disable();
+ unsigned int cpu = smp_processor_id();
set_current(next);
- if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_vcpu) == next) ||
- is_idle_task(next->domain) )
- {
- local_irq_enable();
- }
- else
+ if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
{
__context_switch();
-
- local_irq_enable();
-
+ percpu_ctxt[cpu].context_not_finalised = 1;
+ }
+}
+
+void context_switch_finalise(struct vcpu *next)
+{
+ unsigned int cpu = smp_processor_id();
+
+ if ( percpu_ctxt[cpu].context_not_finalised )
+ {
+ percpu_ctxt[cpu].context_not_finalised = 0;
+
+ BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
+
if ( VMX_DOMAIN(next) )
{
vmx_restore_msrs(next);
@@ -810,18 +841,10 @@
else
{
load_LDT(next);
- load_segments(realprev, next);
- load_msrs(realprev, next);
- }
- }
-
- /*
- * We do this late on because it doesn't need to be protected by the
- * schedule_lock, and because we want this to be the very last use of
- * 'prev' (after this point, a dying domain's info structure may be freed
- * without warning).
- */
- clear_bit(_VCPUF_running, &prev->vcpu_flags);
+ load_segments(next);
+ load_msrs(next);
+ }
+ }
schedule_tail(next);
BUG();
@@ -835,12 +858,19 @@
int __sync_lazy_execstate(void)
{
- if ( percpu_ctxt[smp_processor_id()].curr_vcpu == current )
- return 0;
- __context_switch();
- load_LDT(current);
- clear_segments();
- return 1;
+ unsigned long flags;
+ int switch_required;
+
+ local_irq_save(flags);
+
+ switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
+
+ if ( switch_required )
+ __context_switch();
+
+ local_irq_restore(flags);
+
+ return switch_required;
}
void sync_lazy_execstate_cpu(unsigned int cpu)
diff -r 26c03c17c418 -r 027812e4a63c xen/arch/x86/vmx.c
--- a/xen/arch/x86/vmx.c Tue Aug 16 17:02:49 2005
+++ b/xen/arch/x86/vmx.c Tue Aug 16 18:02:24 2005
@@ -65,7 +65,7 @@
* are not modified once set for generic domains, we don't save them,
* but simply reset them to the values set at percpu_traps_init().
*/
-void vmx_load_msrs(struct vcpu *p, struct vcpu *n)
+void vmx_load_msrs(struct vcpu *n)
{
struct msr_state *host_state;
host_state = &percpu_msr[smp_processor_id()];
diff -r 26c03c17c418 -r 027812e4a63c xen/common/schedule.c
--- a/xen/common/schedule.c Tue Aug 16 17:02:49 2005
+++ b/xen/common/schedule.c Tue Aug 16 18:02:24 2005
@@ -474,13 +474,14 @@
set_ac_timer(&schedule_data[cpu].s_timer, now + r_time);
- /* Must be protected by the schedule_lock! */
+ if ( unlikely(prev == next) )
+ {
+ spin_unlock_irq(&schedule_data[cpu].schedule_lock);
+ return continue_running(prev);
+ }
+
+ clear_bit(_VCPUF_running, &prev->vcpu_flags);
set_bit(_VCPUF_running, &next->vcpu_flags);
-
- spin_unlock_irq(&schedule_data[cpu].schedule_lock);
-
- if ( unlikely(prev == next) )
- return continue_running(prev);
perfc_incrc(sched_ctx);
@@ -517,6 +518,10 @@
next->domain->domain_id, next->vcpu_id);
context_switch(prev, next);
+
+ spin_unlock_irq(&schedule_data[cpu].schedule_lock);
+
+ context_switch_finalise(next);
}
/* No locking needed -- pointer comparison is safe :-) */
diff -r 26c03c17c418 -r 027812e4a63c xen/include/asm-x86/vmx_vmcs.h
--- a/xen/include/asm-x86/vmx_vmcs.h Tue Aug 16 17:02:49 2005
+++ b/xen/include/asm-x86/vmx_vmcs.h Tue Aug 16 18:02:24 2005
@@ -28,10 +28,10 @@
extern void stop_vmx(void);
#if defined (__x86_64__)
-extern void vmx_load_msrs(struct vcpu *p, struct vcpu *n);
+extern void vmx_load_msrs(struct vcpu *n);
void vmx_restore_msrs(struct vcpu *d);
#else
-#define vmx_load_msrs(_p, _n) ((void)0)
+#define vmx_load_msrs(_n) ((void)0)
#define vmx_restore_msrs(_v) ((void)0)
#endif
diff -r 26c03c17c418 -r 027812e4a63c xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Tue Aug 16 17:02:49 2005
+++ b/xen/include/xen/sched.h Tue Aug 16 18:02:24 2005
@@ -258,12 +258,32 @@
extern void sync_lazy_execstate_all(void);
extern int __sync_lazy_execstate(void);
-/* Called by the scheduler to switch to another vcpu. */
+/*
+ * Called by the scheduler to switch to another VCPU. On entry, although
+ * VCPUF_running is no longer asserted for @prev, its context is still running
+ * on the local CPU and is not committed to memory. The local scheduler lock
+ * is therefore still held, and interrupts are disabled, because the local CPU
+ * is in an inconsistent state.
+ *
+ * The callee must ensure that the local CPU is no longer running in @prev's
+ * context, and that the context is saved to memory, before returning.
+ * Alternatively, if implementing lazy context switching, it suffices to ensure
+ * that invoking __sync_lazy_execstate() will switch and commit @prev's state.
+ */
extern void context_switch(
struct vcpu *prev,
struct vcpu *next);
-/* Called by the scheduler to continue running the current vcpu. */
+/*
+ * On some architectures (notably x86) it is not possible to entirely load
+ * @next's context with interrupts disabled. These may implement a function to
+ * finalise loading the new context after interrupts are re-enabled. This
+ * function is not given @prev and is not permitted to access it.
+ */
+extern void context_switch_finalise(
+ struct vcpu *next);
+
+/* Called by the scheduler to continue running the current VCPU. */
extern void continue_running(
struct vcpu *same);
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|