diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c c/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c --- b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c 1969-12-31 18:00:00.000000000 -0600 +++ c/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c 2005-06-03 09:41:10.933009544 -0500 @@ -0,0 +1,76 @@ +/* + * Spin and read/write lock operations. + * + * Copyright (C) 2001-2004 Paul Mackerras , IBM + * Copyright (C) 2001 Anton Blanchard , IBM + * Copyright (C) 2002 Dave Engebretsen , IBM + * Rework to support virtual processors + * Copyright (C) 2005 Ryan Harper , IBM + * Rework for Xen on x86 + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* waiting for a spinlock... */ +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) +void __spin_yield(spinlock_t *lock) +{ + unsigned int lock_value, holder_cpu, yield_count; + shared_info_t *s = HYPERVISOR_shared_info; + + lock_value = lock->slock; + if (lock_value == 1) + return; + holder_cpu = lock->cpu; + BUG_ON(holder_cpu >= NR_CPUS); + yield_count = s->vcpu_data[holder_cpu].yield_count; + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (lock->slock != lock_value) + return; /* something has changed */ + HYPERVISOR_confer(holder_cpu, yield_count); +} + +void __rw_yield(rwlock_t *rw) +{ + unsigned int lock_value, holder_cpu, yield_count; + shared_info_t *s = HYPERVISOR_shared_info; + + lock_value = rw->lock; + if (lock_value == RW_LOCK_BIAS) + return; + holder_cpu = rw->cpu; + BUG_ON(holder_cpu >= NR_CPUS); + yield_count = s->vcpu_data[holder_cpu].yield_count; + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (rw->lock != lock_value) + return; /* something has changed */ + HYPERVISOR_confer(holder_cpu, yield_count); +} + +void spin_unlock_wait(spinlock_t *lock) +{ + while (spin_is_locked(lock)) { + cpu_relax(); + if (SHARED_PROCESSOR) + __spin_yield(lock); + } + cpu_relax(); +} +EXPORT_SYMBOL(spin_unlock_wait); +#endif diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile c/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile --- b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile 1969-12-31 18:00:00.000000000 -0600 +++ c/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile 2005-06-03 09:41:10.948007446 -0500 @@ -0,0 +1,11 @@ +# +# Makefile for i386-specific library files.. +# + + +lib-y = checksum.o delay.o usercopy.o getuser.o memcpy.o strstr.o \ + bitops.o + +lib-$(CONFIG_X86_USE_3DNOW) += mmx.o +lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_XEN) += locks.o diff -urN b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 c/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 --- b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 2005-06-03 09:02:36.837705157 -0500 +++ c/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 2005-06-03 09:41:10.949007306 -0500 @@ -117,8 +117,8 @@ CONFIG_SMP=y CONFIG_NR_CPUS=8 # CONFIG_SCHED_SMT is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_BKL=y +# CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_BKL is not set CONFIG_X86_CPUID=y # diff -urN b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S c/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S --- b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 2005-06-02 22:21:42.000000000 -0500 +++ c/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 2005-06-03 09:41:10.953006747 -0500 @@ -80,7 +80,7 @@ #define evtchn_upcall_pending /* 0 */ #define evtchn_upcall_mask 1 -#define sizeof_vcpu_shift 3 +#define sizeof_vcpu_shift 4 #ifdef CONFIG_SMP #define preempt_disable(reg) incl TI_preempt_count(reg) diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h c/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h --- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h 2005-06-02 22:21:42.000000000 -0500 +++ c/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h 2005-06-03 09:41:10.954006607 -0500 @@ -517,4 +517,20 @@ return ret; } +static inline int +HYPERVISOR_confer( + unsigned int vcpu, unsigned int yield_count) +{ + int ret; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_confer), "1" (vcpu), "2" (yield_count) + : "memory"); + + return ret; +} + #endif /* __HYPERCALL_H__ */ diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h c/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h --- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h 2005-06-02 22:21:37.000000000 -0500 +++ c/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h 2005-06-03 09:41:10.975003670 -0500 @@ -22,10 +22,36 @@ #ifdef CONFIG_PREEMPT unsigned int break_lock; #endif +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + unsigned int cpu; +#endif } spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + */ +typedef struct { + volatile unsigned int lock; +#ifdef CONFIG_DEBUG_SPINLOCK + unsigned magic; +#endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + unsigned int cpu; +#endif +} rwlock_t; + #ifdef CONFIG_DEBUG_SPINLOCK #define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC #else @@ -44,7 +70,20 @@ */ #define spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) <= 0) +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) +#include +#define SPINLOCK_CPU (smp_processor_id()) +/* We only yield to the hypervisor if we are in shared processor mode */ +#define SHARED_PROCESSOR (HYPERVISOR_shared_info->shproc == 0) +extern void __spin_yield(spinlock_t *lock); +extern void __rw_yield(rwlock_t *rw); +extern void spin_unlock_wait(spinlock_t *lock); +#else +#define __spin_yield(x) barrier() +#define __rw_yield(x) barrier() +#define SHARED_PROCESSOR 0 #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) +#endif #define spin_lock_string \ "\n1:\t" \ @@ -125,6 +164,9 @@ "xchgb %b0,%1" :"=q" (oldval), "=m" (lock->slock) :"0" (0) : "memory"); +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + lock->cpu = SPINLOCK_CPU; +#endif return oldval > 0; } @@ -136,43 +178,55 @@ BUG(); } #endif - __asm__ __volatile__( - spin_lock_string - :"=m" (lock->slock) : : "memory"); +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + while (1) { + if ( likely(_raw_spin_trylock(lock)) ) + break; + do { + cpu_relax(); + if (SHARED_PROCESSOR) + __spin_yield(lock); + } while (likely(spin_is_locked(lock))); + cpu_relax(); + } +#else + __asm__ __volatile__( + spin_lock_string + :"=m" (lock->slock) : : "memory"); +#endif } static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags) { +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + unsigned long flags_dis; +#endif #ifdef CONFIG_DEBUG_SPINLOCK if (unlikely(lock->magic != SPINLOCK_MAGIC)) { printk("eip: %p\n", __builtin_return_address(0)); BUG(); } #endif - __asm__ __volatile__( - spin_lock_string_flags - :"=m" (lock->slock) : "r" (flags) : "memory"); -} - -/* - * Read-write spinlocks, allowing multiple readers - * but only one writer. - * - * NOTE! it is quite common to have readers in interrupts - * but no interrupt writers. For those circumstances we - * can "mix" irq-safe locks - any writer needs to get a - * irq-safe write-lock, but readers can get non-irqsafe - * read-locks. - */ -typedef struct { - volatile unsigned int lock; -#ifdef CONFIG_DEBUG_SPINLOCK - unsigned magic; -#endif -#ifdef CONFIG_PREEMPT - unsigned int break_lock; +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + while (1) { + if ( likely(_raw_spin_trylock(lock)) ) + break; + local_save_flags(flags_dis); + local_irq_restore(flags); + do { + cpu_relax(); + if (SHARED_PROCESSOR) + __spin_yield(lock); + } while (likely(spin_is_locked(lock))); + cpu_relax(); + local_irq_restore(flags_dis); + } +#else + __asm__ __volatile__( + spin_lock_string_flags + :"=m" (lock->slock) : "r" (flags) : "memory"); #endif -} rwlock_t; +} #define RWLOCK_MAGIC 0xdeaf1eed @@ -198,6 +252,18 @@ */ #define write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) +static inline int _raw_write_trylock(rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) { +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + lock->cpu = SPINLOCK_CPU; +#endif + return 1; + } + atomic_add(RW_LOCK_BIAS, count); + return 0; +} /* * On x86, we implement read-write locks as a 32-bit counter * with the high bit (sign) being the "contended" bit. @@ -222,7 +288,20 @@ #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(rw->magic != RWLOCK_MAGIC); #endif +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + while (1) { + if ( likely(_raw_write_trylock(rw)) ) + break; + do { + cpu_relax(); + if (SHARED_PROCESSOR) + __rw_yield(rw); + } while ( likely(!write_can_lock(rw))); + cpu_relax(); + } +#else __build_write_lock(rw, "__write_lock_failed"); +#endif } #define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") @@ -238,13 +317,6 @@ return 0; } -static inline int _raw_write_trylock(rwlock_t *lock) -{ - atomic_t *count = (atomic_t *)lock; - if (atomic_sub_and_test(RW_LOCK_BIAS, count)) - return 1; - atomic_add(RW_LOCK_BIAS, count); - return 0; -} + #endif /* __ASM_SPINLOCK_H */ diff -urN b/xen/arch/x86/domain.c c/xen/arch/x86/domain.c --- b/xen/arch/x86/domain.c 2005-06-02 22:21:41.000000000 -0500 +++ c/xen/arch/x86/domain.c 2005-06-03 09:42:37.487868084 -0500 @@ -240,6 +240,8 @@ memset(d->shared_info, 0, PAGE_SIZE); v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id]; v->cpumap = CPUMAP_RUNANYWHERE; + /* default vcpus to sharing physical cpus */ + d->shared_info->shproc = 1; SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d); machine_to_phys_mapping[virt_to_phys(d->shared_info) >> PAGE_SHIFT] = INVALID_M2P_ENTRY; diff -urN b/xen/arch/x86/x86_32/entry.S c/xen/arch/x86/x86_32/entry.S --- b/xen/arch/x86/x86_32/entry.S 2005-06-02 22:21:43.000000000 -0500 +++ c/xen/arch/x86/x86_32/entry.S 2005-06-03 09:41:11.000000173 -0500 @@ -751,6 +751,7 @@ .long do_boot_vcpu .long do_ni_hypercall /* 25 */ .long do_mmuext_op + .long do_confer .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr diff -urN b/xen/common/domain.c c/xen/common/domain.c --- b/xen/common/domain.c 2005-06-02 22:21:37.000000000 -0500 +++ c/xen/common/domain.c 2005-06-03 09:42:09.839752947 -0500 @@ -392,6 +392,7 @@ atomic_set(&v->pausecnt, 0); v->cpumap = CPUMAP_RUNANYWHERE; + set_bit(_VCPUF_canconfer, &v->vcpu_flags); memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch)); diff -urN b/xen/common/schedule.c c/xen/common/schedule.c --- b/xen/common/schedule.c 2005-06-02 22:21:42.000000000 -0500 +++ c/xen/common/schedule.c 2005-06-03 09:41:49.540601494 -0500 @@ -219,6 +219,11 @@ spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags); if ( likely(domain_runnable(v)) ) { + /* mark current's confer state */ + if ( test_bit(_VCPUF_conferring, ¤t->vcpu_flags) ) { + clear_bit(_VCPUF_conferring, ¤t->vcpu_flags); + set_bit(_VCPUF_conferred, ¤t->vcpu_flags); + } SCHED_OP(wake, v); #ifdef WAKE_HISTO v->wokenup = NOW(); @@ -260,6 +265,51 @@ return 0; } +/* Confer control to another vcpu */ +long do_confer(unsigned int vcpu, unsigned int yield_count) +{ + struct domain *d = current->domain; + + /* Validate CONFER prereqs: + * - vcpu is within bounds + * - vcpu is a valid in this domain + * - current has not already conferred its slice to vcpu + * - vcpu is not already running + * - designated vcpu's yield_count matches value from call + * + * of all are ok, then set conferred value and enter scheduler + */ + + if (unlikely(vcpu > MAX_VIRT_CPUS)) + return 0; + + if (unlikely(d->vcpu[vcpu] == NULL)) + return 0; + + if (unlikely(!test_bit(_VCPUF_canconfer, ¤t->vcpu_flags))) + return 0; + + /* even counts indicate a running vcpu, odd is preempted/conferred */ + /* don't confer if holder is currently running */ + if (unlikely((d->vcpu[vcpu]->vcpu_info->yield_count & 1) == 0)) + return 0; + + if (unlikely(d->vcpu[vcpu]->vcpu_info->yield_count != yield_count)) + return 0; + + /* + * set current's state to conferring, wake target + */ + clear_bit(_VCPUF_canconfer, ¤t->vcpu_flags); + set_bit(_VCPUF_conferring, ¤t->vcpu_flags); + domain_wake(d->vcpu[vcpu]); + + /* give up my timeslice */ + do_yield(); + + return 0; +} + /* * Demultiplex scheduler-related hypercalls. */ @@ -422,7 +472,15 @@ r_time = next_slice.time; next = next_slice.task; - + + /* + * always clear conferred state so this vcpu can confer during its slice + * since it can confer, clear all other confer state + */ + set_bit(_VCPUF_canconfer, &next->vcpu_flags); + clear_bit(_VCPUF_conferring, &next->vcpu_flags); + clear_bit(_VCPUF_conferred, &next->vcpu_flags); + schedule_data[cpu].curr = next; next->lastschd = now; @@ -434,6 +492,12 @@ spin_unlock_irq(&schedule_data[cpu].schedule_lock); + /* bump vcpu yield_count when controlling domain is not-idle */ + if ( !is_idle_task(prev->domain) ) + prev->vcpu_info->yield_count++; + if ( !is_idle_task(next->domain) ) + next->vcpu_info->yield_count++; + if ( unlikely(prev == next) ) return continue_running(prev); diff -urN b/xen/include/public/xen.h c/xen/include/public/xen.h --- b/xen/include/public/xen.h 2005-06-02 22:21:41.000000000 -0500 +++ c/xen/include/public/xen.h 2005-06-03 09:41:11.040994437 -0500 @@ -58,6 +58,7 @@ #define __HYPERVISOR_boot_vcpu 24 #define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ #define __HYPERVISOR_mmuext_op 26 +#define __HYPERVISOR_confer 27 /* * VIRTUAL INTERRUPTS @@ -324,8 +325,11 @@ u8 evtchn_upcall_mask; /* 1 */ u8 pad0, pad1; u32 evtchn_pending_sel; /* 4 */ - arch_vcpu_info_t arch; /* 8 */ -} PACKED vcpu_info_t; /* 8 + arch */ + /* Even when vcpu is running, Odd when it is preempted/conferred */ + u32 yield_count; /* 8 */ + u32 pad2; /* 12 */ + arch_vcpu_info_t arch; /* 16 */ +} PACKED vcpu_info_t; /* 16 + arch */ /* * Xen/kernel shared data -- pointer provided in start_info. @@ -337,6 +341,9 @@ u32 n_vcpu; + /* set if domains' vcpus share physical cpus */ + int shproc; + /* * A domain can have up to 1024 "event channels" on which it can send * and receive asynchronous event notifications. There are three classes diff -urN b/xen/include/xen/sched.h c/xen/include/xen/sched.h --- b/xen/include/xen/sched.h 2005-06-02 22:21:36.000000000 -0500 +++ c/xen/include/xen/sched.h 2005-06-03 09:41:11.042994158 -0500 @@ -342,6 +342,15 @@ /* Initialization completed. */ #define _VCPUF_initialised 8 #define VCPUF_initialised (1UL<<_VCPUF_initialised) + /* Able to give time slice to another vcpu */ +#define _VCPUF_canconfer 9 +#define VCPUF_canconfer (1UL<<_VCPUF_canconfer) + /* Currently giving time slice to another vcpu */ +#define _VCPUF_conferring 10 +#define VCPUF_conferring (1UL<<_VCPUF_conferring) + /* Already given time slice to another vcpu */ +#define _VCPUF_conferred 11 +#define VCPUF_conferred (1UL<<_VCPUF_conferred) /* * Per-domain flags (domain_flags).