x86: improve CR0 read/write handling With the only bit in CR0 permitted to be changed by PV guests being TS, optimize the handling towards that: Keep a cached value in a per-CPU variable, and issue HYPERVISOR_fpu_taskswitch hypercalls for updates in all but the unusual case should something in the system still try to modify another bit (the attempt of which would then be logged by the hypervisor). This removes the need to have the hypervisor emulate MOV to/from CR0 instructions in all halfway frequently executed code paths. Signed-off-by: Jan Beulich --- v2: Add safety measure for nested updates (those will now always access non-cached state). --- a/arch/i386/kernel/cpu/common-xen.c +++ b/arch/i386/kernel/cpu/common-xen.c @@ -32,6 +32,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); #ifndef CONFIG_XEN DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); +#else +DEFINE_PER_CPU(unsigned int, xen_x86_cr0); +DEFINE_PER_CPU(unsigned int, xen_x86_cr0_upd) = ~0; +EXPORT_PER_CPU_SYMBOL(xen_x86_cr0); +EXPORT_PER_CPU_SYMBOL(xen_x86_cr0_upd); #endif static int cachesize_override __cpuinitdata = -1; @@ -681,6 +686,8 @@ old_gdt: cpu_gdt_descr->size = GDT_SIZE - 1; cpu_gdt_descr->address = (unsigned long)gdt; #else + __get_cpu_var(xen_x86_cr0) = raw_read_cr0(); + xen_clear_cr0_upd(); if (cpu == 0 && cpu_gdt_descr->address == 0) { gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); /* alloc_bootmem_pages panics on failure, so no check */ --- a/arch/i386/kernel/process-xen.c +++ b/arch/i386/kernel/process-xen.c @@ -639,8 +639,14 @@ struct task_struct fastcall * __switch_t BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); #endif BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (_mcl->op == __HYPERVISOR_fpu_taskswitch) + __get_cpu_var(xen_x86_cr0_upd) = X86_CR0_TS; if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) BUG(); + if (_mcl->op == __HYPERVISOR_fpu_taskswitch) { + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; + xen_clear_cr0_upd(); + } /* * Restore %fs and %gs if needed. --- a/arch/i386/kernel/traps-xen.c +++ b/arch/i386/kernel/traps-xen.c @@ -1057,6 +1057,7 @@ asmlinkage void math_state_restore(struc struct task_struct *tsk = thread->task; /* NB. 'clts' is done for us by Xen during virtual trap. */ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; if (!tsk_used_math(tsk)) init_fpu(tsk); restore_fpu(tsk); --- a/arch/x86_64/kernel/process-xen.c +++ b/arch/x86_64/kernel/process-xen.c @@ -572,8 +572,14 @@ __switch_to(struct task_struct *prev_p, BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); #endif BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (_mcl->op == __HYPERVISOR_fpu_taskswitch) + __get_cpu_var(xen_x86_cr0_upd) = X86_CR0_TS; if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) BUG(); + if (_mcl->op == __HYPERVISOR_fpu_taskswitch) { + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; + xen_clear_cr0_upd(); + } /* * Switch DS and ES. --- a/arch/x86_64/kernel/setup64-xen.c +++ b/arch/x86_64/kernel/setup64-xen.c @@ -126,6 +126,11 @@ void __init setup_per_cpu_areas(void) } #ifdef CONFIG_XEN +DEFINE_PER_CPU(unsigned long, xen_x86_cr0); +DEFINE_PER_CPU(unsigned long, xen_x86_cr0_upd) = ~0; +EXPORT_PER_CPU_SYMBOL(xen_x86_cr0); +EXPORT_PER_CPU_SYMBOL(xen_x86_cr0_upd); + static void switch_pt(void) { xen_pt_switch(__pa_symbol(init_level4_pgt)); @@ -174,6 +179,8 @@ void pda_init(int cpu) if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda)) BUG(); + __get_cpu_var(xen_x86_cr0) = raw_read_cr0(); + xen_clear_cr0_upd(); #endif pda->cpunumber = cpu; pda->irqcount = -1; --- a/arch/x86_64/kernel/traps-xen.c +++ b/arch/x86_64/kernel/traps-xen.c @@ -1075,8 +1075,9 @@ asmlinkage void __attribute__((weak)) mc asmlinkage void math_state_restore(void) { struct task_struct *me = current; - /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */ + /* NB. 'clts' is done for us by Xen during virtual trap. */ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; if (!used_math()) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); --- a/include/asm-i386/mach-xen/asm/system.h +++ b/include/asm-i386/mach-xen/asm/system.h @@ -2,8 +2,10 @@ #define __ASM_SYSTEM_H #include +#include #include #include +#include #include /* for LOCK_PREFIX */ #include #include @@ -90,15 +91,50 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) -#define read_cr0() ({ \ +DECLARE_PER_CPU(unsigned int, xen_x86_cr0); +DECLARE_PER_CPU(unsigned int, xen_x86_cr0_upd); + +#define xen_read_cr0_upd() ({ \ + unsigned int u__ = __get_cpu_var(xen_x86_cr0_upd); \ + rmb(); \ + u__; \ +}) +#define xen_clear_cr0_upd() do { \ + wmb(); \ + __get_cpu_var(xen_x86_cr0_upd) = 0; \ +} while (0) + +#define raw_read_cr0() ({ \ unsigned int __dummy; \ __asm__ __volatile__( \ "movl %%cr0,%0\n\t" \ :"=r" (__dummy)); \ __dummy; \ }) -#define write_cr0(x) \ +#define read_cr0() (likely(!xen_read_cr0_upd()) ? \ + __get_cpu_var(xen_x86_cr0) : raw_read_cr0()) +#define raw_write_cr0(x) \ __asm__ __volatile__("movl %0,%%cr0": :"r" (x)) +#define write_cr0(x) do { \ + unsigned int x__ = (x); \ + unsigned int upd = x__ ^ __get_cpu_var(xen_x86_cr0); \ + if (unlikely(cmpxchg(&__get_cpu_var(xen_x86_cr0_upd), 0, upd))) { \ + raw_write_cr0(x__); \ + break; \ + } \ + switch (upd) { \ + case 0: \ + continue; \ + case X86_CR0_TS: \ + HYPERVISOR_fpu_taskswitch(!!(x__ & X86_CR0_TS)); \ + break; \ + default: \ + raw_write_cr0(x__); \ + break; \ + } \ + __get_cpu_var(xen_x86_cr0) = x__; \ + xen_clear_cr0_upd(); \ +} while (0) #define read_cr2() (current_vcpu_info()->arch.cr2) #define write_cr2(x) \ @@ -142,8 +178,27 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" /* * Clear and set 'TS' bit respectively */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) -#define stts() (HYPERVISOR_fpu_taskswitch(1)) +#define X86_CR0_TS 8 +#define clts() ({ \ + if (unlikely(xen_read_cr0_upd())) \ + HYPERVISOR_fpu_taskswitch(0); \ + else if (__get_cpu_var(xen_x86_cr0) & X86_CR0_TS) { \ + __get_cpu_var(xen_x86_cr0_upd) = X86_CR0_TS; \ + HYPERVISOR_fpu_taskswitch(0); \ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; \ + xen_clear_cr0_upd(); \ + } \ +}) +#define stts() ({ \ + if (unlikely(xen_read_cr0_upd())) \ + HYPERVISOR_fpu_taskswitch(1); \ + else if (!(__get_cpu_var(xen_x86_cr0) & X86_CR0_TS)) { \ + __get_cpu_var(xen_x86_cr0_upd) = X86_CR0_TS; \ + HYPERVISOR_fpu_taskswitch(1); \ + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; \ + xen_clear_cr0_upd(); \ + } \ +}) #endif /* __KERNEL__ */ --- a/include/asm-x86_64/mach-xen/asm/system.h +++ b/include/asm-x86_64/mach-xen/asm/system.h @@ -7,7 +7,7 @@ #include #include -#include +#include #ifdef __KERNEL__ @@ -71,19 +71,64 @@ struct alt_instr { /* * Clear and set 'TS' bit respectively */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) +#define X86_CR0_TS 8 +#define clts() ({ \ + if (unlikely(xen_read_cr0_upd())) \ + HYPERVISOR_fpu_taskswitch(0); \ + else if (__get_cpu_var(xen_x86_cr0) & X86_CR0_TS) { \ + __get_cpu_var(xen_x86_cr0_upd) = X86_CR0_TS; \ + HYPERVISOR_fpu_taskswitch(0); \ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; \ + xen_clear_cr0_upd(); \ + } \ +}) -static inline unsigned long read_cr0(void) +DECLARE_PER_CPU(unsigned long, xen_x86_cr0); +DECLARE_PER_CPU(unsigned long, xen_x86_cr0_upd); + +#define xen_read_cr0_upd() ({ \ + unsigned long u__ = __get_cpu_var(xen_x86_cr0_upd); \ + rmb(); \ + u__; \ +}) +#define xen_clear_cr0_upd() do { \ + wmb(); \ + __get_cpu_var(xen_x86_cr0_upd) = 0; \ +} while (0) + +static inline unsigned long raw_read_cr0(void) { unsigned long cr0; asm volatile("movq %%cr0,%0" : "=r" (cr0)); return cr0; } +#define read_cr0() (likely(!xen_read_cr0_upd()) ? \ + __get_cpu_var(xen_x86_cr0) : raw_read_cr0()) -static inline void write_cr0(unsigned long val) +static inline void raw_write_cr0(unsigned long val) { asm volatile("movq %0,%%cr0" :: "r" (val)); } +#define write_cr0(x) do { \ + unsigned long x__ = (x); \ + unsigned long upd = x__ ^ __get_cpu_var(xen_x86_cr0); \ + if (unlikely(cmpxchg(&__get_cpu_var(xen_x86_cr0_upd), 0, upd))) { \ + raw_write_cr0(x__); \ + break; \ + } \ + switch (upd) { \ + case 0: \ + continue; \ + case X86_CR0_TS: \ + HYPERVISOR_fpu_taskswitch(!!(x__ & X86_CR0_TS)); \ + break; \ + default: \ + raw_write_cr0(x__); \ + break; \ + } \ + __get_cpu_var(xen_x86_cr0) = x__; \ + xen_clear_cr0_upd(); \ +} while (0) #define read_cr3() ({ \ unsigned long __dummy; \ @@ -103,7 +148,16 @@ static inline void write_cr4(unsigned lo asm volatile("movq %0,%%cr4" :: "r" (val)); } -#define stts() (HYPERVISOR_fpu_taskswitch(1)) +#define stts() ({ \ + if (unlikely(xen_read_cr0_upd())) \ + HYPERVISOR_fpu_taskswitch(1); \ + else if (!(__get_cpu_var(xen_x86_cr0) & X86_CR0_TS)) { \ + __get_cpu_var(xen_x86_cr0_upd) = X86_CR0_TS; \ + HYPERVISOR_fpu_taskswitch(1); \ + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; \ + xen_clear_cr0_upd(); \ + } \ +}) #define wbinvd() \ __asm__ __volatile__ ("wbinvd": : :"memory");