diff -Naurp ../xeno-unstable.bk/xen/arch/x86/apic.c xen/arch/x86/apic.c --- ../xeno-unstable.bk/xen/arch/x86/apic.c 2005-04-14 14:56:31.000000000 -0500 +++ xen/arch/x86/apic.c 2005-04-15 08:33:29.000000000 -0500 @@ -722,10 +722,19 @@ int reprogram_ac_timer(s_time_t timeout) return 1; } +extern int nmi_profiling_started; +extern int nmi_sanity_check(struct xen_regs * regs, int cpu); + void smp_apic_timer_interrupt(struct xen_regs * regs) { ack_APIC_irq(); perfc_incrc(apic_timer); + /* we may lose NMI samples for several reasons. + * Here, we protect against that by doing an nmi sanity + * check every timer interrupt on every processor + */ + if (nmi_profiling_started) + nmi_sanity_check(regs, smp_processor_id()); raise_softirq(AC_TIMER_SOFTIRQ); } diff -Naurp ../xeno-unstable.bk/xen/arch/x86/Makefile xen/arch/x86/Makefile --- ../xeno-unstable.bk/xen/arch/x86/Makefile 2005-04-14 14:56:31.000000000 -0500 +++ xen/arch/x86/Makefile 2005-04-15 08:33:52.000000000 -0500 @@ -15,7 +15,10 @@ ifneq ($(crash_debug),y) OBJS := $(patsubst cdb%.o,,$(OBJS)) endif +OBJS += oprofile/oprofile.o + default: $(TARGET) + make -C oprofile $(TARGET): $(TARGET)-syms boot/mkelf32 ./boot/mkelf32 $(TARGET)-syms $(TARGET) 0x100000 @@ -33,11 +36,15 @@ asm-offsets.s: $(TARGET_SUBARCH)/asm-off boot/mkelf32: boot/mkelf32.c $(HOSTCC) $(HOSTCFLAGS) -o $@ $< +oprofile/oprofile.o: + $(MAKE) -C oprofile + clean: rm -f *.o *.s *~ core boot/*.o boot/*~ boot/core boot/mkelf32 rm -f x86_32/*.o x86_32/*~ x86_32/core rm -f x86_64/*.o x86_64/*~ x86_64/core rm -f mtrr/*.o mtrr/*~ mtrr/core + rm -f oprofile/*.o delete-unfresh-files: # nothing diff -Naurp ../xeno-unstable.bk/xen/arch/x86/microcode.c xen/arch/x86/microcode.c --- ../xeno-unstable.bk/xen/arch/x86/microcode.c 2005-04-14 14:56:31.000000000 -0500 +++ xen/arch/x86/microcode.c 2005-04-15 08:33:55.000000000 -0500 @@ -87,13 +87,6 @@ #define vmalloc(_s) xmalloc_bytes(_s) #define vfree(_p) xfree(_p) #define num_online_cpus() smp_num_cpus -static inline int on_each_cpu( - void (*func) (void *info), void *info, int retry, int wait) -{ - int ret = smp_call_function(func, info, retry, wait); - func(info); - return ret; -} #if 0 MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); diff -Naurp ../xeno-unstable.bk/xen/arch/x86/nmi.c xen/arch/x86/nmi.c --- ../xeno-unstable.bk/xen/arch/x86/nmi.c 2005-04-14 14:56:32.000000000 -0500 +++ xen/arch/x86/nmi.c 2005-04-15 08:33:57.000000000 -0500 @@ -5,6 +5,10 @@ * * Started by Ingo Molnar * + * Modified by Aravind Menon for supporting oprofile + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * * Fixes: * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. * Mikael Pettersson : Power Management for local APIC NMI watchdog. @@ -34,6 +38,28 @@ unsigned int nmi_perfctr_msr; /* the MSR extern int logical_proc_id[]; +/* + * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: + * - it may be reserved by some other driver, or not + * - when not reserved by some other driver, it may be used for + * the NMI watchdog, or not + * + * This is maintained separately from nmi_active because the NMI + * watchdog may also be driven from the I/O APIC timer. + */ +static spinlock_t lapic_nmi_owner_lock = SPIN_LOCK_UNLOCKED; +static unsigned int lapic_nmi_owner; +#define LAPIC_NMI_WATCHDOG (1<<0) +#define LAPIC_NMI_RESERVED (1<<1) + +/* nmi_active: + * +1: the lapic NMI watchdog is active, but can be disabled + * 0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * -1: the lapic NMI watchdog is disabled, but can be enabled + */ +int nmi_active; + #define K7_EVNTSEL_ENABLE (1 << 22) #define K7_EVNTSEL_INT (1 << 20) #define K7_EVNTSEL_OS (1 << 17) @@ -70,9 +96,9 @@ extern int logical_proc_id[]; */ #define MSR_P4_IQ_COUNTER0 0x30C #define MSR_P4_IQ_COUNTER1 0x30D -#define MSR_P4_IQ_CCCR0 0x36C -#define MSR_P4_IQ_CCCR1 0x36D -#define MSR_P4_CRU_ESCR0 0x3B8 /* ESCR no. 4 */ +//#define MSR_P4_IQ_CCCR0 0x36C +//#define MSR_P4_IQ_CCCR1 0x36D +//#define MSR_P4_CRU_ESCR0 0x3B8 /* ESCR no. 4 */ #define P4_NMI_CRU_ESCR0 \ (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS0|P4_ESCR_USR0| \ P4_ESCR_OS1|P4_ESCR_USR1) @@ -128,6 +154,69 @@ static inline void nmi_pm_init(void) { } * Original code written by Keith Owens. */ +static void disable_lapic_nmi_watchdog(void) +{ + if (nmi_active <= 0) + return; + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + wrmsr(MSR_K7_EVNTSEL0, 0, 0); + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + wrmsr(MSR_P6_EVNTSEL0, 0, 0); + break; + case 15: + if (logical_proc_id[smp_processor_id()] == 0) + { + wrmsr(MSR_P4_IQ_CCCR0, 0, 0); + wrmsr(MSR_P4_CRU_ESCR0, 0, 0); + } else { + wrmsr(MSR_P4_IQ_CCCR1, 0, 0); + } + break; + } + break; + } + nmi_active = -1; + /* tell do_nmi() and others that we're not active any more */ + nmi_watchdog = 0; +} + +static void enable_lapic_nmi_watchdog(void) +{ + if (nmi_active < 0) { + nmi_watchdog = NMI_LOCAL_APIC; + setup_apic_nmi_watchdog(); + } +} + +int reserve_lapic_nmi(void) +{ + unsigned int old_owner; + spin_lock(&lapic_nmi_owner_lock); + old_owner = lapic_nmi_owner; + lapic_nmi_owner |= LAPIC_NMI_RESERVED; + spin_unlock(&lapic_nmi_owner_lock); + if (old_owner & LAPIC_NMI_RESERVED) + return -EBUSY; + if (old_owner & LAPIC_NMI_WATCHDOG) + disable_lapic_nmi_watchdog(); + return 0; +} + +void release_lapic_nmi(void) +{ + unsigned int new_owner; + spin_lock(&lapic_nmi_owner_lock); + new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; + lapic_nmi_owner = new_owner; + spin_unlock(&lapic_nmi_owner_lock); + if (new_owner & LAPIC_NMI_WATCHDOG) + enable_lapic_nmi_watchdog(); +} + static void __pminit clear_msr_range(unsigned int base, unsigned int n) { unsigned int i; @@ -252,6 +341,8 @@ void __pminit setup_apic_nmi_watchdog(vo default: return; } + lapic_nmi_owner = LAPIC_NMI_WATCHDOG; + nmi_active = 1; nmi_pm_init(); } @@ -316,3 +407,7 @@ void nmi_watchdog_tick (struct xen_regs } } } + +EXPORT_SYMBOL(reserve_lapic_nmi); +EXPORT_SYMBOL(release_lapic_nmi); + diff -Naurp ../xeno-unstable.bk/xen/arch/x86/oprofile/Makefile xen/arch/x86/oprofile/Makefile --- ../xeno-unstable.bk/xen/arch/x86/oprofile/Makefile 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/oprofile/Makefile 2005-04-15 08:33:57.000000000 -0500 @@ -0,0 +1,34 @@ + +include $(BASEDIR)/Rules.mk + + +# FLAGS is identical to CFLAGS except for -Werror +FLAGS := -nostdinc -fno-builtin -fno-common -fno-strict-aliasing +FLAGS += -iwithprefix include -Wall -pipe +FLAGS += -I$(BASEDIR)/include -Wno-pointer-arith -Wredundant-decls + +ifeq ($(optimize),y) +FLAGS += -O3 -fomit-frame-pointer +else +x86_32/usercopy.o: CFLAGS += -O1 +endif + +# Prevent floating-point variables from creeping into Xen. +FLAGS += -msoft-float + +ifeq ($(TARGET_SUBARCH),x86_32) +FLAGS += -m32 -march=i686 +LDFLAGS := --oformat elf32-i386 +endif + +ifeq ($(TARGET_SUBARCH),x86_64) +FLAGS += -m64 -mno-red-zone -fpic -fno-reorder-blocks +FLAGS += -fno-asynchronous-unwind-tables +endif + +default: $(OBJS) + $(LD) $(LDFLAGS) -r -o oprofile.o $(OBJS) + +%.o: %.c $(HDRS) Makefile + $(CC) $(FLAGS) -c $< -o $@ + diff -Naurp ../xeno-unstable.bk/xen/arch/x86/oprofile/nmi_int.c xen/arch/x86/oprofile/nmi_int.c --- ../xeno-unstable.bk/xen/arch/x86/oprofile/nmi_int.c 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/oprofile/nmi_int.c 2005-04-15 08:33:57.000000000 -0500 @@ -0,0 +1,430 @@ +/** + * @file nmi_int.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "op_counter.h" +#include "op_x86_model.h" + +static struct op_x86_model_spec const * model; +static struct op_msrs cpu_msrs[NR_CPUS]; +static unsigned long saved_lvtpc[NR_CPUS]; + +#define VIRQ_BITMASK_SIZE (MAX_OPROF_DOMAINS/32 + 1) + +extern int active_domains[MAX_OPROF_DOMAINS]; +extern unsigned int adomains; + +extern struct domain * primary_profiler; +extern struct domain * adomain_ptrs[MAX_OPROF_DOMAINS]; +extern unsigned long virq_ovf_pending[VIRQ_BITMASK_SIZE]; + +extern int is_active(struct domain *d); +extern int active_id(struct domain *d); +extern int is_passive(struct domain *d); +extern int is_profiled(struct domain *d); + + +int nmi_profiling_started = 0; + +int active_virq_count = 0; +int passive_virq_count = 0; +int other_virq_count = 0; +int other_id = -1; +int xen_count = 0; +int dom_count = 0; +int ovf = 0; + +int nmi_callback(struct xen_regs * regs, int cpu) +{ + int xen_mode = 0; + + ovf = model->check_ctrs(cpu, &cpu_msrs[cpu], regs); + xen_mode = RING_0(regs); + if (ovf) { + if (xen_mode) + xen_count++; + else + dom_count++; + + if (is_active(current->domain)) { + /* This is lightly incorrect. If we do not deliver + OVF virtual interrupts in a synchronous + manner, a process switch may happen in the domain + between the point the sample was collected and + the point at which a VIRQ was delivered. However, + it is not safe to call send_guest_virq from this + NMI context, it may lead to a deadlock since NMIs are + unmaskable. One optimization that we can do is + that if the sample occurs while domain code is + runnng, we know that it is safe to call + send_guest_virq, since we know no Xen code + is running at that time. + However, this may distort the sample distribution, + because we may lose more Xen mode samples.*/ + active_virq_count++; + if (!xen_mode) { + send_guest_virq(current, VIRQ_PMC_OVF); + clear_bit(active_id(current->domain), &virq_ovf_pending[0]); + } else + set_bit(active_id(current->domain), &virq_ovf_pending[0]); + primary_profiler->shared_info->active_samples++; + } + else if (is_passive(current->domain)) { + set_bit(active_id(primary_profiler), &virq_ovf_pending[0]); + passive_virq_count++; + primary_profiler->shared_info->passive_samples++; + } + else { + other_virq_count++; + other_id = current->domain->id; + primary_profiler->shared_info->other_samples++; + } + } + return 1; +} + +static void free_msrs(void) +{ + int i; + for (i = 0; i < NR_CPUS; ++i) { + xfree(cpu_msrs[i].counters); + cpu_msrs[i].counters = NULL; + xfree(cpu_msrs[i].controls); + cpu_msrs[i].controls = NULL; + } +} + +static int allocate_msrs(void) +{ + int success = 1; + + int i; + for (i = 0; i < NR_CPUS; ++i) { + //if (!cpu_online(i)) + if (!test_bit(i, &cpu_online_map)) + continue; + + cpu_msrs[i].counters = xmalloc(struct op_msr); + if (!cpu_msrs[i].counters) { + success = 0; + break; + } + cpu_msrs[i].controls = xmalloc(struct op_msr); + if (!cpu_msrs[i].controls) { + success = 0; + break; + } + } + if (!success) + free_msrs(); + + return success; +} + +static void nmi_cpu_save_registers(struct op_msrs * msrs) +{ + unsigned int const nr_ctrs = model->num_counters; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr * counters = msrs->counters; + struct op_msr * controls = msrs->controls; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + rdmsr(counters[i].addr, + counters[i].saved.low, + counters[i].saved.high); + } + + for (i = 0; i < nr_ctrls; ++i) { + rdmsr(controls[i].addr, + controls[i].saved.low, + controls[i].saved.high); + } +} + +static void nmi_save_registers(void * dummy) +{ + int cpu = smp_processor_id(); + struct op_msrs * msrs = &cpu_msrs[cpu]; + model->fill_in_addresses(msrs); + nmi_cpu_save_registers(msrs); +} + +int nmi_reserve_counters(void) +{ + if (!allocate_msrs()) + return -ENOMEM; + + /* We walk a thin line between law and rape here. + * We need to be careful to install our NMI handler + * without actually triggering any NMIs as this will + * break the core code horrifically. + */ + /* Don't we need to do this on all CPUs?*/ + if (reserve_lapic_nmi() < 0) { + free_msrs(); + return -EBUSY; + } + /* We need to serialize save and setup for HT because the subset + * of msrs are distinct for save and setup operations + */ + on_each_cpu(nmi_save_registers, NULL, 0, 1); + return 0; +} + +static void nmi_cpu_setup(void * dummy) +{ + int cpu = smp_processor_id(); + struct op_msrs * msrs = &cpu_msrs[cpu]; + model->setup_ctrs(msrs); +} + +int nmi_setup_events(void) +{ + on_each_cpu(nmi_cpu_setup, NULL, 0, 1); + return 0; +} + +int nmi_enable_virq() +{ + set_nmi_callback(nmi_callback); + return 0; +} + +static void nmi_cpu_start(void * dummy) +{ + int cpu = smp_processor_id(); + struct op_msrs const * msrs = &cpu_msrs[cpu]; + saved_lvtpc[cpu] = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, APIC_DM_NMI); + model->start(msrs); +} + +int nmi_start(void) +{ + on_each_cpu(nmi_cpu_start, NULL, 0, 1); + nmi_profiling_started = 1; + return 0; +} + +static void nmi_cpu_stop(void * dummy) +{ + unsigned int v; + int cpu = smp_processor_id(); + struct op_msrs const * msrs = &cpu_msrs[cpu]; + model->stop(msrs); + + /* restoring APIC_LVTPC can trigger an apic error because the delivery + * mode and vector nr combination can be illegal. That's by design: on + * power on apic lvt contain a zero vector nr which are legal only for + * NMI delivery mode. So inhibit apic err before restoring lvtpc + */ + if (!(apic_read(APIC_LVTPC) & APIC_DM_NMI) + || (apic_read(APIC_LVTPC) & APIC_LVT_MASKED)) { + printk("nmi_stop: APIC not good %p\n", apic_read(APIC_LVTPC)); + mdelay(5000); + } + v = apic_read(APIC_LVTERR); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, saved_lvtpc[cpu]); + apic_write(APIC_LVTERR, v); +} + +void nmi_stop(void) +{ + nmi_profiling_started = 0; + on_each_cpu(nmi_cpu_stop, NULL, 0, 1); + active_virq_count = 0; + passive_virq_count = 0; + other_virq_count = 0; + xen_count = 0; + dom_count = 0; +} + +extern unsigned int read_ctr(struct op_msrs const * const msrs, int ctr); + +void nmi_sanity_check(struct xen_regs *regs, int cpu) +{ + int i; + int masked = 0; + + /* We may have missed some NMI interrupts if we were already + in an NMI context at that time. If this happens, then + the counters are not reset and in the case of P4, the + APIC LVT disable mask is set. In both cases we end up + losing samples. On P4, this condition can be detected + by checking the APIC LVT mask. But in P6, we need to + examine the counters for overflow. So, every timer + interrupt, we check that everything is OK */ + + if (apic_read(APIC_LVTPC) & APIC_LVT_MASKED) + masked = 1; + + nmi_callback(regs, cpu); + + if (ovf && masked) { + if (is_active(current->domain)) + current->domain->shared_info->nmi_restarts++; + else if (is_passive(current->domain)) + primary_profiler->shared_info->nmi_restarts++; + } + + /*if (jiffies %1000 == 0) { + printk("cpu %d: sample count %d %d %d at %u\n", cpu, active_virq_count, passive_virq_count, other_virq_count, jiffies); + printk("other task id %d\n", other_id); + printk("%d in xen, %d in domain\n", xen_count, dom_count); + printk("counters %p %p\n", read_ctr(&cpu_msrs[cpu], 0), read_ctr(&cpu_msrs[cpu], 1)); + }*/ + + + for (i = 0; i < adomains; i++) + if (test_and_clear_bit(i, &virq_ovf_pending[0])) { + send_guest_virq(adomain_ptrs[i], VIRQ_PMC_OVF); + } +} + +void nmi_disable_virq(void) +{ + unset_nmi_callback(); +} + +static void nmi_restore_registers(struct op_msrs * msrs) +{ + unsigned int const nr_ctrs = model->num_counters; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr * counters = msrs->counters; + struct op_msr * controls = msrs->controls; + unsigned int i; + + for (i = 0; i < nr_ctrls; ++i) { + wrmsr(controls[i].addr, + controls[i].saved.low, + controls[i].saved.high); + } + + for (i = 0; i < nr_ctrs; ++i) { + wrmsr(counters[i].addr, + counters[i].saved.low, + counters[i].saved.high); + } +} + +static void nmi_cpu_shutdown(void * dummy) +{ + int cpu = smp_processor_id(); + struct op_msrs * msrs = &cpu_msrs[cpu]; + nmi_restore_registers(msrs); +} + +void nmi_release_counters(void) +{ + on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); + release_lapic_nmi(); + free_msrs(); +} + +struct op_counter_config counter_config[OP_MAX_COUNTER]; + +static int __init p4_init(void) +{ + __u8 cpu_model = current_cpu_data.x86_model; + + if (cpu_model > 3) + return 0; + +#ifndef CONFIG_SMP + model = &op_p4_spec; + return 1; +#else + //switch (smp_num_siblings) { + if (cpu_has_ht) + { + model = &op_p4_ht2_spec; + return 1; + } + else + { + model = &op_p4_spec; + return 1; + } +#endif + return 0; +} + + +static int __init ppro_init(void) +{ + __u8 cpu_model = current_cpu_data.x86_model; + + if (cpu_model > 0xd) + return 0; + + model = &op_ppro_spec; + return 1; +} + +int nmi_init(int *num_events, int *is_primary) +{ + __u8 vendor = current_cpu_data.x86_vendor; + __u8 family = current_cpu_data.x86; + int prim = 0; + + if (!cpu_has_apic) + return -ENODEV; + + if (primary_profiler == NULL) { + primary_profiler = current->domain; + prim = 1; + } + + if (primary_profiler != current->domain) + goto out; + + switch (vendor) { + case X86_VENDOR_INTEL: + switch (family) { + /* Pentium IV */ + case 0xf: + if (!p4_init()) + return -ENODEV; + break; + /* A P6-class processor */ + case 6: + if (!ppro_init()) + return -ENODEV; + break; + default: + return -ENODEV; + } + break; + default: + return -ENODEV; + } +out: + if (copy_to_user((void *)num_events, (void *)&model->num_counters, sizeof(int))) + return -EFAULT; + if (copy_to_user((void *)is_primary, (void *)&prim, sizeof(int))) + return -EFAULT; + + return 0; +} + diff -Naurp ../xeno-unstable.bk/xen/arch/x86/oprofile/op_counter.h xen/arch/x86/oprofile/op_counter.h --- ../xeno-unstable.bk/xen/arch/x86/oprofile/op_counter.h 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/oprofile/op_counter.h 2005-04-15 08:33:57.000000000 -0500 @@ -0,0 +1,33 @@ +/** + * @file op_counter.h + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + */ + +#ifndef OP_COUNTER_H +#define OP_COUNTER_H + +#define OP_MAX_COUNTER 8 + +/* Per-perfctr configuration as set via + * oprofilefs. + */ +struct op_counter_config { + unsigned long count; + unsigned long enabled; + unsigned long event; + unsigned long kernel; + unsigned long user; + unsigned long unit_mask; +}; + +extern struct op_counter_config counter_config[]; + +#endif /* OP_COUNTER_H */ diff -Naurp ../xeno-unstable.bk/xen/arch/x86/oprofile/op_model_p4.c xen/arch/x86/oprofile/op_model_p4.c --- ../xeno-unstable.bk/xen/arch/x86/oprofile/op_model_p4.c 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/oprofile/op_model_p4.c 2005-04-15 08:33:57.000000000 -0500 @@ -0,0 +1,744 @@ +/** + * @file op_model_p4.c + * P4 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + */ + +#include +#include +#include +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_EVENTS 39 + +#define NUM_COUNTERS_NON_HT 8 +#define NUM_ESCRS_NON_HT 45 +#define NUM_CCCRS_NON_HT 18 +#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) + +#define NUM_COUNTERS_HT2 4 +#define NUM_ESCRS_HT2 23 +#define NUM_CCCRS_HT2 9 +#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) + +static unsigned int num_counters = NUM_COUNTERS_NON_HT; + + +/* this has to be checked dynamically since the + hyper-threadedness of a chip is discovered at + kernel boot-time. */ +static inline void setup_num_counters(void) +{ +#ifdef CONFIG_SMP + if (cpu_has_ht) + num_counters = NUM_COUNTERS_HT2; +#endif +} + +static int inline addr_increment(void) +{ +#ifdef CONFIG_SMP + return cpu_has_ht ? 2 : 1; +#else + return 1; +#endif +} + + +/* tables to simulate simplified hardware view of p4 registers */ +struct p4_counter_binding { + int virt_counter; + int counter_address; + int cccr_address; +}; + +struct p4_event_binding { + int escr_select; /* value to put in CCCR */ + int event_select; /* value to put in ESCR */ + struct { + int virt_counter; /* for this counter... */ + int escr_address; /* use this ESCR */ + } bindings[2]; +}; + +/* nb: these CTR_* defines are a duplicate of defines in + event/i386.p4*events. */ + + +#define CTR_BPU_0 (1 << 0) +#define CTR_MS_0 (1 << 1) +#define CTR_FLAME_0 (1 << 2) +#define CTR_IQ_4 (1 << 3) +#define CTR_BPU_2 (1 << 4) +#define CTR_MS_2 (1 << 5) +#define CTR_FLAME_2 (1 << 6) +#define CTR_IQ_5 (1 << 7) + +static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { + { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, + { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, + { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, + { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, + { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, + { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, + { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, + { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } +}; + +#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT + +/* All cccr we don't use. */ +static int p4_unused_cccr[NUM_UNUSED_CCCRS] = { + MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3, + MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3, + MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3, + MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR1, + MSR_P4_IQ_CCCR2, MSR_P4_IQ_CCCR3 +}; + +/* p4 event codes in libop/op_event.h are indices into this table. */ + +static struct p4_event_binding p4_events[NUM_EVENTS] = { + + { /* BRANCH_RETIRED */ + 0x05, 0x06, + { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, + {CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* MISPRED_BRANCH_RETIRED */ + 0x04, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* TC_DELIVER_MODE */ + 0x01, 0x01, + { { CTR_MS_0, MSR_P4_TC_ESCR0}, + { CTR_MS_2, MSR_P4_TC_ESCR1} } + }, + + { /* BPU_FETCH_REQUEST */ + 0x00, 0x03, + { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, + { CTR_BPU_2, MSR_P4_BPU_ESCR1} } + }, + + { /* ITLB_REFERENCE */ + 0x03, 0x18, + { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, + { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } + }, + + { /* MEMORY_CANCEL */ + 0x05, 0x02, + { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, + { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } + }, + + { /* MEMORY_COMPLETE */ + 0x02, 0x08, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* LOAD_PORT_REPLAY */ + 0x02, 0x04, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* STORE_PORT_REPLAY */ + 0x02, 0x05, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* MOB_LOAD_REPLAY */ + 0x02, 0x03, + { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, + { CTR_BPU_2, MSR_P4_MOB_ESCR1} } + }, + + { /* PAGE_WALK_TYPE */ + 0x04, 0x01, + { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, + { CTR_BPU_2, MSR_P4_PMH_ESCR1} } + }, + + { /* BSQ_CACHE_REFERENCE */ + 0x07, 0x0c, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { CTR_BPU_2, MSR_P4_BSU_ESCR1} } + }, + + { /* IOQ_ALLOCATION */ + 0x06, 0x03, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { 0, 0 } } + }, + + { /* IOQ_ACTIVE_ENTRIES */ + 0x06, 0x1a, + { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, + { 0, 0 } } + }, + + { /* FSB_DATA_ACTIVITY */ + 0x06, 0x17, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* BSQ_ALLOCATION */ + 0x07, 0x05, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { 0, 0 } } + }, + + { /* BSQ_ACTIVE_ENTRIES */ + 0x07, 0x06, + { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, + { 0, 0 } } + }, + + { /* X87_ASSIST */ + 0x05, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* SSE_INPUT_ASSIST */ + 0x01, 0x34, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_SP_UOP */ + 0x01, 0x08, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_DP_UOP */ + 0x01, 0x0c, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_SP_UOP */ + 0x01, 0x0a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_DP_UOP */ + 0x01, 0x0e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 64BIT_MMX_UOP */ + 0x01, 0x02, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 128BIT_MMX_UOP */ + 0x01, 0x1a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_FP_UOP */ + 0x01, 0x04, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_SIMD_MOVES_UOP */ + 0x01, 0x2e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* MACHINE_CLEAR */ + 0x05, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* GLOBAL_POWER_EVENTS */ + 0x06, 0x13 /* older manual says 0x05, newer 0x13 */, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* TC_MS_XFER */ + 0x00, 0x05, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* UOP_QUEUE_WRITES */ + 0x00, 0x09, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* FRONT_END_EVENT */ + 0x05, 0x08, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* EXECUTION_EVENT */ + 0x05, 0x0c, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* REPLAY_EVENT */ + 0x05, 0x09, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* INSTR_RETIRED */ + 0x04, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOPS_RETIRED */ + 0x04, 0x01, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOP_TYPE */ + 0x02, 0x02, + { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, + { CTR_IQ_5, MSR_P4_RAT_ESCR1} } + }, + + { /* RETIRED_MISPRED_BRANCH_TYPE */ + 0x02, 0x05, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + }, + + { /* RETIRED_BRANCH_TYPE */ + 0x02, 0x04, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + } +}; + + +#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7) + +#define ESCR_RESERVED_BITS 0x80000003 +#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) +#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2)) +#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3)) +#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1))) +#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) +#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) +#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) +#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) +#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) + +#define CCCR_RESERVED_BITS 0x38030FFF +#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) +#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000) +#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13)) +#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26)) +#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) +#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) +#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) +#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) +#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) +#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) +#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) + +#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) +#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) +#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) + + +/* this assigns a "stagger" to the current CPU, which is used throughout + the code in this module as an extra array offset, to select the "even" + or "odd" part of all the divided resources. */ +static unsigned int get_stagger(void) +{ +#ifdef CONFIG_SMP + /*int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu]));*/ + /* We want the two logical cpus of a physical cpu to use + disjoint set of counters. The following code is wrong. */ + return 0; +#endif + return 0; +} + + +/* finally, mediate access to a real hardware counter + by passing a "virtual" counter numer to this macro, + along with your stagger setting. */ +#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) + +static unsigned long reset_value[NUM_COUNTERS_NON_HT]; + + +static void p4_fill_in_addresses(struct op_msrs * const msrs) +{ + unsigned int i; + unsigned int addr, stag; + + setup_num_counters(); + stag = get_stagger(); + + /* the counter registers we pay attention to */ + for (i = 0; i < num_counters; ++i) { + msrs->counters[i].addr = + p4_counters[VIRT_CTR(stag, i)].counter_address; + } + + /* FIXME: bad feeling, we don't save the 10 counters we don't use. */ + + /* 18 CCCR registers */ + for (i = 0, addr = MSR_P4_BPU_CCCR0 + stag; + addr <= MSR_P4_IQ_CCCR5; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + /* 43 ESCR registers in three or four discontiguous group */ + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1 + * to avoid special case in nmi_{save|restore}_registers() */ + if (boot_cpu_data.x86_model >= 0x3) { + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + } else { + for (addr = MSR_P4_IQ_ESCR0 + stag; + addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + } + + for (addr = MSR_P4_RAT_ESCR0 + stag; + addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + for (addr = MSR_P4_MS_ESCR0 + stag; + addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + for (addr = MSR_P4_IX_ESCR0 + stag; + addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + /* there are 2 remaining non-contiguously located ESCRs */ + + if (num_counters == NUM_COUNTERS_NON_HT) { + /* standard non-HT CPUs handle both remaining ESCRs*/ + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + + } else if (stag == 0) { + /* HT CPUs give the first remainder to the even thread, as + the 32nd control register */ + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + + } else { + /* and two copies of the second to the odd thread, + for the 22st and 23nd control registers */ + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + } +} + + +static void pmc_setup_one_p4_counter(unsigned int ctr) +{ + int i; + int const maxbind = 2; + unsigned int cccr = 0; + unsigned int escr = 0; + unsigned int high = 0; + unsigned int counter_bit; + struct p4_event_binding *ev = NULL; + unsigned int stag; + + stag = get_stagger(); + + /* convert from counter *number* to counter *bit* */ + counter_bit = 1 << VIRT_CTR(stag, ctr); + + /* find our event binding structure. */ + if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { + printk(KERN_ERR + "oprofile: P4 event code 0x%lx out of range\n", + counter_config[ctr].event); + return; + } + + ev = &(p4_events[counter_config[ctr].event - 1]); + + for (i = 0; i < maxbind; i++) { + if (ev->bindings[i].virt_counter & counter_bit) { + + /* modify ESCR */ + ESCR_READ(escr, high, ev, i); + ESCR_CLEAR(escr); + if (stag == 0) { + ESCR_SET_USR_0(escr, counter_config[ctr].user); + ESCR_SET_OS_0(escr, counter_config[ctr].kernel); + } else { + ESCR_SET_USR_1(escr, counter_config[ctr].user); + ESCR_SET_OS_1(escr, counter_config[ctr].kernel); + } + ESCR_SET_EVENT_SELECT(escr, ev->event_select); + ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); + ESCR_WRITE(escr, high, ev, i); + + /* modify CCCR */ + CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + CCCR_CLEAR(cccr); + CCCR_SET_REQUIRED_BITS(cccr); + CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); + if (stag == 0) { + CCCR_SET_PMI_OVF_0(cccr); + } else { + CCCR_SET_PMI_OVF_1(cccr); + } + CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + return; + } + } + + printk(KERN_ERR + "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", + counter_config[ctr].event, stag, ctr); +} + + +static void p4_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int i; + unsigned int low, high; + unsigned int addr; + unsigned int stag; + + stag = get_stagger(); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (! MISC_PMC_ENABLED_P(low)) { + printk(KERN_ERR "oprofile: P4 PMC not available\n"); + return; + } + + /* clear the cccrs we will use */ + for (i = 0 ; i < num_counters ; i++) { + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + CCCR_CLEAR(low); + CCCR_SET_REQUIRED_BITS(low); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + } + + /* clear cccrs outside our concern */ + for (i = stag ; i < NUM_UNUSED_CCCRS ; i += addr_increment()) { + rdmsr(p4_unused_cccr[i], low, high); + CCCR_CLEAR(low); + CCCR_SET_REQUIRED_BITS(low); + wrmsr(p4_unused_cccr[i], low, high); + } + + /* clear all escrs (including those outside our concern) */ + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr < MSR_P4_IQ_ESCR0; addr += addr_increment()) { + wrmsr(addr, 0, 0); + } + + /* On older models clear also MSR_P4_IQ_ESCR0/1 */ + if (boot_cpu_data.x86_model < 0x3) { + wrmsr(MSR_P4_IQ_ESCR0, 0, 0); + wrmsr(MSR_P4_IQ_ESCR1, 0, 0); + } + + for (addr = MSR_P4_RAT_ESCR0 + stag; + addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_MS_ESCR0 + stag; + addr <= MSR_P4_TC_ESCR1; addr += addr_increment()){ + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_IX_ESCR0 + stag; + addr <= MSR_P4_CRU_ESCR3; addr += addr_increment()){ + wrmsr(addr, 0, 0); + } + + if (num_counters == NUM_COUNTERS_NON_HT) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } else if (stag == 0) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + } else { + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } + + /* setup all counters */ + for (i = 0 ; i < num_counters ; ++i) { + if (counter_config[i].enabled) { + reset_value[i] = counter_config[i].count; + pmc_setup_one_p4_counter(i); + CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + } else { + reset_value[i] = 0; + } + } +} + + +extern void pmc_log_event(struct domain *d, unsigned int eip, int mode, int event); +extern int is_profiled(struct domain * d); +extern struct domain * primary_profiler; + +static int p4_check_ctrs(unsigned int const cpu, + struct op_msrs const * const msrs, + struct xen_regs * const regs) +{ + unsigned long ctr, low, high, stag, real; + int i, ovf = 0; + unsigned long eip = regs->eip; + int mode = 0; + + if (RING_1(regs)) + mode = 1; + else if (RING_0(regs)) + mode = 2; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + + /* + * there is some eccentricity in the hardware which + * requires that we perform 2 extra corrections: + * + * - check both the CCCR:OVF flag for overflow and the + * counter high bit for un-flagged overflows. + * + * - write the counter back twice to ensure it gets + * updated properly. + * + * the former seems to be related to extra NMIs happening + * during the current NMI; the latter is reported as errata + * N15 in intel doc 249199-029, pentium 4 specification + * update, though their suggested work-around does not + * appear to solve the problem. + */ + + real = VIRT_CTR(stag, i); + + CCCR_READ(low, high, real); + CTR_READ(ctr, high, real); + if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + pmc_log_event(current, eip, mode, i); + CTR_WRITE(reset_value[i], real); + CCCR_CLEAR_OVF(low); + CCCR_WRITE(low, high, real); + CTR_WRITE(reset_value[i], real); + ovf = 1; + } + } + + /* P4 quirk: you have to re-unmask the apic vector */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + + /* See op_model_ppro.c */ + return ovf; +} + + +static void p4_start(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_ENABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +static void p4_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_DISABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +#ifdef CONFIG_SMP +struct op_x86_model_spec const op_p4_ht2_spec = { + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; +#endif + +struct op_x86_model_spec const op_p4_spec = { + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; diff -Naurp ../xeno-unstable.bk/xen/arch/x86/oprofile/op_model_ppro.c xen/arch/x86/oprofile/op_model_ppro.c --- ../xeno-unstable.bk/xen/arch/x86/oprofile/op_model_ppro.c 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/oprofile/op_model_ppro.c 2005-04-15 08:33:57.000000000 -0500 @@ -0,0 +1,166 @@ +/** + * @file op_model_ppro.h + * pentium pro / P6 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + */ + +#include +#include +#include +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_COUNTERS 2 +#define NUM_CONTROLS 2 + +#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) +#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) + +#define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0) +#define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_CLEAR(x) (x &= (1<<21)) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16)) +#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_EVENT(val, e) (val |= e) + +static unsigned long reset_value[NUM_COUNTERS]; + +static void ppro_fill_in_addresses(struct op_msrs * const msrs) +{ + msrs->counters[0].addr = MSR_P6_PERFCTR0; + msrs->counters[1].addr = MSR_P6_PERFCTR1; + + msrs->controls[0].addr = MSR_P6_EVNTSEL0; + msrs->controls[1].addr = MSR_P6_EVNTSEL1; +} + + +static void ppro_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* clear all counters */ + for (i = 0 ; i < NUM_CONTROLS; ++i) { + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR(low); + CTRL_WRITE(low, high, msrs, i); + } + + /* avoid a false detection of ctr overflows in NMI handler */ + for (i = 0; i < NUM_COUNTERS; ++i) { + CTR_WRITE(1, msrs, i); + } + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if (counter_config[i].enabled) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); + + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR(low); + CTRL_SET_ENABLE(low); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT(low, counter_config[i].event); + CTRL_WRITE(low, high, msrs, i); + } + } +} + +extern void pmc_log_event(struct domain *d, unsigned int eip, int mode, int event); +extern int is_profiled(struct domain * d); +extern struct domain * primary_profiler; + +static int ppro_check_ctrs(unsigned int const cpu, + struct op_msrs const * const msrs, + struct xen_regs * const regs) +{ + unsigned int low, high; + int i, ovf = 0; + unsigned long eip = regs->eip; + int mode = 0; + + if (RING_1(regs)) + mode = 1; + else if (RING_0(regs)) + mode = 2; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + pmc_log_event(current, eip, mode, i); + CTR_WRITE(reset_value[i], msrs, i); + ovf = 1; + } + } + + /* Only P6 based Pentium M need to re-unmask the apic vector but it + * doesn't hurt other P6 variant */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + + /* We can't work out if we really handled an interrupt. We + * might have caught a *second* counter just after overflowing + * the interrupt for this counter then arrives + * and we don't find a counter that's overflowed, so we + * would return 0 and get dazed + confused. Instead we always + * assume we found an overflow. This sucks. + */ + return ovf; +} + + +static void ppro_start(struct op_msrs const * const msrs) +{ + unsigned int low,high; + CTRL_READ(low, high, msrs, 0); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, 0); +} + +static void ppro_stop(struct op_msrs const * const msrs) +{ + unsigned int low,high; + CTRL_READ(low, high, msrs, 0); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, 0); +} + +unsigned int read_ctr(struct op_msrs const * const msrs, int i) +{ + unsigned int low, high; + CTR_READ(low, high, msrs, i); + return low; +} + +struct op_x86_model_spec const op_ppro_spec = { + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop +}; diff -Naurp ../xeno-unstable.bk/xen/arch/x86/oprofile/op_x86_model.h xen/arch/x86/oprofile/op_x86_model.h --- ../xeno-unstable.bk/xen/arch/x86/oprofile/op_x86_model.h 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/oprofile/op_x86_model.h 2005-04-15 08:33:57.000000000 -0500 @@ -0,0 +1,55 @@ +/** + * @file op_x86_model.h + * interface to x86 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + */ + +#ifndef OP_X86_MODEL_H +#define OP_X86_MODEL_H + +struct op_saved_msr { + unsigned int high; + unsigned int low; +}; + +struct op_msr { + unsigned long addr; + struct op_saved_msr saved; +}; + +struct op_msrs { + struct op_msr * counters; + struct op_msr * controls; +}; + +struct pt_regs; + +/* The model vtable abstracts the differences between + * various x86 CPU model's perfctr support. + */ +struct op_x86_model_spec { + unsigned int const num_counters; + unsigned int const num_controls; + void (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_msrs const * const msrs); + int (*check_ctrs)(unsigned int const cpu, + struct op_msrs const * const msrs, + struct xen_regs * const regs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); +}; + +extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec const op_p4_spec; +extern struct op_x86_model_spec const op_p4_ht2_spec; +extern struct op_x86_model_spec const op_athlon_spec; + +#endif /* OP_X86_MODEL_H */ diff -Naurp ../xeno-unstable.bk/xen/arch/x86/oprofile/pmc.c xen/arch/x86/oprofile/pmc.c --- ../xeno-unstable.bk/xen/arch/x86/oprofile/pmc.c 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/oprofile/pmc.c 2005-04-15 08:33:57.000000000 -0500 @@ -0,0 +1,281 @@ +/* + * Copyright (C) 2005 Hewlett-Packard Co. + * written by Aravind Menon, email: xenoprof@xxxxxxxxxxxxx + */ + +#include + +#include "op_counter.h" + +int active_domains[MAX_OPROF_DOMAINS]; +int passive_domains[MAX_OPROF_DOMAINS]; +unsigned int adomains = 0; +unsigned int pdomains = 0; +unsigned int activated = 0; + +#define VIRQ_BITMASK_SIZE (MAX_OPROF_DOMAINS/32 + 1) + +struct domain * primary_profiler = NULL; +struct domain * adomain_ptrs[MAX_OPROF_DOMAINS]; +unsigned int virq_ovf_pending[VIRQ_BITMASK_SIZE]; + +int is_active(struct domain *d) +{ + int i; + for (i = 0; i < adomains; i++) + if (d->id == active_domains[i]) + return 1; + return 0; +} + +int active_id(struct domain *d) +{ + int i; + for (i = 0; i < adomains; i++) + if (d == adomain_ptrs[i]) + return i; + return -1; +} + +void free_adomain_ptrs() +{ + int i; + int num = adomains; + + adomains = 0; + for (i = 0; i < VIRQ_BITMASK_SIZE; i++) + virq_ovf_pending[i] = 0; + + for (i = 0; i < num; i++) { + put_domain(adomain_ptrs[i]); + adomain_ptrs[i] = NULL; + } +} + +int set_adomain_ptrs(int num) +{ + int i; + struct domain *d; + + for (i = 0; i < VIRQ_BITMASK_SIZE; i++) + virq_ovf_pending[i] = 0; + + for (i = 0; i < num; i++) { + d = find_domain_by_id(active_domains[i]); + if (!d) { + free_adomain_ptrs(); + return -EFAULT; + } + adomain_ptrs[i] = d; + adomains++; + } + return 0; +} + +int set_active(struct domain *d) +{ + if (is_active(d)) + return 0; + /* hack if we run out of space */ + if (adomains >= MAX_OPROF_DOMAINS) { + adomains--; + put_domain(adomain_ptrs[adomains]); + } + active_domains[adomains] = d->id; + if (get_domain(d)) + adomain_ptrs[adomains++] = d; + else { + free_adomain_ptrs(); + return -EFAULT; + } + return 0; +} + +int is_passive(struct domain *d) +{ + int i; + for (i = 0; i < pdomains; i++) + if (d->id == passive_domains[i]) + return 1; + return 0; +} + +int is_profiled(struct domain *d) +{ + if (is_active(d) || is_passive(d)) + return 1; + return 0; +} + +void pmc_log_event(struct domain *d, unsigned int eip, int mode, int event) +{ + shared_info_t *s = NULL; + struct domain *dest = d; + int head = 0; + + if (!is_profiled(d)) + return; + + if (is_passive(d)) { + dest = primary_profiler; + goto log_passive; + } + +log_active: + s = dest->shared_info; + + head = s->event_head; + if (head >= MAX_OPROF_EVENTS) + head = 0; + + if (s->losing_samples) + s->samples_lost++; + if (head == s->event_tail - 1 || (head == MAX_OPROF_EVENTS - 1 && s->event_tail == 0)) + s->losing_samples = 1; + + s->event_log[head].eip = eip; + s->event_log[head].mode = mode; + s->event_log[head].event = event; + head++; + s->event_head = head; + return; + +log_passive: + /* We use the following inefficient format for logging events from other + domains. We put a special record indicating that the next record is + for another domain. This is done for each sample from another + domain */ + s = dest->shared_info; + + head = s->event_head; + if (head >= MAX_OPROF_EVENTS) + head = 0; + + if (s->losing_samples) + s->samples_lost++; + if (head == s->event_tail - 1 || (head == MAX_OPROF_EVENTS - 1 && s->event_tail == 0)) + s->losing_samples = 1; + + s->event_log[head].eip = ~1UL; + s->event_log[head].mode = ~0; + s->event_log[head].event = d->id; + head++; + s->event_head = head; + goto log_active; +} + +static void pmc_event_init(struct domain *d) +{ + shared_info_t *s = d->shared_info; + s->event_head = 0; + s->event_tail = 0; + s->losing_samples = 0; + s->samples_lost = 0; + s->nmi_restarts = 0; + s->active_samples = 0; + s->passive_samples = 0; + s->other_samples = 0; +} + +extern int nmi_init(int *num_events, int *is_primary); +extern int nmi_reserve_counters(void); +extern int nmi_setup_events(void); +extern int nmi_enable_virq(void); +extern int nmi_start(void); +extern void nmi_stop(void); +extern void nmi_disable_virq(void); +extern void nmi_release_counters(void); + +#define PRIV_OP(op) ((op == PMC_SET_ACTIVE) || (op == PMC_SET_PASSIVE) || (op == PMC_RESERVE_COUNTERS) \ + || (op == PMC_SETUP_EVENTS) || (op == PMC_START) || (op == PMC_STOP) \ + || (op == PMC_RELEASE_COUNTERS) || (op == PMC_SHUTDOWN)) + +int do_pmc_op(int op, unsigned int arg1, unsigned int arg2) +{ + int ret = 0; + + if (PRIV_OP(op) && current->domain != primary_profiler) + return -EPERM; + + switch (op) { + case PMC_INIT: + ret = nmi_init((int *)arg1, (int *)arg2); + break; + + case PMC_SET_ACTIVE: + if (adomains != 0) + return -EPERM; + if (copy_from_user((void *)&active_domains, + (void *)arg1, arg2*sizeof(int))) + return -EFAULT; + if (set_adomain_ptrs(arg2)) + return -EFAULT; + if (set_active(current->domain)) + return -EFAULT; + break; + + case PMC_SET_PASSIVE: + if (pdomains != 0) + return -EPERM; + if (copy_from_user((void *)&passive_domains, + (void *)arg1, arg2*sizeof(int))) + return -EFAULT; + pdomains = arg2; + break; + + case PMC_RESERVE_COUNTERS: + ret = nmi_reserve_counters(); + break; + + case PMC_SETUP_EVENTS: + if (copy_from_user((void *)&counter_config, + (void *)arg1, arg2*sizeof(struct op_counter_config))) + return -EFAULT; + ret = nmi_setup_events(); + break; + + case PMC_ENABLE_VIRQ: + if (!is_active(current->domain)) { + if (current->domain != primary_profiler) + return -EPERM; + else + set_active(current->domain); + } + ret = nmi_enable_virq(); + pmc_event_init(current->domain); + activated++; + break; + + case PMC_START: + if (activated < adomains) + return -EPERM; + ret = nmi_start(); + break; + + case PMC_STOP: + nmi_stop(); + break; + + case PMC_DISABLE_VIRQ: + if (!is_active(current->domain)) + return -EPERM; + nmi_disable_virq(); + activated--; + break; + + case PMC_RELEASE_COUNTERS: + nmi_release_counters(); + break; + + case PMC_SHUTDOWN: + free_adomain_ptrs(); + pdomains = 0; + activated = 0; + primary_profiler = NULL; + break; + + default: + ret = -EINVAL; + } + return ret; +} diff -Naurp ../xeno-unstable.bk/xen/arch/x86/traps.c xen/arch/x86/traps.c --- ../xeno-unstable.bk/xen/arch/x86/traps.c 2005-04-15 08:27:45.000000000 -0500 +++ xen/arch/x86/traps.c 2005-04-15 08:33:59.000000000 -0500 @@ -2,6 +2,10 @@ * arch/x86/traps.c * * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser + * + * Modified by Aravind Menon for supporting oprofile + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -51,6 +55,7 @@ #include #include #include +#include #include #include @@ -898,13 +903,12 @@ static void unknown_nmi_error(unsigned c printk("Do you have a strange power saving mode enabled?\n"); } -asmlinkage void do_nmi(struct xen_regs *regs, unsigned long reason) +static void default_do_nmi(struct xen_regs * regs, unsigned long reason) { ++nmi_count(smp_processor_id()); if ( nmi_watchdog ) nmi_watchdog_tick(regs); - if ( reason & 0x80 ) mem_parity_error(regs); else if ( reason & 0x40 ) @@ -913,6 +917,36 @@ asmlinkage void do_nmi(struct xen_regs * unknown_nmi_error((unsigned char)(reason&0xff)); } +static int dummy_nmi_callback(struct xen_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + +asmlinkage void do_nmi(struct xen_regs * regs, unsigned long reason) +{ + int cpu; + cpu = smp_processor_id(); + + if (!nmi_callback(regs, cpu)) + default_do_nmi(regs, reason); +} + +void set_nmi_callback(nmi_callback_t callback) +{ + nmi_callback = callback; +} + +void unset_nmi_callback(void) +{ + nmi_callback = dummy_nmi_callback; +} + +EXPORT_SYMBOL(set_nmi_callback); +EXPORT_SYMBOL(unset_nmi_callback); + + asmlinkage int math_state_restore(struct xen_regs *regs) { /* Prevent recursion. */ diff -Naurp ../xeno-unstable.bk/xen/arch/x86/traps.c.orig xen/arch/x86/traps.c.orig --- ../xeno-unstable.bk/xen/arch/x86/traps.c.orig 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/traps.c.orig 2005-04-15 08:33:16.000000000 -0500 @@ -0,0 +1,1194 @@ +/****************************************************************************** + * arch/x86/traps.c + * + * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * opt_nmi: one of 'ignore', 'dom0', or 'fatal'. + * fatal: Xen prints diagnostic message and then hangs. + * dom0: The NMI is virtualised to DOM0. + * ignore: The NMI error is cleared and ignored. + */ +#ifdef NDEBUG +char opt_nmi[10] = "dom0"; +#else +char opt_nmi[10] = "fatal"; +#endif +string_param("nmi", opt_nmi); + +/* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/ +idt_entry_t idt_table[IDT_ENTRIES]; + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void coprocessor_error(void); +asmlinkage void simd_coprocessor_error(void); +asmlinkage void alignment_check(void); +asmlinkage void spurious_interrupt_bug(void); +asmlinkage void machine_check(void); + +/* + * This is called for faults at very unexpected times (e.g., when interrupts + * are disabled). In such situations we can't do much that is safe. We try to + * print out some tracing and then we just spin. + */ +asmlinkage void fatal_trap(int trapnr, struct xen_regs *regs) +{ + int cpu = smp_processor_id(); + unsigned long cr2; + static char *trapstr[] = { + "divide error", "debug", "nmi", "bkpt", "overflow", "bounds", + "invalid operation", "device not available", "double fault", + "coprocessor segment", "invalid tss", "segment not found", + "stack error", "general protection fault", "page fault", + "spurious interrupt", "coprocessor error", "alignment check", + "machine check", "simd error" + }; + + watchdog_on = 0; + + show_registers(regs); + + if ( trapnr == TRAP_page_fault ) + { + __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : ); + printk("Faulting linear address might be %p\n", cr2); + } + + printk("************************************\n"); + printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n", + cpu, trapnr, trapstr[trapnr], regs->error_code, + (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT"); + printk("System shutting down -- need manual reset.\n"); + printk("************************************\n"); + + (void)debugger_trap_fatal(trapnr, regs); + + /* Lock up the console to prevent spurious output from other CPUs. */ + console_force_lock(); + + /* Wait for manual reset. */ + for ( ; ; ) + __asm__ __volatile__ ( "hlt" ); +} + +static inline int do_trap(int trapnr, char *str, + struct xen_regs *regs, + int use_error_code) +{ + struct exec_domain *ed = current; + struct trap_bounce *tb = &ed->arch.trap_bounce; + trap_info_t *ti; + unsigned long fixup; + + DEBUGGER_trap_entry(trapnr, regs); + + if ( !GUEST_MODE(regs) ) + goto xen_fault; + +#ifndef NDEBUG + if ( (ed->arch.traps[trapnr].address == 0) && (ed->domain->id == 0) ) + goto xen_fault; +#endif + + ti = current->arch.traps + trapnr; + tb->flags = TBF_EXCEPTION; + tb->cs = ti->cs; + tb->eip = ti->address; + if ( use_error_code ) + { + tb->flags |= TBF_EXCEPTION_ERRCODE; + tb->error_code = regs->error_code; + } + if ( TI_GET_IF(ti) ) + ed->vcpu_info->evtchn_upcall_mask = 1; + return 0; + + xen_fault: + + if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) + { + DPRINTK("Trap %d: %p -> %p\n", trapnr, regs->eip, fixup); + regs->eip = fixup; + return 0; + } + + DEBUGGER_trap_fatal(trapnr, regs); + + show_registers(regs); + panic("CPU%d FATAL TRAP: vector = %d (%s)\n" + "[error_code=%04x]\n", + smp_processor_id(), trapnr, str, regs->error_code); + return 0; +} + +#define DO_ERROR_NOCODE(trapnr, str, name) \ +asmlinkage int do_##name(struct xen_regs *regs) \ +{ \ + return do_trap(trapnr, str, regs, 0); \ +} + +#define DO_ERROR(trapnr, str, name) \ +asmlinkage int do_##name(struct xen_regs *regs) \ +{ \ + return do_trap(trapnr, str, regs, 1); \ +} + +DO_ERROR_NOCODE( 0, "divide error", divide_error) +DO_ERROR_NOCODE( 4, "overflow", overflow) +DO_ERROR_NOCODE( 5, "bounds", bounds) +DO_ERROR_NOCODE( 6, "invalid operand", invalid_op) +DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, "invalid TSS", invalid_TSS) +DO_ERROR(11, "segment not present", segment_not_present) +DO_ERROR(12, "stack segment", stack_segment) +DO_ERROR_NOCODE(16, "fpu error", coprocessor_error) +DO_ERROR(17, "alignment check", alignment_check) +DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error) + +asmlinkage int do_int3(struct xen_regs *regs) +{ + struct exec_domain *ed = current; + struct trap_bounce *tb = &ed->arch.trap_bounce; + trap_info_t *ti; + + DEBUGGER_trap_entry(TRAP_int3, regs); + + if ( !GUEST_MODE(regs) ) + { + DEBUGGER_trap_fatal(TRAP_int3, regs); + show_registers(regs); + panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id()); + } + + ti = current->arch.traps + 3; + tb->flags = TBF_EXCEPTION; + tb->cs = ti->cs; + tb->eip = ti->address; + if ( TI_GET_IF(ti) ) + ed->vcpu_info->evtchn_upcall_mask = 1; + + return 0; +} + +asmlinkage void do_machine_check(struct xen_regs *regs) +{ + fatal_trap(TRAP_machine_check, regs); +} + +void propagate_page_fault(unsigned long addr, u16 error_code) +{ + trap_info_t *ti; + struct exec_domain *ed = current; + struct trap_bounce *tb = &ed->arch.trap_bounce; + + ti = ed->arch.traps + 14; + tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE | TBF_EXCEPTION_CR2; + tb->cr2 = addr; + tb->error_code = error_code; + tb->cs = ti->cs; + tb->eip = ti->address; + if ( TI_GET_IF(ti) ) + ed->vcpu_info->evtchn_upcall_mask = 1; + + ed->arch.guest_cr2 = addr; +} + +asmlinkage int do_page_fault(struct xen_regs *regs) +{ + unsigned long off, addr, fixup; + struct exec_domain *ed = current; + struct domain *d = ed->domain; + int ret; + + __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : ); + + DEBUGGER_trap_entry(TRAP_page_fault, regs); + + //printk("do_page_fault(eip=%p, va=%p, code=%d)\n", regs->eip, addr, regs->error_code); + + perfc_incrc(page_faults); + + if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) + { + LOCK_BIGLOCK(d); + if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && + unlikely((addr >> L2_PAGETABLE_SHIFT) == + d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) ) + { + ptwr_flush(d, PTWR_PT_ACTIVE); + UNLOCK_BIGLOCK(d); + return EXCRET_fault_fixed; + } + + if ( (addr < PAGE_OFFSET) && + ((regs->error_code & 3) == 3) && /* write-protection fault */ + ptwr_do_page_fault(d, addr) ) + { + if ( unlikely(shadow_mode_enabled(d)) ) + (void)shadow_fault(addr, regs); + UNLOCK_BIGLOCK(d); + return EXCRET_fault_fixed; + } + UNLOCK_BIGLOCK(d); + } + + if ( unlikely(shadow_mode_enabled(d)) && + ((addr < HYPERVISOR_VIRT_START) || + (shadow_mode_external(d) && GUEST_CONTEXT(ed, regs))) && + shadow_fault(addr, regs) ) + { + return EXCRET_fault_fixed; + } + + if ( unlikely(addr >= LDT_VIRT_START(ed)) && + (addr < (LDT_VIRT_START(ed) + (ed->arch.ldt_ents*LDT_ENTRY_SIZE))) ) + { + /* + * Copy a mapping from the guest's LDT, if it is valid. Otherwise we + * send the fault up to the guest OS to be handled. + */ + extern int map_ldt_shadow_page(unsigned int); + LOCK_BIGLOCK(d); + off = addr - LDT_VIRT_START(ed); + addr = ed->arch.ldt_base + off; + ret = map_ldt_shadow_page(off >> PAGE_SHIFT); + UNLOCK_BIGLOCK(d); + if ( likely(ret) ) + return EXCRET_fault_fixed; /* successfully copied the mapping */ + } + + if ( !GUEST_MODE(regs) ) + goto xen_fault; + +#ifndef NDEBUG + if ( (ed->arch.traps[TRAP_page_fault].address == 0) && (d->id == 0) ) + goto xen_fault; +#endif + + propagate_page_fault(addr, regs->error_code); + return 0; + + xen_fault: + + if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) + { + perfc_incrc(copy_user_faults); + if ( !shadow_mode_enabled(d) ) + DPRINTK("Page fault: %p -> %p\n", regs->eip, fixup); + regs->eip = fixup; + return 0; + } + + DEBUGGER_trap_fatal(TRAP_page_fault, regs); + + show_registers(regs); + show_page_walk(addr); + panic("CPU%d FATAL PAGE FAULT\n" + "[error_code=%04x]\n" + "Faulting linear address might be %p\n", + smp_processor_id(), regs->error_code, addr); + return 0; +} + +long do_fpu_taskswitch(int set) +{ + struct exec_domain *ed = current; + + if ( set ) + { + set_bit(EDF_GUEST_STTS, &ed->ed_flags); + stts(); + } + else + { + clear_bit(EDF_GUEST_STTS, &ed->ed_flags); + if ( test_bit(EDF_USEDFPU, &ed->ed_flags) ) + clts(); + } + + return 0; +} + +/* Has the guest requested sufficient permission for this I/O access? */ +static inline int guest_io_okay( + unsigned int port, unsigned int bytes, + struct exec_domain *ed, struct xen_regs *regs) +{ + u16 x; +#if defined(__x86_64__) + /* If in user mode, switch to kernel mode just to read I/O bitmap. */ + extern void toggle_guest_mode(struct exec_domain *); + int user_mode = !(ed->arch.flags & TF_kernel_mode); +#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(ed) +#elif defined(__i386__) +#define TOGGLE_MODE() ((void)0) +#endif + + if ( ed->arch.iopl >= (KERNEL_MODE(ed, regs) ? 1 : 3) ) + return 1; + + if ( ed->arch.iobmp_limit > (port + bytes) ) + { + TOGGLE_MODE(); + __get_user(x, (u16 *)(ed->arch.iobmp+(port>>3))); + TOGGLE_MODE(); + if ( (x & (((1<domain; + u16 x; + + if ( d->arch.iobmp_mask != NULL ) + { + x = *(u16 *)(d->arch.iobmp_mask + (port >> 3)); + if ( (x & (((1<eip; + u8 opcode, modrm_reg = 0, rep_prefix = 0; + unsigned int port, i, op_bytes = 4, data; + + /* Legacy prefixes. */ + for ( i = 0; i < 8; i++ ) + { + switch ( opcode = insn_fetch(u8, 1, eip) ) + { + case 0x66: /* operand-size override */ + op_bytes ^= 6; /* switch between 2/4 bytes */ + break; + case 0x67: /* address-size override */ + case 0x2e: /* CS override */ + case 0x3e: /* DS override */ + case 0x26: /* ES override */ + case 0x64: /* FS override */ + case 0x65: /* GS override */ + case 0x36: /* SS override */ + case 0xf0: /* LOCK */ + case 0xf2: /* REPNE/REPNZ */ + break; + case 0xf3: /* REP/REPE/REPZ */ + rep_prefix = 1; + break; + default: + goto done_prefixes; + } + } + done_prefixes: + +#ifdef __x86_64__ + /* REX prefix. */ + if ( (opcode & 0xf0) == 0x40 ) + { + modrm_reg = (opcode & 4) << 1; /* REX.R */ + /* REX.W, REX.B and REX.X do not need to be decoded. */ + opcode = insn_fetch(u8, 1, eip); + } +#endif + + /* Input/Output String instructions. */ + if ( (opcode >= 0x6c) && (opcode <= 0x6f) ) + { + if ( rep_prefix && (regs->ecx == 0) ) + goto done; + + continue_io_string: + switch ( opcode ) + { + case 0x6c: /* INSB */ + op_bytes = 1; + case 0x6d: /* INSW/INSL */ + if ( !guest_io_okay((u16)regs->edx, op_bytes, ed, regs) ) + goto fail; + switch ( op_bytes ) + { + case 1: + data = (u8)inb_user((u16)regs->edx, ed, regs); + if ( put_user((u8)data, (u8 *)regs->edi) ) + goto write_fault; + break; + case 2: + data = (u16)inw_user((u16)regs->edx, ed, regs); + if ( put_user((u16)data, (u16 *)regs->edi) ) + goto write_fault; + break; + case 4: + data = (u32)inl_user((u16)regs->edx, ed, regs); + if ( put_user((u32)data, (u32 *)regs->edi) ) + goto write_fault; + break; + } + regs->edi += (regs->eflags & EF_DF) ? -op_bytes : op_bytes; + break; + + case 0x6e: /* OUTSB */ + op_bytes = 1; + case 0x6f: /* OUTSW/OUTSL */ + if ( !guest_io_okay((u16)regs->edx, op_bytes, ed, regs) ) + goto fail; + switch ( op_bytes ) + { + case 1: + if ( get_user(data, (u8 *)regs->esi) ) + goto read_fault; + outb_user((u8)data, (u16)regs->edx, ed, regs); + break; + case 2: + if ( get_user(data, (u16 *)regs->esi) ) + goto read_fault; + outw_user((u16)data, (u16)regs->edx, ed, regs); + break; + case 4: + if ( get_user(data, (u32 *)regs->esi) ) + goto read_fault; + outl_user((u32)data, (u16)regs->edx, ed, regs); + break; + } + regs->esi += (regs->eflags & EF_DF) ? -op_bytes : op_bytes; + break; + } + + if ( rep_prefix && (--regs->ecx != 0) ) + { + if ( !hypercall_preempt_check() ) + goto continue_io_string; + eip = regs->eip; + } + + goto done; + } + + /* I/O Port and Interrupt Flag instructions. */ + switch ( opcode ) + { + case 0xe4: /* IN imm8,%al */ + op_bytes = 1; + case 0xe5: /* IN imm8,%eax */ + port = insn_fetch(u8, 1, eip); + exec_in: + if ( !guest_io_okay(port, op_bytes, ed, regs) ) + goto fail; + switch ( op_bytes ) + { + case 1: + regs->eax &= ~0xffUL; + regs->eax |= (u8)inb_user(port, ed, regs); + break; + case 2: + regs->eax &= ~0xffffUL; + regs->eax |= (u16)inw_user(port, ed, regs); + break; + case 4: + regs->eax = (u32)inl_user(port, ed, regs); + break; + } + goto done; + + case 0xec: /* IN %dx,%al */ + op_bytes = 1; + case 0xed: /* IN %dx,%eax */ + port = (u16)regs->edx; + goto exec_in; + + case 0xe6: /* OUT %al,imm8 */ + op_bytes = 1; + case 0xe7: /* OUT %eax,imm8 */ + port = insn_fetch(u8, 1, eip); + exec_out: + if ( !guest_io_okay(port, op_bytes, ed, regs) ) + goto fail; + switch ( op_bytes ) + { + case 1: + outb_user((u8)regs->eax, port, ed, regs); + break; + case 2: + outw_user((u16)regs->eax, port, ed, regs); + break; + case 4: + outl_user((u32)regs->eax, port, ed, regs); + break; + } + goto done; + + case 0xee: /* OUT %al,%dx */ + op_bytes = 1; + case 0xef: /* OUT %eax,%dx */ + port = (u16)regs->edx; + goto exec_out; + + case 0xfa: /* CLI */ + case 0xfb: /* STI */ + if ( ed->arch.iopl < (KERNEL_MODE(ed, regs) ? 1 : 3) ) + goto fail; + /* + * This is just too dangerous to allow, in my opinion. Consider if the + * caller then tries to reenable interrupts using POPF: we can't trap + * that and we'll end up with hard-to-debug lockups. Fast & loose will + * do for us. :-) + */ + /*ed->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/ + goto done; + + case 0x0f: /* Two-byte opcode */ + break; + + default: + goto fail; + } + + /* Remaining instructions only emulated from guest kernel. */ + if ( !KERNEL_MODE(ed, regs) ) + goto fail; + + /* Privileged (ring 0) instructions. */ + opcode = insn_fetch(u8, 1, eip); + switch ( opcode ) + { + case 0x06: /* CLTS */ + (void)do_fpu_taskswitch(0); + break; + + case 0x09: /* WBINVD */ + /* Ignore the instruction if unprivileged. */ + if ( !IS_CAPABLE_PHYSDEV(ed->domain) ) + DPRINTK("Non-physdev domain attempted WBINVD.\n"); + else + wbinvd(); + break; + + case 0x20: /* MOV CR?, */ + opcode = insn_fetch(u8, 1, eip); + if ( (opcode & 0xc0) != 0xc0 ) + goto fail; + modrm_reg |= opcode & 7; + reg = decode_register(modrm_reg, regs, 0); + switch ( (opcode >> 3) & 7 ) + { + case 0: /* Read CR0 */ + *reg = + (read_cr0() & ~X86_CR0_TS) | + (test_bit(EDF_GUEST_STTS, &ed->ed_flags) ? X86_CR0_TS : 0); + break; + + case 2: /* Read CR2 */ + *reg = ed->arch.guest_cr2; + break; + + case 3: /* Read CR3 */ + *reg = pagetable_val(ed->arch.guest_table); + break; + + default: + goto fail; + } + break; + + case 0x22: /* MOV ,CR? */ + opcode = insn_fetch(u8, 1, eip); + if ( (opcode & 0xc0) != 0xc0 ) + goto fail; + modrm_reg |= opcode & 7; + reg = decode_register(modrm_reg, regs, 0); + switch ( (opcode >> 3) & 7 ) + { + case 0: /* Write CR0 */ + (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS)); + break; + + case 2: /* Write CR2 */ + ed->arch.guest_cr2 = *reg; + break; + + case 3: /* Write CR3 */ + LOCK_BIGLOCK(ed->domain); + (void)new_guest_cr3(*reg); + UNLOCK_BIGLOCK(ed->domain); + break; + + default: + goto fail; + } + break; + + case 0x30: /* WRMSR */ + /* Ignore the instruction if unprivileged. */ + if ( !IS_PRIV(ed->domain) ) + DPRINTK("Non-priv domain attempted WRMSR(%p,%08lx,%08lx).\n", + regs->ecx, (long)regs->eax, (long)regs->edx); + else if ( wrmsr_user(regs->ecx, regs->eax, regs->edx) ) + goto fail; + break; + + case 0x32: /* RDMSR */ + if ( !IS_PRIV(ed->domain) ) + DPRINTK("Non-priv domain attempted RDMSR(%p,%08lx,%08lx).\n", + regs->ecx, (long)regs->eax, (long)regs->edx); + /* Everyone can read the MSR space. */ + if ( rdmsr_user(regs->ecx, regs->eax, regs->edx) ) + goto fail; + break; + + default: + goto fail; + } + + done: + regs->eip = eip; + return EXCRET_fault_fixed; + + fail: + return 0; + + read_fault: + propagate_page_fault(eip, 4); /* user mode, read fault */ + return EXCRET_fault_fixed; + + write_fault: + propagate_page_fault(eip, 6); /* user mode, write fault */ + return EXCRET_fault_fixed; +} + +asmlinkage int do_general_protection(struct xen_regs *regs) +{ + struct exec_domain *ed = current; + struct trap_bounce *tb = &ed->arch.trap_bounce; + trap_info_t *ti; + unsigned long fixup; + + DEBUGGER_trap_entry(TRAP_gp_fault, regs); + + if ( regs->error_code & 1 ) + goto hardware_gp; + + if ( !GUEST_MODE(regs) ) + goto gp_in_kernel; + + /* + * Cunning trick to allow arbitrary "INT n" handling. + * + * We set DPL == 0 on all vectors in the IDT. This prevents any INT + * instruction from trapping to the appropriate vector, when that might not + * be expected by Xen or the guest OS. For example, that entry might be for + * a fault handler (unlike traps, faults don't increment EIP), or might + * expect an error code on the stack (which a software trap never + * provides), or might be a hardware interrupt handler that doesn't like + * being called spuriously. + * + * Instead, a GPF occurs with the faulting IDT vector in the error code. + * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is + * clear to indicate that it's a software fault, not hardware. + * + * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is + * okay because they can only be triggered by an explicit DPL-checked + * instruction. The DPL specified by the guest OS for these vectors is NOT + * CHECKED!! + */ + if ( (regs->error_code & 3) == 2 ) + { + /* This fault must be due to instruction. */ + ti = current->arch.traps + (regs->error_code>>3); + if ( PERMIT_SOFTINT(TI_GET_DPL(ti), ed, regs) ) + { + tb->flags = TBF_EXCEPTION; + regs->eip += 2; + goto finish_propagation; + } + } + + /* Emulate some simple privileged and I/O instructions. */ + if ( (regs->error_code == 0) && + emulate_privileged_op(regs) ) + return 0; + +#if defined(__i386__) + if ( VM_ASSIST(ed->domain, VMASST_TYPE_4gb_segments) && + (regs->error_code == 0) && + gpf_emulate_4gb(regs) ) + return 0; +#endif + +#ifndef NDEBUG + if ( (ed->arch.traps[TRAP_gp_fault].address == 0) && + (ed->domain->id == 0) ) + goto gp_in_kernel; +#endif + + /* Pass on GPF as is. */ + ti = current->arch.traps + 13; + tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; + tb->error_code = regs->error_code; + finish_propagation: + tb->cs = ti->cs; + tb->eip = ti->address; + if ( TI_GET_IF(ti) ) + ed->vcpu_info->evtchn_upcall_mask = 1; + return 0; + + gp_in_kernel: + + if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) + { + DPRINTK("GPF (%04x): %p -> %p\n", + regs->error_code, regs->eip, fixup); + regs->eip = fixup; + return 0; + } + + DEBUGGER_trap_fatal(TRAP_gp_fault, regs); + + hardware_gp: + show_registers(regs); + panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n", + smp_processor_id(), regs->error_code); + return 0; +} + +unsigned long nmi_softirq_reason; +static void nmi_softirq(void) +{ + if ( dom0 == NULL ) + return; + + if ( test_and_clear_bit(0, &nmi_softirq_reason) ) + send_guest_virq(dom0->exec_domain[0], VIRQ_PARITY_ERR); + + if ( test_and_clear_bit(1, &nmi_softirq_reason) ) + send_guest_virq(dom0->exec_domain[0], VIRQ_IO_ERR); +} + +asmlinkage void mem_parity_error(struct xen_regs *regs) +{ + /* Clear and disable the parity-error line. */ + outb((inb(0x61)&15)|4,0x61); + + switch ( opt_nmi[0] ) + { + case 'd': /* 'dom0' */ + set_bit(0, &nmi_softirq_reason); + raise_softirq(NMI_SOFTIRQ); + case 'i': /* 'ignore' */ + break; + default: /* 'fatal' */ + console_force_unlock(); + printk("\n\nNMI - MEMORY ERROR\n"); + fatal_trap(TRAP_nmi, regs); + } +} + +asmlinkage void io_check_error(struct xen_regs *regs) +{ + /* Clear and disable the I/O-error line. */ + outb((inb(0x61)&15)|8,0x61); + + switch ( opt_nmi[0] ) + { + case 'd': /* 'dom0' */ + set_bit(0, &nmi_softirq_reason); + raise_softirq(NMI_SOFTIRQ); + case 'i': /* 'ignore' */ + break; + default: /* 'fatal' */ + console_force_unlock(); + printk("\n\nNMI - I/O ERROR\n"); + fatal_trap(TRAP_nmi, regs); + } +} + +static void unknown_nmi_error(unsigned char reason) +{ + printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); + printk("Dazed and confused, but trying to continue\n"); + printk("Do you have a strange power saving mode enabled?\n"); +} + +asmlinkage void do_nmi(struct xen_regs *regs, unsigned long reason) +{ + ++nmi_count(smp_processor_id()); + + if ( nmi_watchdog ) + nmi_watchdog_tick(regs); + + if ( reason & 0x80 ) + mem_parity_error(regs); + else if ( reason & 0x40 ) + io_check_error(regs); + else if ( !nmi_watchdog ) + unknown_nmi_error((unsigned char)(reason&0xff)); +} + +asmlinkage int math_state_restore(struct xen_regs *regs) +{ + /* Prevent recursion. */ + clts(); + + if ( !test_bit(EDF_USEDFPU, ¤t->ed_flags) ) + { + if ( test_bit(EDF_DONEFPUINIT, ¤t->ed_flags) ) + restore_fpu(current); + else + init_fpu(); + set_bit(EDF_USEDFPU, ¤t->ed_flags); /* so we fnsave on switch_to() */ + } + + if ( test_and_clear_bit(EDF_GUEST_STTS, ¤t->ed_flags) ) + { + struct trap_bounce *tb = ¤t->arch.trap_bounce; + tb->flags = TBF_EXCEPTION; + tb->cs = current->arch.traps[7].cs; + tb->eip = current->arch.traps[7].address; + } + + return EXCRET_fault_fixed; +} + +asmlinkage int do_debug(struct xen_regs *regs) +{ + unsigned long condition; + struct exec_domain *ed = current; + struct trap_bounce *tb = &ed->arch.trap_bounce; + + __asm__ __volatile__("mov %%db6,%0" : "=r" (condition)); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) && + (ed->arch.debugreg[7] == 0) ) + { + __asm__("mov %0,%%db7" : : "r" (0UL)); + goto out; + } + + DEBUGGER_trap_entry(TRAP_debug, regs); + + if ( !GUEST_MODE(regs) ) + { + /* Clear TF just for absolute sanity. */ + regs->eflags &= ~EF_TF; + /* + * We ignore watchpoints when they trigger within Xen. This may happen + * when a buffer is passed to us which previously had a watchpoint set + * on it. No need to bump EIP; the only faulting trap is an instruction + * breakpoint, which can't happen to us. + */ + goto out; + } + + /* Save debug status register where guest OS can peek at it */ + ed->arch.debugreg[6] = condition; + + tb->flags = TBF_EXCEPTION; + tb->cs = ed->arch.traps[1].cs; + tb->eip = ed->arch.traps[1].address; + + out: + return EXCRET_not_a_fault; +} + +asmlinkage int do_spurious_interrupt_bug(struct xen_regs *regs) +{ + return EXCRET_not_a_fault; +} + +void set_intr_gate(unsigned int n, void *addr) +{ +#ifdef __i386__ + int i; + /* Keep secondary tables in sync with IRQ updates. */ + for ( i = 1; i < NR_CPUS; i++ ) + if ( idt_tables[i] != NULL ) + _set_gate(&idt_tables[i][n], 14, 0, addr); +#endif + _set_gate(&idt_table[n], 14, 0, addr); +} + +void set_system_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,14,3,addr); +} + +void set_task_gate(unsigned int n, unsigned int sel) +{ + idt_table[n].a = sel << 16; + idt_table[n].b = 0x8500; +} + +void set_tss_desc(unsigned int n, void *addr) +{ + _set_tssldt_desc( + gdt_table + __TSS(n), + (unsigned long)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, + 9); +} + +void __init trap_init(void) +{ + extern void percpu_traps_init(void); + extern void cpu_init(void); + + /* + * Note that interrupt gates are always used, rather than trap gates. We + * must have interrupts disabled until DS/ES/FS/GS are saved because the + * first activation must have the "bad" value(s) for these registers and + * we may lose them if another activation is installed before they are + * saved. The page-fault handler also needs interrupts disabled until %cr2 + * has been read and saved on the stack. + */ + set_intr_gate(TRAP_divide_error,÷_error); + set_intr_gate(TRAP_debug,&debug); + set_intr_gate(TRAP_nmi,&nmi); + set_system_gate(TRAP_int3,&int3); /* usable from all privileges */ + set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */ + set_intr_gate(TRAP_bounds,&bounds); + set_intr_gate(TRAP_invalid_op,&invalid_op); + set_intr_gate(TRAP_no_device,&device_not_available); + set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun); + set_intr_gate(TRAP_invalid_tss,&invalid_TSS); + set_intr_gate(TRAP_no_segment,&segment_not_present); + set_intr_gate(TRAP_stack_error,&stack_segment); + set_intr_gate(TRAP_gp_fault,&general_protection); + set_intr_gate(TRAP_page_fault,&page_fault); + set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug); + set_intr_gate(TRAP_copro_error,&coprocessor_error); + set_intr_gate(TRAP_alignment_check,&alignment_check); + set_intr_gate(TRAP_machine_check,&machine_check); + set_intr_gate(TRAP_simd_error,&simd_coprocessor_error); + + percpu_traps_init(); + + cpu_init(); + + open_softirq(NMI_SOFTIRQ, nmi_softirq); +} + + +long do_set_trap_table(trap_info_t *traps) +{ + trap_info_t cur; + trap_info_t *dst = current->arch.traps; + long rc = 0; + + LOCK_BIGLOCK(current->domain); + + for ( ; ; ) + { + if ( hypercall_preempt_check() ) + { + rc = hypercall1_create_continuation( + __HYPERVISOR_set_trap_table, traps); + break; + } + + if ( copy_from_user(&cur, traps, sizeof(cur)) ) + { + rc = -EFAULT; + break; + } + + if ( cur.address == 0 ) + break; + + if ( !VALID_CODESEL(cur.cs) ) + { + rc = -EPERM; + break; + } + + memcpy(dst+cur.vector, &cur, sizeof(cur)); + traps++; + } + + UNLOCK_BIGLOCK(current->domain); + + return rc; +} + + +#if defined(__i386__) +#define DB_VALID_ADDR(_a) \ + ((_a) <= (PAGE_OFFSET - 4)) +#elif defined(__x86_64__) +#define DB_VALID_ADDR(_a) \ + ((_a) >= HYPERVISOR_VIRT_END) || ((_a) <= (HYPERVISOR_VIRT_START-8)) +#endif +long set_debugreg(struct exec_domain *p, int reg, unsigned long value) +{ + int i; + + switch ( reg ) + { + case 0: + if ( !DB_VALID_ADDR(value) ) return -EPERM; + if ( p == current ) + __asm__ ( "mov %0, %%db0" : : "r" (value) ); + break; + case 1: + if ( !DB_VALID_ADDR(value) ) return -EPERM; + if ( p == current ) + __asm__ ( "mov %0, %%db1" : : "r" (value) ); + break; + case 2: + if ( !DB_VALID_ADDR(value) ) return -EPERM; + if ( p == current ) + __asm__ ( "mov %0, %%db2" : : "r" (value) ); + break; + case 3: + if ( !DB_VALID_ADDR(value) ) return -EPERM; + if ( p == current ) + __asm__ ( "mov %0, %%db3" : : "r" (value) ); + break; + case 6: + /* + * DR6: Bits 4-11,16-31 reserved (set to 1). + * Bit 12 reserved (set to 0). + */ + value &= 0xffffefff; /* reserved bits => 0 */ + value |= 0xffff0ff0; /* reserved bits => 1 */ + if ( p == current ) + __asm__ ( "mov %0, %%db6" : : "r" (value) ); + break; + case 7: + /* + * DR7: Bit 10 reserved (set to 1). + * Bits 11-12,14-15 reserved (set to 0). + * Privileged bits: + * GD (bit 13): must be 0. + * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10. + * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10. + */ + /* DR7 == 0 => debugging disabled for this domain. */ + if ( value != 0 ) + { + value &= 0xffff27ff; /* reserved bits => 0 */ + value |= 0x00000400; /* reserved bits => 1 */ + if ( (value & (1<<13)) != 0 ) return -EPERM; + for ( i = 0; i < 16; i += 2 ) + if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM; + } + if ( p == current ) + __asm__ ( "mov %0, %%db7" : : "r" (value) ); + break; + default: + return -EINVAL; + } + + p->arch.debugreg[reg] = value; + return 0; +} + +long do_set_debugreg(int reg, unsigned long value) +{ + return set_debugreg(current, reg, value); +} + +unsigned long do_get_debugreg(int reg) +{ + if ( (reg < 0) || (reg > 7) ) return -EINVAL; + return current->arch.debugreg[reg]; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -Naurp ../xeno-unstable.bk/xen/arch/x86/x86_32/entry.S xen/arch/x86/x86_32/entry.S --- ../xeno-unstable.bk/xen/arch/x86/x86_32/entry.S 2005-04-15 08:27:45.000000000 -0500 +++ xen/arch/x86/x86_32/entry.S 2005-04-15 08:40:37.000000000 -0500 @@ -3,6 +3,10 @@ * * Copyright (c) 2002-2004, K A Fraser * Copyright (c) 1991, 1992 Linus Torvalds + * + * Modified by Aravind Menon for supporting oprofile + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. * * Calling back to a guest OS: * =========================== @@ -563,10 +567,10 @@ ENTRY(nmi) jnz do_watchdog_tick movl %ds,%eax cmpw $(__HYPERVISOR_DS),%ax - jne defer_nmi + jne force_nmi movl %es,%eax cmpw $(__HYPERVISOR_DS),%ax - jne defer_nmi + jne force_nmi do_watchdog_tick: movl $(__HYPERVISOR_DS),%edx @@ -579,6 +583,32 @@ do_watchdog_tick: addl $8,%esp jmp ret_from_intr +force_nmi: + movl %ds,-4(%esp) + movl %es,-8(%esp) + movl %fs,-12(%esp) + movl %gs,-16(%esp) + subl $16,%esp + + movl $(__HYPERVISOR_DS),%edx + movl %edx,%ds + movl %edx,%es + + movl %esp,%edx + addl $16,%edx + pushl %ebx + pushl %edx + call SYMBOL_NAME(do_nmi) + addl $8,%esp + + addl $16,%esp + movl -4(%esp),%ds + movl -8(%esp),%es + movl -12(%esp),%fs + movl -16(%esp),%gs + + jmp restore_all_xen + defer_nmi: movl $FIXMAP_apic_base,%eax # apic_wait_icr_idle() @@ -739,6 +769,7 @@ ENTRY(hypercall_table) .long SYMBOL_NAME(do_boot_vcpu) .long SYMBOL_NAME(do_ni_hypercall) /* 25 */ .long SYMBOL_NAME(do_mmuext_op) + .long SYMBOL_NAME(do_pmc_op) .rept NR_hypercalls-((.-hypercall_table)/4) .long SYMBOL_NAME(do_ni_hypercall) .endr diff -Naurp ../xeno-unstable.bk/xen/arch/x86/x86_32/entry.S.orig xen/arch/x86/x86_32/entry.S.orig --- ../xeno-unstable.bk/xen/arch/x86/x86_32/entry.S.orig 1969-12-31 18:00:00.000000000 -0600 +++ xen/arch/x86/x86_32/entry.S.orig 2005-04-15 08:33:16.000000000 -0500 @@ -0,0 +1,744 @@ +/* + * Hypercall and fault low-level handling routines. + * + * Copyright (c) 2002-2004, K A Fraser + * Copyright (c) 1991, 1992 Linus Torvalds + * + * Calling back to a guest OS: + * =========================== + * + * First, we require that all callbacks (either via a supplied + * interrupt-descriptor-table, or via the special event or failsafe callbacks + * in the shared-info-structure) are to ring 1. This just makes life easier, + * in that it means we don't have to do messy GDT/LDT lookups to find + * out which the privilege-level of the return code-selector. That code + * would just be a hassle to write, and would need to account for running + * off the end of the GDT/LDT, for example. For all callbacks we check + * that the provided return CS is not == __HYPERVISOR_{CS,DS}. Apart from that + * we're safe as don't allow a guest OS to install ring-0 privileges into the + * GDT/LDT. It's up to the guest OS to ensure all returns via the IDT are to + * ring 1. If not, we load incorrect SS/ESP values from the TSS (for ring 1 + * rather than the correct ring) and bad things are bound to ensue -- IRET is + * likely to fault, and we may end up killing the domain (no harm can + * come to Xen, though). + * + * When doing a callback, we check if the return CS is in ring 0. If so, + * callback is delayed until next return to ring != 0. + * If return CS is in ring 1, then we create a callback frame + * starting at return SS/ESP. The base of the frame does an intra-privilege + * interrupt-return. + * If return CS is in ring > 1, we create a callback frame starting + * at SS/ESP taken from appropriate section of the current TSS. The base + * of the frame does an inter-privilege interrupt-return. + * + * Note that the "failsafe callback" uses a special stackframe: + * { return_DS, return_ES, return_FS, return_GS, return_EIP, + * return_CS, return_EFLAGS[, return_ESP, return_SS] } + * That is, original values for DS/ES/FS/GS are placed on stack rather than + * in DS/ES/FS/GS themselves. Why? It saves us loading them, only to have them + * saved/restored in guest OS. Furthermore, if we load them we may cause + * a fault if they are invalid, which is a hassle to deal with. We avoid + * that problem if we don't load them :-) This property allows us to use + * the failsafe callback as a fallback: if we ever fault on loading DS/ES/FS/GS + * on return to ring != 0, we can simply package it up as a return via + * the failsafe callback, and let the guest OS sort it out (perhaps by + * killing an application process). Note that we also do this for any + * faulting IRET -- just let the guest OS handle it via the event + * callback. + * + * We terminate a domain in the following cases: + * - creating a callback stack frame (due to bad ring-1 stack). + * - faulting IRET on entry to failsafe callback handler. + * So, each domain must keep its ring-1 %ss/%esp and failsafe callback + * handler in good order (absolutely no faults allowed!). + */ + +#include +#include +#include +#include +#include +#include +#include + +#define GET_CURRENT(reg) \ + movl $STACK_SIZE-4, reg; \ + orl %esp, reg; \ + andl $~3,reg; \ + movl (reg),reg; + +#ifdef CONFIG_VMX +/* + * At VMExit time the processor saves the guest selectors, esp, eip, + * and eflags. Therefore we don't save them, but simply decrement + * the kernel stack pointer to make it consistent with the stack frame + * at usual interruption time. The eflags of the host is not saved by VMX, + * and we set it to the fixed value. + * + * We also need the room, especially because orig_eax field is used + * by do_IRQ(). Compared the xen_regs, we skip pushing for the following: + * (10) u32 gs; + * (9) u32 fs; + * (8) u32 ds; + * (7) u32 es; + * <- get_stack_bottom() (= HOST_ESP) + * (6) u32 ss; + * (5) u32 esp; + * (4) u32 eflags; + * (3) u32 cs; + * (2) u32 eip; + * (2/1) u16 entry_vector; + * (1/1) u16 error_code; + * However, get_stack_bottom() actually returns 20 bytes before the real + * bottom of the stack to allow space for: + * domain pointer, DS, ES, FS, GS. Therefore, we effectively skip 6 registers. + */ +#define VMX_MONITOR_EFLAGS 0x202 /* IF on */ +#define NR_SKIPPED_REGS 6 /* See the above explanation */ +#define VMX_SAVE_ALL_NOSEGREGS \ + pushl $VMX_MONITOR_EFLAGS; \ + popf; \ + subl $(NR_SKIPPED_REGS*4), %esp; \ + movl $0, 0xc(%esp); /* eflags==0 identifies xen_regs as VMX guest */ \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; + +ENTRY(vmx_asm_vmexit_handler) + /* selectors are restored/saved by VMX */ + VMX_SAVE_ALL_NOSEGREGS + call SYMBOL_NAME(vmx_vmexit_handler) + jmp vmx_asm_do_resume + +ENTRY(vmx_asm_do_launch) + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + addl $(NR_SKIPPED_REGS*4), %esp + /* VMLUANCH */ + .byte 0x0f,0x01,0xc2 + pushf + call SYMBOL_NAME(vm_launch_fail) + hlt + + ALIGN + +ENTRY(vmx_asm_do_resume) +vmx_test_all_events: + GET_CURRENT(%ebx) +/*test_all_events:*/ + xorl %ecx,%ecx + notl %ecx + cli # tests must not race interrupts +/*test_softirqs:*/ + movl EDOMAIN_processor(%ebx),%eax + shl $6,%eax # sizeof(irq_cpustat) == 64 + test %ecx,SYMBOL_NAME(irq_stat)(%eax,1) + jnz vmx_process_softirqs + +vmx_restore_all_guest: + call SYMBOL_NAME(load_cr2) + /* + * Check if we are going back to VMX-based VM + * By this time, all the setups in the VMCS must be complete. + */ + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + addl $(NR_SKIPPED_REGS*4), %esp + /* VMRESUME */ + .byte 0x0f,0x01,0xc3 + pushf + call SYMBOL_NAME(vm_resume_fail) + /* Should never reach here */ + hlt + + ALIGN +vmx_process_softirqs: + sti + call SYMBOL_NAME(do_softirq) + jmp vmx_test_all_events +#endif + + ALIGN +restore_all_guest: + testl $X86_EFLAGS_VM,XREGS_eflags(%esp) + jnz restore_all_vm86 +FLT1: movl XREGS_ds(%esp),%ds +FLT2: movl XREGS_es(%esp),%es +FLT3: movl XREGS_fs(%esp),%fs +FLT4: movl XREGS_gs(%esp),%gs +restore_all_vm86: + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + addl $4,%esp +FLT5: iret +.section .fixup,"ax" +FIX5: subl $28,%esp + pushl 28(%esp) # error_code/entry_vector + movl %eax,XREGS_eax+4(%esp) + movl %ebp,XREGS_ebp+4(%esp) + movl %edi,XREGS_edi+4(%esp) + movl %esi,XREGS_esi+4(%esp) + movl %edx,XREGS_edx+4(%esp) + movl %ecx,XREGS_ecx+4(%esp) + movl %ebx,XREGS_ebx+4(%esp) +FIX1: SET_XEN_SEGMENTS(a) + movl %eax,%fs + movl %eax,%gs + sti + popl %esi + pushfl # EFLAGS + movl $__HYPERVISOR_CS,%eax + pushl %eax # CS + movl $DBLFLT1,%eax + pushl %eax # EIP + pushl %esi # error_code/entry_vector + jmp error_code +DBLFLT1:GET_CURRENT(%ebx) + jmp test_all_events +failsafe_callback: + GET_CURRENT(%ebx) + leal EDOMAIN_trap_bounce(%ebx),%edx + movl EDOMAIN_failsafe_addr(%ebx),%eax + movl %eax,TRAPBOUNCE_eip(%edx) + movl EDOMAIN_failsafe_sel(%ebx),%eax + movw %ax,TRAPBOUNCE_cs(%edx) + movw $TBF_FAILSAFE,TRAPBOUNCE_flags(%edx) + call create_bounce_frame + xorl %eax,%eax + movl %eax,XREGS_ds(%esp) + movl %eax,XREGS_es(%esp) + movl %eax,XREGS_fs(%esp) + movl %eax,XREGS_gs(%esp) + jmp test_all_events +.previous +.section __pre_ex_table,"a" + .long FLT1,FIX1 + .long FLT2,FIX1 + .long FLT3,FIX1 + .long FLT4,FIX1 + .long FLT5,FIX5 +.previous +.section __ex_table,"a" + .long DBLFLT1,failsafe_callback +.previous + + ALIGN +restore_all_xen: + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + addl $4,%esp + iret + + ALIGN +ENTRY(hypercall) + subl $4,%esp + SAVE_ALL(b) + sti + GET_CURRENT(%ebx) + andl $(NR_hypercalls-1),%eax + PERFC_INCR(PERFC_hypercalls, %eax) + call *SYMBOL_NAME(hypercall_table)(,%eax,4) + movl %eax,XREGS_eax(%esp) # save the return value + +test_all_events: + xorl %ecx,%ecx + notl %ecx + cli # tests must not race interrupts +/*test_softirqs:*/ + movl EDOMAIN_processor(%ebx),%eax + shl $6,%eax # sizeof(irq_cpustat) == 64 + test %ecx,SYMBOL_NAME(irq_stat)(%eax,1) + jnz process_softirqs +/*test_guest_events:*/ + movl EDOMAIN_vcpu_info(%ebx),%eax + testb $0xFF,VCPUINFO_upcall_mask(%eax) + jnz restore_all_guest + testb $0xFF,VCPUINFO_upcall_pending(%eax) + jz restore_all_guest +/*process_guest_events:*/ + sti + leal EDOMAIN_trap_bounce(%ebx),%edx + movl EDOMAIN_event_addr(%ebx),%eax + movl %eax,TRAPBOUNCE_eip(%edx) + movl EDOMAIN_event_sel(%ebx),%eax + movw %ax,TRAPBOUNCE_cs(%edx) + movw $TBF_INTERRUPT,TRAPBOUNCE_flags(%edx) + call create_bounce_frame + movl EDOMAIN_vcpu_info(%ebx),%eax + movb $1,VCPUINFO_upcall_mask(%eax) # Upcalls are masked during delivery + jmp test_all_events + + ALIGN +process_softirqs: + sti + call SYMBOL_NAME(do_softirq) + jmp test_all_events + +/* CREATE A BASIC EXCEPTION FRAME ON GUEST OS (RING-1) STACK: */ +/* {EIP, CS, EFLAGS, [ESP, SS]} */ +/* %edx == trap_bounce, %ebx == struct exec_domain */ +/* %eax,%ecx are clobbered. %gs:%esi contain new XREGS_ss/XREGS_esp. */ +create_bounce_frame: + movl XREGS_eflags+4(%esp),%ecx + movb XREGS_cs+4(%esp),%cl + testl $(2|X86_EFLAGS_VM),%ecx + jz ring1 /* jump if returning to an existing ring-1 activation */ + movl EDOMAIN_kernel_sp(%ebx),%esi +FLT6: movl EDOMAIN_kernel_ss(%ebx),%gs + testl $X86_EFLAGS_VM,XREGS_eflags+4(%esp) + jz nvm86_1 + subl $16,%esi /* push ES/DS/FS/GS (VM86 stack frame) */ + movl XREGS_es+4(%esp),%eax +FLT7: movl %eax,%gs:(%esi) + movl XREGS_ds+4(%esp),%eax +FLT8: movl %eax,%gs:4(%esi) + movl XREGS_fs+4(%esp),%eax +FLT9: movl %eax,%gs:8(%esi) + movl XREGS_gs+4(%esp),%eax +FLT10: movl %eax,%gs:12(%esi) +nvm86_1:subl $8,%esi /* push SS/ESP (inter-priv iret) */ + movl XREGS_esp+4(%esp),%eax +FLT11: movl %eax,%gs:(%esi) + movl XREGS_ss+4(%esp),%eax +FLT12: movl %eax,%gs:4(%esi) + jmp 1f +ring1: /* obtain ss/esp from oldss/oldesp -- a ring-1 activation exists */ + movl XREGS_esp+4(%esp),%esi +FLT13: movl XREGS_ss+4(%esp),%gs +1: /* Construct a stack frame: EFLAGS, CS/EIP */ + subl $12,%esi + movl XREGS_eip+4(%esp),%eax +FLT14: movl %eax,%gs:(%esi) + movl XREGS_cs+4(%esp),%eax +FLT15: movl %eax,%gs:4(%esi) + movl XREGS_eflags+4(%esp),%eax +FLT16: movl %eax,%gs:8(%esi) + movb TRAPBOUNCE_flags(%edx),%cl + test $TBF_EXCEPTION_ERRCODE,%cl + jz 1f + subl $4,%esi # push error_code onto guest frame + movl TRAPBOUNCE_error_code(%edx),%eax +FLT17: movl %eax,%gs:(%esi) + testb $TBF_EXCEPTION_CR2,%cl + jz 2f + subl $4,%esi # push %cr2 onto guest frame + movl TRAPBOUNCE_cr2(%edx),%eax +FLT18: movl %eax,%gs:(%esi) +1: testb $TBF_FAILSAFE,%cl + jz 2f + subl $16,%esi # add DS/ES/FS/GS to failsafe stack frame + testl $X86_EFLAGS_VM,XREGS_eflags+4(%esp) + jz nvm86_2 + xorl %eax,%eax # VM86: we write zero selector values +FLT19: movl %eax,%gs:(%esi) +FLT20: movl %eax,%gs:4(%esi) +FLT21: movl %eax,%gs:8(%esi) +FLT22: movl %eax,%gs:12(%esi) + jmp 2f +nvm86_2:movl XREGS_ds+4(%esp),%eax # non-VM86: write real selector values +FLT23: movl %eax,%gs:(%esi) + movl XREGS_es+4(%esp),%eax +FLT24: movl %eax,%gs:4(%esi) + movl XREGS_fs+4(%esp),%eax +FLT25: movl %eax,%gs:8(%esi) + movl XREGS_gs+4(%esp),%eax +FLT26: movl %eax,%gs:12(%esi) +2: testl $X86_EFLAGS_VM,XREGS_eflags+4(%esp) + jz nvm86_3 + xorl %eax,%eax /* zero DS-GS, just as a real CPU would */ + movl %eax,XREGS_ds+4(%esp) + movl %eax,XREGS_es+4(%esp) + movl %eax,XREGS_fs+4(%esp) + movl %eax,XREGS_gs+4(%esp) +nvm86_3:/* Rewrite our stack frame and return to ring 1. */ + /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */ + andl $0xfffcbeff,XREGS_eflags+4(%esp) + movl %gs,XREGS_ss+4(%esp) + movl %esi,XREGS_esp+4(%esp) + movzwl TRAPBOUNCE_cs(%edx),%eax + movl %eax,XREGS_cs+4(%esp) + movl TRAPBOUNCE_eip(%edx),%eax + movl %eax,XREGS_eip+4(%esp) + movb $0,TRAPBOUNCE_flags(%edx) + ret +.section __ex_table,"a" + .long FLT6,domain_crash_synchronous , FLT7,domain_crash_synchronous + .long FLT8,domain_crash_synchronous , FLT9,domain_crash_synchronous + .long FLT10,domain_crash_synchronous , FLT11,domain_crash_synchronous + .long FLT12,domain_crash_synchronous , FLT13,domain_crash_synchronous + .long FLT14,domain_crash_synchronous , FLT15,domain_crash_synchronous + .long FLT16,domain_crash_synchronous , FLT17,domain_crash_synchronous + .long FLT18,domain_crash_synchronous , FLT19,domain_crash_synchronous + .long FLT20,domain_crash_synchronous , FLT21,domain_crash_synchronous + .long FLT22,domain_crash_synchronous , FLT23,domain_crash_synchronous + .long FLT24,domain_crash_synchronous , FLT25,domain_crash_synchronous + .long FLT26,domain_crash_synchronous +.previous + + ALIGN +process_guest_exception_and_events: + leal EDOMAIN_trap_bounce(%ebx),%edx + testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%edx) + jz test_all_events + call create_bounce_frame + jmp test_all_events + + ALIGN +ENTRY(ret_from_intr) + GET_CURRENT(%ebx) + movl XREGS_eflags(%esp),%eax + movb XREGS_cs(%esp),%al + testl $(3|X86_EFLAGS_VM),%eax + jnz test_all_events + jmp restore_all_xen + +ENTRY(divide_error) + pushl $TRAP_divide_error<<16 + ALIGN +error_code: + SAVE_ALL_NOSEGREGS(a) + SET_XEN_SEGMENTS(a) + testb $X86_EFLAGS_IF>>8,XREGS_eflags+1(%esp) + jz exception_with_ints_disabled + sti # re-enable interrupts + xorl %eax,%eax + movw XREGS_entry_vector(%esp),%ax + movl %esp,%edx + pushl %edx # push the xen_regs pointer + GET_CURRENT(%ebx) + PERFC_INCR(PERFC_exceptions, %eax) + call *SYMBOL_NAME(exception_table)(,%eax,4) + addl $4,%esp + movl XREGS_eflags(%esp),%eax + movb XREGS_cs(%esp),%al + testl $(3|X86_EFLAGS_VM),%eax + jz restore_all_xen + jmp process_guest_exception_and_events + +exception_with_ints_disabled: + movl XREGS_eflags(%esp),%eax + movb XREGS_cs(%esp),%al + testl $(3|X86_EFLAGS_VM),%eax # interrupts disabled outside Xen? + jnz FATAL_exception_with_ints_disabled + pushl %esp + call search_pre_exception_table + addl $4,%esp + testl %eax,%eax # no fixup code for faulting EIP? + jz FATAL_exception_with_ints_disabled + movl %eax,XREGS_eip(%esp) + movl %esp,%esi + subl $4,%esp + movl %esp,%edi + movl $XREGS_kernel_sizeof/4,%ecx + rep; movsl # make room for error_code/entry_vector + movl XREGS_error_code(%esp),%eax # error_code/entry_vector + movl %eax,XREGS_kernel_sizeof(%esp) + jmp restore_all_xen # return to fixup code + +FATAL_exception_with_ints_disabled: + xorl %esi,%esi + movw XREGS_entry_vector(%esp),%si + movl %esp,%edx + pushl %edx # push the xen_regs pointer + pushl %esi # push the trapnr (entry vector) + call SYMBOL_NAME(fatal_trap) + ud2 + +ENTRY(coprocessor_error) + pushl $TRAP_copro_error<<16 + jmp error_code + +ENTRY(simd_coprocessor_error) + pushl $TRAP_simd_error<<16 + jmp error_code + +ENTRY(device_not_available) + pushl $TRAP_no_device<<16 + jmp error_code + +ENTRY(debug) + pushl $TRAP_debug<<16 + jmp error_code + +ENTRY(int3) + pushl $TRAP_int3<<16 + jmp error_code + +ENTRY(overflow) + pushl $TRAP_overflow<<16 + jmp error_code + +ENTRY(bounds) + pushl $TRAP_bounds<<16 + jmp error_code + +ENTRY(invalid_op) + pushl $TRAP_invalid_op<<16 + jmp error_code + +ENTRY(coprocessor_segment_overrun) + pushl $TRAP_copro_seg<<16 + jmp error_code + +ENTRY(invalid_TSS) + movw $TRAP_invalid_tss,2(%esp) + jmp error_code + +ENTRY(segment_not_present) + movw $TRAP_no_segment,2(%esp) + jmp error_code + +ENTRY(stack_segment) + movw $TRAP_stack_error,2(%esp) + jmp error_code + +ENTRY(general_protection) + movw $TRAP_gp_fault,2(%esp) + jmp error_code + +ENTRY(alignment_check) + movw $TRAP_alignment_check,2(%esp) + jmp error_code + +ENTRY(page_fault) + movw $TRAP_page_fault,2(%esp) + jmp error_code + +ENTRY(machine_check) + pushl $TRAP_machine_check<<16 + jmp error_code + +ENTRY(spurious_interrupt_bug) + pushl $TRAP_spurious_int<<16 + jmp error_code + +ENTRY(nmi) + # Save state but do not trash the segment registers! + # We may otherwise be unable to reload them or copy them to ring 1. + pushl %eax + SAVE_ALL_NOSEGREGS(a) + + # Check for hardware problems. + inb $0x61,%al + testb $0x80,%al + jne nmi_parity_err + testb $0x40,%al + jne nmi_io_err + movl %eax,%ebx + + # Okay, its almost a normal NMI tick. We can only process it if: + # A. We are the outermost Xen activation (in which case we have + # the selectors safely saved on our stack) + # B. DS and ES contain sane Xen values. + # In all other cases we bail without touching DS-GS, as we have + # interrupted an enclosing Xen activation in tricky prologue or + # epilogue code. + movl XREGS_eflags(%esp),%eax + movb XREGS_cs(%esp),%al + testl $(3|X86_EFLAGS_VM),%eax + jnz do_watchdog_tick + movl %ds,%eax + cmpw $(__HYPERVISOR_DS),%ax + jne defer_nmi + movl %es,%eax + cmpw $(__HYPERVISOR_DS),%ax + jne defer_nmi + +do_watchdog_tick: + movl $(__HYPERVISOR_DS),%edx + movl %edx,%ds + movl %edx,%es + movl %esp,%edx + pushl %ebx # reason + pushl %edx # regs + call SYMBOL_NAME(do_nmi) + addl $8,%esp + jmp ret_from_intr + +defer_nmi: + movl $FIXMAP_apic_base,%eax + # apic_wait_icr_idle() +1: movl %ss:APIC_ICR(%eax),%ebx + testl $APIC_ICR_BUSY,%ebx + jnz 1b + # __send_IPI_shortcut(APIC_DEST_SELF, TRAP_deferred_nmi) + movl $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \ + TRAP_deferred_nmi),%ss:APIC_ICR(%eax) + jmp restore_all_xen + +nmi_parity_err: + # Clear and disable the parity-error line + andb $0xf,%al + orb $0x4,%al + outb %al,$0x61 + cmpb $'i',%ss:SYMBOL_NAME(opt_nmi) # nmi=ignore + je nmi_out + bts $0,%ss:SYMBOL_NAME(nmi_softirq_reason) + bts $NMI_SOFTIRQ,%ss:SYMBOL_NAME(irq_stat) + cmpb $'d',%ss:SYMBOL_NAME(opt_nmi) # nmi=dom0 + je nmi_out + movl $(__HYPERVISOR_DS),%edx # nmi=fatal + movl %edx,%ds + movl %edx,%es + movl %esp,%edx + push %edx + call SYMBOL_NAME(mem_parity_error) + addl $4,%esp +nmi_out:movl %ss:XREGS_eflags(%esp),%eax + movb %ss:XREGS_cs(%esp),%al + testl $(3|X86_EFLAGS_VM),%eax + jz restore_all_xen + movl $(__HYPERVISOR_DS),%edx + movl %edx,%ds + movl %edx,%es + GET_CURRENT(%ebx) + jmp test_all_events + +nmi_io_err: + # Clear and disable the I/O-error line + andb $0xf,%al + orb $0x8,%al + outb %al,$0x61 + cmpb $'i',%ss:SYMBOL_NAME(opt_nmi) # nmi=ignore + je nmi_out + bts $1,%ss:SYMBOL_NAME(nmi_softirq_reason) + bts $NMI_SOFTIRQ,%ss:SYMBOL_NAME(irq_stat) + cmpb $'d',%ss:SYMBOL_NAME(opt_nmi) # nmi=dom0 + je nmi_out + movl $(__HYPERVISOR_DS),%edx # nmi=fatal + movl %edx,%ds + movl %edx,%es + movl %esp,%edx + push %edx + call SYMBOL_NAME(io_check_error) + addl $4,%esp + jmp nmi_out + + +ENTRY(setup_vm86_frame) + # Copies the entire stack frame forwards by 16 bytes. + .macro copy_vm86_words count=18 + .if \count + pushl ((\count-1)*4)(%esp) + popl ((\count-1)*4)+16(%esp) + copy_vm86_words "(\count-1)" + .endif + .endm + copy_vm86_words + addl $16,%esp + ret + +do_switch_vm86: + # Discard the return address + addl $4,%esp + + # GS:ESI == Ring-1 stack activation + movl XREGS_esp(%esp),%esi +VFLT1: movl XREGS_ss(%esp),%gs + + # ES:EDI == Ring-0 stack activation + leal XREGS_eip(%esp),%edi + + # Restore the hypercall-number-clobbered EAX on our stack frame +VFLT2: movl %gs:(%esi),%eax + movl %eax,XREGS_eax(%esp) + addl $4,%esi + + # Copy the VM86 activation from the ring-1 stack to the ring-0 stack + movl $(XREGS_user_sizeof-XREGS_eip)/4,%ecx +VFLT3: movl %gs:(%esi),%eax + stosl + addl $4,%esi + loop VFLT3 + + # Fix up EFLAGS: IOPL=0, IF=1, VM=1 + andl $~X86_EFLAGS_IOPL,XREGS_eflags(%esp) + orl $X86_EFLAGS_IF|X86_EFLAGS_VM,XREGS_eflags(%esp) + + jmp test_all_events + +.section __ex_table,"a" + .long VFLT1,domain_crash_synchronous + .long VFLT2,domain_crash_synchronous + .long VFLT3,domain_crash_synchronous +.previous + +.data + +ENTRY(exception_table) + .long SYMBOL_NAME(do_divide_error) + .long SYMBOL_NAME(do_debug) + .long 0 # nmi + .long SYMBOL_NAME(do_int3) + .long SYMBOL_NAME(do_overflow) + .long SYMBOL_NAME(do_bounds) + .long SYMBOL_NAME(do_invalid_op) + .long SYMBOL_NAME(math_state_restore) + .long 0 # double fault + .long SYMBOL_NAME(do_coprocessor_segment_overrun) + .long SYMBOL_NAME(do_invalid_TSS) + .long SYMBOL_NAME(do_segment_not_present) + .long SYMBOL_NAME(do_stack_segment) + .long SYMBOL_NAME(do_general_protection) + .long SYMBOL_NAME(do_page_fault) + .long SYMBOL_NAME(do_spurious_interrupt_bug) + .long SYMBOL_NAME(do_coprocessor_error) + .long SYMBOL_NAME(do_alignment_check) + .long SYMBOL_NAME(do_machine_check) + .long SYMBOL_NAME(do_simd_coprocessor_error) + +ENTRY(hypercall_table) + .long SYMBOL_NAME(do_set_trap_table) /* 0 */ + .long SYMBOL_NAME(do_mmu_update) + .long SYMBOL_NAME(do_set_gdt) + .long SYMBOL_NAME(do_stack_switch) + .long SYMBOL_NAME(do_set_callbacks) + .long SYMBOL_NAME(do_fpu_taskswitch) /* 5 */ + .long SYMBOL_NAME(do_sched_op) + .long SYMBOL_NAME(do_dom0_op) + .long SYMBOL_NAME(do_set_debugreg) + .long SYMBOL_NAME(do_get_debugreg) + .long SYMBOL_NAME(do_update_descriptor) /* 10 */ + .long SYMBOL_NAME(do_set_fast_trap) + .long SYMBOL_NAME(do_dom_mem_op) + .long SYMBOL_NAME(do_multicall) + .long SYMBOL_NAME(do_update_va_mapping) + .long SYMBOL_NAME(do_set_timer_op) /* 15 */ + .long SYMBOL_NAME(do_event_channel_op) + .long SYMBOL_NAME(do_xen_version) + .long SYMBOL_NAME(do_console_io) + .long SYMBOL_NAME(do_physdev_op) + .long SYMBOL_NAME(do_grant_table_op) /* 20 */ + .long SYMBOL_NAME(do_vm_assist) + .long SYMBOL_NAME(do_update_va_mapping_otherdomain) + .long SYMBOL_NAME(do_switch_vm86) + .long SYMBOL_NAME(do_boot_vcpu) + .long SYMBOL_NAME(do_ni_hypercall) /* 25 */ + .long SYMBOL_NAME(do_mmuext_op) + .rept NR_hypercalls-((.-hypercall_table)/4) + .long SYMBOL_NAME(do_ni_hypercall) + .endr diff -Naurp ../xeno-unstable.bk/xen/include/asm-x86/apic.h xen/include/asm-x86/apic.h --- ../xeno-unstable.bk/xen/include/asm-x86/apic.h 2005-04-14 14:56:32.000000000 -0500 +++ xen/include/asm-x86/apic.h 2005-04-15 08:34:22.000000000 -0500 @@ -77,6 +77,8 @@ extern void init_apic_mappings (void); extern void smp_local_timer_interrupt (struct xen_regs * regs); extern void setup_APIC_clocks (void); extern void setup_apic_nmi_watchdog (void); +extern int reserve_lapic_nmi(void); +extern void release_lapic_nmi(void); extern void nmi_watchdog_tick (struct xen_regs * regs); extern void touch_nmi_watchdog(void); extern int APIC_init_uniprocessor (void); diff -Naurp ../xeno-unstable.bk/xen/include/asm-x86/msr.h xen/include/asm-x86/msr.h --- ../xeno-unstable.bk/xen/include/asm-x86/msr.h 2005-04-14 14:56:32.000000000 -0500 +++ xen/include/asm-x86/msr.h 2005-04-15 08:34:24.000000000 -0500 @@ -11,6 +11,21 @@ : /* no outputs */ \ : "c" (msr), "a" (val1), "d" (val2)) +#define rdmsrl(msr,val) do { \ + unsigned long l__,h__; \ + rdmsr (msr, l__, h__); \ + val = l__; \ + val |= ((u64)h__<<32); \ +} while(0) + +static inline void wrmsrl (unsigned long msr, unsigned long long val) +{ + unsigned long lo, hi; + lo = (unsigned long) val; + hi = val >> 32; + wrmsr (msr, lo, hi); +} + #define rdmsr_user(msr,val1,val2) ({\ int _rc; \ __asm__ __volatile__( \ @@ -47,16 +62,8 @@ #define rdtscl(low) \ __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx") -#if defined(__i386__) #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) -#elif defined(__x86_64__) -#define rdtscll(val) do { \ - unsigned int a,d; \ - asm volatile("rdtsc" : "=a" (a), "=d" (d)); \ - (val) = ((unsigned long)a) | (((unsigned long)d)<<32); \ -} while(0) -#endif #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) @@ -140,12 +147,38 @@ #define MSR_IA32_UCODE_WRITE 0x79 #define MSR_IA32_UCODE_REV 0x8b +#define MSR_P6_PERFCTR0 0xc1 +#define MSR_P6_PERFCTR1 0xc2 + #define MSR_IA32_BBL_CR_CTL 0x119 +#define MSR_IA32_SYSENTER_CS 0x174 +#define MSR_IA32_SYSENTER_ESP 0x175 +#define MSR_IA32_SYSENTER_EIP 0x176 + #define MSR_IA32_MCG_CAP 0x179 #define MSR_IA32_MCG_STATUS 0x17a #define MSR_IA32_MCG_CTL 0x17b +/* P4/Xeon+ specific */ +#define MSR_IA32_MCG_EAX 0x180 +#define MSR_IA32_MCG_EBX 0x181 +#define MSR_IA32_MCG_ECX 0x182 +#define MSR_IA32_MCG_EDX 0x183 +#define MSR_IA32_MCG_ESI 0x184 +#define MSR_IA32_MCG_EDI 0x185 +#define MSR_IA32_MCG_EBP 0x186 +#define MSR_IA32_MCG_ESP 0x187 +#define MSR_IA32_MCG_EFLAGS 0x188 +#define MSR_IA32_MCG_EIP 0x189 +#define MSR_IA32_MCG_RESERVED 0x18A + +#define MSR_P6_EVNTSEL0 0x186 +#define MSR_P6_EVNTSEL1 0x187 + +#define MSR_IA32_PERF_STATUS 0x198 +#define MSR_IA32_PERF_CTL 0x199 + #define MSR_IA32_THERM_CONTROL 0x19a #define MSR_IA32_THERM_INTERRUPT 0x19b #define MSR_IA32_THERM_STATUS 0x19c @@ -178,6 +211,92 @@ #define MSR_P6_EVNTSEL0 0x186 #define MSR_P6_EVNTSEL1 0x187 +/* Pentium IV performance counter MSRs */ +#define MSR_P4_BPU_PERFCTR0 0x300 +#define MSR_P4_BPU_PERFCTR1 0x301 +#define MSR_P4_BPU_PERFCTR2 0x302 +#define MSR_P4_BPU_PERFCTR3 0x303 +#define MSR_P4_MS_PERFCTR0 0x304 +#define MSR_P4_MS_PERFCTR1 0x305 +#define MSR_P4_MS_PERFCTR2 0x306 +#define MSR_P4_MS_PERFCTR3 0x307 +#define MSR_P4_FLAME_PERFCTR0 0x308 +#define MSR_P4_FLAME_PERFCTR1 0x309 +#define MSR_P4_FLAME_PERFCTR2 0x30a +#define MSR_P4_FLAME_PERFCTR3 0x30b +#define MSR_P4_IQ_PERFCTR0 0x30c +#define MSR_P4_IQ_PERFCTR1 0x30d +#define MSR_P4_IQ_PERFCTR2 0x30e +#define MSR_P4_IQ_PERFCTR3 0x30f +#define MSR_P4_IQ_PERFCTR4 0x310 +#define MSR_P4_IQ_PERFCTR5 0x311 +#define MSR_P4_BPU_CCCR0 0x360 +#define MSR_P4_BPU_CCCR1 0x361 +#define MSR_P4_BPU_CCCR2 0x362 +#define MSR_P4_BPU_CCCR3 0x363 +#define MSR_P4_MS_CCCR0 0x364 +#define MSR_P4_MS_CCCR1 0x365 +#define MSR_P4_MS_CCCR2 0x366 +#define MSR_P4_MS_CCCR3 0x367 +#define MSR_P4_FLAME_CCCR0 0x368 +#define MSR_P4_FLAME_CCCR1 0x369 +#define MSR_P4_FLAME_CCCR2 0x36a +#define MSR_P4_FLAME_CCCR3 0x36b +#define MSR_P4_IQ_CCCR0 0x36c +#define MSR_P4_IQ_CCCR1 0x36d +#define MSR_P4_IQ_CCCR2 0x36e +#define MSR_P4_IQ_CCCR3 0x36f +#define MSR_P4_IQ_CCCR4 0x370 +#define MSR_P4_IQ_CCCR5 0x371 +#define MSR_P4_ALF_ESCR0 0x3ca +#define MSR_P4_ALF_ESCR1 0x3cb +#define MSR_P4_BPU_ESCR0 0x3b2 +#define MSR_P4_BPU_ESCR1 0x3b3 +#define MSR_P4_BSU_ESCR0 0x3a0 +#define MSR_P4_BSU_ESCR1 0x3a1 +#define MSR_P4_CRU_ESCR0 0x3b8 +#define MSR_P4_CRU_ESCR1 0x3b9 +#define MSR_P4_CRU_ESCR2 0x3cc +#define MSR_P4_CRU_ESCR3 0x3cd +#define MSR_P4_CRU_ESCR4 0x3e0 +#define MSR_P4_CRU_ESCR5 0x3e1 +#define MSR_P4_DAC_ESCR0 0x3a8 +#define MSR_P4_DAC_ESCR1 0x3a9 +#define MSR_P4_FIRM_ESCR0 0x3a4 +#define MSR_P4_FIRM_ESCR1 0x3a5 +#define MSR_P4_FLAME_ESCR0 0x3a6 +#define MSR_P4_FLAME_ESCR1 0x3a7 +#define MSR_P4_FSB_ESCR0 0x3a2 +#define MSR_P4_FSB_ESCR1 0x3a3 +#define MSR_P4_IQ_ESCR0 0x3ba +#define MSR_P4_IQ_ESCR1 0x3bb +#define MSR_P4_IS_ESCR0 0x3b4 +#define MSR_P4_IS_ESCR1 0x3b5 +#define MSR_P4_ITLB_ESCR0 0x3b6 +#define MSR_P4_ITLB_ESCR1 0x3b7 +#define MSR_P4_IX_ESCR0 0x3c8 +#define MSR_P4_IX_ESCR1 0x3c9 +#define MSR_P4_MOB_ESCR0 0x3aa +#define MSR_P4_MOB_ESCR1 0x3ab +#define MSR_P4_MS_ESCR0 0x3c0 +#define MSR_P4_MS_ESCR1 0x3c1 +#define MSR_P4_PMH_ESCR0 0x3ac +#define MSR_P4_PMH_ESCR1 0x3ad +#define MSR_P4_RAT_ESCR0 0x3bc +#define MSR_P4_RAT_ESCR1 0x3bd +#define MSR_P4_SAAT_ESCR0 0x3ae +#define MSR_P4_SAAT_ESCR1 0x3af +#define MSR_P4_SSU_ESCR0 0x3be +#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */ +#define MSR_P4_TBPU_ESCR0 0x3c2 +#define MSR_P4_TBPU_ESCR1 0x3c3 +#define MSR_P4_TC_ESCR0 0x3c4 +#define MSR_P4_TC_ESCR1 0x3c5 +#define MSR_P4_U2L_ESCR0 0x3b0 +#define MSR_P4_U2L_ESCR1 0x3b1 + + + /* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */ #define MSR_K7_EVNTSEL0 0xC0010000 @@ -196,7 +315,7 @@ #define MSR_K7_FID_VID_CTL 0xC0010041 #define MSR_K7_VID_STATUS 0xC0010042 -/* K6 MSRs */ +/* AMD Defined MSRs */ #define MSR_K6_EFER 0xC0000080 #define MSR_K6_STAR 0xC0000081 #define MSR_K6_WHCR 0xC0000082 @@ -205,6 +324,28 @@ #define MSR_K6_PSOR 0xC0000087 #define MSR_K6_PFIR 0xC0000088 +#define MSR_K7_EVNTSEL0 0xC0010000 +#define MSR_K7_EVNTSEL1 0xC0010001 +#define MSR_K7_EVNTSEL2 0xC0010002 +#define MSR_K7_EVNTSEL3 0xC0010003 +#define MSR_K7_PERFCTR0 0xC0010004 +#define MSR_K7_PERFCTR1 0xC0010005 +#define MSR_K7_PERFCTR2 0xC0010006 +#define MSR_K7_PERFCTR3 0xC0010007 +#define MSR_K7_HWCR 0xC0010015 +#define MSR_K7_CLK_CTL 0xC001001b +#define MSR_K7_FID_VID_CTL 0xC0010041 +#define MSR_K7_FID_VID_STATUS 0xC0010042 + +/* extended feature register */ +#define MSR_EFER 0xc0000080 + +/* EFER bits: */ + +/* Execute Disable enable */ +#define _EFER_NX 11 +#define EFER_NX (1<<_EFER_NX) + /* Centaur-Hauls/IDT defined MSRs. */ #define MSR_IDT_FCR1 0x107 #define MSR_IDT_FCR2 0x108 @@ -224,6 +365,7 @@ /* VIA Cyrix defined MSRs*/ #define MSR_VIA_FCR 0x1107 #define MSR_VIA_LONGHAUL 0x110a +#define MSR_VIA_RNG 0x110b #define MSR_VIA_BCR2 0x1147 /* Transmeta defined MSRs */ @@ -232,4 +374,6 @@ #define MSR_TMTA_LRTI_READOUT 0x80868018 #define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a + + #endif /* __ASM_MSR_H */ diff -Naurp ../xeno-unstable.bk/xen/include/asm-x86/nmi.h xen/include/asm-x86/nmi.h --- ../xeno-unstable.bk/xen/include/asm-x86/nmi.h 1969-12-31 18:00:00.000000000 -0600 +++ xen/include/asm-x86/nmi.h 2005-04-15 08:34:24.000000000 -0500 @@ -0,0 +1,26 @@ +/* + * linux/include/asm-i386/nmi.h + */ +#ifndef ASM_NMI_H +#define ASM_NMI_H + +struct xen_regs; + +typedef int (*nmi_callback_t)(struct xen_regs * regs, int cpu); + +/** + * set_nmi_callback + * + * Set a handler for an NMI. Only one handler may be + * set. Return 1 if the NMI was handled. + */ +void set_nmi_callback(nmi_callback_t callback); + +/** + * unset_nmi_callback + * + * Remove the handler previously set. + */ +void unset_nmi_callback(void); + +#endif /* ASM_NMI_H */ diff -Naurp ../xeno-unstable.bk/xen/include/public/xen.h xen/include/public/xen.h --- ../xeno-unstable.bk/xen/include/public/xen.h 2005-04-14 14:56:32.000000000 -0500 +++ xen/include/public/xen.h 2005-04-15 08:34:26.000000000 -0500 @@ -4,6 +4,10 @@ * Guest OS interface to Xen. * * Copyright (c) 2004, K A Fraser + * + * Modified by Aravind Menon for supporting oprofile + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. */ #ifndef __XEN_PUBLIC_XEN_H__ @@ -59,6 +63,7 @@ #define __HYPERVISOR_boot_vcpu 24 #define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ #define __HYPERVISOR_mmuext_op 26 +#define __HYPERVISOR_pmc_op 27 /* * MULTICALLS @@ -81,6 +86,7 @@ #define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ #define VIRQ_PARITY_ERR 4 /* (DOM0) NMI parity error. */ #define VIRQ_IO_ERR 5 /* (DOM0) NMI I/O error. */ +#define VIRQ_PMC_OVF 6 /* PMC Overflow */ #define NR_VIRQS 7 /* @@ -245,6 +251,21 @@ struct mmuext_op { #define VMASST_TYPE_writable_pagetables 2 #define MAX_VMASST_TYPE 2 +/* + * Commands to HYPERVISOR_pmc_op(). + */ +#define PMC_INIT 0 +#define PMC_SET_ACTIVE 1 +#define PMC_SET_PASSIVE 2 +#define PMC_RESERVE_COUNTERS 3 +#define PMC_SETUP_EVENTS 4 +#define PMC_ENABLE_VIRQ 5 +#define PMC_START 6 +#define PMC_STOP 7 +#define PMC_DISABLE_VIRQ 8 +#define PMC_RELEASE_COUNTERS 9 +#define PMC_SHUTDOWN 10 + #ifndef __ASSEMBLY__ typedef u16 domid_t; @@ -300,6 +321,8 @@ typedef struct /* Support for multi-processor guests. */ #define MAX_VIRT_CPUS 32 +#define MAX_OPROF_EVENTS 32 +#define MAX_OPROF_DOMAINS 25 /* * Per-VCPU information goes here. This will be cleaned up more when Xen * actually supports multi-VCPU guests. @@ -413,6 +436,20 @@ typedef struct shared_info_st arch_shared_info_t arch; + /* Oprofile structures */ + u8 event_head; + u8 event_tail; + struct { + u32 eip; + u8 mode; + u8 event; + } PACKED event_log[MAX_OPROF_EVENTS]; + u8 losing_samples; + u64 samples_lost; + u32 nmi_restarts; + u64 active_samples; + u64 passive_samples; + u64 other_samples; } PACKED shared_info_t; /* diff -Naurp ../xeno-unstable.bk/xen/include/xen/smp.h xen/include/xen/smp.h --- ../xeno-unstable.bk/xen/include/xen/smp.h 2005-04-14 14:56:32.000000000 -0500 +++ xen/include/xen/smp.h 2005-04-15 08:34:28.000000000 -0500 @@ -47,6 +47,18 @@ extern int smp_call_function( void (*func) (void *info), void *info, int retry, int wait); /* + * Call a function on all processors + */ +static inline int on_each_cpu(void (*func) (void *info), void *info, + int retry, int wait) +{ + int ret = 0; + ret = smp_call_function(func, info, retry, wait); + func(info); + return ret; +} + +/* * True once the per process idle is forked */ extern int smp_threads_ready;