diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c Thu Aug 23 10:20:43 2007 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.c Thu Aug 23 10:52:15 2007 +0200 @@ -8,11 +8,13 @@ #include #include #include +#include #include #include #include "mce.h" +#include "x86_mca.h" int mce_disabled = 0; unsigned int nr_mce_banks; @@ -89,3 +91,89 @@ static int __init mcheck_enable(char *st custom_param("nomce", mcheck_disable); custom_param("mce", mcheck_enable); + + + + +void x86_mcinfo_clear(struct shared_info *si) +{ + memset(&si->arch.mc_info, 0, sizeof(struct arch_mc_info)); + x86_mcinfo_nentries(si) = 0; +} + + +int x86_mcinfo_add(struct shared_info *si, void *mcinfo) +{ + int i; + unsigned long end1, end2; + struct mcinfo_common *mic, *mic_base, *mic_index; + + mic = (struct mcinfo_common *)mcinfo; + mic_index = mic_base = x86_mcinfo_first(si); + + /* go to first free entry */ + for (i = 0; i < x86_mcinfo_nentries(si); i++) { + mic_index = x86_mcinfo_next(mic_index); + } + + /* check if there is enough size */ + end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct arch_mc_info)); + end2 = (unsigned long)((uint8_t *)mic_index + mic->size); + + if (end1 < end2) + return -ENOSPC; /* No space. Can't add entry. */ + + /* there's enough space. add entry. */ + memcpy(mic_index, mic, mic->size); + x86_mcinfo_nentries(si)++; + + return 0; +} + +/* Dump machine check information in a format, + * mcelog can parse. This is used only when + * Dom0 does not take the notification. */ +void x86_mcinfo_dump(struct shared_info *si) +{ + struct mcinfo_common *mic; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + + /* first print the global info */ + x86_mcinfo_lookup(mic, si, MC_TYPE_GLOBAL); + if (mic == NULL) + return; + mc_global = (struct mcinfo_global *)mic; + if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) { + printk(KERN_WARNING + "CPU%d: Machine Check Exception: %16"PRIx64"\n", + mc_global->mc_coreid, mc_global->mc_gstatus); + } else { + printk(KERN_WARNING "Machine Check: The hardware reports a non " + "fatal, correctable incident occured on " + "CPU %d.\n", + mc_global->mc_coreid); + } + + /* then the bank information */ + x86_mcinfo_lookup(mic, si, MC_TYPE_BANK); /* finds the first entry */ + do { + if (mic == NULL) + return; + if (mic->type != MC_TYPE_BANK) + continue; + + mc_bank = (struct mcinfo_bank *)mic; + + printk(KERN_WARNING "Bank %d: %16"PRIx64, + mc_bank->mc_bank, + mc_bank->mc_status); + if (mc_bank->mc_status & MCi_STATUS_MISCV) + printk("[%16"PRIx64"]", mc_bank->mc_misc); + if (mc_bank->mc_status & MCi_STATUS_ADDRV) + printk(" at %16"PRIx64, mc_bank->mc_addr); + + printk("\n"); + mic = x86_mcinfo_next(mic); /* next entry */ + } while (mic != NULL); +} diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Thu Aug 23 10:20:43 2007 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.h Thu Aug 23 10:52:15 2007 +0200 @@ -6,6 +6,10 @@ void intel_p6_mcheck_init(struct cpuinfo void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); +void x86_mcinfo_clear(struct shared_info *si); +int x86_mcinfo_add(struct shared_info *si, void *mcinfo); +void x86_mcinfo_dump(struct shared_info *si); + /* Call the installed machine check handler for this CPU setup. */ extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long error_code); diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/arch/x86/cpu/mcheck/non-fatal.c --- a/xen/arch/x86/cpu/mcheck/non-fatal.c Thu Aug 23 10:20:43 2007 +0200 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c Thu Aug 23 10:52:15 2007 +0200 @@ -13,19 +13,22 @@ #include #include #include -#include +#include #include #include #include #include "mce.h" -static int firstbank; +static int firstbank = 0; static struct timer mce_timer; #define MCE_PERIOD MILLISECS(15000) - -static void mce_checkregs (void *info) +#define MCE_MIN MILLISECS(2000) +#define MCE_MAX MILLISECS(30000) + + +static void mce_intel_checkregs (void *info) { u32 low, high; int i; @@ -50,10 +53,171 @@ static void mce_checkregs (void *info) } } -static void mce_work_fn(void *data) +static void mce_intel_work_fn(void *data) { - on_each_cpu(mce_checkregs, NULL, 1, 1); + on_each_cpu(mce_intel_checkregs, NULL, 1, 1); set_timer(&mce_timer, NOW() + MCE_PERIOD); +} + + + +/* The MCi_STATUS_* #defines are needed here */ +#include "x86_mca.h" + +static s_time_t period = MCE_PERIOD; +static int hw_threshold = 0; +static int adjust = 0; + + +/* The polling service routine: + * Collects information of correctable errors and notifies + * Dom0 via an event. + */ +void mce_amd_checkregs(void *info) +{ + struct shared_info *si; + struct vcpu *vcpu = current; + struct mcinfo_global mc_global; + struct mcinfo_bank mc_info; + uint64_t status, addrv, miscv; + unsigned int i; + unsigned int event_enabled; + + event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA); + adjust = 0; + si = (struct shared_info *)dom0->shared_info; /* cast silences gcc4 */ + memset(&mc_global, 0, sizeof(mc_global)); + mc_global.common.type = MC_TYPE_GLOBAL; + mc_global.common.size = sizeof(mc_global); + + mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */ + mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ + mc_global.mc_core_threadid = 0; + mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ +#if 0 /* todo: on which socket is this physical core? */ + mc_global.mc_socketid = ???; +#endif + mc_global.mc_flags |= MC_FLAG_CORRECTABLE; + rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); + x86_mcinfo_clear(si); + x86_mcinfo_add(si, &mc_global); + + for (i = 0; i < nr_mce_banks; i++) { + rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status); + + if (!(status & MCi_STATUS_VAL)) + continue; + + memset(&mc_info, 0, sizeof(mc_info)); + mc_info.common.type = MC_TYPE_BANK; + mc_info.common.size = sizeof(mc_info); + mc_info.mc_bank = i; + mc_info.mc_status = status; + + /* Increase polling frequency */ + adjust = 1; + + addrv = 0; + if (status & MCi_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv); + + miscv = 0; + if (status & MCi_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv); + + mc_info.mc_addr = addrv; + mc_info.mc_misc = miscv; + x86_mcinfo_add(si, &mc_info); + + /* clear status */ + wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL); + wmb(); + } + + if (adjust > 0) { + /* If Dom0 enabled the VIRQ_MCA event, then ... */ + if (event_enabled) + /* ... notify it. */ + send_guest_global_virq(dom0, VIRQ_MCA); + else + /* ... or dump it */ + x86_mcinfo_dump(si); + } +} + +/* polling service routine invoker: + * Adjust poll frequency at runtime. No error means slow polling frequency, + * an error means higher polling frequency. + * It uses hw threshold register introduced in AMD K8 RevF to detect + * multiple correctable errors between two polls. In that case, + * increase polling frequency higher than normal. + */ +static void mce_amd_work_fn(void *data) +{ + on_each_cpu(mce_amd_checkregs, data, 1, 1); + + if (adjust > 0) { + if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) { + /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */ + printk("MCE: polling routine found correctable error\n"); + } + } + + if (hw_threshold) { + uint64_t value; + uint32_t counter; + + rdmsrl(MSR_K8_MC4_MISC, value); + /* Only the error counter field is of interest + * Bit field is described in AMD K8 BKDG chapter 6.4.5.5 + */ + counter = (value & 0xFFF00000000ULL) >> 32U; + + /* HW does not count *all* kinds of correctable errors. + * Thus it is possible, that the polling routine finds an + * correctable error even if the HW reports nothing. + * However, the other way around is not possible (= BUG). + */ + if (counter > 0) { + /* HW reported correctable errors, + * the polling routine did not find... + */ + BUG_ON(adjust == 0); + /* subtract 1 to not double count the error + * from the polling service routine */ + adjust += (counter - 1); + + /* Restart counter */ + /* No interrupt, reset counter value */ + value &= ~(0x60FFF00000000ULL); + /* Counter enable */ + value |= (1ULL << 51); + wrmsrl(MSR_K8_MC4_MISC, value); + wmb(); + } + } + + if (adjust > 0) { + /* Increase polling frequency */ + adjust++; /* adjust == 1 must have an effect */ + period /= adjust; + } else { + /* Decrease polling frequency */ + period *= 2; + } + if (period > MCE_MAX) { + /* limit: Poll at least every 30s */ + period = MCE_MAX; + } + if (period < MCE_MIN) { + /* limit: Poll every 2s. + * When this is reached an uncorrectable error + * is expected to happen, if Dom0 does nothing. + */ + period = MCE_MIN; + } + + set_timer(&mce_timer, NOW() + period); } static int __init init_nonfatal_mce_checker(void) @@ -68,19 +232,63 @@ static int __init init_nonfatal_mce_chec if (!cpu_has(c, X86_FEATURE_MCA)) return -ENODEV; - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - /* * Check for non-fatal errors every MCE_RATE s */ - init_timer(&mce_timer, mce_work_fn, NULL, 0); - set_timer(&mce_timer, NOW() + MCE_PERIOD); - printk(KERN_INFO "Machine check exception polling timer started.\n"); + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (c->x86 == 6) { /* K7 */ + firstbank = 1; + init_timer(&mce_timer, mce_intel_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + MCE_PERIOD); + break; + } + + /* Assume we are on K8 or newer AMD CPU here */ + if (cpu_has(c, X86_FEATURE_SVME)) { + uint64_t value; + + /* hw threshold registers present */ + hw_threshold = 1; + rdmsrl(MSR_K8_MC4_MISC, value); + + if (value & (1ULL << 61)) { /* Locked bit */ + /* Locked by BIOS. Not available for use */ + hw_threshold = 0; + } + if (!(value & (1ULL << 63))) { /* Valid bit */ + /* No CtrP present */ + hw_threshold = 0; + } else { + if (!(value & (1ULL << 62))) { /* Counter Bit */ + /* No counter field present */ + hw_threshold = 0; + } + } + + if (hw_threshold) { + /* No interrupt, reset counter value */ + value &= ~(0x60FFF00000000ULL); + /* Counter enable */ + value |= (1ULL << 51); + wrmsrl(MSR_K8_MC4_MISC, value); + /* serialize */ + wmb(); + printk(KERN_INFO "MCA: Use hw thresholding to adjust polling frequency\n"); + } + } + + init_timer(&mce_timer, mce_amd_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + period); + break; + + case X86_VENDOR_INTEL: + init_timer(&mce_timer, mce_intel_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + MCE_PERIOD); + break; + } + + printk(KERN_INFO "MCA: Machine check polling timer started.\n"); return 0; } __initcall(init_nonfatal_mce_checker); diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/common/event_channel.c --- a/xen/common/event_channel.c Thu Aug 23 10:20:43 2007 +0200 +++ b/xen/common/event_channel.c Thu Aug 23 10:52:15 2007 +0200 @@ -539,6 +539,21 @@ void evtchn_set_pending(struct vcpu *v, } +int guest_enabled_event(struct vcpu *v, int virq) +{ + int port; + + if ( unlikely(v == NULL) ) + return 0; + + port = v->virq_to_evtchn[virq]; + if ( port == 0 ) + return 0; + + /* virq is in use */ + return 1; +} + void send_guest_vcpu_virq(struct vcpu *v, int virq) { int port; diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/include/asm-x86/event.h --- a/xen/include/asm-x86/event.h Thu Aug 23 10:20:43 2007 +0200 +++ b/xen/include/asm-x86/event.h Thu Aug 23 10:52:15 2007 +0200 @@ -61,7 +61,12 @@ static inline void local_event_delivery_ /* No arch specific virq definition now. Default to global. */ static inline int arch_virq_is_global(int virq) { - return 1; + switch (virq) { + case VIRQ_MCA: + return 1; + default: + return 1; + } } #endif diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/include/public/arch-x86/xen.h --- a/xen/include/public/arch-x86/xen.h Thu Aug 23 10:20:43 2007 +0200 +++ b/xen/include/public/arch-x86/xen.h Thu Aug 23 10:52:15 2007 +0200 @@ -82,6 +82,8 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); #ifndef __ASSEMBLY__ +#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */ + /* * Machine Check Architecure: * structs are read-only and used to report all kinds of diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/include/xen/event.h --- a/xen/include/xen/event.h Thu Aug 23 10:20:43 2007 +0200 +++ b/xen/include/xen/event.h Thu Aug 23 10:52:15 2007 +0200 @@ -51,6 +51,9 @@ void free_xen_event_channel( void free_xen_event_channel( struct vcpu *local_vcpu, int port); +/* Query if event channel is in use by the guest */ +int guest_enabled_event(struct vcpu *v, int virq); + /* Notify remote end of a Xen-attached event channel.*/ void notify_via_xen_event_channel(int lport);