[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] re-work MCA telemetry internals; use common code for Intel/AMD MCA



The following patch reworks the MCA error telemetry handling inside Xen, and shares code between the Intel and AMD implementations as much as possible.

I've had this patch sitting around for a while, but it wasn't ported to -unstable yet. I finished porting and testing it, and am submitting it now, because the Intel folks want to go ahead and submit their new changes, so we agreed that I should push our changes first.

Brief explanation of the telemetry part: previously, the telemetry was accessed in a global array, with index variables used to access it. There were some issues with that: race conditions with regard to new machine checks (or CMCIs) coming in while handling the telemetry, and interaction with domains having been notified or not, which was a bit hairy. Our changes (I should say: Gavin Maltby's changes, as he did the bulk of this work for our 3.1 based tree, I merely ported/extended it to 3.3 and beyond) make telemetry access transactional (think of a database). Also, the internal database updates are atomic, since the final commit is done by a pointer swap. There is a brief explanation of the mechanism in mctelem.h.This patch also removes dom0->domU notification, which is ok, since Intel's upcoming changes will replace domU notification with a vMCE mechanism anyway.

The common code part is pretty much what it says. It defines a common MCE handler, with a few hooks for the special needs of the specific CPUs.

I've been told that Intel's upcoming patch will need to make some parts of the common code specific to the Intel CPU again, but we'll work together to use as much common code as possible.

- Frank
Replace hypervisor MCA telemetry structures with something more robust
and designed to make terminal error telemetry available to the dom0
panic flow for diagnosis on reboot.

Use common code for a lot of the AMD and Intel MCE handling code.

Signed-off-by: Gavin Maltby <gavin.maltby@xxxxxxx>
Signed-off-by: Frank van der Linden <frank.vanderlinden@xxxxxxx>

diff --git a/xen/arch/x86/cpu/mcheck/Makefile b/xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile
+++ b/xen/arch/x86/cpu/mcheck/Makefile
@@ -2,6 +2,7 @@ obj-y += k7.o
 obj-y += k7.o
 obj-y += amd_k8.o
 obj-y += amd_f10.o
+obj-y += mctelem.o
 obj-y += mce.o
 obj-y += mce_intel.o
 obj-y += non-fatal.o
diff --git a/xen/arch/x86/cpu/mcheck/amd_f10.c 
b/xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c
@@ -49,20 +49,21 @@
 #include "x86_mca.h"
 
 
-static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+static enum mca_extinfo
+amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
 {
        struct mcinfo_extended mc_ext;
 
        /* Family 0x10 introduced additional MSR that belong to the
         * northbridge bank (4). */
-       if (bank != 4)
-               return 0;
+       if (mi == NULL || bank != 4)
+               return MCA_EXTINFO_IGNORED;
 
        if (!(status & MCi_STATUS_VAL))
-               return 0;
+               return MCA_EXTINFO_IGNORED;
 
        if (!(status & MCi_STATUS_MISCV))
-               return 0;
+               return MCA_EXTINFO_IGNORED;
 
        memset(&mc_ext, 0, sizeof(mc_ext));
        mc_ext.common.type = MC_TYPE_EXTENDED;
@@ -78,23 +79,25 @@ static int amd_f10_handler(struct mc_inf
        rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
        
        x86_mcinfo_add(mi, &mc_ext);
-       return 1;
+       return MCA_EXTINFO_LOCAL;
 }
 
 
 extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
 
 /* AMD Family10 machine check */
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
 { 
        uint64_t value;
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
-       mc_callback_bank_extended = amd_f10_handler;
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k8_machine_check);
+       x86_mce_callback_register(amd_f10_handler);
        cpu_nr = smp_processor_id();
-       wmb();
 
        rdmsrl(MSR_IA32_MCG_CAP, value);
        if (value & MCG_CTL_P)  /* Control register present ? */
@@ -104,18 +107,9 @@ void amd_f10_mcheck_init(struct cpuinfo_
        for (i = 0; i < nr_mce_banks; i++) {
                switch (i) {
                case 4: /* Northbridge */
-                       /* Enable error reporting of all errors,
-                        * enable error checking and
-                        * disable sync flooding */
-                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
                        wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
-
-                       /* XXX: We should write the value 0x1087821UL into
-                        * to register F3x180 here, which sits in
-                        * the PCI extended configuration space.
-                        * Since this is not possible here, we can only hope,
-                        * Dom0 is doing that.
-                        */
                        break;
 
                default:
@@ -128,4 +122,5 @@ void amd_f10_mcheck_init(struct cpuinfo_
 
        set_in_cr4(X86_CR4_MCE);
        printk("CPU%i: AMD Family10h machine check reporting enabled.\n", 
cpu_nr);
+       return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/amd_k8.c b/xen/arch/x86/cpu/mcheck/amd_k8.c
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c
@@ -67,234 +67,27 @@
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 
 /* Machine Check Handler for AMD K8 family series */
 void k8_machine_check(struct cpu_user_regs *regs, long error_code)
 {
-       struct vcpu *vcpu = current;
-       struct domain *curdom;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv, uc;
-       uint32_t i;
-       unsigned int cpu_nr;
-       uint32_t xen_impacted = 0;
-#define DOM_NORMAL     0
-#define DOM0_TRAP      1
-#define DOMU_TRAP      2
-#define DOMU_KILLED    4
-       uint32_t dom_state = DOM_NORMAL;
-
-       /* This handler runs as interrupt gate. So IPIs from the
-        * polling service routine are defered until we finished.
-        */
-
-        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
-        * an other physical CPU or the impacted process in the guest
-        * continues running with corrupted data, otherwise. */
-        vcpu_schedule_lock_irq(vcpu);
-
-       mc_data = x86_mcinfo_getptr();
-       cpu_nr = smp_processor_id();
-       BUG_ON(cpu_nr != vcpu->processor);
-
-       curdom = vcpu->domain;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
-
-       x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
-           &mc_global.mc_coreid, &mc_global.mc_core_threadid,
-           &mc_global.mc_apicid, NULL, NULL, NULL);
-
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       /* Quick check, who is impacted */
-       xen_impacted = is_idle_domain(curdom);
-
-       /* Dom0 */
-       x86_mcinfo_clear(mc_data);
-       x86_mcinfo_add(mc_data, &mc_global);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               /* An error happened in this bank.
-                * This is expected to be an uncorrectable error,
-                * since correctable errors get polled.
-                */
-               uc = status & MCi_STATUS_UC;
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-                       
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-
-               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
-               wmb();
-               add_taint(TAINT_MACHINE_CHECK);
-       }
-
-       status = mc_global.mc_gstatus;
-
-       /* clear MCIP or cpu enters shutdown state
-        * in case another MCE occurs. */
-       status &= ~MCG_STATUS_MCIP;
-       wrmsrl(MSR_IA32_MCG_STATUS, status);
-       wmb();
-
-       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
-        * The thread started here:
-        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
-        */
-
-       /* MCG_STATUS_RIPV: 
-        * When this bit is not set, then the instruction pointer onto the stack
-        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
-        * right below. Otherwise it is up to the guest to figure out if 
-        * guest kernel or guest userland is affected and should kill either
-        * itself or the affected process.
-        */
-
-       /* MCG_STATUS_EIPV:
-        * Evaluation of EIPV is the job of the guest.
-        */
-
-       if (xen_impacted) {
-               /* Now we are going to panic anyway. Allow interrupts, so that
-                * printk on serial console can work. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* Uh, that means, machine check exception
-                * inside Xen occured. */
-               printk("Machine check exception occured in Xen.\n");
-
-               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
-                * to the error then it makes sense to print a stack trace.
-                * That can be useful for more detailed error analysis and/or
-                * error case studies to figure out, if we can clear
-                * xen_impacted and kill a DomU instead
-                * (i.e. if a guest only control structure is affected, but then
-                * we must ensure the bad pages are not re-used again).
-                */
-               if (status & MCG_STATUS_EIPV) {
-                       printk("MCE: Instruction Pointer is related to the 
error. "
-                               "Therefore, print the execution state.\n");
-                       show_execution_state(regs);
-               }
-               x86_mcinfo_dump(mc_data);
-               mc_panic("End of MCE. Use mcelog to decode above error 
codes.\n");
-       }
-
-       /* If Dom0 registered a machine check handler, which is only possible
-        * with a PV MCA driver, then ... */
-       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
-               dom_state = DOM0_TRAP;
-
-               /* ... deliver machine check trap to Dom0. */
-               send_guest_trap(dom0, 0, TRAP_machine_check);
-
-               /* Xen may tell Dom0 now to notify the DomU.
-                * But this will happen through a hypercall. */
-       } else
-               /* Dom0 did not register a machine check handler, but if DomU
-                * did so, then... */
-                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
-                       dom_state = DOMU_TRAP;
-
-                       /* ... deliver machine check trap to DomU */
-                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
-       } else {
-               /* hmm... noone feels responsible to handle the error.
-                * So, do a quick check if a DomU is impacted or not.
-                */
-               if (curdom == dom0) {
-                       /* Dom0 is impacted. Since noone can't handle
-                        * this error, panic! */
-                       x86_mcinfo_dump(mc_data);
-                       mc_panic("MCE occured in Dom0, which it can't 
handle\n");
-
-                       /* UNREACHED */
-               } else {
-                       dom_state = DOMU_KILLED;
-
-                       /* Enable interrupts. This basically results in
-                        * calling sti on the *physical* cpu. But after
-                        * domain_crash() the vcpu pointer is invalid.
-                        * Therefore, we must unlock the irqs before killing
-                        * it. */
-                       vcpu_schedule_unlock_irq(vcpu);
-
-                       /* DomU is impacted. Kill it and continue. */
-                       domain_crash(curdom);
-               }
-       }
-
-
-       switch (dom_state) {
-       case DOM0_TRAP:
-       case DOMU_TRAP:
-               /* Enable interrupts. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* guest softirqs and event callbacks are scheduled
-                * immediately after this handler exits. */
-               break;
-       case DOMU_KILLED:
-               /* Nothing to do here. */
-               break;
-       default:
-               BUG();
-       }
+       mcheck_cmn_handler(regs, error_code, mca_allbanks);
 }
 
-
 /* AMD K8 machine check */
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c)
 {
        uint64_t value;
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
+       /* Check for PPro style MCA; our caller has confirmed MCE support. */
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k8_machine_check);
        cpu_nr = smp_processor_id();
-       wmb();
 
        rdmsrl(MSR_IA32_MCG_CAP, value);
        if (value & MCG_CTL_P)  /* Control register present ? */
@@ -304,10 +97,8 @@ void amd_k8_mcheck_init(struct cpuinfo_x
        for (i = 0; i < nr_mce_banks; i++) {
                switch (i) {
                case 4: /* Northbridge */
-                       /* Enable error reporting of all errors,
-                        * enable error checking and
-                        * disable sync flooding */
-                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
                        wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
                        break;
 
@@ -321,4 +112,6 @@ void amd_k8_mcheck_init(struct cpuinfo_x
 
        set_in_cr4(X86_CR4_MCE);
        printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+
+       return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c 
b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
@@ -58,22 +58,23 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/event.h>
-#include <asm/processor.h> 
+
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(10000)
 #define MCE_MIN    MILLISECS(2000)
 #define MCE_MAX    MILLISECS(30000)
 
 static s_time_t period = MCE_PERIOD;
 static int hw_threshold = 0;
 static int adjust = 0;
+static int variable_period = 1;
 
 /* The polling service routine:
  * Collects information of correctable errors and notifies
@@ -81,99 +82,46 @@ static int adjust = 0;
  */
 void mce_amd_checkregs(void *info)
 {
-       struct vcpu *vcpu = current;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv;
-       unsigned int i;
+       mctelem_cookie_t mctc;
+       struct mca_summary bs;
        unsigned int event_enabled;
-       unsigned int cpu_nr;
-       int error_found;
-
-       /* We don't need a slot yet. Only allocate one on error. */
-       mc_data = NULL;
-
-       cpu_nr = smp_processor_id();
-       BUG_ON(cpu_nr != vcpu->processor);
+
+       mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs);
+
        event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
-       error_found = 0;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-
-       x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
-           &mc_global.mc_coreid, &mc_global.mc_core_threadid,
-           &mc_global.mc_apicid, NULL, NULL, NULL);
-
-       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               if (mc_data == NULL) {
-                       /* Now we need a slot to fill in error telemetry. */
-                       mc_data = x86_mcinfo_getptr();
-                       BUG_ON(mc_data == NULL);
-                       x86_mcinfo_clear(mc_data);
-                       x86_mcinfo_add(mc_data, &mc_global);
-               }
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               /* Increase polling frequency */
-               error_found = 1;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-               x86_mcinfo_add(mc_data, &mc_info);
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
-               wmb();
-       }
-
-       if (error_found > 0) {
-               /* If Dom0 enabled the VIRQ_MCA event, then ... */
-               if (event_enabled)
-                       /* ... notify it. */
+
+       if (bs.errcnt && mctc != NULL) {
+               static uint64_t dumpcount = 0;
+
+               /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+                * Otherwise, if dom0 has had plenty of time to register
+                * the virq handler but still hasn't then dump telemetry
+                * to the Xen console.  The call count may be incremented
+                * on multiple cpus at once and is indicative only - just
+                * a simple-minded attempt to avoid spamming the console
+                * for corrected errors in early startup. */
+
+               if (event_enabled) {
+                       mctelem_commit(mctc);
                        send_guest_global_virq(dom0, VIRQ_MCA);
-               else
-                       /* ... or dump it */
-                       x86_mcinfo_dump(mc_data);
-       }
-
-       adjust += error_found;
+               } else if (++dumpcount >= 10) {
+                       x86_mcinfo_dump((struct mc_info 
*)mctelem_dataptr(mctc));
+                       mctelem_dismiss(mctc);
+               } else {
+                       mctelem_dismiss(mctc);
+               }
+               
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
+       }
+
+       /* adjust is global and all cpus may attempt to increment it without
+        * synchronisation, so they race and the final adjust count
+        * (number of cpus seeing any error) is approximate.  We can
+        * guarantee that if any cpu observes an error that the
+        * adjust count is at least 1. */
+       if (bs.errcnt)
+               adjust++;
 }
 
 /* polling service routine invoker:
@@ -188,7 +136,7 @@ static void mce_amd_work_fn(void *data)
        on_each_cpu(mce_amd_checkregs, data, 1, 1);
 
        if (adjust > 0) {
-               if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+               if (!guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
                        /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
                        printk("MCE: polling routine found correctable error. "
                                " Use mcelog to parse above error output.\n");
@@ -229,19 +177,19 @@ static void mce_amd_work_fn(void *data)
                }
        }
 
-       if (adjust > 0) {
+       if (variable_period && adjust > 0) {
                /* Increase polling frequency */
                adjust++; /* adjust == 1 must have an effect */
                period /= adjust;
-       } else {
+       } else if (variable_period) {
                /* Decrease polling frequency */
                period *= 2;
        }
-       if (period > MCE_MAX) {
+       if (variable_period && period > MCE_MAX) {
                /* limit: Poll at least every 30s */
                period = MCE_MAX;
        }
-       if (period < MCE_MIN) {
+       if (variable_period && period < MCE_MIN) {
                /* limit: Poll every 2s.
                 * When this is reached an uncorrectable error
                 * is expected to happen, if Dom0 does nothing.
@@ -262,7 +210,7 @@ void amd_nonfatal_mcheck_init(struct cpu
 
        /* The threshold bitfields in MSR_IA32_MC4_MISC has
         * been introduced along with the SVME feature bit. */
-       if (cpu_has(c, X86_FEATURE_SVME)) {
+       if (variable_period && cpu_has(c, X86_FEATURE_SVME)) {
                uint64_t value;
 
                /* hw threshold registers present */
diff --git a/xen/arch/x86/cpu/mcheck/k7.c b/xen/arch/x86/cpu/mcheck/k7.c
--- a/xen/arch/x86/cpu/mcheck/k7.c
+++ b/xen/arch/x86/cpu/mcheck/k7.c
@@ -68,13 +68,16 @@ static fastcall void k7_machine_check(st
 
 
 /* AMD K7 machine check */
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        int i;
 
-       machine_check_vector = k7_machine_check;
-       wmb();
+       /* Check for PPro style MCA; our caller has confirmed MCE support. */
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k7_machine_check);
 
        rdmsr (MSR_IA32_MCG_CAP, l, h);
        if (l & (1<<8)) /* Control register present ? */
@@ -92,4 +95,6 @@ void amd_k7_mcheck_init(struct cpuinfo_x
        set_in_cr4 (X86_CR4_MCE);
        printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
                smp_processor_id());
+
+       return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -10,104 +10,490 @@
 #include <xen/smp.h>
 #include <xen/errno.h>
 #include <xen/console.h>
-
-#include <asm/processor.h> 
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
+
+#include <asm/processor.h>
 #include <asm/system.h>
+#include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 int mce_disabled = 0;
 unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
 
-/* XXX For now a fixed array is used. Later this should be changed
- * to a dynamic allocated array with the size calculated in relation
- * to physical cpus present in the machine.
- * The more physical cpus are available, the more entries you need.
- */
-#define MAX_MCINFO     20
-
-struct mc_machine_notify {
-       struct mc_info mc;
-       uint32_t fetch_idx;
-       uint32_t valid;
-};
-
-struct mc_machine {
-
-       /* Array structure used for collecting machine check error telemetry. */
-       struct mc_info mc[MAX_MCINFO];
-
-       /* We handle multiple machine check reports lockless by
-        * iterating through the array using the producer/consumer concept.
-        */
-       /* Producer array index to fill with machine check error data.
-        * Index must be increased atomically. */
-       uint32_t error_idx;
-
-       /* Consumer array index to fetch machine check error data from.
-        * Index must be increased atomically. */
-       uint32_t fetch_idx;
-
-       /* Integer array holding the indeces of the mc array that allows
-         * a Dom0 to notify a DomU to re-fetch the same machine check error
-         * data. The notification and refetch also uses its own 
-        * producer/consumer mechanism, because Dom0 may decide to not report
-        * every error to the impacted DomU.
-        */
-       struct mc_machine_notify notify[MAX_MCINFO];
-
-       /* Array index to get fetch_idx from.
-        * Index must be increased atomically. */
-       uint32_t notifyproducer_idx;
-       uint32_t notifyconsumer_idx;
-};
-
-/* Global variable with machine check information. */
-struct mc_machine mc_data;
+static void mcinfo_clear(struct mc_info *);
+
+#define        SEG_PL(segsel) ((segsel) & 0x3)
+
+#if 1  /* XXFM switch to 0 for putback */
+
+#define        x86_mcerr(str, err) _x86_mcerr(str, err)
+
+static int _x86_mcerr(const char *msg, int err)
+{
+       printk("x86_mcerr: %s, returning %d\n",
+           msg != NULL ? msg : "", err);
+       return err;
+}
+#else
+#define x86_mcerr(str,err)
+#endif
+
+cpu_banks_t mca_allbanks;
 
 /* Handle unconfigured int18 (should never happen) */
 static void unexpected_machine_check(struct cpu_user_regs *regs, long 
error_code)
-{      
+{
        printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
                smp_processor_id());
 }
 
 
+static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
+
+void x86_mce_vector_register(x86_mce_vector_t hdlr)
+{
+       _machine_check_vector = hdlr;
+       wmb();
+}
+
 /* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = 
unexpected_machine_check;
+
+void machine_check_vector(struct cpu_user_regs *regs, long error_code)
+{
+       _machine_check_vector(regs, error_code);
+}
 
 /* Init machine check callback handler
  * It is used to collect additional information provided by newer
  * CPU families/models without the need to duplicate the whole handler.
  * This avoids having many handlers doing almost nearly the same and each
  * with its own tweaks ands bugs. */
-int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
-
-
-static void amd_mcheck_init(struct cpuinfo_x86 *ci)
-{
+static x86_mce_callback_t mc_callback_bank_extended = NULL;
+
+void x86_mce_callback_register(x86_mce_callback_t cbfunc)
+{
+       mc_callback_bank_extended = cbfunc;
+}
+
+/* Utility function to perform MCA bank telemetry readout and to push that
+ * telemetry towards an interested dom0 for logging and diagnosis.
+ * The caller - #MC handler or MCA poll function - must arrange that we
+ * do not migrate cpus. */
+
+/* XXFM Could add overflow counting? */
+mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
+    struct mca_summary *sp)
+{
+       struct vcpu *v = current;
+       struct domain *d;
+       uint64_t gstatus, status, addr, misc;
+       struct mcinfo_global mcg;       /* on stack */
+       struct mcinfo_common *mic;
+       struct mcinfo_global *mig;      /* on stack */
+       mctelem_cookie_t mctc = NULL;
+       uint32_t uc = 0, pcc = 0;
+       struct mc_info *mci = NULL;
+       mctelem_class_t which = MC_URGENT;      /* XXXgcc */
+       unsigned int cpu_nr;
+       int errcnt = 0;
+       int i;
+       enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
+
+       cpu_nr = smp_processor_id();
+       BUG_ON(cpu_nr != v->processor);
+
+       rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+
+       memset(&mcg, 0, sizeof (mcg));
+       mcg.common.type = MC_TYPE_GLOBAL;
+       mcg.common.size = sizeof (mcg);
+       if (v != NULL && ((d = v->domain) != NULL)) {
+               mcg.mc_domid = d->domain_id;
+               mcg.mc_vcpuid = v->vcpu_id;
+       } else {
+               mcg.mc_domid = -1;
+               mcg.mc_vcpuid = -1;
+       }
+       mcg.mc_gstatus = gstatus;       /* MCG_STATUS */
+
+       switch (who) {
+       case MCA_MCE_HANDLER:
+               mcg.mc_flags = MC_FLAG_MCE;
+               which = MC_URGENT;
+               break;
+
+       case MCA_POLLER:
+       case MCA_RESET:
+               mcg.mc_flags = MC_FLAG_POLLED;
+               which = MC_NONURGENT;
+               break;
+
+       case MCA_CMCI_HANDLER:
+               mcg.mc_flags = MC_FLAG_CMCI;
+               which = MC_NONURGENT;
+               break;
+
+       default:
+               BUG();
+       }
+
+       /* Retrieve detector information */
+       x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
+           &mcg.mc_coreid, &mcg.mc_core_threadid,
+           &mcg.mc_apicid, NULL, NULL, NULL);
+
+       for (i = 0; i < 32 && i < nr_mce_banks; i++) {
+               struct mcinfo_bank mcb;         /* on stack */
+
+               /* Skip bank if corresponding bit in bankmask is clear */
+               if (!test_bit(i, bankmask))
+                       continue;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+               if (!(status & MCi_STATUS_VAL))
+                       continue;       /* this bank has no valid telemetry */
+
+               /* If this is the first bank with valid MCA DATA, then
+                * try to reserve an entry from the urgent/nonurgent queue
+                * depending on whethere we are called from an exception or
+                * a poller;  this can fail (for example dom0 may not
+                * yet have consumed past telemetry). */
+               if (errcnt == 0) {
+                       if ((mctc = mctelem_reserve(which)) != NULL) {
+                               mci = mctelem_dataptr(mctc);
+                               mcinfo_clear(mci);
+                       }
+               }
+
+               memset(&mcb, 0, sizeof (mcb));
+               mcb.common.type = MC_TYPE_BANK;
+               mcb.common.size = sizeof (mcb);
+               mcb.mc_bank = i;
+               mcb.mc_status = status;
+
+               /* form a mask of which banks have logged uncorrected errors */
+               if ((status & MCi_STATUS_UC) != 0)
+                       uc |= (1 << i);
+
+               /* likewise for those with processor context corrupt */
+               if ((status & MCi_STATUS_PCC) != 0)
+                       pcc |= (1 << i);
+
+               addr = misc = 0;
+
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+                       d = maddr_get_owner(addr);
+                       if (d != NULL && (who == MCA_POLLER ||
+                           who == MCA_CMCI_HANDLER))
+                               mcb.mc_domid = d->domain_id;
+               }
+
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
+
+               mcb.mc_addr = addr;
+               mcb.mc_misc = misc;
+
+               if (who == MCA_CMCI_HANDLER) {
+                       rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+                       rdtscll(mcb.mc_tsc);
+               }
+
+               /* Increment the error count;  if this is the first bank
+                * with a valid error then add the global info to the mcinfo. */
+               if (errcnt++ == 0 && mci != NULL)
+                       x86_mcinfo_add(mci, &mcg);
+
+               /* Add the bank data */
+               if (mci != NULL)
+                       x86_mcinfo_add(mci, &mcb);
+
+               if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
+                       cbret = mc_callback_bank_extended(mci, i, status);
+               }
+
+               /* Clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               wmb();
+       }
+
+       if (mci != NULL && errcnt > 0) {
+               x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
+               mig = (struct mcinfo_global *)mic;
+               if (pcc)
+                       mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+               else if (uc)
+                       mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+               else
+                       mcg.mc_flags |= MC_FLAG_CORRECTABLE;
+       }
+
+
+       if (sp) {
+               sp->errcnt = errcnt;
+               sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
+               sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
+               sp->uc = uc;
+               sp->pcc = pcc;
+       }
+
+       return mci != NULL ? mctc : NULL;       /* may be NULL */
+}
+
+#define DOM_NORMAL     0
+#define DOM0_TRAP      1
+#define DOMU_TRAP      2
+#define DOMU_KILLED    4
+
+/* Shared #MC handler. */
+void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
+    cpu_banks_t bankmask)
+{
+       int xen_state_lost, dom0_state_lost, domU_state_lost;
+       struct vcpu *v = current;
+       struct domain *curdom = v->domain;
+       domid_t domid = curdom->domain_id;
+       int ctx_xen, ctx_dom0, ctx_domU;
+       uint32_t dom_state = DOM_NORMAL;
+       mctelem_cookie_t mctc = NULL;
+       struct mca_summary bs;
+       struct mc_info *mci = NULL;
+       int irqlocked = 0;
+       uint64_t gstatus;
+       int ripv;
+
+       /* This handler runs as interrupt gate. So IPIs from the
+        * polling service routine are defered until we're finished.
+        */
+
+       /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+        * another physical CPU. */
+       vcpu_schedule_lock_irq(v);
+       irqlocked = 1;
+
+       /* Read global status;  if it does not indicate machine check
+        * in progress then bail as long as we have a valid ip to return to. */
+       rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
+       if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
+               add_taint(TAINT_MACHINE_CHECK); /* questionable */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               goto cmn_handler_done;
+       }
+
+       /* Go and grab error telemetry.  We must choose whether to commit
+        * for logging or dismiss the cookie that is returned, and must not
+        * reference the cookie after that action.
+        */
+       mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs);
+       if (mctc != NULL)
+               mci = (struct mc_info *)mctelem_dataptr(mctc);
+
+       /* Clear MCIP or another #MC will enter shutdown state */
+       gstatus &= ~MCG_STATUS_MCIP;
+       wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       wmb();
+
+       /* If no valid errors and our stack is intact, we're done */
+       if (ripv && bs.errcnt == 0) {
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               goto cmn_handler_done;
+       }
+
+       if (bs.uc || bs.pcc)
+               add_taint(TAINT_MACHINE_CHECK);
+
+       /* Machine check exceptions will usually be for UC and/or PCC errors,
+        * but it is possible to configure machine check for some classes
+        * of corrected error.
+        *
+        * UC errors could compromise any domain or the hypervisor
+        * itself - for example a cache writeback of modified data that
+        * turned out to be bad could be for data belonging to anyone, not
+        * just the current domain.  In the absence of known data poisoning
+        * to prevent consumption of such bad data in the system we regard
+        * all UC errors as terminal.  It may be possible to attempt some
+        * heuristics based on the address affected, which guests have
+        * mappings to that mfn etc.
+        *
+        * PCC errors apply to the current context.
+        *
+        * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
+        * and not PCC is terminal - the return instruction pointer
+        * pushed onto the stack is bogus.  If the interrupt context is
+        * the hypervisor or dom0 the game is over, otherwise we can
+        * limit the impact to a single domU but only if we trampoline
+        * somewhere safely - we can't return and unwind the stack.
+        * Since there is no trampoline in place we will treat !RIPV
+        * as terminal for any context.
+        */
+       ctx_xen = SEG_PL(regs->cs) == 0;
+       ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
+       ctx_domU = !ctx_xen && !ctx_dom0;
+
+       xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
+           !ripv;
+       dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
+       domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
+
+       if (xen_state_lost) {
+               /* Now we are going to panic anyway. Allow interrupts, so that
+                * printk on serial console can work. */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+
+               printk("Terminal machine check exception occured in "
+                   "hypervisor context.\n");
+
+               /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
+                * to the error then it makes sense to print a stack trace.
+                * That can be useful for more detailed error analysis and/or
+                * error case studies to figure out, if we can clear
+                * xen_impacted and kill a DomU instead
+                * (i.e. if a guest only control structure is affected, but then
+                * we must ensure the bad pages are not re-used again).
+                */
+               if (bs.eipv & MCG_STATUS_EIPV) {
+                       printk("MCE: Instruction Pointer is related to the "
+                           "error, therefore print the execution state.\n");
+                       show_execution_state(regs);
+               }
+
+               /* Commit the telemetry so that panic flow can find it. */
+               if (mctc != NULL) {
+                       x86_mcinfo_dump(mci);
+                       mctelem_commit(mctc);
+               }
+               mc_panic("Hypervisor state lost due to machine check "
+                   "exception.\n");
+               /*NOTREACHED*/
+       }
+
+       /*
+        * Xen hypervisor state is intact.  If dom0 state is lost then
+        * give it a chance to decide what to do if it has registered
+        * a handler for this event, otherwise panic.
+        *
+        * XXFM Could add some Solaris dom0 contract kill here?
+        */
+       if (dom0_state_lost) {
+               if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
+                       dom_state = DOM0_TRAP;
+                       send_guest_trap(dom0, 0, TRAP_machine_check);
+                       /* XXFM case of return with !ripv ??? */
+               } else {
+                       /* Commit telemetry for panic flow. */
+                       if (mctc != NULL) {
+                               x86_mcinfo_dump(mci);
+                               mctelem_commit(mctc);
+                       }
+                       mc_panic("Dom0 state lost due to machine check "
+                           "exception\n");
+                       /*NOTREACHED*/
+               }
+       }
+
+       /*
+        * If a domU has lost state then send it a trap if it has registered
+        * a handler, otherwise crash the domain.
+        * XXFM Revisit this functionality.
+        */
+       if (domU_state_lost) {
+               if (guest_has_trap_callback(v->domain, v->vcpu_id,
+                   TRAP_machine_check)) {
+                       dom_state = DOMU_TRAP;
+                       send_guest_trap(curdom, v->vcpu_id,
+                           TRAP_machine_check);
+               } else {
+                       dom_state = DOMU_KILLED;
+                       /* Enable interrupts. This basically results in
+                        * calling sti on the *physical* cpu. But after
+                        * domain_crash() the vcpu pointer is invalid.
+                        * Therefore, we must unlock the irqs before killing
+                        * it. */
+                       vcpu_schedule_unlock_irq(v);
+                       irqlocked = 0;
+
+                       /* DomU is impacted. Kill it and continue. */
+                       domain_crash(curdom);
+               }
+       }
+
+       switch (dom_state) {
+       case DOM0_TRAP:
+       case DOMU_TRAP:
+               /* Enable interrupts. */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+
+               /* guest softirqs and event callbacks are scheduled
+                * immediately after this handler exits. */
+               break;
+       case DOMU_KILLED:
+               /* Nothing to do here. */
+               break;
+
+       case DOM_NORMAL:
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               break;
+       }
+
+cmn_handler_done:
+       BUG_ON(irqlocked);
+       BUG_ON(!ripv);
+
+       if (bs.errcnt) {
+               /* Not panicing, so forward telemetry to dom0 now if it
+                * is interested. */
+               if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+                       if (mctc != NULL)
+                               mctelem_commit(mctc);
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               } else {
+                       x86_mcinfo_dump(mci);
+                       if (mctc != NULL)
+                               mctelem_dismiss(mctc);
+               }
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
+       }
+}
+
+static int amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+       int rc = 0;
 
        switch (ci->x86) {
        case 6:
-               amd_k7_mcheck_init(ci);
+               rc = amd_k7_mcheck_init(ci);
                break;
 
        case 0xf:
-               amd_k8_mcheck_init(ci);
+               rc = amd_k8_mcheck_init(ci);
                break;
 
        case 0x10:
-               amd_f10_mcheck_init(ci);
+               rc = amd_f10_mcheck_init(ci);
                break;
 
        default:
                /* Assume that machine check support is available.
                 * The minimum provided support is at least the K8. */
-               amd_k8_mcheck_init(ci);
-       }
+               rc = amd_k8_mcheck_init(ci);
+       }
+
+       return rc;
 }
 
 /*check the existence of Machine Check*/
@@ -116,50 +502,81 @@ int mce_available(struct cpuinfo_x86 *c)
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
+/*
+ * Check if bank 0 is usable for MCE. It isn't for AMD K7,
+ * and Intel P6 family before model 0x1a.
+ */
+int mce_firstbank(struct cpuinfo_x86 *c)
+{
+       if (c->x86 == 6) {
+               if (c->x86_vendor == X86_VENDOR_AMD)
+                       return 1;
+
+               if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
+                       return 1;
+       }
+
+       return 0;
+}
+
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
+       int inited = 0, i;
+
        if (mce_disabled == 1) {
                printk(XENLOG_INFO "MCE support disabled by bootparam\n");
                return;
        }
 
+       for (i = 0; i < MAX_NR_BANKS; i++)
+               set_bit(i,mca_allbanks);
+
+       /* Enforce at least MCE support in CPUID information.  Individual
+        * families may also need to enforce a check for MCA support. */
        if (!cpu_has(c, X86_FEATURE_MCE)) {
                printk(XENLOG_INFO "CPU%i: No machine check support 
available\n",
                        smp_processor_id());
                return;
        }
 
-       memset(&mc_data, 0, sizeof(struct mc_machine));
+       mctelem_init(sizeof (struct mc_info));
 
        switch (c->x86_vendor) {
        case X86_VENDOR_AMD:
-               amd_mcheck_init(c);
+               inited = amd_mcheck_init(c);
                break;
 
        case X86_VENDOR_INTEL:
+               switch (c->x86) {
+               case 5:
 #ifndef CONFIG_X86_64
-               if (c->x86==5)
-                       intel_p5_mcheck_init(c);
+                       inited = intel_p5_mcheck_init(c);
 #endif
-               /*If it is P6 or P4 family, including CORE 2 DUO series*/
-               if (c->x86 == 6 || c->x86==15)
-               {
-                       printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
-                       intel_mcheck_init(c);
+                       break;
+
+               case 6:
+               case 15:
+                       inited = intel_mcheck_init(c);
+                       break;
                }
                break;
 
 #ifndef CONFIG_X86_64
        case X86_VENDOR_CENTAUR:
-               if (c->x86==5)
-                       winchip_mcheck_init(c);
+               if (c->x86==5) {
+                       inited = winchip_mcheck_init(c);
+               }
                break;
 #endif
 
        default:
                break;
        }
+
+       if (!inited)
+               printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
+                   smp_processor_id());
 }
 
 
@@ -176,190 +593,11 @@ custom_param("nomce", mcheck_disable);
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
 
-
-#include <xen/guest_access.h>
-#include <asm/traps.h>
-
-struct mc_info *x86_mcinfo_getptr(void)
-{
-       struct mc_info *mi;
-       uint32_t entry, next;
-
-       for (;;) {
-               entry = mc_data.error_idx;
-               smp_rmb();
-               next = entry + 1;
-               if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
-                       break;
-       }
-
-       mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
-       BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
-
-       return mi;
-}
-
-static int x86_mcinfo_matches_guest(const struct mc_info *mi,
-                       const struct domain *d, const struct vcpu *v)
-{
-       struct mcinfo_common *mic;
-       struct mcinfo_global *mig;
-
-       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
-       mig = (struct mcinfo_global *)mic;
-       if (mig == NULL)
-               return 0;
-
-       if (d->domain_id != mig->mc_domid)
-               return 0;
-
-       if (v->vcpu_id != mig->mc_vcpuid)
-               return 0;
-
-       return 1;
-}
-
-
-#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
-
-static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
-                               const struct domain *d, const struct vcpu *v)
-{
-       struct mc_info *mi;
-
-       /* This function is called from the fetch hypercall with
-        * the mc_lock spinlock held. Thus, no need for locking here.
-        */
-       mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
-       if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
-               /* Bogus domU command detected. */
-               *fetch_idx = 0;
-               return NULL;
-       }
-
-       *fetch_idx = mc_data.fetch_idx;
-       mc_data.fetch_idx++;
-       BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
-
-       return mi;
-}
-
-
-static void x86_mcinfo_marknotified(struct xen_mc_notifydomain 
*mc_notifydomain)
-{
-       struct mc_machine_notify *mn;
-       struct mcinfo_common *mic = NULL;
-       struct mcinfo_global *mig;
-       struct domain *d;
-       int i;
-
-       /* This function is called from the notifier hypercall with
-        * the mc_notify_lock spinlock held. Thus, no need for locking here.
-        */
-
-       /* First invalidate entries for guests that disappeared after
-        * notification (e.g. shutdown/crash). This step prevents the
-        * notification array from filling up with stalling/leaking entries.
-        */
-       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
-               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-               BUG_ON(mic == NULL);
-               mig = (struct mcinfo_global *)mic;
-               d = get_domain_by_id(mig->mc_domid);
-               if (d == NULL) {
-                       /* Domain does not exist. */
-                       mn->valid = 0;
-               }
-               if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
-                       mc_data.notifyconsumer_idx++;
-       }
-
-       /* Now put in the error telemetry. Since all error data fetchable
-        * by domUs are uncorrectable errors, they are very important.
-        * So we dump them before overriding them. When a guest takes that long,
-        * then we can assume something bad already happened (crash, hang, etc.)
-        */
-       mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
-
-       if (mn->valid) {
-               struct mcinfo_common *mic = NULL;
-               struct mcinfo_global *mig;
-
-               /* To not loose the information, we dump it. */
-               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-               BUG_ON(mic == NULL);
-               mig = (struct mcinfo_global *)mic;
-               printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
-                       "fetch machine check error telemetry. But Domain ID "
-                       "did not do that in time.\n",
-                       mig->mc_domid);
-               x86_mcinfo_dump(&mn->mc);
-       }
-
-       memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
-               sizeof(struct mc_info));
-       mn->fetch_idx = mc_notifydomain->fetch_idx;
-       mn->valid = 1;
-
-       mc_data.notifyproducer_idx++;
-
-       /* By design there can never be more notifies than machine check errors.
-        * If that ever happens, then we hit a bug. */
-       BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
-       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-}
-
-static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
-                               const struct domain *d, const struct vcpu *v)
-{
-       struct mc_machine_notify *mn = NULL;
-       uint32_t i;
-       int found;
-
-       /* This function is called from the fetch hypercall with
-        * the mc_notify_lock spinlock held. Thus, no need for locking here.
-        */
-
-       /* The notifier data is filled in the order guests get notified, but
-        * guests may fetch them in a different order. That's why we need
-        * the game with valid/invalid entries. */
-       found = 0;
-       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
-               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-               if (!mn->valid) {
-                       if (i == mc_data.notifyconsumer_idx)
-                               mc_data.notifyconsumer_idx++;
-                       continue;
-               }
-               if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
-                       found = 1;
-                       break;
-               }
-       }
-
-       if (!found) {
-               /* This domain has never been notified. This must be
-                * a bogus domU command. */
-               *fetch_idx = 0;
-               return NULL;
-       }
-
-       BUG_ON(mn == NULL);
-       *fetch_idx = mn->fetch_idx;
-       mn->valid = 0;
-
-       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-       return &mn->mc;
-}
-
-
-void x86_mcinfo_clear(struct mc_info *mi)
+static void mcinfo_clear(struct mc_info *mi)
 {
        memset(mi, 0, sizeof(struct mc_info));
        x86_mcinfo_nentries(mi) = 0;
 }
-
 
 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
 {
@@ -380,7 +618,7 @@ int x86_mcinfo_add(struct mc_info *mi, v
        end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
 
        if (end1 < end2)
-               return -ENOSPC; /* No space. Can't add entry. */
+               return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
 
        /* there's enough space. add entry. */
        memcpy(mic_index, mic, mic->size);
@@ -388,7 +626,6 @@ int x86_mcinfo_add(struct mc_info *mi, v
 
        return 0;
 }
-
 
 /* Dump machine check information in a format,
  * mcelog can parse. This is used only when
@@ -404,7 +641,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
        if (mic == NULL)
                return;
        mc_global = (struct mcinfo_global *)mic;
-       if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+       if (mc_global->mc_flags & MC_FLAG_MCE) {
                printk(XENLOG_WARNING
                        "CPU%d: Machine Check Exception: %16"PRIx64"\n",
                        mc_global->mc_coreid, mc_global->mc_gstatus);
@@ -424,7 +661,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
                        goto next;
 
                mc_bank = (struct mcinfo_bank *)mic;
-       
+
                printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
                        mc_bank->mc_bank,
                        mc_bank->mc_status);
@@ -440,8 +677,6 @@ next:
                        break;
        } while (1);
 }
-
-
 
 static void do_mc_get_cpu_info(void *v)
 {
@@ -533,183 +768,141 @@ void x86_mc_get_cpu_info(unsigned cpu, u
        }
 }
 
+#if BITS_PER_LONG == 64
+
+#define        ID2COOKIE(id)   ((mctelem_cookie_t)(id))
+#define        COOKIE2ID(c) ((uint64_t)(c))
+
+#elif BITS_PER_LONG == 32
+
+#define        ID2COOKIE(id)   ((mctelem_cookie_t)(uint32_t)((id) & 
0xffffffffU))
+#define        COOKIE2ID(c)    ((uint64_t)(uint32_t)(c))
+
+#elif defined(BITS_PER_LONG)
+#error BITS_PER_LONG has unexpected value
+#else
+#error BITS_PER_LONG definition absent
+#endif
+
 /* Machine Check Architecture Hypercall */
 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
 {
        long ret = 0;
        struct xen_mc curop, *op = &curop;
        struct vcpu *v = current;
-       struct domain *domU;
        struct xen_mc_fetch *mc_fetch;
-       struct xen_mc_notifydomain *mc_notifydomain;
        struct xen_mc_physcpuinfo *mc_physcpuinfo;
-       struct mc_info *mi;
-       uint32_t flags;
-       uint32_t fetch_idx;
-        uint16_t vcpuid;
-       /* Use a different lock for the notify hypercall in order to allow
-        * a DomU to fetch mc data while Dom0 notifies another DomU. */
-       static DEFINE_SPINLOCK(mc_lock);
-       static DEFINE_SPINLOCK(mc_notify_lock);
+       uint32_t flags, cmdflags;
        int nlcpu;
        xen_mc_logical_cpu_t *log_cpus = NULL;
+       mctelem_cookie_t mctc;
+       mctelem_class_t which;
 
        if ( copy_from_guest(op, u_xen_mc, 1) )
-               return -EFAULT;
+               return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
 
        if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
-               return -EACCES;
-
-       switch ( op->cmd ) {
+               return x86_mcerr("do_mca: interface version mismatch", -EACCES);
+
+       switch (op->cmd) {
        case XEN_MC_fetch:
-               /* This hypercall is for any domain */
                mc_fetch = &op->u.mc_fetch;
-
-               switch (mc_fetch->flags) {
-               case XEN_MC_CORRECTABLE:
-                       /* But polling mode is Dom0 only, because
-                        * correctable errors are reported to Dom0 only */
-                       if ( !IS_PRIV(v->domain) )
-                               return -EPERM;
+               cmdflags = mc_fetch->flags;
+
+               /* This hypercall is for Dom0 only */
+               if (!IS_PRIV(v->domain) )
+                       return x86_mcerr(NULL, -EPERM);
+
+               switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
+               case XEN_MC_NONURGENT:
+                       which = MC_NONURGENT;
                        break;
 
-               case XEN_MC_TRAP:
+               case XEN_MC_URGENT:
+                       which = MC_URGENT;
                        break;
+
                default:
-                       return -EFAULT;
+                       return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
                }
 
                flags = XEN_MC_OK;
-               spin_lock(&mc_lock);
-
-               if ( IS_PRIV(v->domain) ) {
-                       /* this must be Dom0. So a notify hypercall
-                        * can't have happened before. */
-                       mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+
+               if (cmdflags & XEN_MC_ACK) {
+                       mctelem_cookie_t cookie = ID2COOKIE(mc_fetch->fetch_id);
+                       mctelem_ack(which, cookie);
                } else {
-                       /* Hypercall comes from an unprivileged domain */
-                       domU = v->domain;
-                       if (guest_has_trap_callback(dom0, 0, 
TRAP_machine_check)) {
-                               /* Dom0 must have notified this DomU before
-                                * via the notify hypercall. */
-                               mi = x86_mcinfo_getnotifiedptr(&fetch_idx, 
domU, v);
+                       if (guest_handle_is_null(mc_fetch->data))
+                               return x86_mcerr("do_mca fetch: guest buffer "
+                                   "invalid", -EINVAL);
+
+                       if ((mctc = mctelem_consume_oldest_begin(which))) {
+                               struct mc_info *mcip = mctelem_dataptr(mctc);
+                               if (copy_to_guest(mc_fetch->data, mcip, 1)) {
+                                       ret = -EFAULT;
+                                       flags |= XEN_MC_FETCHFAILED;
+                                       mc_fetch->fetch_id = 0;
+                               } else {
+                                       mc_fetch->fetch_id = COOKIE2ID(mctc);
+                               }
+                               mctelem_consume_oldest_end(mctc);
                        } else {
-                               /* Xen notified the DomU. */
-                               mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, 
v);
+                               /* There is no data */
+                               flags |= XEN_MC_NODATA;
+                               mc_fetch->fetch_id = 0;
                        }
-               }
-
-               if (mi) {
-                       memcpy(&mc_fetch->mc_info, mi,
-                               sizeof(struct mc_info));
-               } else {
-                       /* There is no data for a bogus DomU command. */
-                       flags |= XEN_MC_NODATA;
-                       memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
-               }
-
-               mc_fetch->flags = flags;
-               mc_fetch->fetch_idx = fetch_idx;
-
-               if ( copy_to_guest(u_xen_mc, op, 1) )
-                       ret = -EFAULT;
-
-               spin_unlock(&mc_lock);
+
+                       mc_fetch->flags = flags;
+                       if (copy_to_guest(u_xen_mc, op, 1) != 0)
+                               ret = -EFAULT;
+               }
+
                break;
 
        case XEN_MC_notifydomain:
-               /* This hypercall is for Dom0 only */
+               return x86_mcerr("do_mca notify unsupported", -EINVAL);
+
+       case XEN_MC_physcpuinfo:
                if ( !IS_PRIV(v->domain) )
-                       return -EPERM;
-
-               spin_lock(&mc_notify_lock);
-
-               mc_notifydomain = &op->u.mc_notifydomain;
-               domU = get_domain_by_id(mc_notifydomain->mc_domid);
-               vcpuid = mc_notifydomain->mc_vcpuid;
-
-               if ((domU == NULL) || (domU == dom0)) {
-                       /* It's not possible to notify a non-existent domain
-                        * or the dom0. */
-                       spin_unlock(&mc_notify_lock);
-                       return -EACCES;
-               }
-
-               if (vcpuid >= MAX_VIRT_CPUS) {
-                       /* It's not possible to notify a vcpu, Xen can't
-                        * assign to a domain. */
-                       spin_unlock(&mc_notify_lock);
-                       return -EACCES;
-               }
-
-               mc_notifydomain->flags = XEN_MC_OK;
-
-               mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
-               if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
-                       /* The error telemetry is not for the guest, Dom0
-                        * wants to notify. */
-                       mc_notifydomain->flags |= XEN_MC_NOMATCH;
-               } else if ( guest_has_trap_callback(domU, vcpuid,
-                                               TRAP_machine_check) )
-               {
-                       /* Send notification */
-                       if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
-                               mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
-               } else
-                       mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
-
-#ifdef DEBUG
-               /* sanity check - these two flags are mutually exclusive */
-               if ((flags & XEN_MC_CANNOTHANDLE) && (flags & 
XEN_MC_NOTDELIVERED))
-                       BUG();
-#endif
-
-               if ( copy_to_guest(u_xen_mc, op, 1) )
-                       ret = -EFAULT;
-
-               if (ret == 0) {
-                       x86_mcinfo_marknotified(mc_notifydomain);
-               }
-
-               spin_unlock(&mc_notify_lock);
-               break;
-
-       case XEN_MC_physcpuinfo:
-              if ( !IS_PRIV(v->domain) )
-                      return -EPERM;
- 
-              mc_physcpuinfo = &op->u.mc_physcpuinfo;
-              nlcpu = num_online_cpus();
- 
-              if (!guest_handle_is_null(mc_physcpuinfo->info)) {
-                      if (mc_physcpuinfo->ncpus <= 0)
-                              return -EINVAL;
-                      nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
-                      log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
-                      if (log_cpus == NULL)
-                              return -ENOMEM;
- 
-                      if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
-                          1, 1) != 0) {
-                              xfree(log_cpus);
-                              return -EIO;
-                      }
-              }
- 
-              mc_physcpuinfo->ncpus = nlcpu;
- 
-              if (copy_to_guest(u_xen_mc, op, 1)) {
-                      if (log_cpus != NULL)
-                              xfree(log_cpus);
-                      return -EFAULT;
-              }
- 
-              if (!guest_handle_is_null(mc_physcpuinfo->info)) {
-                      if (copy_to_guest(mc_physcpuinfo->info,
-                          log_cpus, nlcpu))
-                              ret = -EFAULT;
-                      xfree(log_cpus);
-              }
+                       return x86_mcerr("do_mca cpuinfo", -EPERM);
+
+               mc_physcpuinfo = &op->u.mc_physcpuinfo;
+               nlcpu = num_online_cpus();
+
+               if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+                       if (mc_physcpuinfo->ncpus <= 0)
+                               return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
+                                   -EINVAL);
+                       nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
+                       log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
+                       if (log_cpus == NULL)
+                               return x86_mcerr("do_mca cpuinfo", -ENOMEM);
+
+                       if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
+                           1, 1) != 0) {
+                               xfree(log_cpus);
+                               return x86_mcerr("do_mca cpuinfo", -EIO);
+                       }
+               }
+
+               mc_physcpuinfo->ncpus = nlcpu;
+
+               if (copy_to_guest(u_xen_mc, op, 1)) {
+                       if (log_cpus != NULL)
+                               xfree(log_cpus);
+                       return x86_mcerr("do_mca cpuinfo", -EFAULT);
+               }
+
+               if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+                       if (copy_to_guest(mc_physcpuinfo->info,
+                           log_cpus, nlcpu))
+                               ret = -EFAULT;
+                       xfree(log_cpus);
+               }
+               break;
+
+       default:
+               return x86_mcerr("do_mca: bad command", -EINVAL);
        }
 
        return ret;
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -1,38 +1,98 @@
+#ifndef _MCE_H
+
+#define _MCE_H
+
 #include <xen/init.h>
+#include <xen/smp.h>
 #include <asm/types.h>
 #include <asm/traps.h>
 #include <asm/atomic.h>
 #include <asm/percpu.h>
 
+#include "x86_mca.h"
+#include "mctelem.h"
 
 /* Init functions */
-void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c);
 
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+int winchip_mcheck_init(struct cpuinfo_x86 *c);
+int intel_mcheck_init(struct cpuinfo_x86 *c);
 
 void intel_mcheck_timer(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_mcheck_init(struct cpuinfo_x86 *c);
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
-
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-
-/* Function pointer used in the handlers to collect additional information
- * provided by newer CPU families/models without the need to duplicate
- * the whole handler resulting in various handlers each with its own
- * tweaks and bugs */
-extern int (*mc_callback_bank_extended)(struct mc_info *mi,
-               uint16_t bank, uint64_t status);
-
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
 
 int mce_available(struct cpuinfo_x86 *c);
+int mce_firstbank(struct cpuinfo_x86 *c);
 /* Helper functions used for collecting error telemetry */
 struct mc_info *x86_mcinfo_getptr(void);
-void x86_mcinfo_clear(struct mc_info *mi);
-int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
-void x86_mcinfo_dump(struct mc_info *mi);
 void mc_panic(char *s);
 void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *,
                         uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+
+
+/* Register a handler for machine check exceptions. */
+typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long);
+extern void x86_mce_vector_register(x86_mce_vector_t);
+
+/* Common generic MCE handler that implementations may nominate
+ * via x86_mce_vector_register. */
+extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
+
+/* Utility function to "logout" all architectural MCA telemetry from the MCA
+ * banks of the current processor.  A cookie is returned which may be
+ * uses to reference the data so logged (the cookie can be NULL if
+ * no logout structures were available).  The caller can also pass a pointer
+ * to a structure which will be completed with some summary information
+ * of the MCA data observed in the logout operation. */
+
+enum mca_source {
+       MCA_MCE_HANDLER,
+       MCA_POLLER,
+       MCA_CMCI_HANDLER,
+       MCA_RESET
+};
+
+enum mca_extinfo {
+       MCA_EXTINFO_LOCAL,
+       MCA_EXTINFO_GLOBAL,
+       MCA_EXTINFO_IGNORED
+};
+
+struct mca_summary {
+       uint32_t        errcnt; /* number of banks with valid errors */
+       int             ripv;   /* meaningful on #MC */
+       int             eipv;   /* meaningful on #MC */
+       uint32_t        uc;     /* bitmask of banks with UC */
+       uint32_t        pcc;    /* bitmask of banks with PCC */
+};
+
+extern cpu_banks_t mca_allbanks;
+
+extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
+    struct mca_summary *);
+
+/* Register a callback to be made during bank telemetry logout.
+ * This callback is only available to those machine check handlers
+ * that call to the common mcheck_cmn_handler or who use the common
+ * telemetry logout function mcheck_mca_logout in error polling.
+ *
+ * This can be used to collect additional information (typically non-
+ * architectural) provided by newer CPU families/models without the need
+ * to duplicate the whole handler resulting in various handlers each with
+ * its own tweaks and bugs.  The callback receives an struct mc_info pointer
+ * which it can use with x86_mcinfo_add to add additional telemetry,
+ * the current MCA bank number we are reading telemetry from, and the
+ * MCi_STATUS value for that bank.
+ */
+typedef enum mca_extinfo (*x86_mce_callback_t)
+    (struct mc_info *, uint16_t, uint64_t);
+extern void x86_mce_callback_register(x86_mce_callback_t);
+
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
+void x86_mcinfo_dump(struct mc_info *mi);
+
+#endif /* _MCE_H */
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c 
b/xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -14,6 +14,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_ow
 
 static int nr_intel_ext_msrs = 0;
 static int cmci_support = 0;
+static int firstbank;
 
 #ifdef CONFIG_X86_MCE_THERMAL
 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -115,222 +116,51 @@ static void intel_init_thermal(struct cp
 }
 #endif /* CONFIG_X86_MCE_THERMAL */
 
-static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
-{
-    if (nr_intel_ext_msrs == 0)
-        return;
+static enum mca_extinfo
+intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
+{
+    struct mcinfo_extended mc_ext;
+
+    if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
+        return MCA_EXTINFO_IGNORED;
 
     /* this function will called when CAP(9).MCG_EXT_P = 1 */
-    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
-    mc_ext->common.type = MC_TYPE_EXTENDED;
-    mc_ext->common.size = sizeof(mc_ext);
-    mc_ext->mc_msrs = 10;
-
-    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
-    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
-    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
-    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
-    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
-    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
-
-    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
-    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
-    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
-    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
-    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
-    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
-
-    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
-    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
-    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
-    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
-    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
-    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
-    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
-    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
-}
-
-/* machine_check_poll might be called by following types:
- * 1. called when do mcheck_init.
- * 2. called in cmci interrupt handler
- * 3. called in polling handler
- * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
- * consumer.
- */
-static struct mc_info *machine_check_poll(int calltype)
-{
-    struct mc_info *mi = NULL;
-    int exceptions = (read_cr4() & X86_CR4_MCE);
-    int i, nr_unit = 0, uc = 0, pcc = 0;
-    uint64_t status, addr;
-    struct mcinfo_global mcg;
-    struct mcinfo_extended mce;
-    unsigned int cpu;
-    struct domain *d;
-
-    cpu = smp_processor_id();
-
-    memset(&mcg, 0, sizeof(mcg));
-    mcg.common.type = MC_TYPE_GLOBAL;
-    mcg.common.size = sizeof(mcg);
-    /* If called from cpu-reset check, don't need to fill them.
-     * If called from cmci context, we'll try to fill domid by memory addr
-     */
-    mcg.mc_domid = -1;
-    mcg.mc_vcpuid = -1;
-    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
-        mcg.mc_flags = MC_FLAG_POLLED;
-    else if (calltype == MC_FLAG_CMCI)
-        mcg.mc_flags = MC_FLAG_CMCI;
-    x86_mc_get_cpu_info(
-        cpu, &mcg.mc_socketid, &mcg.mc_coreid,
-        &mcg.mc_core_threadid, &mcg.mc_apicid, NULL, NULL, NULL);
-    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
-
-    for ( i = 0; i < nr_mce_banks; i++ ) {
-        struct mcinfo_bank mcb;
-        /* For CMCI, only owners checks the owned MSRs */
-        if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) &&
-             (calltype & MC_FLAG_CMCI) )
-            continue;
-        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-        if (! (status & MCi_STATUS_VAL) )
-            continue;
-        /*
-         * Uncorrected events are handled by the exception
-         * handler when it is enabled. But when the exception
-         * is disabled such as when mcheck_init, log everything.
-         */
-        if ((status & MCi_STATUS_UC) && exceptions)
-            continue;
-
-        if (status & MCi_STATUS_UC)
-            uc = 1;
-        if (status & MCi_STATUS_PCC)
-            pcc = 1;
-
-        if (!mi) {
-            mi = x86_mcinfo_getptr();
-            if (!mi) {
-                printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
-                return NULL;
-            }
-            x86_mcinfo_clear(mi);
-        }
-        memset(&mcb, 0, sizeof(mcb));
-        mcb.common.type = MC_TYPE_BANK;
-        mcb.common.size = sizeof(mcb);
-        mcb.mc_bank = i;
-        mcb.mc_status = status;
-        if (status & MCi_STATUS_MISCV)
-            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
-        if (status & MCi_STATUS_ADDRV) {
-            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
-            d = maddr_get_owner(addr);
-            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) 
)
-                mcb.mc_domid = d->domain_id;
-        }
-        if (cmci_support)
-            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
-        if (calltype == MC_FLAG_CMCI)
-            rdtscll(mcb.mc_tsc);
-        x86_mcinfo_add(mi, &mcb);
-        nr_unit++;
-        add_taint(TAINT_MACHINE_CHECK);
-        /* Clear state for this bank */
-        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
-        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%"PRIx64"]\n", 
-                i, cpu, status);
-        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
-                "thread[%d]\n", cpu, mcg.mc_socketid, 
-                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
- 
-    }
-    /* if pcc = 1, uc must be 1 */
-    if (pcc)
-        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
-    else if (uc)
-        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
-    else /* correctable */
-        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
-
-    if (nr_unit && nr_intel_ext_msrs && 
-                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
-        intel_get_extended_msrs(&mce);
-        x86_mcinfo_add(mi, &mce);
-    }
-    if (nr_unit) 
-        x86_mcinfo_add(mi, &mcg);
-    /* Clear global state */
-    return mi;
-}
-
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long 
error_code)
-{
-    /* MACHINE CHECK Error handler will be sent in another patch,
-     * simply copy old solutions here. This code will be replaced
-     * by upcoming machine check patches
-     */
-
-    int recover=1;
-    u32 alow, ahigh, high, low;
-    u32 mcgstl, mcgsth;
-    int i;
-   
-    rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-    if (mcgstl & (1<<0))       /* Recoverable ? */
-        recover=0;
-    
-    printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-           smp_processor_id(), mcgsth, mcgstl);
-    
-    for (i=0; i<nr_mce_banks; i++) {
-        rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
-        if (high & (1<<31)) {
-            if (high & (1<<29))
-                recover |= 1;
-            if (high & (1<<25))
-                recover |= 2;
-            printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
-            high &= ~(1<<31);
-            if (high & (1<<27)) {
-                rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                printk ("[%08x%08x]", ahigh, alow);
-            }
-            if (high & (1<<26)) {
-                rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                printk (" at %08x%08x", ahigh, alow);
-            }
-            printk ("\n");
-        }
-    }
-    
-    if (recover & 2)
-        mc_panic ("CPU context corrupt");
-    if (recover & 1)
-        mc_panic ("Unable to continue");
-    
-    printk(KERN_EMERG "Attempting to continue.\n");
-    /* 
-     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
-     * recoverable/continuable.This will allow BIOS to look at the MSRs
-     * for errors if the OS could not log the error.
-     */
-    for (i=0; i<nr_mce_banks; i++) {
-        u32 msr;
-        msr = MSR_IA32_MC0_STATUS+i*4;
-        rdmsr (msr, low, high);
-        if (high&(1<<31)) {
-            /* Clear it */
-            wrmsr(msr, 0UL, 0UL);
-            /* Serialize */
-            wmb();
-            add_taint(TAINT_MACHINE_CHECK);
-        }
-    }
-    mcgstl &= ~(1<<2);
-    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+    memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
+    mc_ext.common.type = MC_TYPE_EXTENDED;
+    mc_ext.common.size = sizeof(mc_ext);
+    mc_ext.mc_msrs = 10;
+
+    mc_ext.mc_msr[0].reg = MSR_IA32_MCG_EAX;
+    rdmsrl(MSR_IA32_MCG_EAX, mc_ext.mc_msr[0].value);
+    mc_ext.mc_msr[1].reg = MSR_IA32_MCG_EBX;
+    rdmsrl(MSR_IA32_MCG_EBX, mc_ext.mc_msr[1].value);
+    mc_ext.mc_msr[2].reg = MSR_IA32_MCG_ECX;
+    rdmsrl(MSR_IA32_MCG_ECX, mc_ext.mc_msr[2].value);
+
+    mc_ext.mc_msr[3].reg = MSR_IA32_MCG_EDX;
+    rdmsrl(MSR_IA32_MCG_EDX, mc_ext.mc_msr[3].value);
+    mc_ext.mc_msr[4].reg = MSR_IA32_MCG_ESI;
+    rdmsrl(MSR_IA32_MCG_ESI, mc_ext.mc_msr[4].value);
+    mc_ext.mc_msr[5].reg = MSR_IA32_MCG_EDI;
+    rdmsrl(MSR_IA32_MCG_EDI, mc_ext.mc_msr[5].value);
+
+    mc_ext.mc_msr[6].reg = MSR_IA32_MCG_EBP;
+    rdmsrl(MSR_IA32_MCG_EBP, mc_ext.mc_msr[6].value);
+    mc_ext.mc_msr[7].reg = MSR_IA32_MCG_ESP;
+    rdmsrl(MSR_IA32_MCG_ESP, mc_ext.mc_msr[7].value);
+    mc_ext.mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext.mc_msr[8].value);
+    mc_ext.mc_msr[9].reg = MSR_IA32_MCG_EIP;
+    rdmsrl(MSR_IA32_MCG_EIP, mc_ext.mc_msr[9].value);
+
+    x86_mcinfo_add(mci, &mc_ext);
+
+    return MCA_EXTINFO_GLOBAL;
+}
+
+static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+{
+       mcheck_cmn_handler(regs, error_code, mca_allbanks);
 }
 
 static DEFINE_SPINLOCK(cmci_discover_lock);
@@ -369,6 +199,8 @@ static void cmci_discover(void)
     unsigned long flags;
     int i;
     struct mc_info *mi = NULL;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
 
     printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
 
@@ -385,12 +217,20 @@ static void cmci_discover(void)
      * MCi_status (error_count bit 38~52) is not cleared,
      * the CMCI interrupt will never be triggered again.
      */
-    mi = machine_check_poll(MC_FLAG_CMCI);
-    if (mi) {
-        x86_mcinfo_dump(mi);
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
             send_guest_global_virq(dom0, VIRQ_MCA);
-    }
+        } else {
+            x86_mcinfo_dump(mi);
+            mctelem_dismiss(mctc);
+       }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
 
     printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
            smp_processor_id(), 
@@ -487,17 +327,26 @@ fastcall void smp_cmci_interrupt(struct 
 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
 {
     struct mc_info *mi = NULL;
-    int cpu = smp_processor_id();
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
 
     ack_APIC_irq();
     irq_enter();
-    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
-    mi = machine_check_poll(MC_FLAG_CMCI);
-    if (mi) {
-        x86_mcinfo_dump(mi);
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
             send_guest_global_virq(dom0, VIRQ_MCA);
-    }
+        } else {
+            x86_mcinfo_dump(mi);
+            mctelem_dismiss(mctc);
+       }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
+
     irq_exit();
 }
 
@@ -527,28 +376,28 @@ static void mce_cap_init(struct cpuinfo_
         printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
             smp_processor_id(), nr_intel_ext_msrs);
     }
-    /* for most of p6 family, bank 0 is an alias bios MSR.
-     * But after model>1a, bank 0 is available*/
-    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
-            && c->x86_model < 0x1A)
-        firstbank = 1;
-    else
-        firstbank = 0;
+    firstbank = mce_firstbank(c);
 }
 
 static void mce_init(void)
 {
     u32 l, h;
     int i;
-    struct mc_info *mi;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
+
     clear_in_cr4(X86_CR4_MCE);
+
     /* log the machine checks left over from the previous reset.
      * This also clears all registers*/
 
-    mi = machine_check_poll(MC_FLAG_RESET);
+    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs);
+
     /* in the boot up stage, don't inject to DOM0, but print out */
-    if (mi)
-        x86_mcinfo_dump(mi);
+    if (bs.errcnt && mctc != NULL) {
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
+        mctelem_dismiss(mctc);
+    }
 
     set_in_cr4(X86_CR4_MCE);
     rdmsr (MSR_IA32_MCG_CAP, l, h);
@@ -573,71 +422,19 @@ static void mce_init(void)
 }
 
 /* p4/p6 family have similar MCA initialization process */
-void intel_mcheck_init(struct cpuinfo_x86 *c)
+int intel_mcheck_init(struct cpuinfo_x86 *c)
 {
     mce_cap_init(c);
     printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
             smp_processor_id());
+
     /* machine check is available */
-    machine_check_vector = intel_machine_check;
+    x86_mce_vector_register(intel_machine_check);
+    x86_mce_callback_register(intel_get_extended_msrs);
+
     mce_init();
     mce_intel_feature_init(c);
     mce_set_owner();
-}
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll faster. When the poller finds no more 
- * errors, poll slower
-*/
-static struct timer mce_timer;
-
-#define MCE_PERIOD 4000
-#define MCE_MIN    2000
-#define MCE_MAX    32000
-
-static u64 period = MCE_PERIOD;
-static int adjust = 0;
-
-static void mce_intel_checkregs(void *info)
-{
-    struct mc_info *mi;
-
-    if( !mce_available(&current_cpu_data))
-        return;
-    mi = machine_check_poll(MC_FLAG_POLLED);
-    if (mi)
-    {
-        x86_mcinfo_dump(mi);
-        adjust++;
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
-            send_guest_global_virq(dom0, VIRQ_MCA);
-    }
-}
-
-static void mce_intel_work_fn(void *data)
-{
-    on_each_cpu(mce_intel_checkregs, data, 1, 1);
-    if (adjust) {
-        period = period / (adjust + 1);
-        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval "
-               "to %"PRIu64"\n", period);
-    }
-    else {
-        period *= 2;
-    }
-    if (period > MCE_MAX) 
-        period = MCE_MAX;
-    if (period < MCE_MIN)
-        period = MCE_MIN;
-    set_timer(&mce_timer, NOW() + MILLISECS(period));
-    adjust = 0;
-}
-
-void intel_mcheck_timer(struct cpuinfo_x86 *c)
-{
-    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
-    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
-    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
-}
-
+
+    return 1;
+}
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c 
b/xen/arch/x86/cpu/mcheck/mctelem.c
new file mode 100644
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+/*
+ * mctelem.c - x86 Machine Check Telemetry Transport
+ */
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+
+struct mctelem_ent {
+       struct mctelem_ent *mcte_next;  /* next in chronological order */
+       struct mctelem_ent *mcte_prev;  /* previous in chronological order */
+       uint32_t mcte_flags;            /* See MCTE_F_* below */
+       uint32_t mcte_refcnt;           /* Reference count */
+       void *mcte_data;                /* corresponding data payload */
+};
+
+#define        MCTE_F_HOME_URGENT              0x0001U /* free to urgent 
freelist */
+#define        MCTE_F_HOME_NONURGENT           0x0002U /* free to nonurgent 
freelist */
+#define        MCTE_F_CLASS_URGENT             0x0004U /* in use - urgent 
errors */
+#define        MCTE_F_CLASS_NONURGENT          0x0008U /* in use - nonurgent 
errors */
+#define        MCTE_F_STATE_FREE               0x0010U /* on a freelist */
+#define        MCTE_F_STATE_UNCOMMITTED        0x0020U /* reserved; on no list 
*/
+#define        MCTE_F_STATE_COMMITTED          0x0040U /* on a committed list 
*/
+#define        MCTE_F_STATE_PROCESSING         0x0080U /* on a processing list 
*/
+
+#define        MCTE_F_MASK_HOME        (MCTE_F_HOME_URGENT | 
MCTE_F_HOME_NONURGENT)
+#define        MCTE_F_MASK_CLASS       (MCTE_F_CLASS_URGENT | 
MCTE_F_CLASS_NONURGENT)
+#define        MCTE_F_MASK_STATE       (MCTE_F_STATE_FREE | \
+                               MCTE_F_STATE_UNCOMMITTED | \
+                               MCTE_F_STATE_COMMITTED | \
+                               MCTE_F_STATE_PROCESSING)
+
+#define        MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME)
+
+#define        MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
+#define        MCTE_SET_CLASS(tep, new) do { \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
+    (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
+
+#define        MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
+#define        MCTE_TRANSITION_STATE(tep, old, new) do { \
+    BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
+    (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
+
+#define        MC_URGENT_NENT          10
+#define        MC_NONURGENT_NENT       20
+
+#define        MC_NCLASSES             (MC_NONURGENT + 1)
+
+#define        COOKIE2MCTE(c)          ((struct mctelem_ent *)(c))
+#define        MCTE2COOKIE(tep)        ((mctelem_cookie_t)(tep))
+
+static struct mc_telem_ctl {
+       /* Linked lists that thread the array members together.
+        *
+        * The free lists are singly-linked via mcte_next, and we allocate
+        * from them by atomically unlinking an element from the head.
+        * Consumed entries are returned to the head of the free list.
+        * When an entry is reserved off the free list it is not linked
+        * on any list until it is committed or dismissed.
+        *
+        * The committed list grows at the head and we do not maintain a
+        * tail pointer; insertions are performed atomically.  The head
+        * thus has the most-recently committed telemetry, i.e. the
+        * list is in reverse chronological order.  The committed list
+        * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
+        * When we move telemetry from the committed list to the processing
+        * list we atomically unlink the committed list and keep a pointer
+        * to the head of that list;  we then traverse the list following
+        * mcte_prev and fill in mcte_next to doubly-link the list, and then
+        * append the tail of the list onto the processing list.  If we panic
+        * during this manipulation of the committed list we still have
+        * the pointer to its head so we can recover all entries during
+        * the panic flow (albeit in reverse chronological order).
+        *
+        * The processing list is updated in a controlled context, and
+        * we can lock it for updates.  The head of the processing list
+        * always has the oldest telemetry, and we append (as above)
+        * at the tail of the processing list. */
+       struct mctelem_ent *mctc_free[MC_NCLASSES];
+       struct mctelem_ent *mctc_committed[MC_NCLASSES];
+       struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
+       struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
+       /*
+        * Telemetry array
+        */
+       struct mctelem_ent *mctc_elems;
+} mctctl;
+
+/* Lock protecting all processing lists */
+static DEFINE_SPINLOCK(processing_lock);
+
+static void *cmpxchgptr(void *ptr, void *old, void *new)
+{
+       unsigned long *ulp = (unsigned long *)ptr;
+       unsigned long a = (unsigned long)old;
+       unsigned long b = (unsigned long)new;
+
+       return (void *)cmpxchg(ulp, a, b);
+}
+
+/* Free an entry to its native free list; the entry must not be linked on
+ * any list.
+ */
+static void mctelem_free(struct mctelem_ent *tep)
+{
+       mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent **freelp;
+       struct mctelem_ent *oldhead;
+
+       BUG_ON(tep->mcte_refcnt != 0);
+       BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
+
+       tep->mcte_prev = NULL;
+       freelp = &mctctl.mctc_free[target];
+       for (;;) {
+               oldhead = *freelp;
+               tep->mcte_next = oldhead;
+               wmb();
+               if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
+                       break;
+       }
+}
+
+/* Increment the reference count of an entry that is not linked on to
+ * any list and which only the caller has a pointer to.
+ */
+static void mctelem_hold(struct mctelem_ent *tep)
+{
+       tep->mcte_refcnt++;
+}
+
+/* Increment the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_hold(struct mctelem_ent *tep)
+{
+       int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep != mctctl.mctc_processing_head[which]);
+       tep->mcte_refcnt++;
+}
+
+/* Decrement the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_release(struct mctelem_ent *tep)
+{
+       int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep != mctctl.mctc_processing_head[which]);
+       if (--tep->mcte_refcnt == 0) {
+               MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
+               mctctl.mctc_processing_head[which] = tep->mcte_next;
+               mctelem_free(tep);
+       }
+}
+
+void mctelem_init(int reqdatasz)
+{
+       static int called = 0;
+       static int datasz = 0, realdatasz = 0;
+       char *datarr;
+       int i;
+       
+       BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
+
+       /* Called from mcheck_init for all processors; initialize for the
+        * first call only (no race here since the boot cpu completes
+        * init before others start up). */
+       if (++called == 1) {
+               realdatasz = reqdatasz;
+               datasz = (reqdatasz & ~0xf) + 0x10;     /* 16 byte roundup */
+       } else {
+               BUG_ON(reqdatasz != realdatasz);
+               return;
+       }
+
+       if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
+           MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL ||
+           (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) *
+           datasz)) == NULL) {
+               if (mctctl.mctc_elems)
+                       xfree(mctctl.mctc_elems);
+               printk("Allocations for MCA telemetry failed\n");
+               return;
+       }
+
+       for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
+               struct mctelem_ent *tep, **tepp;
+
+               tep = mctctl.mctc_elems + i;
+               tep->mcte_flags = MCTE_F_STATE_FREE;
+               tep->mcte_refcnt = 0;
+               tep->mcte_data = datarr + i * datasz;
+
+               if (i < MC_URGENT_NENT) {
+                       tepp = &mctctl.mctc_free[MC_URGENT];
+                       tep->mcte_flags |= MCTE_F_HOME_URGENT;
+               } else {
+                       tepp = &mctctl.mctc_free[MC_NONURGENT];
+                       tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
+               }
+
+               tep->mcte_next = *tepp;
+               tep->mcte_prev = NULL;
+               *tepp = tep;
+       }
+}
+
+/* incremented non-atomically when reserve fails */
+static int mctelem_drop_count;
+
+/* Reserve a telemetry entry, or return NULL if none available.
+ * If we return an entry then the caller must subsequently call exactly one of
+ * mctelem_unreserve or mctelem_commit for that entry.
+ */
+mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
+{
+       struct mctelem_ent **freelp;
+       struct mctelem_ent *oldhead, *newhead;
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+
+       freelp = &mctctl.mctc_free[target];
+       for (;;) {
+               if ((oldhead = *freelp) == NULL) {
+                       if (which == MC_URGENT && target == MC_URGENT) {
+                               /* raid the non-urgent freelist */
+                               target = MC_NONURGENT;
+                               freelp = &mctctl.mctc_free[target];
+                               continue;
+                       } else {
+                               mctelem_drop_count++;
+                               return (NULL);
+                       }
+               }
+
+               newhead = oldhead->mcte_next;
+               if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
+                       struct mctelem_ent *tep = oldhead;
+
+                       mctelem_hold(tep);
+                       MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
+                       tep->mcte_next = NULL;
+                       tep->mcte_prev = NULL;
+                       if (which == MC_URGENT)
+                               MCTE_SET_CLASS(tep, URGENT);
+                       else
+                               MCTE_SET_CLASS(tep, NONURGENT);
+                       return MCTE2COOKIE(tep);
+               }
+       }
+}
+
+void *mctelem_dataptr(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       return tep->mcte_data;
+}
+
+/* Release a previously reserved entry back to the freelist without
+ * submitting it for logging.  The entry must not be linked on to any
+ * list - that's how mctelem_reserve handed it out.
+ */
+void mctelem_dismiss(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       tep->mcte_refcnt--;
+       MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
+       mctelem_free(tep);
+}
+
+/* Commit an entry with completed telemetry for logging.  The caller must
+ * not reference the entry after this call.  Note that we add entries
+ * at the head of the committed list, so that list therefore has entries
+ * in reverse chronological order.
+ */
+void mctelem_commit(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+       struct mctelem_ent **commlp;
+       struct mctelem_ent *oldhead;
+       mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
+       MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
+
+       commlp = &mctctl.mctc_committed[target];
+       for (;;) {
+               oldhead = *commlp;
+               tep->mcte_prev = oldhead;
+               wmb();
+               if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
+                       break;
+       }
+}
+
+/* Move telemetry from committed list to processing list, reversing the
+ * list into chronological order.  The processing list has been
+ * locked by the caller, and may be non-empty.  We append the
+ * reversed committed list on to the tail of the processing list.
+ * The committed list may grow even while we run, so use atomic
+ * operations to swap NULL to the freelist head.
+ *
+ * Note that "chronological order" means the order in which producers
+ * won additions to the processing list, which may not reflect the
+ * strict chronological order of the associated events if events are
+ * closely spaced in time and contend for the processing list at once.
+ */
+
+static struct mctelem_ent *dangling[MC_NCLASSES];
+
+static void mctelem_append_processing(mctelem_class_t which)
+{
+       mctelem_class_t target = which == MC_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
+       struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
+       struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
+       struct mctelem_ent *tep, *ltep;
+
+       /* Check for an empty list; no race since we hold the processing lock */
+       if (*commlp == NULL)
+               return;
+
+       /* Atomically unlink the committed list, and keep a pointer to
+        * the list we unlink in a well-known location so it can be
+        * picked up in panic code should we panic between this unlink
+        * and the append to the processing list. */
+       for (;;) {
+               dangling[target] = *commlp;
+               wmb();
+               if (cmpxchgptr(commlp, dangling[target], NULL) ==
+                   dangling[target])
+                       break;
+       }
+
+       if (dangling[target] == NULL)
+               return;
+
+       /* Traverse the list following the previous pointers (reverse
+        * chronological order).  For each entry fill in the next pointer
+        * and transition the element state.  */
+       for (tep = dangling[target], ltep = NULL; tep != NULL;
+           tep = tep->mcte_prev) {
+               MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
+               tep->mcte_next = ltep;
+               ltep = tep;
+       }
+
+       /* ltep points to the head of a chronologically ordered linked
+        * list of telemetry entries ending at the most recent entry
+        * dangling[target] if mcte_next is followed; tack this on to
+        * the processing list.
+        */
+       if (*proclhp == NULL) {
+               *proclhp = ltep;
+               *procltp = dangling[target];
+       } else {
+               (*procltp)->mcte_next = ltep;
+               ltep->mcte_prev = *procltp;
+               *procltp = dangling[target];
+       }
+       wmb();
+       dangling[target] = NULL;
+       wmb();
+}
+
+mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
+{
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent *tep;
+
+       spin_lock(&processing_lock);
+       mctelem_append_processing(target);
+       if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
+               spin_unlock(&processing_lock);
+               return NULL;
+       }
+
+       mctelem_processing_hold(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+       return MCTE2COOKIE(tep);
+}
+
+void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       spin_lock(&processing_lock);
+       mctelem_processing_release(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+}
+
+void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
+{
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       if (tep == NULL)
+               return;
+
+       spin_lock(&processing_lock);
+       if (tep == mctctl.mctc_processing_head[target])
+               mctelem_processing_release(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+}
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.h 
b/xen/arch/x86/cpu/mcheck/mctelem.h
new file mode 100644
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mctelem.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#ifndef _MCTELEM_H
+
+#define        _MCTELEM_H
+
+#include <xen/init.h>
+#include <xen/smp.h>
+#include <asm/traps.h>
+
+/* Helper functions used for collecting error telemetry.
+ *
+ * mctelem_init preallocates a number of data areas for use during
+ * machine check data "logout".  Two classes are distinguished -
+ * urgent uses, intended for use from machine check exception handlers,
+ * and non-urgent uses intended for use from error pollers.
+ * Associated with each logout entry of whatever class is a data area
+ * sized per the single argument to mctelem_init.  mcelem_init should be
+ * called from MCA init code before anybody has the chance to change the
+ * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout.
+ *
+ * To reserve an entry of a given class for use in logout, call
+ * mctelem_reserve (or use the common handler functions which do all this
+ * for you).  This returns an opaque cookie, or NULL if no elements are
+ * available.  Elements are reserved with an atomic operation so no deadlock
+ * will occur if, for example, a machine check exception interrupts a
+ * scheduled error poll.  The implementation will raid free non-urgent
+ * entries if all urgent entries are in use when an urgent request is received.
+ * Once an entry is reserved the caller must eventually perform exactly
+ * one of two actions: mctelem_commit or mctelem_dismiss.
+ *
+ * On mctelem_commit the entry is appended to a processing list; 
mctelem_dismiss
+ * frees the element without processing.  After either call the cookie
+ * must not be referenced again.
+ *
+ * To consume committed telemetry call mctelem_consume_oldest_begin
+ * which will return a cookie referencing the oldest (first committed)
+ * entry of the requested class.  Access the associated data using
+ * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the
+ * begin .. end bracket you are guaranteed that the entry canot be freed
+ * even if it is ack'd elsewhere).  Once the ultimate consumer of the
+ * telemetry has processed it to stable storage it should acknowledge
+ * the telemetry quoting the cookie id, at which point we will free
+ * the element from the processing list.
+ */
+
+typedef struct mctelem_cookie *mctelem_cookie_t;
+
+typedef enum mctelem_class {
+       MC_URGENT,
+       MC_NONURGENT
+} mctelem_class_t;
+
+extern void mctelem_init(int);
+extern mctelem_cookie_t mctelem_reserve(mctelem_class_t);
+extern void *mctelem_dataptr(mctelem_cookie_t);
+extern void mctelem_commit(mctelem_cookie_t);
+extern void mctelem_dismiss(mctelem_cookie_t);
+extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
+extern void mctelem_consume_oldest_end(mctelem_cookie_t);
+extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
+
+#endif
diff --git a/xen/arch/x86/cpu/mcheck/non-fatal.c 
b/xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c
@@ -14,46 +14,76 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/errno.h>
+#include <xen/event.h>
+#include <xen/sched.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
-int firstbank = 0;
+
+static cpu_banks_t bankmask;
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(8000)
+#define MCE_PERIOD_MIN MILLISECS(2000)
+#define MCE_PERIOD_MAX MILLISECS(16000)
+
+static uint64_t period = MCE_PERIOD;
+static int adjust = 0;
+static int variable_period = 1;
 
 static void mce_checkregs (void *info)
 {
-       u32 low, high;
-       int i;
+       mctelem_cookie_t mctc;
+       struct mca_summary bs;
+       static uint64_t dumpcount = 0;
 
-       for (i=firstbank; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
+       mctc = mcheck_mca_logout(MCA_POLLER, bankmask, &bs);
 
-               if (high & (1<<31)) {
-                       printk(KERN_INFO "MCE: The hardware reports a non "
-                               "fatal, correctable incident occurred on "
-                               "CPU %d.\n",
-                               smp_processor_id());
-                       printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+       if (bs.errcnt && mctc != NULL) {
+               adjust++;
 
-                       /* Scrub the error so we don't pick it up in MCE_RATE 
seconds time. */
-                       wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+               /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+                * Otherwise, if dom0 has had plenty of time to register
+                * the virq handler but still hasn't then dump telemetry
+                * to the Xen console.  The call count may be incremented
+                * on multiple cpus at once and is indicative only - just
+                * a simple-minded attempt to avoid spamming the console
+                * for corrected errors in early startup.
+                */
 
-                       /* Serialize */
-                       wmb();
-                       add_taint(TAINT_MACHINE_CHECK);
+               if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+                       mctelem_commit(mctc);
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               } else if (++dumpcount >= 10) {
+                       x86_mcinfo_dump((struct mc_info 
*)mctelem_dataptr(mctc));
+                       mctelem_dismiss(mctc);
+               } else {
+                       mctelem_dismiss(mctc);
                }
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
        }
 }
 
 static void mce_work_fn(void *data)
 { 
        on_each_cpu(mce_checkregs, NULL, 1, 1);
-       set_timer(&mce_timer, NOW() + MCE_PERIOD);
+
+       if (variable_period) {
+               if (adjust)
+                       period /= (adjust + 1);
+               else
+                       period *= 2;
+               if (period > MCE_PERIOD_MAX)
+                       period = MCE_PERIOD_MAX;
+               if (period < MCE_PERIOD_MIN)
+                       period = MCE_PERIOD_MIN;
+       }
+
+       set_timer(&mce_timer, NOW() + period);
+       adjust = 0;
 }
 
 static int __init init_nonfatal_mce_checker(void)
@@ -63,13 +93,17 @@ static int __init init_nonfatal_mce_chec
        /* Check for MCE support */
        if (!mce_available(c))
                return -ENODEV;
+
+       memcpy(&bankmask, &mca_allbanks, sizeof (cpu_banks_t));
+       if (mce_firstbank(c) == 1)
+               clear_bit(0, bankmask);
+
        /*
         * Check for non-fatal errors every MCE_RATE s
         */
        switch (c->x86_vendor) {
        case X86_VENDOR_AMD:
                if (c->x86 == 6) { /* K7 */
-                       firstbank = 1;
                        init_timer(&mce_timer, mce_work_fn, NULL, 0);
                        set_timer(&mce_timer, NOW() + MCE_PERIOD);
                        break;
@@ -80,15 +114,14 @@ static int __init init_nonfatal_mce_chec
                break;
 
        case X86_VENDOR_INTEL:
-               /* p5 family is different. P4/P6 and latest CPUs shares the
-                * same polling methods
-               */
+               /*
+                * The P5 family is different. P4/P6 and latest CPUs share the
+                * same polling methods.
+                */
                if ( c->x86 != 5 )
                {
-                       /* some CPUs or banks don't support cmci, we need to 
-                        * enable this feature anyway
-                        */
-                       intel_mcheck_timer(c);
+                       init_timer(&mce_timer, mce_work_fn, NULL, 0);
+                       set_timer(&mce_timer, NOW() + MCE_PERIOD);
                }
                break;
        }
diff --git a/xen/arch/x86/cpu/mcheck/p5.c b/xen/arch/x86/cpu/mcheck/p5.c
--- a/xen/arch/x86/cpu/mcheck/p5.c
+++ b/xen/arch/x86/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
 #include "x86_mca.h"
 
 /* Machine check handler for Pentium class Intel */
-static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long 
error_code)
+static void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
 {
        u32 loaddr, hi, lotype;
        rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
@@ -28,19 +28,14 @@ static fastcall void pentium_machine_che
 }
 
 /* Set up machine check reporting for processors with Intel style MCE */
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        
-       /*Check for MCE support */
-       if( !cpu_has(c, X86_FEATURE_MCE) )
-               return; 
-
        /* Default P5 to off as its often misconnected */
        if(mce_disabled != -1)
-               return;
-       machine_check_vector = pentium_machine_check;
-       wmb();
+               return 0;
+       x86_mce_vector_register(pentium_machine_check);
 
        /* Read registers before enabling */
        rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
@@ -50,4 +45,6 @@ void intel_p5_mcheck_init(struct cpuinfo
        /* Enable MCE */
        set_in_cr4(X86_CR4_MCE);
        printk(KERN_INFO "Intel old style machine check reporting enabled on 
CPU#%d.\n", smp_processor_id());
+
+       return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/winchip.c 
b/xen/arch/x86/cpu/mcheck/winchip.c
--- a/xen/arch/x86/cpu/mcheck/winchip.c
+++ b/xen/arch/x86/cpu/mcheck/winchip.c
@@ -16,22 +16,24 @@
 #include "mce.h"
 
 /* Machine check handler for WinChip C6 */
-static fastcall void winchip_machine_check(struct cpu_user_regs * regs, long 
error_code)
+static void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
 {
        printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
        add_taint(TAINT_MACHINE_CHECK);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
-void winchip_mcheck_init(struct cpuinfo_x86 *c)
+int winchip_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 lo, hi;
-       machine_check_vector = winchip_machine_check;
+
        wmb();
+       x86_mce_vector_register(winchip_machine_check);
        rdmsr(MSR_IDT_FCR1, lo, hi);
        lo|= (1<<2);    /* Enable EIERRINT (int 18 MCE) */
        lo&= ~(1<<4);   /* Enable MCE */
        wrmsr(MSR_IDT_FCR1, lo, hi);
        set_in_cr4(X86_CR4_MCE);
        printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+       return (1);
 }
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h 
b/xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -16,6 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+
+#ifndef X86_MCA_H
+
+#define X86_MCA_H
 
 
 /* The MCA/MCE MSRs should not be used anywhere else.
@@ -73,6 +77,9 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/* Bitfield of MSR_K8_HWCR register */
+#define K8_HWCR_MCi_STATUS_WREN                (1ULL << 18)
+
 /*Intel Specific bitfield*/
 #define CMCI_THRESHOLD                 0x2
 
@@ -87,3 +94,4 @@ extern unsigned int nr_mce_banks;
 extern unsigned int nr_mce_banks;
 extern int firstbank;
 
+#endif /* X86_MCA_H */
diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h
--- a/xen/include/asm-x86/traps.h
+++ b/xen/include/asm-x86/traps.h
@@ -28,7 +28,7 @@ struct softirq_trap {
 
 struct cpu_user_regs;
 
-extern void (*machine_check_vector)(struct cpu_user_regs *regs, long 
error_code);
+extern void machine_check_vector(struct cpu_user_regs *regs, long error_code);
  
 /**
  * guest_has_trap_callback
diff --git a/xen/include/public/arch-x86/xen-mca.h 
b/xen/include/public/arch-x86/xen-mca.h
--- a/xen/include/public/arch-x86/xen-mca.h
+++ b/xen/include/public/arch-x86/xen-mca.h
@@ -56,13 +56,20 @@
 /* Hypercall */
 #define __HYPERVISOR_mca __HYPERVISOR_arch_0
 
-#define XEN_MCA_INTERFACE_VERSION 0x03000002
-
-/* IN: Dom0 calls hypercall from MC event handler. */
-#define XEN_MC_CORRECTABLE  0x0
-/* IN: Dom0/DomU calls hypercall from MC trap handler. */
-#define XEN_MC_TRAP         0x1
-/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc002
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
+#define XEN_MC_NONURGENT  0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
+#define XEN_MC_URGENT     0x0002
+/* IN: Dom0 acknowledges previosly-fetched telemetry */
+#define XEN_MC_ACK        0x0004
 
 /* OUT: All is ok */
 #define XEN_MC_OK           0x0
@@ -110,6 +117,7 @@ struct mcinfo_common {
 #define MC_FLAG_POLLED         (1 << 3)
 #define MC_FLAG_RESET          (1 << 4)
 #define MC_FLAG_CMCI           (1 << 5)
+#define MC_FLAG_MCE            (1 << 6)
 /* contains global x86 mc information */
 struct mcinfo_global {
     struct mcinfo_common common;
@@ -174,6 +182,7 @@ struct mc_info {
     uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
 };
 typedef struct mc_info mc_info_t;
+DEFINE_XEN_GUEST_HANDLE(mc_info_t);
 
 #define __MC_MSR_ARRAYSIZE 8
 #define __MC_NMSRS 1
@@ -274,14 +283,14 @@ DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_c
 #define XEN_MC_fetch            1
 struct xen_mc_fetch {
     /* IN/OUT variables. */
-    uint32_t flags;
-
-/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
-/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint32_t flags;    /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+                           XEN_MC_ACK if ack'ing an earlier fetch */
+                       /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
+                          XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */
 
     /* OUT variables. */
-    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
-    struct mc_info mc_info;
+    XEN_GUEST_HANDLE(mc_info_t) data;
 };
 typedef struct xen_mc_fetch xen_mc_fetch_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
@@ -296,7 +305,6 @@ struct xen_mc_notifydomain {
     uint16_t mc_domid;    /* The unprivileged domain to notify. */
     uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
                            * Usually echo'd value from the fetch hypercall. */
-    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
 
     /* IN/OUT variables. */
     uint32_t flags;
@@ -316,15 +324,16 @@ struct xen_mc_physcpuinfo {
        XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
 };
 
+typedef union {
+    struct xen_mc_fetch        mc_fetch;
+    struct xen_mc_notifydomain mc_notifydomain;
+    struct xen_mc_physcpuinfo  mc_physcpuinfo;
+} xen_mc_arg_t;
+
 struct xen_mc {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
-    union {
-        struct xen_mc_fetch        mc_fetch;
-        struct xen_mc_notifydomain mc_notifydomain;
-        struct xen_mc_physcpuinfo  mc_physcpuinfo;
-        uint8_t pad[MCINFO_HYPERCALLSIZE];
-    } u;
+    xen_mc_arg_t u;
 };
 typedef struct xen_mc xen_mc_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.