[Xen-changelog] [xen-unstable] x86: MCA support.

# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1215185264 -3600
# Node ID a49673cd23d2548d57af716cc85fcaeef48b55fc
# Parent  d133d452cb718a07b42f86edc8ce838ef6734de9
x86: MCA support.
Signed-off-by: Christoph Egger <Christoph.Egger@xxxxxxx>
---
 xen/arch/x86/cpu/mcheck/Makefile       |    3 
 xen/arch/x86/cpu/mcheck/amd_f10.c      |  131 +++++++
 xen/arch/x86/cpu/mcheck/amd_k8.c       |  324 ++++++++++++++++++
 xen/arch/x86/cpu/mcheck/amd_nonfatal.c |  303 +++++++++++++++++
 xen/arch/x86/cpu/mcheck/k7.c           |    7 
 xen/arch/x86/cpu/mcheck/mce.c          |  566 ++++++++++++++++++++++++++++++---
 xen/arch/x86/cpu/mcheck/mce.h          |   26 +
 xen/arch/x86/cpu/mcheck/non-fatal.c    |   30 +
 xen/arch/x86/cpu/mcheck/x86_mca.h      |   72 ++++
 xen/arch/x86/nmi.c                     |    4 
 xen/arch/x86/traps.c                   |  125 ++++++-
 xen/arch/x86/x86_32/asm-offsets.c      |    6 
 xen/arch/x86/x86_32/entry.S            |   36 +-
 xen/arch/x86/x86_32/traps.c            |    9 
 xen/arch/x86/x86_64/asm-offsets.c      |    6 
 xen/arch/x86/x86_64/compat/entry.S     |   35 +-
 xen/arch/x86/x86_64/compat/traps.c     |    9 
 xen/arch/x86/x86_64/entry.S            |   35 +-
 xen/arch/x86/x86_64/traps.c            |    9 
 xen/common/domain.c                    |    4 
 xen/common/event_channel.c             |   15 
 xen/include/Makefile                   |    1 
 xen/include/asm-x86/event.h            |    7 
 xen/include/asm-x86/mm.h               |    3 
 xen/include/asm-x86/traps.h            |   50 ++
 xen/include/public/arch-x86/xen-mca.h  |  279 ++++++++++++++++
 xen/include/public/arch-x86/xen.h      |    4 
 xen/include/xen/event.h                |    3 
 xen/include/xen/sched.h                |   15 
 29 files changed, 2024 insertions(+), 93 deletions(-)

diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile  Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/Makefile  Fri Jul 04 16:27:44 2008 +0100
@@ -1,4 +1,7 @@ obj-y += k7.o
+obj-y += amd_nonfatal.o
 obj-y += k7.o
+obj-y += amd_k8.o
+obj-y += amd_f10.o
 obj-y += mce.o
 obj-y += non-fatal.o
 obj-y += p4.o
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/amd_f10.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,131 @@
+/*
+ * MCA implementation for AMD Family10 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ */
+
+/* Family10 MCA documentation published at
+ *
+ * BIOS and Kernel Developer's Guide
+ * For AMD Family 10h Processors
+ * Publication # 31116 Revision: 1.08
+ * Isse Date: June 10, 2007
+ */
+
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+
+static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+{
+       struct mcinfo_extended mc_ext;
+
+       /* Family 0x10 introduced additional MSR that belong to the
+        * northbridge bank (4). */
+       if (bank != 4)
+               return 0;
+
+       if (!(status & MCi_STATUS_VAL))
+               return 0;
+
+       if (!(status & MCi_STATUS_MISCV))
+               return 0;
+
+       memset(&mc_ext, 0, sizeof(mc_ext));
+       mc_ext.common.type = MC_TYPE_EXTENDED;
+       mc_ext.common.size = sizeof(mc_ext);
+       mc_ext.mc_msrs = 3;
+
+       mc_ext.mc_msr[0].reg = MSR_F10_MC4_MISC1;
+       mc_ext.mc_msr[1].reg = MSR_F10_MC4_MISC2;
+       mc_ext.mc_msr[2].reg = MSR_F10_MC4_MISC3;
+
+       rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
+       rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
+       rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
+       
+       x86_mcinfo_add(mi, &mc_ext);
+       return 1;
+}
+
+
+extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
+
+/* AMD Family10 machine check */
+void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
+{ 
+       uint64_t value;
+       uint32_t i;
+       int cpu_nr;
+
+       machine_check_vector = k8_machine_check;
+       mc_callback_bank_extended = amd_f10_handler;
+       cpu_nr = smp_processor_id();
+       wmb();
+
+       rdmsrl(MSR_IA32_MCG_CAP, value);
+       if (value & MCG_CTL_P)  /* Control register present ? */
+               wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+       nr_mce_banks = value & MCG_CAP_COUNT;
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               switch (i) {
+               case 4: /* Northbridge */
+                       /* Enable error reporting of all errors,
+                        * enable error checking and
+                        * disable sync flooding */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
+
+                       /* XXX: We should write the value 0x1087821UL into
+                        * to register F3x180 here, which sits in
+                        * the PCI extended configuration space.
+                        * Since this is not possible here, we can only hope,
+                        * Dom0 is doing that.
+                        */
+                       break;
+
+               default:
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
+                       wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+                       break;
+               }
+       }
+
+       set_in_cr4(X86_CR4_MCE);
+       printk("CPU%i: AMD Family10h machine check reporting enabled.\n", 
cpu_nr);
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/amd_k8.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c  Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,324 @@
+/*
+ * MCA implementation for AMD K8 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
+ */
+
+/* The related documentation for K8 Revisions A - E is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD Athlon 64 and AMD Opteron Processors
+ * Publication # 26094 Revision: 3.30
+ * Issue Date: February 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
+ */
+
+/* The related documentation for K8 Revisions F - G is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD NPT Family 0Fh Processors
+ * Publication # 32559 Revision: 3.04
+ * Issue Date: December 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+ */
+
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+
+#include <asm/processor.h>
+#include <asm/shared.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+
+/* Machine Check Handler for AMD K8 family series */
+void k8_machine_check(struct cpu_user_regs *regs, long error_code)
+{
+       struct vcpu *vcpu = current;
+       struct domain *curdom;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv, uc;
+       uint32_t i;
+       unsigned int cpu_nr;
+       uint32_t xen_impacted = 0;
+#define DOM_NORMAL     0
+#define DOM0_TRAP      1
+#define DOMU_TRAP      2
+#define DOMU_KILLED    4
+       uint32_t dom_state = DOM_NORMAL;
+
+       /* This handler runs as interrupt gate. So IPIs from the
+        * polling service routine are defered until we finished.
+        */
+
+        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+        * an other physical CPU or the impacted process in the guest
+        * continues running with corrupted data, otherwise. */
+        vcpu_schedule_lock_irq(vcpu);
+
+       mc_data = x86_mcinfo_getptr();
+       cpu_nr = smp_processor_id();
+       curdom = vcpu->domain;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       /* Quick check, who is impacted */
+       xen_impacted = is_idle_domain(curdom);
+
+       /* Dom0 */
+       x86_mcinfo_clear(mc_data);
+       x86_mcinfo_add(mc_data, &mc_global);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               /* An error happened in this bank.
+                * This is expected to be an uncorrectable error,
+                * since correctable errors get polled.
+                */
+               uc = status & MCi_STATUS_UC;
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
+                       
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL)
+                               mc_info.mc_domid = d->domain_id;
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+
+               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               wmb();
+               add_taint(TAINT_MACHINE_CHECK);
+       }
+
+       status = mc_global.mc_gstatus;
+
+       /* clear MCIP or cpu enters shutdown state
+        * in case another MCE occurs. */
+       status &= ~MCG_STATUS_MCIP;
+       wrmsrl(MSR_IA32_MCG_STATUS, status);
+       wmb();
+
+       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
+        * The thread started here:
+        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
+        */
+
+       /* MCG_STATUS_RIPV: 
+        * When this bit is not set, then the instruction pointer onto the stack
+        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
+        * right below. Otherwise it is up to the guest to figure out if 
+        * guest kernel or guest userland is affected and should kill either
+        * itself or the affected process.
+        */
+
+       /* MCG_STATUS_EIPV:
+        * Evaluation of EIPV is the job of the guest.
+        */
+
+       if (xen_impacted) {
+               /* Now we are going to panic anyway. Allow interrupts, so that
+                * printk on serial console can work. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* Uh, that means, machine check exception
+                * inside Xen occured. */
+               printk("Machine check exception occured in Xen.\n");
+
+               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
+                * to the error then it makes sense to print a stack trace.
+                * That can be useful for more detailed error analysis and/or
+                * error case studies to figure out, if we can clear
+                * xen_impacted and kill a DomU instead
+                * (i.e. if a guest only control structure is affected, but then
+                * we must ensure the bad pages are not re-used again).
+                */
+               if (status & MCG_STATUS_EIPV) {
+                       printk("MCE: Instruction Pointer is related to the 
error. "
+                               "Therefore, print the execution state.\n");
+                       show_execution_state(regs);
+               }
+               x86_mcinfo_dump(mc_data);
+               panic("End of MCE. Use mcelog to decode above error codes.\n");
+       }
+
+       /* If Dom0 registered a machine check handler, which is only possible
+        * with a PV MCA driver, then ... */
+       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
+               dom_state = DOM0_TRAP;
+
+               /* ... deliver machine check trap to Dom0. */
+               send_guest_trap(dom0, 0, TRAP_machine_check);
+
+               /* Xen may tell Dom0 now to notify the DomU.
+                * But this will happen through a hypercall. */
+       } else
+               /* Dom0 did not register a machine check handler, but if DomU
+                * did so, then... */
+                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
+                       dom_state = DOMU_TRAP;
+
+                       /* ... deliver machine check trap to DomU */
+                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
+       } else {
+               /* hmm... noone feels responsible to handle the error.
+                * So, do a quick check if a DomU is impacted or not.
+                */
+               if (curdom == dom0) {
+                       /* Dom0 is impacted. Since noone can't handle
+                        * this error, panic! */
+                       x86_mcinfo_dump(mc_data);
+                       panic("MCE occured in Dom0, which it can't handle\n");
+
+                       /* UNREACHED */
+               } else {
+                       dom_state = DOMU_KILLED;
+
+                       /* Enable interrupts. This basically results in
+                        * calling sti on the *physical* cpu. But after
+                        * domain_crash() the vcpu pointer is invalid.
+                        * Therefore, we must unlock the irqs before killing
+                        * it. */
+                       vcpu_schedule_unlock_irq(vcpu);
+
+                       /* DomU is impacted. Kill it and continue. */
+                       domain_crash(curdom);
+               }
+       }
+
+
+       switch (dom_state) {
+       case DOM0_TRAP:
+       case DOMU_TRAP:
+               /* Enable interrupts. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* guest softirqs and event callbacks are scheduled
+                * immediately after this handler exits. */
+               break;
+       case DOMU_KILLED:
+               /* Nothing to do here. */
+               break;
+       default:
+               BUG();
+       }
+}
+
+
+/* AMD K8 machine check */
+void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+{
+       uint64_t value;
+       uint32_t i;
+       int cpu_nr;
+
+       machine_check_vector = k8_machine_check;
+       cpu_nr = smp_processor_id();
+       wmb();
+
+       rdmsrl(MSR_IA32_MCG_CAP, value);
+       if (value & MCG_CTL_P)  /* Control register present ? */
+               wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+       nr_mce_banks = value & MCG_CAP_COUNT;
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               switch (i) {
+               case 4: /* Northbridge */
+                       /* Enable error reporting of all errors,
+                        * enable error checking and
+                        * disable sync flooding */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
+                       break;
+
+               default:
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
+                       wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+                       break;
+               }
+       }
+
+       set_in_cr4(X86_CR4_MCE);
+       printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,303 @@
+/*
+ * MCA implementation for AMD CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
+ */
+
+/* The related documentation for K8 Revisions A - E is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD Athlon 64 and AMD Opteron Processors
+ * Publication # 26094 Revision: 3.30
+ * Issue Date: February 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
+ */
+
+/* The related documentation for K8 Revisions F - G is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD NPT Family 0Fh Processors
+ * Publication # 32559 Revision: 3.04
+ * Issue Date: December 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <xen/timer.h>
+#include <xen/event.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+static struct timer mce_timer;
+
+#define MCE_PERIOD MILLISECS(15000)
+#define MCE_MIN    MILLISECS(2000)
+#define MCE_MAX    MILLISECS(30000)
+
+static s_time_t period = MCE_PERIOD;
+static int hw_threshold = 0;
+static int adjust = 0;
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void mce_amd_checkregs(void *info)
+{
+       struct vcpu *vcpu = current;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv;
+       unsigned int i;
+       unsigned int event_enabled;
+       unsigned int cpu_nr;
+       int error_found;
+
+       /* We don't need a slot yet. Only allocate one on error. */
+       mc_data = NULL;
+
+       cpu_nr = smp_processor_id();
+       event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+       error_found = 0;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               if (mc_data == NULL) {
+                       /* Now we need a slot to fill in error telemetry. */
+                       mc_data = x86_mcinfo_getptr();
+                       BUG_ON(mc_data == NULL);
+                       x86_mcinfo_clear(mc_data);
+                       x86_mcinfo_add(mc_data, &mc_global);
+               }
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               /* Increase polling frequency */
+               error_found = 1;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL)
+                               mc_info.mc_domid = d->domain_id;
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+               x86_mcinfo_add(mc_data, &mc_info);
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+               wmb();
+       }
+
+       if (error_found > 0) {
+               /* If Dom0 enabled the VIRQ_MCA event, then ... */
+               if (event_enabled)
+                       /* ... notify it. */
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               else
+                       /* ... or dump it */
+                       x86_mcinfo_dump(mc_data);
+       }
+
+       adjust += error_found;
+}
+
+/* polling service routine invoker:
+ * Adjust poll frequency at runtime. No error means slow polling frequency,
+ * an error means higher polling frequency.
+ * It uses hw threshold register introduced in AMD K8 RevF to detect
+ * multiple correctable errors between two polls. In that case,
+ * increase polling frequency higher than normal.
+ */
+static void mce_amd_work_fn(void *data)
+{
+       on_each_cpu(mce_amd_checkregs, data, 1, 1);
+
+       if (adjust > 0) {
+               if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+                       /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+                       printk("MCE: polling routine found correctable error. "
+                               " Use mcelog to parse above error output.\n");
+               }
+       }
+
+       if (hw_threshold) {
+               uint64_t value;
+               uint32_t counter;
+
+               rdmsrl(MSR_IA32_MC4_MISC, value);
+               /* Only the error counter field is of interest
+                * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
+                */
+               counter = (value & 0xFFF00000000ULL) >> 32U;
+
+               /* HW does not count *all* kinds of correctable errors.
+                * Thus it is possible, that the polling routine finds an
+                * correctable error even if the HW reports nothing.
+                * However, the other way around is not possible (= BUG).
+                */ 
+               if (counter > 0) {
+                       /* HW reported correctable errors,
+                        * the polling routine did not find...
+                        */
+                       BUG_ON(adjust == 0);
+                       /* subtract 1 to not double count the error 
+                        * from the polling service routine */ 
+                       adjust += (counter - 1);
+
+                       /* Restart counter */
+                       /* No interrupt, reset counter value */
+                       value &= ~(0x60FFF00000000ULL);
+                       /* Counter enable */
+                       value |= (1ULL << 51);
+                       wrmsrl(MSR_IA32_MC4_MISC, value);
+                       wmb();
+               }
+       }
+
+       if (adjust > 0) {
+               /* Increase polling frequency */
+               adjust++; /* adjust == 1 must have an effect */
+               period /= adjust;
+       } else {
+               /* Decrease polling frequency */
+               period *= 2;
+       }
+       if (period > MCE_MAX) {
+               /* limit: Poll at least every 30s */
+               period = MCE_MAX;
+       }
+       if (period < MCE_MIN) {
+               /* limit: Poll every 2s.
+                * When this is reached an uncorrectable error
+                * is expected to happen, if Dom0 does nothing.
+                */
+               period = MCE_MIN;
+       }
+
+       set_timer(&mce_timer, NOW() + period);
+       adjust = 0;
+}
+
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
+{
+       if (c->x86_vendor != X86_VENDOR_AMD)
+               return;
+
+       /* Assume we are on K8 or newer AMD CPU here */
+
+       /* The threshold bitfields in MSR_IA32_MC4_MISC has
+        * been introduced along with the SVME feature bit. */
+       if (cpu_has(c, X86_FEATURE_SVME)) {
+               uint64_t value;
+
+               /* hw threshold registers present */
+               hw_threshold = 1;
+               rdmsrl(MSR_IA32_MC4_MISC, value);
+
+               if (value & (1ULL << 61)) { /* Locked bit */
+                       /* Locked by BIOS. Not available for use */
+                       hw_threshold = 0;
+               }
+               if (!(value & (1ULL << 63))) { /* Valid bit */
+                       /* No CtrP present */
+                       hw_threshold = 0;
+               } else {
+                       if (!(value & (1ULL << 62))) { /* Counter Bit */
+                               /* No counter field present */
+                               hw_threshold = 0;
+                       }
+               }
+
+               if (hw_threshold) {
+                       /* No interrupt, reset counter value */
+                       value &= ~(0x60FFF00000000ULL);
+                       /* Counter enable */
+                       value |= (1ULL << 51);
+                       wrmsrl(MSR_IA32_MC4_MISC, value);
+                       /* serialize */
+                       wmb();
+                       printk(XENLOG_INFO "MCA: Use hw thresholding to adjust 
polling frequency\n");
+               }
+       }
+
+       init_timer(&mce_timer, mce_amd_work_fn, NULL, 0);
+       set_timer(&mce_timer, NOW() + period);
+
+       return;
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/k7.c
--- a/xen/arch/x86/cpu/mcheck/k7.c      Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/k7.c      Fri Jul 04 16:27:44 2008 +0100
@@ -66,8 +66,8 @@ static fastcall void k7_machine_check(st
 }
 
 
-/* AMD K7 machine check is Intel like */
-void amd_mcheck_init(struct cpuinfo_x86 *c)
+/* AMD K7 machine check */
+void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        int i;
@@ -75,7 +75,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
        machine_check_vector = k7_machine_check;
        wmb();
 
-       printk (KERN_INFO "Intel machine check architecture supported.\n");
        rdmsr (MSR_IA32_MCG_CAP, l, h);
        if (l & (1<<8)) /* Control register present ? */
                wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
@@ -90,6 +89,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
        }
 
        set_in_cr4 (X86_CR4_MCE);
-       printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+       printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
                smp_processor_id());
 }
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Fri Jul 04 16:27:44 2008 +0100
@@ -8,73 +8,151 @@
 #include <xen/kernel.h>
 #include <xen/config.h>
 #include <xen/smp.h>
+#include <xen/errno.h>
 
 #include <asm/processor.h> 
 #include <asm/system.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 int mce_disabled = 0;
-int nr_mce_banks;
+unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
 
+/* XXX For now a fixed array is used. Later this should be changed
+ * to a dynamic allocated array with the size calculated in relation
+ * to physical cpus present in the machine.
+ * The more physical cpus are available, the more entries you need.
+ */
+#define MAX_MCINFO     10
+
+struct mc_machine_notify {
+       struct mc_info mc;
+       uint32_t fetch_idx;
+       uint32_t valid;
+};
+
+struct mc_machine {
+
+       /* Array structure used for collecting machine check error telemetry. */
+       struct mc_info mc[MAX_MCINFO];
+
+       /* We handle multiple machine check reports lockless by
+        * iterating through the array using the producer/consumer concept.
+        */
+       /* Producer array index to fill with machine check error data.
+        * Index must be increased atomically. */
+       uint32_t error_idx;
+
+       /* Consumer array index to fetch machine check error data from.
+        * Index must be increased atomically. */
+       uint32_t fetch_idx;
+
+       /* Integer array holding the indeces of the mc array that allows
+         * a Dom0 to notify a DomU to re-fetch the same machine check error
+         * data. The notification and refetch also uses its own 
+        * producer/consumer mechanism, because Dom0 may decide to not report
+        * every error to the impacted DomU.
+        */
+       struct mc_machine_notify notify[MAX_MCINFO];
+
+       /* Array index to get fetch_idx from.
+        * Index must be increased atomically. */
+       uint32_t notifyproducer_idx;
+       uint32_t notifyconsumer_idx;
+};
+
+/* Global variable with machine check information. */
+struct mc_machine mc_data;
+
 /* Handle unconfigured int18 (should never happen) */
-static fastcall void unexpected_machine_check(struct cpu_user_regs * regs, 
long error_code)
+static void unexpected_machine_check(struct cpu_user_regs *regs, long 
error_code)
 {      
-       printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 
smp_processor_id());
-}
+       printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+               smp_processor_id());
+}
+
 
 /* Call the installed machine check handler for this CPU setup. */
-void fastcall (*machine_check_vector)(struct cpu_user_regs *, long error_code) 
= unexpected_machine_check;
+void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = 
unexpected_machine_check;
+
+/* Init machine check callback handler
+ * It is used to collect additional information provided by newer
+ * CPU families/models without the need to duplicate the whole handler.
+ * This avoids having many handlers doing almost nearly the same and each
+ * with its own tweaks ands bugs. */
+int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
+
+
+static void amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+
+       switch (ci->x86) {
+       case 6:
+               amd_k7_mcheck_init(ci);
+               break;
+
+       case 0xf:
+               amd_k8_mcheck_init(ci);
+               break;
+
+       case 0x10:
+               amd_f10_mcheck_init(ci);
+               break;
+
+       default:
+               /* Assume that machine check support is available.
+                * The minimum provided support is at least the K8. */
+               amd_k8_mcheck_init(ci);
+       }
+}
 
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
-       if (mce_disabled==1)
+       if (mce_disabled == 1) {
+               printk(XENLOG_INFO "MCE support disabled by bootparam\n");
                return;
+       }
+
+       if (!cpu_has(c, X86_FEATURE_MCE)) {
+               printk(XENLOG_INFO "CPU%i: No machine check support 
available\n",
+                       smp_processor_id());
+               return;
+       }
+
+       memset(&mc_data, 0, sizeof(struct mc_machine));
 
        switch (c->x86_vendor) {
-               case X86_VENDOR_AMD:
-                       amd_mcheck_init(c);
-                       break;
-
-               case X86_VENDOR_INTEL:
+       case X86_VENDOR_AMD:
+               amd_mcheck_init(c);
+               break;
+
+       case X86_VENDOR_INTEL:
 #ifndef CONFIG_X86_64
-                       if (c->x86==5)
-                               intel_p5_mcheck_init(c);
-                       if (c->x86==6)
-                               intel_p6_mcheck_init(c);
+               if (c->x86==5)
+                       intel_p5_mcheck_init(c);
+               if (c->x86==6)
+                       intel_p6_mcheck_init(c);
 #endif
-                       if (c->x86==15)
-                               intel_p4_mcheck_init(c);
-                       break;
+               if (c->x86==15)
+                       intel_p4_mcheck_init(c);
+               break;
 
 #ifndef CONFIG_X86_64
-               case X86_VENDOR_CENTAUR:
-                       if (c->x86==5)
-                               winchip_mcheck_init(c);
-                       break;
+       case X86_VENDOR_CENTAUR:
+               if (c->x86==5)
+                       winchip_mcheck_init(c);
+               break;
 #endif
 
-               default:
-                       break;
-       }
-}
-
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-       old_cr4 = read_cr4();
-       clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-       if (old_cr4 & X86_CR4_MCE)
-               set_in_cr4(X86_CR4_MCE);
-}
+       default:
+               break;
+       }
+}
+
 
 static void __init mcheck_disable(char *str)
 {
@@ -88,3 +166,411 @@ static void __init mcheck_enable(char *s
 
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
+
+
+#include <xen/guest_access.h>
+#include <asm/traps.h>
+
+struct mc_info *x86_mcinfo_getptr(void)
+{
+       struct mc_info *mi;
+       uint32_t entry, next;
+
+       for (;;) {
+               entry = mc_data.error_idx;
+               smp_rmb();
+               next = entry + 1;
+               if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
+                       break;
+       }
+
+       mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
+       BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
+
+       return mi;
+}
+
+static int x86_mcinfo_matches_guest(const struct mc_info *mi,
+                       const struct domain *d, const struct vcpu *v)
+{
+       struct mcinfo_common *mic;
+       struct mcinfo_global *mig;
+
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+       mig = (struct mcinfo_global *)mic;
+       if (mig == NULL)
+               return 0;
+
+       if (d->domain_id != mig->mc_domid)
+               return 0;
+
+       if (v->vcpu_id != mig->mc_vcpuid)
+               return 0;
+
+       return 1;
+}
+
+
+#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
+
+static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
+                               const struct domain *d, const struct vcpu *v)
+{
+       struct mc_info *mi;
+
+       /* This function is called from the fetch hypercall with
+        * the mc_lock spinlock held. Thus, no need for locking here.
+        */
+       mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
+       if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
+               /* Bogus domU command detected. */
+               *fetch_idx = 0;
+               return NULL;
+       }
+
+       *fetch_idx = mc_data.fetch_idx;
+       mc_data.fetch_idx++;
+       BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
+
+       return mi;
+}
+
+
+static void x86_mcinfo_marknotified(struct xen_mc_notifydomain 
*mc_notifydomain)
+{
+       struct mc_machine_notify *mn;
+       struct mcinfo_common *mic = NULL;
+       struct mcinfo_global *mig;
+       struct domain *d;
+       int i;
+
+       /* This function is called from the notifier hypercall with
+        * the mc_notify_lock spinlock held. Thus, no need for locking here.
+        */
+
+       /* First invalidate entries for guests that disappeared after
+        * notification (e.g. shutdown/crash). This step prevents the
+        * notification array from filling up with stalling/leaking entries.
+        */
+       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
+               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
+               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
+               BUG_ON(mic == NULL);
+               mig = (struct mcinfo_global *)mic;
+               d = get_domain_by_id(mig->mc_domid);
+               if (d == NULL) {
+                       /* Domain does not exist. */
+                       mn->valid = 0;
+               }
+               if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
+                       mc_data.notifyconsumer_idx++;
+       }
+
+       /* Now put in the error telemetry. Since all error data fetchable
+        * by domUs are uncorrectable errors, they are very important.
+        * So we dump them before overriding them. When a guest takes that long,
+        * then we can assume something bad already happened (crash, hang, etc.)
+        */
+       mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
+
+       if (mn->valid) {
+               struct mcinfo_common *mic = NULL;
+               struct mcinfo_global *mig;
+
+               /* To not loose the information, we dump it. */
+               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
+               BUG_ON(mic == NULL);
+               mig = (struct mcinfo_global *)mic;
+               printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
+                       "fetch machine check error telemetry. But Domain ID "
+                       "did not do that in time.\n",
+                       mig->mc_domid);
+               x86_mcinfo_dump(&mn->mc);
+       }
+
+       memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
+               sizeof(struct mc_info));
+       mn->fetch_idx = mc_notifydomain->fetch_idx;
+       mn->valid = 1;
+
+       mc_data.notifyproducer_idx++;
+
+       /* By design there can never be more notifies than machine check errors.
+        * If that ever happens, then we hit a bug. */
+       BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
+       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
+}
+
+static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
+                               const struct domain *d, const struct vcpu *v)
+{
+       struct mc_machine_notify *mn = NULL;
+       uint32_t i;
+       int found;
+
+       /* This function is called from the fetch hypercall with
+        * the mc_notify_lock spinlock held. Thus, no need for locking here.
+        */
+
+       /* The notifier data is filled in the order guests get notified, but
+        * guests may fetch them in a different order. That's why we need
+        * the game with valid/invalid entries. */
+       found = 0;
+       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
+               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
+               if (!mn->valid) {
+                       if (i == mc_data.notifyconsumer_idx)
+                               mc_data.notifyconsumer_idx++;
+                       continue;
+               }
+               if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (!found) {
+               /* This domain has never been notified. This must be
+                * a bogus domU command. */
+               *fetch_idx = 0;
+               return NULL;
+       }
+
+       BUG_ON(mn == NULL);
+       *fetch_idx = mn->fetch_idx;
+       mn->valid = 0;
+
+       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
+       return &mn->mc;
+}
+
+
+void x86_mcinfo_clear(struct mc_info *mi)
+{
+       memset(mi, 0, sizeof(struct mc_info));
+       x86_mcinfo_nentries(mi) = 0;
+}
+
+
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
+{
+       int i;
+       unsigned long end1, end2;
+       struct mcinfo_common *mic, *mic_base, *mic_index;
+
+       mic = (struct mcinfo_common *)mcinfo;
+       mic_index = mic_base = x86_mcinfo_first(mi);
+
+       /* go to first free entry */
+       for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
+               mic_index = x86_mcinfo_next(mic_index);
+       }
+
+       /* check if there is enough size */
+       end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
+       end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
+
+       if (end1 < end2)
+               return -ENOSPC; /* No space. Can't add entry. */
+
+       /* there's enough space. add entry. */
+       memcpy(mic_index, mic, mic->size);
+       x86_mcinfo_nentries(mi)++;
+
+       return 0;
+}
+
+
+/* Dump machine check information in a format,
+ * mcelog can parse. This is used only when
+ * Dom0 does not take the notification. */
+void x86_mcinfo_dump(struct mc_info *mi)
+{
+       struct mcinfo_common *mic = NULL;
+       struct mcinfo_global *mc_global;
+       struct mcinfo_bank *mc_bank;
+
+       /* first print the global info */
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+       if (mic == NULL)
+               return;
+       mc_global = (struct mcinfo_global *)mic;
+       if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+               printk(XENLOG_WARNING
+                       "CPU%d: Machine Check Exception: %16"PRIx64"\n",
+                       mc_global->mc_coreid, mc_global->mc_gstatus);
+       } else {
+               printk(XENLOG_WARNING "MCE: The hardware reports a non "
+                       "fatal, correctable incident occured on "
+                       "CPU %d.\n",
+                       mc_global->mc_coreid);
+       }
+
+       /* then the bank information */
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
+       do {
+               if (mic == NULL)
+                       return;
+               if (mic->type != MC_TYPE_BANK)
+                       continue;
+
+               mc_bank = (struct mcinfo_bank *)mic;
+       
+               printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
+                       mc_bank->mc_bank,
+                       mc_bank->mc_status);
+               if (mc_bank->mc_status & MCi_STATUS_MISCV)
+                       printk("[%16"PRIx64"]", mc_bank->mc_misc);
+               if (mc_bank->mc_status & MCi_STATUS_ADDRV)
+                       printk(" at %16"PRIx64, mc_bank->mc_addr);
+
+               printk("\n");
+               mic = x86_mcinfo_next(mic); /* next entry */
+               if ((mic == NULL) || (mic->size == 0))
+                       break;
+       } while (1);
+}
+
+
+
+/* Machine Check Architecture Hypercall */
+long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
+{
+       long ret = 0;
+       struct xen_mc curop, *op = &curop;
+       struct vcpu *v = current;
+       struct domain *domU;
+       struct xen_mc_fetch *mc_fetch;
+       struct xen_mc_notifydomain *mc_notifydomain;
+       struct mc_info *mi;
+       uint32_t flags;
+       uint32_t fetch_idx;
+        uint16_t vcpuid;
+       /* Use a different lock for the notify hypercall in order to allow
+        * a DomU to fetch mc data while Dom0 notifies another DomU. */
+       static DEFINE_SPINLOCK(mc_lock);
+       static DEFINE_SPINLOCK(mc_notify_lock);
+
+       if ( copy_from_guest(op, u_xen_mc, 1) )
+               return -EFAULT;
+
+       if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
+               return -EACCES;
+
+       switch ( op->cmd ) {
+       case XEN_MC_fetch:
+               /* This hypercall is for any domain */
+               mc_fetch = &op->u.mc_fetch;
+
+               switch (mc_fetch->flags) {
+               case XEN_MC_CORRECTABLE:
+                       /* But polling mode is Dom0 only, because
+                        * correctable errors are reported to Dom0 only */
+                       if ( !IS_PRIV(v->domain) )
+                               return -EPERM;
+                       break;
+
+               case XEN_MC_TRAP:
+                       break;
+               default:
+                       return -EFAULT;
+               }
+
+               flags = XEN_MC_OK;
+               spin_lock(&mc_lock);
+
+               if ( IS_PRIV(v->domain) ) {
+                       /* this must be Dom0. So a notify hypercall
+                        * can't have happened before. */
+                       mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+               } else {
+                       /* Hypercall comes from an unprivileged domain */
+                       domU = v->domain;
+                       if (guest_has_trap_callback(dom0, 0, 
TRAP_machine_check)) {
+                               /* Dom0 must have notified this DomU before
+                                * via the notify hypercall. */
+                               mi = x86_mcinfo_getnotifiedptr(&fetch_idx, 
domU, v);
+                       } else {
+                               /* Xen notified the DomU. */
+                               mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, 
v);
+                       }
+               }
+
+               if (mi) {
+                       memcpy(&mc_fetch->mc_info, mi,
+                               sizeof(struct mc_info));
+               } else {
+                       /* There is no data for a bogus DomU command. */
+                       flags |= XEN_MC_NODATA;
+                       memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
+               }
+
+               mc_fetch->flags = flags;
+               mc_fetch->fetch_idx = fetch_idx;
+
+               if ( copy_to_guest(u_xen_mc, op, 1) )
+                       ret = -EFAULT;
+
+               spin_unlock(&mc_lock);
+               break;
+
+       case XEN_MC_notifydomain:
+               /* This hypercall is for Dom0 only */
+               if ( !IS_PRIV(v->domain) )
+                       return -EPERM;
+
+               spin_lock(&mc_notify_lock);
+
+               mc_notifydomain = &op->u.mc_notifydomain;
+               domU = get_domain_by_id(mc_notifydomain->mc_domid);
+               vcpuid = mc_notifydomain->mc_vcpuid;
+
+               if ((domU == NULL) || (domU == dom0)) {
+                       /* It's not possible to notify a non-existent domain
+                        * or the dom0. */
+                       spin_unlock(&mc_notify_lock);
+                       return -EACCES;
+               }
+
+               if (vcpuid >= MAX_VIRT_CPUS) {
+                       /* It's not possible to notify a vcpu, Xen can't
+                        * assign to a domain. */
+                       spin_unlock(&mc_notify_lock);
+                       return -EACCES;
+               }
+
+               mc_notifydomain->flags = XEN_MC_OK;
+
+               mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
+               if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
+                       /* The error telemetry is not for the guest, Dom0
+                        * wants to notify. */
+                       mc_notifydomain->flags |= XEN_MC_NOMATCH;
+               } else if ( guest_has_trap_callback(domU, vcpuid,
+                                               TRAP_machine_check) )
+               {
+                       /* Send notification */
+                       if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
+                               mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
+               } else
+                       mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
+
+#ifdef DEBUG
+               /* sanity check - these two flags are mutually exclusive */
+               if ((flags & XEN_MC_CANNOTHANDLE) && (flags & 
XEN_MC_NOTDELIVERED))
+                       BUG();
+#endif
+
+               if ( copy_to_guest(u_xen_mc, op, 1) )
+                       ret = -EFAULT;
+
+               if (ret == 0) {
+                       x86_mcinfo_marknotified(mc_notifydomain);
+               }
+
+               spin_unlock(&mc_notify_lock);
+               break;
+       }
+
+       return ret;
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Fri Jul 04 16:27:44 2008 +0100
@@ -1,14 +1,30 @@
 #include <xen/init.h>
+#include <asm/traps.h>
 
-void amd_mcheck_init(struct cpuinfo_x86 *c);
+/* Init functions */
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
+void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
-/* Call the installed machine check handler for this CPU setup. */
-extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long 
error_code);
+/* Function pointer used in the handlers to collect additional information
+ * provided by newer CPU families/models without the need to duplicate
+ * the whole handler resulting in various handlers each with its own
+ * tweaks and bugs */
+extern int (*mc_callback_bank_extended)(struct mc_info *mi,
+               uint16_t bank, uint64_t status);
 
+
+/* Helper functions used for collecting error telemetry */
+struct mc_info *x86_mcinfo_getptr(void);
+void x86_mcinfo_clear(struct mc_info *mi);
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
+void x86_mcinfo_dump(struct mc_info *mi);
+
+/* Global variables */
 extern int mce_disabled __initdata;
-extern int nr_mce_banks;
-
+extern unsigned int nr_mce_banks;
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c       Fri Jul 04 16:27:44 2008 +0100
@@ -68,19 +68,29 @@ static int __init init_nonfatal_mce_chec
        if (!cpu_has(c, X86_FEATURE_MCA))
                return -ENODEV;
 
-       /* Some Athlons misbehave when we frob bank 0 */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-               boot_cpu_data.x86 == 6)
-                       firstbank = 1;
-       else
-                       firstbank = 0;
-
        /*
         * Check for non-fatal errors every MCE_RATE s
         */
-       init_timer(&mce_timer, mce_work_fn, NULL, 0);
-       set_timer(&mce_timer, NOW() + MCE_PERIOD);
-       printk(KERN_INFO "Machine check exception polling timer started.\n");
+       switch (c->x86_vendor) {
+       case X86_VENDOR_AMD:
+               if (c->x86 == 6) { /* K7 */
+                       firstbank = 1;
+                       init_timer(&mce_timer, mce_work_fn, NULL, 0);
+                       set_timer(&mce_timer, NOW() + MCE_PERIOD);
+                       break;
+               }
+
+               /* Assume we are on K8 or newer AMD CPU here */
+               amd_nonfatal_mcheck_init(c);
+               break;
+
+       case X86_VENDOR_INTEL:
+               init_timer(&mce_timer, mce_work_fn, NULL, 0);
+               set_timer(&mce_timer, NOW() + MCE_PERIOD);
+               break;
+       }
+
+       printk(KERN_INFO "MCA: Machine check polling timer started.\n");
        return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/x86_mca.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,72 @@
+/*
+ * MCA implementation for AMD K7/K8 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc. 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* The MCA/MCE MSRs should not be used anywhere else.
+ * They are cpu family/model specific and are only for use
+ * in terms of machine check handling.
+ * So we define them here rather in <asm/msr.h>.
+ */
+
+
+/* Bitfield of the MSR_IA32_MCG_CAP register */
+#define MCG_CAP_COUNT           0x00000000000000ffULL
+#define MCG_CTL_P               0x0000000000000100ULL
+/* Bits 9-63 are reserved */
+
+/* Bitfield of the MSR_IA32_MCG_STATUS register */
+#define MCG_STATUS_RIPV         0x0000000000000001ULL
+#define MCG_STATUS_EIPV         0x0000000000000002ULL
+#define MCG_STATUS_MCIP         0x0000000000000004ULL
+/* Bits 3-63 are reserved */
+
+/* Bitfield of MSR_K8_MCi_STATUS registers */
+/* MCA error code */
+#define MCi_STATUS_MCA          0x000000000000ffffULL
+/* model-specific error code */
+#define MCi_STATUS_MSEC         0x00000000ffff0000ULL
+/* Other information */
+#define MCi_STATUS_OTHER        0x01ffffff00000000ULL
+/* processor context corrupt */
+#define MCi_STATUS_PCC          0x0200000000000000ULL
+/* MSR_K8_MCi_ADDR register valid */
+#define MCi_STATUS_ADDRV        0x0400000000000000ULL
+/* MSR_K8_MCi_MISC register valid */
+#define MCi_STATUS_MISCV        0x0800000000000000ULL
+/* error condition enabled */
+#define MCi_STATUS_EN           0x1000000000000000ULL
+/* uncorrected error */
+#define MCi_STATUS_UC           0x2000000000000000ULL
+/* status register overflow */
+#define MCi_STATUS_OVER         0x4000000000000000ULL
+/* valid */
+#define MCi_STATUS_VAL          0x8000000000000000ULL
+
+/* Bitfield of MSi_STATUS_OTHER field */
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED1      0x00001fff00000000ULL
+/* uncorrectable ECC error */
+#define MCi_STATUS_OTEHR_UC_ECC         0x0000200000000000ULL
+/* correctable ECC error */
+#define MCi_STATUS_OTHER_C_ECC          0x0000400000000000ULL
+/* ECC syndrome of an ECC error */
+#define MCi_STATUS_OTHER_ECC_SYNDROME   0x007f800000000000ULL
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
+
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/nmi.c
--- a/xen/arch/x86/nmi.c        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/nmi.c        Fri Jul 04 16:27:44 2008 +0100
@@ -457,10 +457,10 @@ static void do_nmi_stats(unsigned char k
     if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
         return;
 
-    if ( v->nmi_pending || v->nmi_masked )
+    if ( v->nmi_pending || (v->trap_priority >= VCPU_TRAP_NMI) )
         printk("dom0 vpu0: NMI %s%s\n",
                v->nmi_pending ? "pending " : "",
-               v->nmi_masked  ? "masked " : "");
+               (v->trap_priority >= VCPU_TRAP_NMI)  ? "masked " : "");
     else
         printk("dom0 vcpu0: NMI neither pending nor masked\n");
 }
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/traps.c      Fri Jul 04 16:27:44 2008 +0100
@@ -61,6 +61,7 @@
 #include <asm/msr.h>
 #include <asm/shared.h>
 #include <asm/x86_emulate.h>
+#include <asm/traps.h>
 #include <asm/hvm/vpt.h>
 #include <public/arch-x86/cpuid.h>
 
@@ -486,6 +487,20 @@ static unsigned int check_guest_io_break
 }
 
 /*
+ * Called from asm to set up the MCE trapbounce info.
+ * Returns 0 if no callback is set up, else 1.
+ */
+asmlinkage int set_guest_machinecheck_trapbounce(void)
+{
+    struct vcpu *v = current;
+    struct trap_bounce *tb = &v->arch.trap_bounce;
+ 
+    do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
+    tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
+    return !null_trap_bounce(v, tb);
+}
+
+/*
  * Called from asm to set up the NMI trapbounce info.
  * Returns 0 if no callback is set up, else 1.
  */
@@ -904,8 +919,6 @@ asmlinkage void do_int3(struct cpu_user_
 
 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
 {
-    extern fastcall void (*machine_check_vector)(
-        struct cpu_user_regs *, long error_code);
     machine_check_vector(regs, regs->error_code);
 }
 
@@ -2678,25 +2691,51 @@ asmlinkage void do_general_protection(st
     panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
 }
 
+static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
+
 static void nmi_mce_softirq(void)
 {
-    /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
-    vcpu_kick(dom0->vcpu[0]);
+    int cpu = smp_processor_id();
+    struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
+    cpumask_t affinity;
+
+    BUG_ON(st == NULL);
+    BUG_ON(st->vcpu == NULL);
+
+    /* Set the tmp value unconditionally, so that
+     * the check in the iret hypercall works. */
+    st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
+
+    if ((cpu != st->processor)
+       || (st->processor != st->vcpu->processor))
+    {
+        /* We are on a different physical cpu.
+         * Make sure to wakeup the vcpu on the
+         * specified processor.
+         */
+        cpus_clear(affinity);
+        cpu_set(st->processor, affinity);
+        vcpu_set_affinity(st->vcpu, &affinity);
+
+        /* Affinity is restored in the iret hypercall. */
+    }
+
+    /* Only used to defer wakeup of domain/vcpu to
+     * a safe (non-NMI/MCE) context.
+     */
+    vcpu_kick(st->vcpu);
 }
 
 static void nmi_dom0_report(unsigned int reason_idx)
 {
-    struct domain *d;
-    struct vcpu   *v;
-
-    if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
+    struct domain *d = dom0;
+
+    if ( (d == NULL) || (d->vcpu[0] == NULL) )
         return;
 
     set_bit(reason_idx, nmi_reason(d));
 
-    /* Not safe to wake a vcpu here, or even to schedule a tasklet! */
-    if ( !test_and_set_bool(v->nmi_pending) )
-        raise_softirq(NMI_MCE_SOFTIRQ);
+    send_guest_trap(d, 0, TRAP_nmi);
 }
 
 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
@@ -3010,6 +3049,70 @@ long unregister_guest_nmi_callback(void)
     return 0;
 }
 
+int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int 
trap_nr)
+{
+    struct vcpu *v;
+    struct trap_info *t;
+
+    BUG_ON(d == NULL);
+    BUG_ON(vcpuid >= MAX_VIRT_CPUS);
+
+    /* Sanity check - XXX should be more fine grained. */
+    BUG_ON(trap_nr > TRAP_syscall);
+
+    v = d->vcpu[vcpuid];
+    t = &v->arch.guest_context.trap_ctxt[trap_nr];
+
+    return (t->address != 0);
+}
+
+
+int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
+{
+    struct vcpu *v;
+    struct softirq_trap *st;
+
+    BUG_ON(d == NULL);
+    BUG_ON(vcpuid >= MAX_VIRT_CPUS);
+    v = d->vcpu[vcpuid];
+
+    switch (trap_nr) {
+    case TRAP_nmi:
+        if ( !test_and_set_bool(v->nmi_pending) ) {
+               st = &per_cpu(softirq_trap, smp_processor_id());
+               st->domain = dom0;
+               st->vcpu = dom0->vcpu[0];
+               st->processor = st->vcpu->processor;
+
+               /* not safe to wake up a vcpu here */
+               raise_softirq(NMI_MCE_SOFTIRQ);
+               return 0;
+        }
+        break;
+
+    case TRAP_machine_check:
+
+        /* We are called by the machine check (exception or polling) handlers
+         * on the physical CPU that reported a machine check error. */
+
+        if ( !test_and_set_bool(v->mce_pending) ) {
+                st = &per_cpu(softirq_trap, smp_processor_id());
+                st->domain = d;
+                st->vcpu = v;
+                st->processor = v->processor;
+
+                /* not safe to wake up a vcpu here */
+                raise_softirq(NMI_MCE_SOFTIRQ);
+                return 0;
+        }
+        break;
+    }
+
+    /* delivery failed */
+    return -EIO;
+}
+
+
 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
 {
     struct trap_info cur;
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_32/asm-offsets.c Fri Jul 04 16:27:44 2008 +0100
@@ -67,7 +67,11 @@ void __dummy__(void)
            arch.guest_context.kernel_sp);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
-    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
+    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
+    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
+    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
+    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
+    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
     DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     BLANK();
 
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_32/entry.S       Fri Jul 04 16:27:44 2008 +0100
@@ -229,6 +229,8 @@ test_all_events:
         shl  $IRQSTAT_shift,%eax
         test %ecx,irq_stat(%eax,1)
         jnz  process_softirqs
+        testb $1,VCPU_mce_pending(%ebx)
+        jnz  process_mce
         testb $1,VCPU_nmi_pending(%ebx)
         jnz  process_nmi
 test_guest_events:
@@ -255,15 +257,35 @@ process_softirqs:
         jmp  test_all_events
 
         ALIGN
+/* %ebx: struct vcpu */
+process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
+        jae  test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%ebx)
+        call set_guest_machinecheck_trapbounce
+        test %eax,%eax
+        jz   test_all_events
+        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
+        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
+        jmp process_trap
+
+        ALIGN
+/* %ebx: struct vcpu */
 process_nmi:
-        testb $1,VCPU_nmi_masked(%ebx)
-        jnz  test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
+        jae  test_guest_events
         sti
         movb $0,VCPU_nmi_pending(%ebx)
         call set_guest_nmi_trapbounce
         test %eax,%eax
         jz   test_all_events
-        movb $1,VCPU_nmi_masked(%ebx)
+        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
+        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
+        /* FALLTHROUGH */
+process_trap:
         leal VCPU_trap_bounce(%ebx),%edx
         call create_bounce_frame
         jmp  test_all_events
@@ -681,6 +703,10 @@ ENTRY(hypercall_table)
         .long do_sysctl             /* 35 */
         .long do_domctl
         .long do_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/4)
+        .long do_ni_hypercall
+        .endr
+        .long do_mca                /* 48 */
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -724,6 +750,10 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec_op          */
+        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
+        .byte 0 /* do_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca               */  /* 48 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_32/traps.c       Fri Jul 04 16:27:44 2008 +0100
@@ -255,8 +255,13 @@ unsigned long do_iret(void)
             goto exit_and_crash;
     }
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore affinity.  */
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
+
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/asm-offsets.c Fri Jul 04 16:27:44 2008 +0100
@@ -92,7 +92,11 @@ void __dummy__(void)
     OFFSET(VCPU_kernel_ss, struct vcpu, arch.guest_context.kernel_ss);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
-    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
+    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
+    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
+    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
+    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
+    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
     DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     DEFINE(_VGCF_syscall_disables_events,  _VGCF_syscall_disables_events);
     BLANK();
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/compat/entry.S
--- a/xen/arch/x86/x86_64/compat/entry.S        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/compat/entry.S        Fri Jul 04 16:27:44 2008 +0100
@@ -101,6 +101,8 @@ ENTRY(compat_test_all_events)
         leaq  irq_stat(%rip),%rcx
         testl $~0,(%rcx,%rax,1)
         jnz   compat_process_softirqs
+        testb $1,VCPU_mce_pending(%rbx)
+        jnz   compat_process_mce
         testb $1,VCPU_nmi_pending(%rbx)
         jnz   compat_process_nmi
 compat_test_guest_events:
@@ -129,15 +131,34 @@ compat_process_softirqs:
 
        ALIGN
 /* %rbx: struct vcpu */
+compat_process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jae  compat_test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%rbx)
+        call set_guest_machinecheck_trapbounce
+        testl %eax,%eax
+        jz    compat_test_all_events
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw  $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jmp   compat_process_trap
+
+       ALIGN
+/* %rbx: struct vcpu */
 compat_process_nmi:
-        testb $1,VCPU_nmi_masked(%rbx)
-        jnz   compat_test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        jae   compat_test_guest_events
         sti
         movb  $0,VCPU_nmi_pending(%rbx)
         call  set_guest_nmi_trapbounce
         testl %eax,%eax
         jz    compat_test_all_events
-        movb  $1,VCPU_nmi_masked(%rbx)
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw  $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        /* FALLTHROUGH */
+compat_process_trap:
         leaq  VCPU_trap_bounce(%rbx),%rdx
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
@@ -386,6 +407,10 @@ ENTRY(compat_hypercall_table)
         .quad do_sysctl                 /* 35 */
         .quad do_domctl
         .quad compat_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8)
+        .quad compat_ni_hypercall
+        .endr
+        .quad do_mca                    /* 48 */
         .rept NR_hypercalls-((.-compat_hypercall_table)/8)
         .quad compat_ni_hypercall
         .endr
@@ -429,6 +454,10 @@ ENTRY(compat_hypercall_args_table)
         .byte 1 /* do_sysctl                */  /* 35 */
         .byte 1 /* do_domctl                */
         .byte 2 /* compat_kexec_op          */
+        .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table)
+        .byte 0 /* compat_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca                   */
         .rept NR_hypercalls-(.-compat_hypercall_args_table)
         .byte 0 /* compat_ni_hypercall      */
         .endr
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/compat/traps.c
--- a/xen/arch/x86/x86_64/compat/traps.c        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/compat/traps.c        Fri Jul 04 16:27:44 2008 +0100
@@ -121,8 +121,13 @@ unsigned int compat_iret(void)
     else
         regs->_esp += 16;
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore affinity.  */
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
+
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/entry.S
--- a/xen/arch/x86/x86_64/entry.S       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/entry.S       Fri Jul 04 16:27:44 2008 +0100
@@ -205,6 +205,8 @@ test_all_events:
         leaq  irq_stat(%rip),%rcx
         testl $~0,(%rcx,%rax,1)
         jnz   process_softirqs
+        testb $1,VCPU_mce_pending(%rbx)
+        jnz   process_mce
         testb $1,VCPU_nmi_pending(%rbx)
         jnz   process_nmi
 test_guest_events:
@@ -231,15 +233,34 @@ process_softirqs:
 
         ALIGN
 /* %rbx: struct vcpu */
+process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jae  test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%rbx)
+        call set_guest_machinecheck_trapbounce
+        test %eax,%eax
+        jz   test_all_events
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jmp  process_trap
+
+        ALIGN
+/* %rbx: struct vcpu */
 process_nmi:
-        testb $1,VCPU_nmi_masked(%rbx)
-        jnz  test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        jae  test_guest_events
         sti
         movb $0,VCPU_nmi_pending(%rbx)
         call set_guest_nmi_trapbounce
         test %eax,%eax
         jz   test_all_events
-        movb $1,VCPU_nmi_masked(%rbx)
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        /* FALLTHROUGH */
+process_trap:
         leaq VCPU_trap_bounce(%rbx),%rdx
         call create_bounce_frame
         jmp  test_all_events
@@ -671,6 +692,10 @@ ENTRY(hypercall_table)
         .quad do_sysctl             /* 35 */
         .quad do_domctl
         .quad do_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8)
+        .quad do_ni_hypercall
+        .endr
+        .quad do_mca                /* 48 */
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -715,6 +740,10 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec             */
         .byte 1 /* do_xsm_op            */
+        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
+        .byte 0 /* do_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca               */  /* 48 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/traps.c       Fri Jul 04 16:27:44 2008 +0100
@@ -288,8 +288,13 @@ unsigned long do_iret(void)
         regs->rcx = iret_saved.rcx;
     }
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore affinity.  */
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
+
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE);
diff -r d133d452cb71 -r a49673cd23d2 xen/common/domain.c
--- a/xen/common/domain.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/common/domain.c       Fri Jul 04 16:27:44 2008 +0100
@@ -654,7 +654,9 @@ void vcpu_reset(struct vcpu *v)
     v->is_polling      = 0;
     v->is_initialised  = 0;
     v->nmi_pending     = 0;
-    v->nmi_masked      = 0;
+    v->mce_pending     = 0;
+    v->old_trap_priority = VCPU_TRAP_NONE;
+    v->trap_priority   = VCPU_TRAP_NONE;
     clear_bit(_VPF_blocked, &v->pause_flags);
 
     domain_unlock(v->domain);
diff -r d133d452cb71 -r a49673cd23d2 xen/common/event_channel.c
--- a/xen/common/event_channel.c        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/common/event_channel.c        Fri Jul 04 16:27:44 2008 +0100
@@ -587,6 +587,21 @@ void send_guest_vcpu_virq(struct vcpu *v
     evtchn_set_pending(v, port);
 }
 
+int guest_enabled_event(struct vcpu *v, int virq)
+{
+    int port;
+
+    if ( unlikely(v == NULL) )
+        return 0;
+
+    port = v->virq_to_evtchn[virq];
+    if ( port == 0 )
+        return 0;
+
+    /* virq is in use */
+    return 1;
+}
+
 void send_guest_global_virq(struct domain *d, int virq)
 {
     int port;
diff -r d133d452cb71 -r a49673cd23d2 xen/include/Makefile
--- a/xen/include/Makefile      Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/Makefile      Fri Jul 04 16:27:44 2008 +0100
@@ -20,6 +20,7 @@ headers-y := \
     compat/xen.h \
     compat/xencomm.h \
     compat/xenoprof.h
+headers-$(CONFIG_X86)     += compat/arch-x86/xen-mca.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen-$(compat-arch-y).h
 headers-y                 += compat/arch-$(compat-arch-y).h compat/xlat.h
diff -r d133d452cb71 -r a49673cd23d2 xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/asm-x86/event.h       Fri Jul 04 16:27:44 2008 +0100
@@ -69,7 +69,12 @@ static inline void local_event_delivery_
 /* No arch specific virq definition now. Default to global. */
 static inline int arch_virq_is_global(int virq)
 {
-    return 1;
+    switch (virq) {
+    case VIRQ_MCA:
+        return 1;
+    default:
+        return 1;
+    }
 }
 
 #endif
diff -r d133d452cb71 -r a49673cd23d2 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/asm-x86/mm.h  Fri Jul 04 16:27:44 2008 +0100
@@ -141,6 +141,9 @@ static inline u32 pickle_domptr(struct d
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
+#define maddr_get_owner(ma)   (page_get_owner(maddr_to_page((ma))))
+#define vaddr_get_owner(va)   (page_get_owner(virt_to_page((va))))
+
 #define XENSHARE_writable 0
 #define XENSHARE_readonly 1
 extern void share_xen_page_with_guest(
diff -r d133d452cb71 -r a49673cd23d2 xen/include/asm-x86/traps.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/traps.h       Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2007, 2008 Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@xxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef ASM_TRAP_H
+#define ASM_TRAP_H
+
+struct softirq_trap {
+       struct domain *domain;  /* domain to inject trap */
+       struct vcpu *vcpu;      /* vcpu to inject trap */
+       int processor;          /* physical cpu to inject trap */
+};
+
+struct cpu_user_regs;
+
+extern void (*machine_check_vector)(struct cpu_user_regs *regs, long 
error_code);
+ 
+/**
+ * guest_has_trap_callback
+ *
+ * returns true (non-zero) if guest registered a trap handler
+ */
+extern int guest_has_trap_callback(struct domain *d, uint16_t vcpuid,
+                               unsigned int trap_nr);
+
+/**
+ * send_guest_trap
+ *
+ * delivers trap to guest analogous to send_guest_global_virq
+ * return 0 on successful delivery
+ */
+extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
+                               unsigned int trap_nr);
+
+#endif /* ASM_TRAP_H */
diff -r d133d452cb71 -r a49673cd23d2 xen/include/public/arch-x86/xen-mca.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/arch-x86/xen-mca.h     Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,279 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * 
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@xxxxxxx>
+ *
+ * Guest OS machine check interface to x86 Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Full MCA functionality has the following Usecases from the guest side:
+ *
+ * Must have's:
+ * 1. Dom0 and DomU register machine check trap callback handlers
+ *    (already done via "set_trap_table" hypercall)
+ * 2. Dom0 registers machine check event callback handler
+ *    (doable via EVTCHNOP_bind_virq)
+ * 3. Dom0 and DomU fetches machine check data
+ * 4. Dom0 wants Xen to notify a DomU
+ * 5. Dom0 gets DomU ID from physical address
+ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
+ *
+ * Nice to have's:
+ * 7. Dom0 wants Xen to deactivate a physical CPU
+ *    This is better done as separate task, physical CPU hotplugging,
+ *    and hypercall(s) should be sysctl's
+ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
+ *    move a DomU (or Dom0 itself) away from a malicious page
+ *    producing correctable errors.
+ * 9. offlining physical page:
+ *    Xen free's and never re-uses a certain physical page.
+ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
+ *     and tell Xen to trigger a machine check
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+#define XEN_MCA_INTERFACE_VERSION 0x03000001
+
+/* IN: Dom0 calls hypercall from MC event handler. */
+#define XEN_MC_CORRECTABLE  0x0
+/* IN: Dom0/DomU calls hypercall from MC trap handler. */
+#define XEN_MC_TRAP         0x1
+/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+
+/* OUT: All is ok */
+#define XEN_MC_OK           0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED  0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA       0x2
+/* OUT: Between notification time and this hypercall an other
+ *  (most likely) correctable error happened. The fetched data,
+ *  does not match the original machine check data. */
+#define XEN_MC_NOMATCH      0x4
+
+/* OUT: DomU did not register MC NMI handler. Try something else. */
+#define XEN_MC_CANNOTHANDLE 0x8
+/* OUT: Notifying DomU failed. Retry later or try something else. */
+#define XEN_MC_NOTDELIVERED 0x10
+/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
+
+
+#ifndef __ASSEMBLY__
+
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
+ */
+#define MC_TYPE_GLOBAL          0
+#define MC_TYPE_BANK            1
+#define MC_TYPE_EXTENDED        2
+
+struct mcinfo_common {
+    uint16_t type;      /* structure type */
+    uint16_t size;      /* size of this struct in bytes */
+};
+
+
+#define MC_FLAG_CORRECTABLE     (1 << 0)
+#define MC_FLAG_UNCORRECTABLE   (1 << 1)
+
+/* contains global x86 mc information */
+struct mcinfo_global {
+    struct mcinfo_common common;
+
+    /* running domain at the time in error (most likely the impacted one) */
+    uint16_t mc_domid;
+    uint32_t mc_socketid; /* physical socket of the physical core */
+    uint16_t mc_coreid; /* physical impacted core */
+    uint16_t mc_core_threadid; /* core thread of physical core */
+    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+    uint64_t mc_gstatus; /* global status */
+    uint32_t mc_flags;
+};
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+    struct mcinfo_common common;
+
+    uint16_t mc_bank; /* bank nr */
+    uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0
+                        * and if mc_addr is valid. Never valid on DomU. */
+    uint64_t mc_status; /* bank status */
+    uint64_t mc_addr;   /* bank address, only valid
+                         * if addr bit is set in mc_status */
+    uint64_t mc_misc;
+};
+
+
+struct mcinfo_msr {
+    uint64_t reg;   /* MSR */
+    uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other
+ * or additional mc MSRs */ 
+struct mcinfo_extended {
+    struct mcinfo_common common;
+
+    /* You can fill up to five registers.
+     * If you need more, then use this structure
+     * multiple times. */
+
+    uint32_t mc_msrs; /* Number of msr with valid values. */
+    struct mcinfo_msr mc_msr[5];
+};
+
+#define MCINFO_HYPERCALLSIZE   1024
+#define MCINFO_MAXSIZE         768
+
+struct mc_info {
+    /* Number of mcinfo_* entries in mi_data */
+    uint32_t mi_nentries;
+
+    uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
+};
+typedef struct mc_info mc_info_t;
+
+
+
+/* 
+ * OS's should use these instead of writing their own lookup function
+ * each with its own bugs and drawbacks.
+ * We use macros instead of static inline functions to allow guests
+ * to include this header in assembly files (*.S).
+ */
+/* Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi)    \
+    (_mi)->mi_nentries
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi)       \
+    (struct mcinfo_common *)((_mi)->mi_data)
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+    (struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)
+
+/* Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+#define x86_mcinfo_lookup(_ret, _mi, _type)    \
+    do {                                                        \
+        uint32_t found, i;                                      \
+        struct mcinfo_common *_mic;                             \
+                                                                \
+        found = 0;                                              \
+       (_ret) = NULL;                                          \
+       if (_mi == NULL) break;                                 \
+        _mic = x86_mcinfo_first(_mi);                           \
+        for (i = 0; i < x86_mcinfo_nentries(_mi); i++) {        \
+            if (_mic->type == (_type)) {                        \
+                found = 1;                                      \
+                break;                                          \
+            }                                                   \
+            _mic = x86_mcinfo_next(_mic);                       \
+        }                                                       \
+        (_ret) = found ? _mic : NULL;                           \
+    } while (0)
+
+
+/* Usecase 1
+ * Register machine check trap callback handler
+ *    (already done via "set_trap_table" hypercall)
+ */
+
+/* Usecase 2
+ * Dom0 registers machine check event callback handler
+ * done by EVTCHNOP_bind_virq
+ */
+
+/* Usecase 3
+ * Fetch machine check data from hypervisor.
+ * Note, this hypercall is special, because both Dom0 and DomU must use this.
+ */
+#define XEN_MC_fetch            1
+struct xen_mc_fetch {
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+
+    /* OUT variables. */
+    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
+    struct mc_info mc_info;
+};
+typedef struct xen_mc_fetch xen_mc_fetch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
+
+
+/* Usecase 4
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain     2
+struct xen_mc_notifydomain {
+    /* IN variables. */
+    uint16_t mc_domid;    /* The unprivileged domain to notify. */
+    uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
+                           * Usually echo'd value from the fetch hypercall. */
+    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
+
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
+};
+typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
+
+
+struct xen_mc {
+    uint32_t cmd;
+    uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+    union {
+        struct xen_mc_fetch        mc_fetch;
+        struct xen_mc_notifydomain mc_notifydomain;
+        uint8_t pad[MCINFO_HYPERCALLSIZE];
+    } u;
+};
+typedef struct xen_mc xen_mc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
diff -r d133d452cb71 -r a49673cd23d2 xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/public/arch-x86/xen.h Fri Jul 04 16:27:44 2008 +0100
@@ -75,6 +75,10 @@ typedef unsigned long xen_pfn_t;
 
 /* Maximum number of virtual CPUs in multi-processor guests. */
 #define MAX_VIRT_CPUS 32
+
+
+/* Machine check support */
+#include "xen-mca.h"
 
 #ifndef __ASSEMBLY__
 
diff -r d133d452cb71 -r a49673cd23d2 xen/include/xen/event.h
--- a/xen/include/xen/event.h   Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/xen/event.h   Fri Jul 04 16:27:44 2008 +0100
@@ -50,6 +50,9 @@ void free_xen_event_channel(
 void free_xen_event_channel(
     struct vcpu *local_vcpu, int port);
 
+/* Query if event channel is in use by the guest */
+int guest_enabled_event(struct vcpu *v, int virq);
+
 /* Notify remote end of a Xen-attached event channel.*/
 void notify_via_xen_event_channel(int lport);
 
diff -r d133d452cb71 -r a49673cd23d2 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/xen/sched.h   Fri Jul 04 16:27:44 2008 +0100
@@ -112,10 +112,21 @@ struct vcpu
     bool_t           is_initialised;
     /* Currently running on a CPU? */
     bool_t           is_running;
+    /* MCE callback pending for this VCPU? */
+    bool_t           mce_pending;
     /* NMI callback pending for this VCPU? */
     bool_t           nmi_pending;
-    /* Avoid NMI reentry by allowing NMIs to be masked for short periods. */
-    bool_t           nmi_masked;
+
+    /* Higher priorized traps may interrupt lower priorized traps,
+     * lower priorized traps wait until higher priorized traps finished.
+     * Note: This concept is known as "system priority level" (spl)
+     * in the UNIX world. */
+    uint16_t         old_trap_priority;
+    uint16_t         trap_priority;
+#define VCPU_TRAP_NONE    0
+#define VCPU_TRAP_NMI     1
+#define VCPU_TRAP_MCE     2
+
     /* Require shutdown to be deferred for some asynchronous operation? */
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
WARNING - OLD ARCHIVES

xen-changelog

[Xen-changelog] [xen-unstable] x86: MCA support.