WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] x86: MCA support.

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] x86: MCA support.
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Fri, 04 Jul 2008 16:20:08 -0700
Delivery-date: Fri, 04 Jul 2008 16:21:13 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1215185264 -3600
# Node ID a49673cd23d2548d57af716cc85fcaeef48b55fc
# Parent  d133d452cb718a07b42f86edc8ce838ef6734de9
x86: MCA support.
Signed-off-by: Christoph Egger <Christoph.Egger@xxxxxxx>
---
 xen/arch/x86/cpu/mcheck/Makefile       |    3 
 xen/arch/x86/cpu/mcheck/amd_f10.c      |  131 +++++++
 xen/arch/x86/cpu/mcheck/amd_k8.c       |  324 ++++++++++++++++++
 xen/arch/x86/cpu/mcheck/amd_nonfatal.c |  303 +++++++++++++++++
 xen/arch/x86/cpu/mcheck/k7.c           |    7 
 xen/arch/x86/cpu/mcheck/mce.c          |  566 ++++++++++++++++++++++++++++++---
 xen/arch/x86/cpu/mcheck/mce.h          |   26 +
 xen/arch/x86/cpu/mcheck/non-fatal.c    |   30 +
 xen/arch/x86/cpu/mcheck/x86_mca.h      |   72 ++++
 xen/arch/x86/nmi.c                     |    4 
 xen/arch/x86/traps.c                   |  125 ++++++-
 xen/arch/x86/x86_32/asm-offsets.c      |    6 
 xen/arch/x86/x86_32/entry.S            |   36 +-
 xen/arch/x86/x86_32/traps.c            |    9 
 xen/arch/x86/x86_64/asm-offsets.c      |    6 
 xen/arch/x86/x86_64/compat/entry.S     |   35 +-
 xen/arch/x86/x86_64/compat/traps.c     |    9 
 xen/arch/x86/x86_64/entry.S            |   35 +-
 xen/arch/x86/x86_64/traps.c            |    9 
 xen/common/domain.c                    |    4 
 xen/common/event_channel.c             |   15 
 xen/include/Makefile                   |    1 
 xen/include/asm-x86/event.h            |    7 
 xen/include/asm-x86/mm.h               |    3 
 xen/include/asm-x86/traps.h            |   50 ++
 xen/include/public/arch-x86/xen-mca.h  |  279 ++++++++++++++++
 xen/include/public/arch-x86/xen.h      |    4 
 xen/include/xen/event.h                |    3 
 xen/include/xen/sched.h                |   15 
 29 files changed, 2024 insertions(+), 93 deletions(-)

diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile  Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/Makefile  Fri Jul 04 16:27:44 2008 +0100
@@ -1,4 +1,7 @@ obj-y += k7.o
+obj-y += amd_nonfatal.o
 obj-y += k7.o
+obj-y += amd_k8.o
+obj-y += amd_f10.o
 obj-y += mce.o
 obj-y += non-fatal.o
 obj-y += p4.o
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/amd_f10.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,131 @@
+/*
+ * MCA implementation for AMD Family10 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ */
+
+/* Family10 MCA documentation published at
+ *
+ * BIOS and Kernel Developer's Guide
+ * For AMD Family 10h Processors
+ * Publication # 31116 Revision: 1.08
+ * Isse Date: June 10, 2007
+ */
+
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+
+static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+{
+       struct mcinfo_extended mc_ext;
+
+       /* Family 0x10 introduced additional MSR that belong to the
+        * northbridge bank (4). */
+       if (bank != 4)
+               return 0;
+
+       if (!(status & MCi_STATUS_VAL))
+               return 0;
+
+       if (!(status & MCi_STATUS_MISCV))
+               return 0;
+
+       memset(&mc_ext, 0, sizeof(mc_ext));
+       mc_ext.common.type = MC_TYPE_EXTENDED;
+       mc_ext.common.size = sizeof(mc_ext);
+       mc_ext.mc_msrs = 3;
+
+       mc_ext.mc_msr[0].reg = MSR_F10_MC4_MISC1;
+       mc_ext.mc_msr[1].reg = MSR_F10_MC4_MISC2;
+       mc_ext.mc_msr[2].reg = MSR_F10_MC4_MISC3;
+
+       rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
+       rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
+       rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
+       
+       x86_mcinfo_add(mi, &mc_ext);
+       return 1;
+}
+
+
+extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
+
+/* AMD Family10 machine check */
+void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
+{ 
+       uint64_t value;
+       uint32_t i;
+       int cpu_nr;
+
+       machine_check_vector = k8_machine_check;
+       mc_callback_bank_extended = amd_f10_handler;
+       cpu_nr = smp_processor_id();
+       wmb();
+
+       rdmsrl(MSR_IA32_MCG_CAP, value);
+       if (value & MCG_CTL_P)  /* Control register present ? */
+               wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+       nr_mce_banks = value & MCG_CAP_COUNT;
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               switch (i) {
+               case 4: /* Northbridge */
+                       /* Enable error reporting of all errors,
+                        * enable error checking and
+                        * disable sync flooding */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
+
+                       /* XXX: We should write the value 0x1087821UL into
+                        * to register F3x180 here, which sits in
+                        * the PCI extended configuration space.
+                        * Since this is not possible here, we can only hope,
+                        * Dom0 is doing that.
+                        */
+                       break;
+
+               default:
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
+                       wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+                       break;
+               }
+       }
+
+       set_in_cr4(X86_CR4_MCE);
+       printk("CPU%i: AMD Family10h machine check reporting enabled.\n", 
cpu_nr);
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/amd_k8.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c  Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,324 @@
+/*
+ * MCA implementation for AMD K8 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
+ */
+
+/* The related documentation for K8 Revisions A - E is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD Athlon 64 and AMD Opteron Processors
+ * Publication # 26094 Revision: 3.30
+ * Issue Date: February 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
+ */
+
+/* The related documentation for K8 Revisions F - G is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD NPT Family 0Fh Processors
+ * Publication # 32559 Revision: 3.04
+ * Issue Date: December 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+ */
+
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+
+#include <asm/processor.h>
+#include <asm/shared.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+
+/* Machine Check Handler for AMD K8 family series */
+void k8_machine_check(struct cpu_user_regs *regs, long error_code)
+{
+       struct vcpu *vcpu = current;
+       struct domain *curdom;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv, uc;
+       uint32_t i;
+       unsigned int cpu_nr;
+       uint32_t xen_impacted = 0;
+#define DOM_NORMAL     0
+#define DOM0_TRAP      1
+#define DOMU_TRAP      2
+#define DOMU_KILLED    4
+       uint32_t dom_state = DOM_NORMAL;
+
+       /* This handler runs as interrupt gate. So IPIs from the
+        * polling service routine are defered until we finished.
+        */
+
+        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+        * an other physical CPU or the impacted process in the guest
+        * continues running with corrupted data, otherwise. */
+        vcpu_schedule_lock_irq(vcpu);
+
+       mc_data = x86_mcinfo_getptr();
+       cpu_nr = smp_processor_id();
+       curdom = vcpu->domain;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       /* Quick check, who is impacted */
+       xen_impacted = is_idle_domain(curdom);
+
+       /* Dom0 */
+       x86_mcinfo_clear(mc_data);
+       x86_mcinfo_add(mc_data, &mc_global);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               /* An error happened in this bank.
+                * This is expected to be an uncorrectable error,
+                * since correctable errors get polled.
+                */
+               uc = status & MCi_STATUS_UC;
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
+                       
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL)
+                               mc_info.mc_domid = d->domain_id;
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+
+               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               wmb();
+               add_taint(TAINT_MACHINE_CHECK);
+       }
+
+       status = mc_global.mc_gstatus;
+
+       /* clear MCIP or cpu enters shutdown state
+        * in case another MCE occurs. */
+       status &= ~MCG_STATUS_MCIP;
+       wrmsrl(MSR_IA32_MCG_STATUS, status);
+       wmb();
+
+       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
+        * The thread started here:
+        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
+        */
+
+       /* MCG_STATUS_RIPV: 
+        * When this bit is not set, then the instruction pointer onto the stack
+        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
+        * right below. Otherwise it is up to the guest to figure out if 
+        * guest kernel or guest userland is affected and should kill either
+        * itself or the affected process.
+        */
+
+       /* MCG_STATUS_EIPV:
+        * Evaluation of EIPV is the job of the guest.
+        */
+
+       if (xen_impacted) {
+               /* Now we are going to panic anyway. Allow interrupts, so that
+                * printk on serial console can work. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* Uh, that means, machine check exception
+                * inside Xen occured. */
+               printk("Machine check exception occured in Xen.\n");
+
+               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
+                * to the error then it makes sense to print a stack trace.
+                * That can be useful for more detailed error analysis and/or
+                * error case studies to figure out, if we can clear
+                * xen_impacted and kill a DomU instead
+                * (i.e. if a guest only control structure is affected, but then
+                * we must ensure the bad pages are not re-used again).
+                */
+               if (status & MCG_STATUS_EIPV) {
+                       printk("MCE: Instruction Pointer is related to the 
error. "
+                               "Therefore, print the execution state.\n");
+                       show_execution_state(regs);
+               }
+               x86_mcinfo_dump(mc_data);
+               panic("End of MCE. Use mcelog to decode above error codes.\n");
+       }
+
+       /* If Dom0 registered a machine check handler, which is only possible
+        * with a PV MCA driver, then ... */
+       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
+               dom_state = DOM0_TRAP;
+
+               /* ... deliver machine check trap to Dom0. */
+               send_guest_trap(dom0, 0, TRAP_machine_check);
+
+               /* Xen may tell Dom0 now to notify the DomU.
+                * But this will happen through a hypercall. */
+       } else
+               /* Dom0 did not register a machine check handler, but if DomU
+                * did so, then... */
+                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
+                       dom_state = DOMU_TRAP;
+
+                       /* ... deliver machine check trap to DomU */
+                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
+       } else {
+               /* hmm... noone feels responsible to handle the error.
+                * So, do a quick check if a DomU is impacted or not.
+                */
+               if (curdom == dom0) {
+                       /* Dom0 is impacted. Since noone can't handle
+                        * this error, panic! */
+                       x86_mcinfo_dump(mc_data);
+                       panic("MCE occured in Dom0, which it can't handle\n");
+
+                       /* UNREACHED */
+               } else {
+                       dom_state = DOMU_KILLED;
+
+                       /* Enable interrupts. This basically results in
+                        * calling sti on the *physical* cpu. But after
+                        * domain_crash() the vcpu pointer is invalid.
+                        * Therefore, we must unlock the irqs before killing
+                        * it. */
+                       vcpu_schedule_unlock_irq(vcpu);
+
+                       /* DomU is impacted. Kill it and continue. */
+                       domain_crash(curdom);
+               }
+       }
+
+
+       switch (dom_state) {
+       case DOM0_TRAP:
+       case DOMU_TRAP:
+               /* Enable interrupts. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* guest softirqs and event callbacks are scheduled
+                * immediately after this handler exits. */
+               break;
+       case DOMU_KILLED:
+               /* Nothing to do here. */
+               break;
+       default:
+               BUG();
+       }
+}
+
+
+/* AMD K8 machine check */
+void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+{
+       uint64_t value;
+       uint32_t i;
+       int cpu_nr;
+
+       machine_check_vector = k8_machine_check;
+       cpu_nr = smp_processor_id();
+       wmb();
+
+       rdmsrl(MSR_IA32_MCG_CAP, value);
+       if (value & MCG_CTL_P)  /* Control register present ? */
+               wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+       nr_mce_banks = value & MCG_CAP_COUNT;
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               switch (i) {
+               case 4: /* Northbridge */
+                       /* Enable error reporting of all errors,
+                        * enable error checking and
+                        * disable sync flooding */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
+                       break;
+
+               default:
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
+                       wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+                       break;
+               }
+       }
+
+       set_in_cr4(X86_CR4_MCE);
+       printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,303 @@
+/*
+ * MCA implementation for AMD CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
+ */
+
+/* The related documentation for K8 Revisions A - E is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD Athlon 64 and AMD Opteron Processors
+ * Publication # 26094 Revision: 3.30
+ * Issue Date: February 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
+ */
+
+/* The related documentation for K8 Revisions F - G is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD NPT Family 0Fh Processors
+ * Publication # 32559 Revision: 3.04
+ * Issue Date: December 2006
+ *
+ * URL:
+ * 
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <xen/timer.h>
+#include <xen/event.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+static struct timer mce_timer;
+
+#define MCE_PERIOD MILLISECS(15000)
+#define MCE_MIN    MILLISECS(2000)
+#define MCE_MAX    MILLISECS(30000)
+
+static s_time_t period = MCE_PERIOD;
+static int hw_threshold = 0;
+static int adjust = 0;
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void mce_amd_checkregs(void *info)
+{
+       struct vcpu *vcpu = current;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv;
+       unsigned int i;
+       unsigned int event_enabled;
+       unsigned int cpu_nr;
+       int error_found;
+
+       /* We don't need a slot yet. Only allocate one on error. */
+       mc_data = NULL;
+
+       cpu_nr = smp_processor_id();
+       event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+       error_found = 0;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               if (mc_data == NULL) {
+                       /* Now we need a slot to fill in error telemetry. */
+                       mc_data = x86_mcinfo_getptr();
+                       BUG_ON(mc_data == NULL);
+                       x86_mcinfo_clear(mc_data);
+                       x86_mcinfo_add(mc_data, &mc_global);
+               }
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               /* Increase polling frequency */
+               error_found = 1;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL)
+                               mc_info.mc_domid = d->domain_id;
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+               x86_mcinfo_add(mc_data, &mc_info);
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+               wmb();
+       }
+
+       if (error_found > 0) {
+               /* If Dom0 enabled the VIRQ_MCA event, then ... */
+               if (event_enabled)
+                       /* ... notify it. */
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               else
+                       /* ... or dump it */
+                       x86_mcinfo_dump(mc_data);
+       }
+
+       adjust += error_found;
+}
+
+/* polling service routine invoker:
+ * Adjust poll frequency at runtime. No error means slow polling frequency,
+ * an error means higher polling frequency.
+ * It uses hw threshold register introduced in AMD K8 RevF to detect
+ * multiple correctable errors between two polls. In that case,
+ * increase polling frequency higher than normal.
+ */
+static void mce_amd_work_fn(void *data)
+{
+       on_each_cpu(mce_amd_checkregs, data, 1, 1);
+
+       if (adjust > 0) {
+               if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+                       /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+                       printk("MCE: polling routine found correctable error. "
+                               " Use mcelog to parse above error output.\n");
+               }
+       }
+
+       if (hw_threshold) {
+               uint64_t value;
+               uint32_t counter;
+
+               rdmsrl(MSR_IA32_MC4_MISC, value);
+               /* Only the error counter field is of interest
+                * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
+                */
+               counter = (value & 0xFFF00000000ULL) >> 32U;
+
+               /* HW does not count *all* kinds of correctable errors.
+                * Thus it is possible, that the polling routine finds an
+                * correctable error even if the HW reports nothing.
+                * However, the other way around is not possible (= BUG).
+                */ 
+               if (counter > 0) {
+                       /* HW reported correctable errors,
+                        * the polling routine did not find...
+                        */
+                       BUG_ON(adjust == 0);
+                       /* subtract 1 to not double count the error 
+                        * from the polling service routine */ 
+                       adjust += (counter - 1);
+
+                       /* Restart counter */
+                       /* No interrupt, reset counter value */
+                       value &= ~(0x60FFF00000000ULL);
+                       /* Counter enable */
+                       value |= (1ULL << 51);
+                       wrmsrl(MSR_IA32_MC4_MISC, value);
+                       wmb();
+               }
+       }
+
+       if (adjust > 0) {
+               /* Increase polling frequency */
+               adjust++; /* adjust == 1 must have an effect */
+               period /= adjust;
+       } else {
+               /* Decrease polling frequency */
+               period *= 2;
+       }
+       if (period > MCE_MAX) {
+               /* limit: Poll at least every 30s */
+               period = MCE_MAX;
+       }
+       if (period < MCE_MIN) {
+               /* limit: Poll every 2s.
+                * When this is reached an uncorrectable error
+                * is expected to happen, if Dom0 does nothing.
+                */
+               period = MCE_MIN;
+       }
+
+       set_timer(&mce_timer, NOW() + period);
+       adjust = 0;
+}
+
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
+{
+       if (c->x86_vendor != X86_VENDOR_AMD)
+               return;
+
+       /* Assume we are on K8 or newer AMD CPU here */
+
+       /* The threshold bitfields in MSR_IA32_MC4_MISC has
+        * been introduced along with the SVME feature bit. */
+       if (cpu_has(c, X86_FEATURE_SVME)) {
+               uint64_t value;
+
+               /* hw threshold registers present */
+               hw_threshold = 1;
+               rdmsrl(MSR_IA32_MC4_MISC, value);
+
+               if (value & (1ULL << 61)) { /* Locked bit */
+                       /* Locked by BIOS. Not available for use */
+                       hw_threshold = 0;
+               }
+               if (!(value & (1ULL << 63))) { /* Valid bit */
+                       /* No CtrP present */
+                       hw_threshold = 0;
+               } else {
+                       if (!(value & (1ULL << 62))) { /* Counter Bit */
+                               /* No counter field present */
+                               hw_threshold = 0;
+                       }
+               }
+
+               if (hw_threshold) {
+                       /* No interrupt, reset counter value */
+                       value &= ~(0x60FFF00000000ULL);
+                       /* Counter enable */
+                       value |= (1ULL << 51);
+                       wrmsrl(MSR_IA32_MC4_MISC, value);
+                       /* serialize */
+                       wmb();
+                       printk(XENLOG_INFO "MCA: Use hw thresholding to adjust 
polling frequency\n");
+               }
+       }
+
+       init_timer(&mce_timer, mce_amd_work_fn, NULL, 0);
+       set_timer(&mce_timer, NOW() + period);
+
+       return;
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/k7.c
--- a/xen/arch/x86/cpu/mcheck/k7.c      Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/k7.c      Fri Jul 04 16:27:44 2008 +0100
@@ -66,8 +66,8 @@ static fastcall void k7_machine_check(st
 }
 
 
-/* AMD K7 machine check is Intel like */
-void amd_mcheck_init(struct cpuinfo_x86 *c)
+/* AMD K7 machine check */
+void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        int i;
@@ -75,7 +75,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
        machine_check_vector = k7_machine_check;
        wmb();
 
-       printk (KERN_INFO "Intel machine check architecture supported.\n");
        rdmsr (MSR_IA32_MCG_CAP, l, h);
        if (l & (1<<8)) /* Control register present ? */
                wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
@@ -90,6 +89,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
        }
 
        set_in_cr4 (X86_CR4_MCE);
-       printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+       printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
                smp_processor_id());
 }
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Fri Jul 04 16:27:44 2008 +0100
@@ -8,73 +8,151 @@
 #include <xen/kernel.h>
 #include <xen/config.h>
 #include <xen/smp.h>
+#include <xen/errno.h>
 
 #include <asm/processor.h> 
 #include <asm/system.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 int mce_disabled = 0;
-int nr_mce_banks;
+unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
 
+/* XXX For now a fixed array is used. Later this should be changed
+ * to a dynamic allocated array with the size calculated in relation
+ * to physical cpus present in the machine.
+ * The more physical cpus are available, the more entries you need.
+ */
+#define MAX_MCINFO     10
+
+struct mc_machine_notify {
+       struct mc_info mc;
+       uint32_t fetch_idx;
+       uint32_t valid;
+};
+
+struct mc_machine {
+
+       /* Array structure used for collecting machine check error telemetry. */
+       struct mc_info mc[MAX_MCINFO];
+
+       /* We handle multiple machine check reports lockless by
+        * iterating through the array using the producer/consumer concept.
+        */
+       /* Producer array index to fill with machine check error data.
+        * Index must be increased atomically. */
+       uint32_t error_idx;
+
+       /* Consumer array index to fetch machine check error data from.
+        * Index must be increased atomically. */
+       uint32_t fetch_idx;
+
+       /* Integer array holding the indeces of the mc array that allows
+         * a Dom0 to notify a DomU to re-fetch the same machine check error
+         * data. The notification and refetch also uses its own 
+        * producer/consumer mechanism, because Dom0 may decide to not report
+        * every error to the impacted DomU.
+        */
+       struct mc_machine_notify notify[MAX_MCINFO];
+
+       /* Array index to get fetch_idx from.
+        * Index must be increased atomically. */
+       uint32_t notifyproducer_idx;
+       uint32_t notifyconsumer_idx;
+};
+
+/* Global variable with machine check information. */
+struct mc_machine mc_data;
+
 /* Handle unconfigured int18 (should never happen) */
-static fastcall void unexpected_machine_check(struct cpu_user_regs * regs, 
long error_code)
+static void unexpected_machine_check(struct cpu_user_regs *regs, long 
error_code)
 {      
-       printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 
smp_processor_id());
-}
+       printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+               smp_processor_id());
+}
+
 
 /* Call the installed machine check handler for this CPU setup. */
-void fastcall (*machine_check_vector)(struct cpu_user_regs *, long error_code) 
= unexpected_machine_check;
+void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = 
unexpected_machine_check;
+
+/* Init machine check callback handler
+ * It is used to collect additional information provided by newer
+ * CPU families/models without the need to duplicate the whole handler.
+ * This avoids having many handlers doing almost nearly the same and each
+ * with its own tweaks ands bugs. */
+int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
+
+
+static void amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+
+       switch (ci->x86) {
+       case 6:
+               amd_k7_mcheck_init(ci);
+               break;
+
+       case 0xf:
+               amd_k8_mcheck_init(ci);
+               break;
+
+       case 0x10:
+               amd_f10_mcheck_init(ci);
+               break;
+
+       default:
+               /* Assume that machine check support is available.
+                * The minimum provided support is at least the K8. */
+               amd_k8_mcheck_init(ci);
+       }
+}
 
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
-       if (mce_disabled==1)
+       if (mce_disabled == 1) {
+               printk(XENLOG_INFO "MCE support disabled by bootparam\n");
                return;
+       }
+
+       if (!cpu_has(c, X86_FEATURE_MCE)) {
+               printk(XENLOG_INFO "CPU%i: No machine check support 
available\n",
+                       smp_processor_id());
+               return;
+       }
+
+       memset(&mc_data, 0, sizeof(struct mc_machine));
 
        switch (c->x86_vendor) {
-               case X86_VENDOR_AMD:
-                       amd_mcheck_init(c);
-                       break;
-
-               case X86_VENDOR_INTEL:
+       case X86_VENDOR_AMD:
+               amd_mcheck_init(c);
+               break;
+
+       case X86_VENDOR_INTEL:
 #ifndef CONFIG_X86_64
-                       if (c->x86==5)
-                               intel_p5_mcheck_init(c);
-                       if (c->x86==6)
-                               intel_p6_mcheck_init(c);
+               if (c->x86==5)
+                       intel_p5_mcheck_init(c);
+               if (c->x86==6)
+                       intel_p6_mcheck_init(c);
 #endif
-                       if (c->x86==15)
-                               intel_p4_mcheck_init(c);
-                       break;
+               if (c->x86==15)
+                       intel_p4_mcheck_init(c);
+               break;
 
 #ifndef CONFIG_X86_64
-               case X86_VENDOR_CENTAUR:
-                       if (c->x86==5)
-                               winchip_mcheck_init(c);
-                       break;
+       case X86_VENDOR_CENTAUR:
+               if (c->x86==5)
+                       winchip_mcheck_init(c);
+               break;
 #endif
 
-               default:
-                       break;
-       }
-}
-
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-       old_cr4 = read_cr4();
-       clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-       if (old_cr4 & X86_CR4_MCE)
-               set_in_cr4(X86_CR4_MCE);
-}
+       default:
+               break;
+       }
+}
+
 
 static void __init mcheck_disable(char *str)
 {
@@ -88,3 +166,411 @@ static void __init mcheck_enable(char *s
 
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
+
+
+#include <xen/guest_access.h>
+#include <asm/traps.h>
+
+struct mc_info *x86_mcinfo_getptr(void)
+{
+       struct mc_info *mi;
+       uint32_t entry, next;
+
+       for (;;) {
+               entry = mc_data.error_idx;
+               smp_rmb();
+               next = entry + 1;
+               if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
+                       break;
+       }
+
+       mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
+       BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
+
+       return mi;
+}
+
+static int x86_mcinfo_matches_guest(const struct mc_info *mi,
+                       const struct domain *d, const struct vcpu *v)
+{
+       struct mcinfo_common *mic;
+       struct mcinfo_global *mig;
+
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+       mig = (struct mcinfo_global *)mic;
+       if (mig == NULL)
+               return 0;
+
+       if (d->domain_id != mig->mc_domid)
+               return 0;
+
+       if (v->vcpu_id != mig->mc_vcpuid)
+               return 0;
+
+       return 1;
+}
+
+
+#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
+
+static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
+                               const struct domain *d, const struct vcpu *v)
+{
+       struct mc_info *mi;
+
+       /* This function is called from the fetch hypercall with
+        * the mc_lock spinlock held. Thus, no need for locking here.
+        */
+       mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
+       if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
+               /* Bogus domU command detected. */
+               *fetch_idx = 0;
+               return NULL;
+       }
+
+       *fetch_idx = mc_data.fetch_idx;
+       mc_data.fetch_idx++;
+       BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
+
+       return mi;
+}
+
+
+static void x86_mcinfo_marknotified(struct xen_mc_notifydomain 
*mc_notifydomain)
+{
+       struct mc_machine_notify *mn;
+       struct mcinfo_common *mic = NULL;
+       struct mcinfo_global *mig;
+       struct domain *d;
+       int i;
+
+       /* This function is called from the notifier hypercall with
+        * the mc_notify_lock spinlock held. Thus, no need for locking here.
+        */
+
+       /* First invalidate entries for guests that disappeared after
+        * notification (e.g. shutdown/crash). This step prevents the
+        * notification array from filling up with stalling/leaking entries.
+        */
+       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
+               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
+               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
+               BUG_ON(mic == NULL);
+               mig = (struct mcinfo_global *)mic;
+               d = get_domain_by_id(mig->mc_domid);
+               if (d == NULL) {
+                       /* Domain does not exist. */
+                       mn->valid = 0;
+               }
+               if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
+                       mc_data.notifyconsumer_idx++;
+       }
+
+       /* Now put in the error telemetry. Since all error data fetchable
+        * by domUs are uncorrectable errors, they are very important.
+        * So we dump them before overriding them. When a guest takes that long,
+        * then we can assume something bad already happened (crash, hang, etc.)
+        */
+       mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
+
+       if (mn->valid) {
+               struct mcinfo_common *mic = NULL;
+               struct mcinfo_global *mig;
+
+               /* To not loose the information, we dump it. */
+               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
+               BUG_ON(mic == NULL);
+               mig = (struct mcinfo_global *)mic;
+               printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
+                       "fetch machine check error telemetry. But Domain ID "
+                       "did not do that in time.\n",
+                       mig->mc_domid);
+               x86_mcinfo_dump(&mn->mc);
+       }
+
+       memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
+               sizeof(struct mc_info));
+       mn->fetch_idx = mc_notifydomain->fetch_idx;
+       mn->valid = 1;
+
+       mc_data.notifyproducer_idx++;
+
+       /* By design there can never be more notifies than machine check errors.
+        * If that ever happens, then we hit a bug. */
+       BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
+       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
+}
+
+static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
+                               const struct domain *d, const struct vcpu *v)
+{
+       struct mc_machine_notify *mn = NULL;
+       uint32_t i;
+       int found;
+
+       /* This function is called from the fetch hypercall with
+        * the mc_notify_lock spinlock held. Thus, no need for locking here.
+        */
+
+       /* The notifier data is filled in the order guests get notified, but
+        * guests may fetch them in a different order. That's why we need
+        * the game with valid/invalid entries. */
+       found = 0;
+       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
+               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
+               if (!mn->valid) {
+                       if (i == mc_data.notifyconsumer_idx)
+                               mc_data.notifyconsumer_idx++;
+                       continue;
+               }
+               if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (!found) {
+               /* This domain has never been notified. This must be
+                * a bogus domU command. */
+               *fetch_idx = 0;
+               return NULL;
+       }
+
+       BUG_ON(mn == NULL);
+       *fetch_idx = mn->fetch_idx;
+       mn->valid = 0;
+
+       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
+       return &mn->mc;
+}
+
+
+void x86_mcinfo_clear(struct mc_info *mi)
+{
+       memset(mi, 0, sizeof(struct mc_info));
+       x86_mcinfo_nentries(mi) = 0;
+}
+
+
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
+{
+       int i;
+       unsigned long end1, end2;
+       struct mcinfo_common *mic, *mic_base, *mic_index;
+
+       mic = (struct mcinfo_common *)mcinfo;
+       mic_index = mic_base = x86_mcinfo_first(mi);
+
+       /* go to first free entry */
+       for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
+               mic_index = x86_mcinfo_next(mic_index);
+       }
+
+       /* check if there is enough size */
+       end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
+       end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
+
+       if (end1 < end2)
+               return -ENOSPC; /* No space. Can't add entry. */
+
+       /* there's enough space. add entry. */
+       memcpy(mic_index, mic, mic->size);
+       x86_mcinfo_nentries(mi)++;
+
+       return 0;
+}
+
+
+/* Dump machine check information in a format,
+ * mcelog can parse. This is used only when
+ * Dom0 does not take the notification. */
+void x86_mcinfo_dump(struct mc_info *mi)
+{
+       struct mcinfo_common *mic = NULL;
+       struct mcinfo_global *mc_global;
+       struct mcinfo_bank *mc_bank;
+
+       /* first print the global info */
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+       if (mic == NULL)
+               return;
+       mc_global = (struct mcinfo_global *)mic;
+       if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+               printk(XENLOG_WARNING
+                       "CPU%d: Machine Check Exception: %16"PRIx64"\n",
+                       mc_global->mc_coreid, mc_global->mc_gstatus);
+       } else {
+               printk(XENLOG_WARNING "MCE: The hardware reports a non "
+                       "fatal, correctable incident occured on "
+                       "CPU %d.\n",
+                       mc_global->mc_coreid);
+       }
+
+       /* then the bank information */
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
+       do {
+               if (mic == NULL)
+                       return;
+               if (mic->type != MC_TYPE_BANK)
+                       continue;
+
+               mc_bank = (struct mcinfo_bank *)mic;
+       
+               printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
+                       mc_bank->mc_bank,
+                       mc_bank->mc_status);
+               if (mc_bank->mc_status & MCi_STATUS_MISCV)
+                       printk("[%16"PRIx64"]", mc_bank->mc_misc);
+               if (mc_bank->mc_status & MCi_STATUS_ADDRV)
+                       printk(" at %16"PRIx64, mc_bank->mc_addr);
+
+               printk("\n");
+               mic = x86_mcinfo_next(mic); /* next entry */
+               if ((mic == NULL) || (mic->size == 0))
+                       break;
+       } while (1);
+}
+
+
+
+/* Machine Check Architecture Hypercall */
+long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
+{
+       long ret = 0;
+       struct xen_mc curop, *op = &curop;
+       struct vcpu *v = current;
+       struct domain *domU;
+       struct xen_mc_fetch *mc_fetch;
+       struct xen_mc_notifydomain *mc_notifydomain;
+       struct mc_info *mi;
+       uint32_t flags;
+       uint32_t fetch_idx;
+        uint16_t vcpuid;
+       /* Use a different lock for the notify hypercall in order to allow
+        * a DomU to fetch mc data while Dom0 notifies another DomU. */
+       static DEFINE_SPINLOCK(mc_lock);
+       static DEFINE_SPINLOCK(mc_notify_lock);
+
+       if ( copy_from_guest(op, u_xen_mc, 1) )
+               return -EFAULT;
+
+       if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
+               return -EACCES;
+
+       switch ( op->cmd ) {
+       case XEN_MC_fetch:
+               /* This hypercall is for any domain */
+               mc_fetch = &op->u.mc_fetch;
+
+               switch (mc_fetch->flags) {
+               case XEN_MC_CORRECTABLE:
+                       /* But polling mode is Dom0 only, because
+                        * correctable errors are reported to Dom0 only */
+                       if ( !IS_PRIV(v->domain) )
+                               return -EPERM;
+                       break;
+
+               case XEN_MC_TRAP:
+                       break;
+               default:
+                       return -EFAULT;
+               }
+
+               flags = XEN_MC_OK;
+               spin_lock(&mc_lock);
+
+               if ( IS_PRIV(v->domain) ) {
+                       /* this must be Dom0. So a notify hypercall
+                        * can't have happened before. */
+                       mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+               } else {
+                       /* Hypercall comes from an unprivileged domain */
+                       domU = v->domain;
+                       if (guest_has_trap_callback(dom0, 0, 
TRAP_machine_check)) {
+                               /* Dom0 must have notified this DomU before
+                                * via the notify hypercall. */
+                               mi = x86_mcinfo_getnotifiedptr(&fetch_idx, 
domU, v);
+                       } else {
+                               /* Xen notified the DomU. */
+                               mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, 
v);
+                       }
+               }
+
+               if (mi) {
+                       memcpy(&mc_fetch->mc_info, mi,
+                               sizeof(struct mc_info));
+               } else {
+                       /* There is no data for a bogus DomU command. */
+                       flags |= XEN_MC_NODATA;
+                       memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
+               }
+
+               mc_fetch->flags = flags;
+               mc_fetch->fetch_idx = fetch_idx;
+
+               if ( copy_to_guest(u_xen_mc, op, 1) )
+                       ret = -EFAULT;
+
+               spin_unlock(&mc_lock);
+               break;
+
+       case XEN_MC_notifydomain:
+               /* This hypercall is for Dom0 only */
+               if ( !IS_PRIV(v->domain) )
+                       return -EPERM;
+
+               spin_lock(&mc_notify_lock);
+
+               mc_notifydomain = &op->u.mc_notifydomain;
+               domU = get_domain_by_id(mc_notifydomain->mc_domid);
+               vcpuid = mc_notifydomain->mc_vcpuid;
+
+               if ((domU == NULL) || (domU == dom0)) {
+                       /* It's not possible to notify a non-existent domain
+                        * or the dom0. */
+                       spin_unlock(&mc_notify_lock);
+                       return -EACCES;
+               }
+
+               if (vcpuid >= MAX_VIRT_CPUS) {
+                       /* It's not possible to notify a vcpu, Xen can't
+                        * assign to a domain. */
+                       spin_unlock(&mc_notify_lock);
+                       return -EACCES;
+               }
+
+               mc_notifydomain->flags = XEN_MC_OK;
+
+               mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
+               if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
+                       /* The error telemetry is not for the guest, Dom0
+                        * wants to notify. */
+                       mc_notifydomain->flags |= XEN_MC_NOMATCH;
+               } else if ( guest_has_trap_callback(domU, vcpuid,
+                                               TRAP_machine_check) )
+               {
+                       /* Send notification */
+                       if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
+                               mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
+               } else
+                       mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
+
+#ifdef DEBUG
+               /* sanity check - these two flags are mutually exclusive */
+               if ((flags & XEN_MC_CANNOTHANDLE) && (flags & 
XEN_MC_NOTDELIVERED))
+                       BUG();
+#endif
+
+               if ( copy_to_guest(u_xen_mc, op, 1) )
+                       ret = -EFAULT;
+
+               if (ret == 0) {
+                       x86_mcinfo_marknotified(mc_notifydomain);
+               }
+
+               spin_unlock(&mc_notify_lock);
+               break;
+       }
+
+       return ret;
+}
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Fri Jul 04 16:27:44 2008 +0100
@@ -1,14 +1,30 @@
 #include <xen/init.h>
+#include <asm/traps.h>
 
-void amd_mcheck_init(struct cpuinfo_x86 *c);
+/* Init functions */
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
+void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
-/* Call the installed machine check handler for this CPU setup. */
-extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long 
error_code);
+/* Function pointer used in the handlers to collect additional information
+ * provided by newer CPU families/models without the need to duplicate
+ * the whole handler resulting in various handlers each with its own
+ * tweaks and bugs */
+extern int (*mc_callback_bank_extended)(struct mc_info *mi,
+               uint16_t bank, uint64_t status);
 
+
+/* Helper functions used for collecting error telemetry */
+struct mc_info *x86_mcinfo_getptr(void);
+void x86_mcinfo_clear(struct mc_info *mi);
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
+void x86_mcinfo_dump(struct mc_info *mi);
+
+/* Global variables */
 extern int mce_disabled __initdata;
-extern int nr_mce_banks;
-
+extern unsigned int nr_mce_banks;
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c       Fri Jul 04 16:27:44 2008 +0100
@@ -68,19 +68,29 @@ static int __init init_nonfatal_mce_chec
        if (!cpu_has(c, X86_FEATURE_MCA))
                return -ENODEV;
 
-       /* Some Athlons misbehave when we frob bank 0 */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-               boot_cpu_data.x86 == 6)
-                       firstbank = 1;
-       else
-                       firstbank = 0;
-
        /*
         * Check for non-fatal errors every MCE_RATE s
         */
-       init_timer(&mce_timer, mce_work_fn, NULL, 0);
-       set_timer(&mce_timer, NOW() + MCE_PERIOD);
-       printk(KERN_INFO "Machine check exception polling timer started.\n");
+       switch (c->x86_vendor) {
+       case X86_VENDOR_AMD:
+               if (c->x86 == 6) { /* K7 */
+                       firstbank = 1;
+                       init_timer(&mce_timer, mce_work_fn, NULL, 0);
+                       set_timer(&mce_timer, NOW() + MCE_PERIOD);
+                       break;
+               }
+
+               /* Assume we are on K8 or newer AMD CPU here */
+               amd_nonfatal_mcheck_init(c);
+               break;
+
+       case X86_VENDOR_INTEL:
+               init_timer(&mce_timer, mce_work_fn, NULL, 0);
+               set_timer(&mce_timer, NOW() + MCE_PERIOD);
+               break;
+       }
+
+       printk(KERN_INFO "MCA: Machine check polling timer started.\n");
        return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/cpu/mcheck/x86_mca.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,72 @@
+/*
+ * MCA implementation for AMD K7/K8 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc. 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* The MCA/MCE MSRs should not be used anywhere else.
+ * They are cpu family/model specific and are only for use
+ * in terms of machine check handling.
+ * So we define them here rather in <asm/msr.h>.
+ */
+
+
+/* Bitfield of the MSR_IA32_MCG_CAP register */
+#define MCG_CAP_COUNT           0x00000000000000ffULL
+#define MCG_CTL_P               0x0000000000000100ULL
+/* Bits 9-63 are reserved */
+
+/* Bitfield of the MSR_IA32_MCG_STATUS register */
+#define MCG_STATUS_RIPV         0x0000000000000001ULL
+#define MCG_STATUS_EIPV         0x0000000000000002ULL
+#define MCG_STATUS_MCIP         0x0000000000000004ULL
+/* Bits 3-63 are reserved */
+
+/* Bitfield of MSR_K8_MCi_STATUS registers */
+/* MCA error code */
+#define MCi_STATUS_MCA          0x000000000000ffffULL
+/* model-specific error code */
+#define MCi_STATUS_MSEC         0x00000000ffff0000ULL
+/* Other information */
+#define MCi_STATUS_OTHER        0x01ffffff00000000ULL
+/* processor context corrupt */
+#define MCi_STATUS_PCC          0x0200000000000000ULL
+/* MSR_K8_MCi_ADDR register valid */
+#define MCi_STATUS_ADDRV        0x0400000000000000ULL
+/* MSR_K8_MCi_MISC register valid */
+#define MCi_STATUS_MISCV        0x0800000000000000ULL
+/* error condition enabled */
+#define MCi_STATUS_EN           0x1000000000000000ULL
+/* uncorrected error */
+#define MCi_STATUS_UC           0x2000000000000000ULL
+/* status register overflow */
+#define MCi_STATUS_OVER         0x4000000000000000ULL
+/* valid */
+#define MCi_STATUS_VAL          0x8000000000000000ULL
+
+/* Bitfield of MSi_STATUS_OTHER field */
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED1      0x00001fff00000000ULL
+/* uncorrectable ECC error */
+#define MCi_STATUS_OTEHR_UC_ECC         0x0000200000000000ULL
+/* correctable ECC error */
+#define MCi_STATUS_OTHER_C_ECC          0x0000400000000000ULL
+/* ECC syndrome of an ECC error */
+#define MCi_STATUS_OTHER_ECC_SYNDROME   0x007f800000000000ULL
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
+
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/nmi.c
--- a/xen/arch/x86/nmi.c        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/nmi.c        Fri Jul 04 16:27:44 2008 +0100
@@ -457,10 +457,10 @@ static void do_nmi_stats(unsigned char k
     if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
         return;
 
-    if ( v->nmi_pending || v->nmi_masked )
+    if ( v->nmi_pending || (v->trap_priority >= VCPU_TRAP_NMI) )
         printk("dom0 vpu0: NMI %s%s\n",
                v->nmi_pending ? "pending " : "",
-               v->nmi_masked  ? "masked " : "");
+               (v->trap_priority >= VCPU_TRAP_NMI)  ? "masked " : "");
     else
         printk("dom0 vcpu0: NMI neither pending nor masked\n");
 }
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/traps.c      Fri Jul 04 16:27:44 2008 +0100
@@ -61,6 +61,7 @@
 #include <asm/msr.h>
 #include <asm/shared.h>
 #include <asm/x86_emulate.h>
+#include <asm/traps.h>
 #include <asm/hvm/vpt.h>
 #include <public/arch-x86/cpuid.h>
 
@@ -486,6 +487,20 @@ static unsigned int check_guest_io_break
 }
 
 /*
+ * Called from asm to set up the MCE trapbounce info.
+ * Returns 0 if no callback is set up, else 1.
+ */
+asmlinkage int set_guest_machinecheck_trapbounce(void)
+{
+    struct vcpu *v = current;
+    struct trap_bounce *tb = &v->arch.trap_bounce;
+ 
+    do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
+    tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
+    return !null_trap_bounce(v, tb);
+}
+
+/*
  * Called from asm to set up the NMI trapbounce info.
  * Returns 0 if no callback is set up, else 1.
  */
@@ -904,8 +919,6 @@ asmlinkage void do_int3(struct cpu_user_
 
 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
 {
-    extern fastcall void (*machine_check_vector)(
-        struct cpu_user_regs *, long error_code);
     machine_check_vector(regs, regs->error_code);
 }
 
@@ -2678,25 +2691,51 @@ asmlinkage void do_general_protection(st
     panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
 }
 
+static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
+
 static void nmi_mce_softirq(void)
 {
-    /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
-    vcpu_kick(dom0->vcpu[0]);
+    int cpu = smp_processor_id();
+    struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
+    cpumask_t affinity;
+
+    BUG_ON(st == NULL);
+    BUG_ON(st->vcpu == NULL);
+
+    /* Set the tmp value unconditionally, so that
+     * the check in the iret hypercall works. */
+    st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
+
+    if ((cpu != st->processor)
+       || (st->processor != st->vcpu->processor))
+    {
+        /* We are on a different physical cpu.
+         * Make sure to wakeup the vcpu on the
+         * specified processor.
+         */
+        cpus_clear(affinity);
+        cpu_set(st->processor, affinity);
+        vcpu_set_affinity(st->vcpu, &affinity);
+
+        /* Affinity is restored in the iret hypercall. */
+    }
+
+    /* Only used to defer wakeup of domain/vcpu to
+     * a safe (non-NMI/MCE) context.
+     */
+    vcpu_kick(st->vcpu);
 }
 
 static void nmi_dom0_report(unsigned int reason_idx)
 {
-    struct domain *d;
-    struct vcpu   *v;
-
-    if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
+    struct domain *d = dom0;
+
+    if ( (d == NULL) || (d->vcpu[0] == NULL) )
         return;
 
     set_bit(reason_idx, nmi_reason(d));
 
-    /* Not safe to wake a vcpu here, or even to schedule a tasklet! */
-    if ( !test_and_set_bool(v->nmi_pending) )
-        raise_softirq(NMI_MCE_SOFTIRQ);
+    send_guest_trap(d, 0, TRAP_nmi);
 }
 
 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
@@ -3010,6 +3049,70 @@ long unregister_guest_nmi_callback(void)
     return 0;
 }
 
+int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int 
trap_nr)
+{
+    struct vcpu *v;
+    struct trap_info *t;
+
+    BUG_ON(d == NULL);
+    BUG_ON(vcpuid >= MAX_VIRT_CPUS);
+
+    /* Sanity check - XXX should be more fine grained. */
+    BUG_ON(trap_nr > TRAP_syscall);
+
+    v = d->vcpu[vcpuid];
+    t = &v->arch.guest_context.trap_ctxt[trap_nr];
+
+    return (t->address != 0);
+}
+
+
+int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
+{
+    struct vcpu *v;
+    struct softirq_trap *st;
+
+    BUG_ON(d == NULL);
+    BUG_ON(vcpuid >= MAX_VIRT_CPUS);
+    v = d->vcpu[vcpuid];
+
+    switch (trap_nr) {
+    case TRAP_nmi:
+        if ( !test_and_set_bool(v->nmi_pending) ) {
+               st = &per_cpu(softirq_trap, smp_processor_id());
+               st->domain = dom0;
+               st->vcpu = dom0->vcpu[0];
+               st->processor = st->vcpu->processor;
+
+               /* not safe to wake up a vcpu here */
+               raise_softirq(NMI_MCE_SOFTIRQ);
+               return 0;
+        }
+        break;
+
+    case TRAP_machine_check:
+
+        /* We are called by the machine check (exception or polling) handlers
+         * on the physical CPU that reported a machine check error. */
+
+        if ( !test_and_set_bool(v->mce_pending) ) {
+                st = &per_cpu(softirq_trap, smp_processor_id());
+                st->domain = d;
+                st->vcpu = v;
+                st->processor = v->processor;
+
+                /* not safe to wake up a vcpu here */
+                raise_softirq(NMI_MCE_SOFTIRQ);
+                return 0;
+        }
+        break;
+    }
+
+    /* delivery failed */
+    return -EIO;
+}
+
+
 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
 {
     struct trap_info cur;
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_32/asm-offsets.c Fri Jul 04 16:27:44 2008 +0100
@@ -67,7 +67,11 @@ void __dummy__(void)
            arch.guest_context.kernel_sp);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
-    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
+    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
+    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
+    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
+    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
+    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
     DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     BLANK();
 
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_32/entry.S       Fri Jul 04 16:27:44 2008 +0100
@@ -229,6 +229,8 @@ test_all_events:
         shl  $IRQSTAT_shift,%eax
         test %ecx,irq_stat(%eax,1)
         jnz  process_softirqs
+        testb $1,VCPU_mce_pending(%ebx)
+        jnz  process_mce
         testb $1,VCPU_nmi_pending(%ebx)
         jnz  process_nmi
 test_guest_events:
@@ -255,15 +257,35 @@ process_softirqs:
         jmp  test_all_events
 
         ALIGN
+/* %ebx: struct vcpu */
+process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
+        jae  test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%ebx)
+        call set_guest_machinecheck_trapbounce
+        test %eax,%eax
+        jz   test_all_events
+        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
+        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
+        jmp process_trap
+
+        ALIGN
+/* %ebx: struct vcpu */
 process_nmi:
-        testb $1,VCPU_nmi_masked(%ebx)
-        jnz  test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
+        jae  test_guest_events
         sti
         movb $0,VCPU_nmi_pending(%ebx)
         call set_guest_nmi_trapbounce
         test %eax,%eax
         jz   test_all_events
-        movb $1,VCPU_nmi_masked(%ebx)
+        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
+        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
+        /* FALLTHROUGH */
+process_trap:
         leal VCPU_trap_bounce(%ebx),%edx
         call create_bounce_frame
         jmp  test_all_events
@@ -681,6 +703,10 @@ ENTRY(hypercall_table)
         .long do_sysctl             /* 35 */
         .long do_domctl
         .long do_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/4)
+        .long do_ni_hypercall
+        .endr
+        .long do_mca                /* 48 */
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -724,6 +750,10 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec_op          */
+        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
+        .byte 0 /* do_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca               */  /* 48 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_32/traps.c       Fri Jul 04 16:27:44 2008 +0100
@@ -255,8 +255,13 @@ unsigned long do_iret(void)
             goto exit_and_crash;
     }
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore affinity.  */
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
+
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/asm-offsets.c Fri Jul 04 16:27:44 2008 +0100
@@ -92,7 +92,11 @@ void __dummy__(void)
     OFFSET(VCPU_kernel_ss, struct vcpu, arch.guest_context.kernel_ss);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
-    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
+    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
+    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
+    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
+    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
+    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
     DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     DEFINE(_VGCF_syscall_disables_events,  _VGCF_syscall_disables_events);
     BLANK();
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/compat/entry.S
--- a/xen/arch/x86/x86_64/compat/entry.S        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/compat/entry.S        Fri Jul 04 16:27:44 2008 +0100
@@ -101,6 +101,8 @@ ENTRY(compat_test_all_events)
         leaq  irq_stat(%rip),%rcx
         testl $~0,(%rcx,%rax,1)
         jnz   compat_process_softirqs
+        testb $1,VCPU_mce_pending(%rbx)
+        jnz   compat_process_mce
         testb $1,VCPU_nmi_pending(%rbx)
         jnz   compat_process_nmi
 compat_test_guest_events:
@@ -129,15 +131,34 @@ compat_process_softirqs:
 
        ALIGN
 /* %rbx: struct vcpu */
+compat_process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jae  compat_test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%rbx)
+        call set_guest_machinecheck_trapbounce
+        testl %eax,%eax
+        jz    compat_test_all_events
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw  $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jmp   compat_process_trap
+
+       ALIGN
+/* %rbx: struct vcpu */
 compat_process_nmi:
-        testb $1,VCPU_nmi_masked(%rbx)
-        jnz   compat_test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        jae   compat_test_guest_events
         sti
         movb  $0,VCPU_nmi_pending(%rbx)
         call  set_guest_nmi_trapbounce
         testl %eax,%eax
         jz    compat_test_all_events
-        movb  $1,VCPU_nmi_masked(%rbx)
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw  $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        /* FALLTHROUGH */
+compat_process_trap:
         leaq  VCPU_trap_bounce(%rbx),%rdx
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
@@ -386,6 +407,10 @@ ENTRY(compat_hypercall_table)
         .quad do_sysctl                 /* 35 */
         .quad do_domctl
         .quad compat_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8)
+        .quad compat_ni_hypercall
+        .endr
+        .quad do_mca                    /* 48 */
         .rept NR_hypercalls-((.-compat_hypercall_table)/8)
         .quad compat_ni_hypercall
         .endr
@@ -429,6 +454,10 @@ ENTRY(compat_hypercall_args_table)
         .byte 1 /* do_sysctl                */  /* 35 */
         .byte 1 /* do_domctl                */
         .byte 2 /* compat_kexec_op          */
+        .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table)
+        .byte 0 /* compat_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca                   */
         .rept NR_hypercalls-(.-compat_hypercall_args_table)
         .byte 0 /* compat_ni_hypercall      */
         .endr
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/compat/traps.c
--- a/xen/arch/x86/x86_64/compat/traps.c        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/compat/traps.c        Fri Jul 04 16:27:44 2008 +0100
@@ -121,8 +121,13 @@ unsigned int compat_iret(void)
     else
         regs->_esp += 16;
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore affinity.  */
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
+
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/entry.S
--- a/xen/arch/x86/x86_64/entry.S       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/entry.S       Fri Jul 04 16:27:44 2008 +0100
@@ -205,6 +205,8 @@ test_all_events:
         leaq  irq_stat(%rip),%rcx
         testl $~0,(%rcx,%rax,1)
         jnz   process_softirqs
+        testb $1,VCPU_mce_pending(%rbx)
+        jnz   process_mce
         testb $1,VCPU_nmi_pending(%rbx)
         jnz   process_nmi
 test_guest_events:
@@ -231,15 +233,34 @@ process_softirqs:
 
         ALIGN
 /* %rbx: struct vcpu */
+process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jae  test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%rbx)
+        call set_guest_machinecheck_trapbounce
+        test %eax,%eax
+        jz   test_all_events
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jmp  process_trap
+
+        ALIGN
+/* %rbx: struct vcpu */
 process_nmi:
-        testb $1,VCPU_nmi_masked(%rbx)
-        jnz  test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        jae  test_guest_events
         sti
         movb $0,VCPU_nmi_pending(%rbx)
         call set_guest_nmi_trapbounce
         test %eax,%eax
         jz   test_all_events
-        movb $1,VCPU_nmi_masked(%rbx)
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        /* FALLTHROUGH */
+process_trap:
         leaq VCPU_trap_bounce(%rbx),%rdx
         call create_bounce_frame
         jmp  test_all_events
@@ -671,6 +692,10 @@ ENTRY(hypercall_table)
         .quad do_sysctl             /* 35 */
         .quad do_domctl
         .quad do_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8)
+        .quad do_ni_hypercall
+        .endr
+        .quad do_mca                /* 48 */
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -715,6 +740,10 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec             */
         .byte 1 /* do_xsm_op            */
+        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
+        .byte 0 /* do_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca               */  /* 48 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r d133d452cb71 -r a49673cd23d2 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/arch/x86/x86_64/traps.c       Fri Jul 04 16:27:44 2008 +0100
@@ -288,8 +288,13 @@ unsigned long do_iret(void)
         regs->rcx = iret_saved.rcx;
     }
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore affinity.  */
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
+
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE);
diff -r d133d452cb71 -r a49673cd23d2 xen/common/domain.c
--- a/xen/common/domain.c       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/common/domain.c       Fri Jul 04 16:27:44 2008 +0100
@@ -654,7 +654,9 @@ void vcpu_reset(struct vcpu *v)
     v->is_polling      = 0;
     v->is_initialised  = 0;
     v->nmi_pending     = 0;
-    v->nmi_masked      = 0;
+    v->mce_pending     = 0;
+    v->old_trap_priority = VCPU_TRAP_NONE;
+    v->trap_priority   = VCPU_TRAP_NONE;
     clear_bit(_VPF_blocked, &v->pause_flags);
 
     domain_unlock(v->domain);
diff -r d133d452cb71 -r a49673cd23d2 xen/common/event_channel.c
--- a/xen/common/event_channel.c        Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/common/event_channel.c        Fri Jul 04 16:27:44 2008 +0100
@@ -587,6 +587,21 @@ void send_guest_vcpu_virq(struct vcpu *v
     evtchn_set_pending(v, port);
 }
 
+int guest_enabled_event(struct vcpu *v, int virq)
+{
+    int port;
+
+    if ( unlikely(v == NULL) )
+        return 0;
+
+    port = v->virq_to_evtchn[virq];
+    if ( port == 0 )
+        return 0;
+
+    /* virq is in use */
+    return 1;
+}
+
 void send_guest_global_virq(struct domain *d, int virq)
 {
     int port;
diff -r d133d452cb71 -r a49673cd23d2 xen/include/Makefile
--- a/xen/include/Makefile      Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/Makefile      Fri Jul 04 16:27:44 2008 +0100
@@ -20,6 +20,7 @@ headers-y := \
     compat/xen.h \
     compat/xencomm.h \
     compat/xenoprof.h
+headers-$(CONFIG_X86)     += compat/arch-x86/xen-mca.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen-$(compat-arch-y).h
 headers-y                 += compat/arch-$(compat-arch-y).h compat/xlat.h
diff -r d133d452cb71 -r a49673cd23d2 xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h       Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/asm-x86/event.h       Fri Jul 04 16:27:44 2008 +0100
@@ -69,7 +69,12 @@ static inline void local_event_delivery_
 /* No arch specific virq definition now. Default to global. */
 static inline int arch_virq_is_global(int virq)
 {
-    return 1;
+    switch (virq) {
+    case VIRQ_MCA:
+        return 1;
+    default:
+        return 1;
+    }
 }
 
 #endif
diff -r d133d452cb71 -r a49673cd23d2 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/asm-x86/mm.h  Fri Jul 04 16:27:44 2008 +0100
@@ -141,6 +141,9 @@ static inline u32 pickle_domptr(struct d
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
+#define maddr_get_owner(ma)   (page_get_owner(maddr_to_page((ma))))
+#define vaddr_get_owner(va)   (page_get_owner(virt_to_page((va))))
+
 #define XENSHARE_writable 0
 #define XENSHARE_readonly 1
 extern void share_xen_page_with_guest(
diff -r d133d452cb71 -r a49673cd23d2 xen/include/asm-x86/traps.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/traps.h       Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2007, 2008 Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@xxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef ASM_TRAP_H
+#define ASM_TRAP_H
+
+struct softirq_trap {
+       struct domain *domain;  /* domain to inject trap */
+       struct vcpu *vcpu;      /* vcpu to inject trap */
+       int processor;          /* physical cpu to inject trap */
+};
+
+struct cpu_user_regs;
+
+extern void (*machine_check_vector)(struct cpu_user_regs *regs, long 
error_code);
+ 
+/**
+ * guest_has_trap_callback
+ *
+ * returns true (non-zero) if guest registered a trap handler
+ */
+extern int guest_has_trap_callback(struct domain *d, uint16_t vcpuid,
+                               unsigned int trap_nr);
+
+/**
+ * send_guest_trap
+ *
+ * delivers trap to guest analogous to send_guest_global_virq
+ * return 0 on successful delivery
+ */
+extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
+                               unsigned int trap_nr);
+
+#endif /* ASM_TRAP_H */
diff -r d133d452cb71 -r a49673cd23d2 xen/include/public/arch-x86/xen-mca.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/arch-x86/xen-mca.h     Fri Jul 04 16:27:44 2008 +0100
@@ -0,0 +1,279 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * 
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@xxxxxxx>
+ *
+ * Guest OS machine check interface to x86 Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Full MCA functionality has the following Usecases from the guest side:
+ *
+ * Must have's:
+ * 1. Dom0 and DomU register machine check trap callback handlers
+ *    (already done via "set_trap_table" hypercall)
+ * 2. Dom0 registers machine check event callback handler
+ *    (doable via EVTCHNOP_bind_virq)
+ * 3. Dom0 and DomU fetches machine check data
+ * 4. Dom0 wants Xen to notify a DomU
+ * 5. Dom0 gets DomU ID from physical address
+ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
+ *
+ * Nice to have's:
+ * 7. Dom0 wants Xen to deactivate a physical CPU
+ *    This is better done as separate task, physical CPU hotplugging,
+ *    and hypercall(s) should be sysctl's
+ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
+ *    move a DomU (or Dom0 itself) away from a malicious page
+ *    producing correctable errors.
+ * 9. offlining physical page:
+ *    Xen free's and never re-uses a certain physical page.
+ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
+ *     and tell Xen to trigger a machine check
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+#define XEN_MCA_INTERFACE_VERSION 0x03000001
+
+/* IN: Dom0 calls hypercall from MC event handler. */
+#define XEN_MC_CORRECTABLE  0x0
+/* IN: Dom0/DomU calls hypercall from MC trap handler. */
+#define XEN_MC_TRAP         0x1
+/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+
+/* OUT: All is ok */
+#define XEN_MC_OK           0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED  0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA       0x2
+/* OUT: Between notification time and this hypercall an other
+ *  (most likely) correctable error happened. The fetched data,
+ *  does not match the original machine check data. */
+#define XEN_MC_NOMATCH      0x4
+
+/* OUT: DomU did not register MC NMI handler. Try something else. */
+#define XEN_MC_CANNOTHANDLE 0x8
+/* OUT: Notifying DomU failed. Retry later or try something else. */
+#define XEN_MC_NOTDELIVERED 0x10
+/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
+
+
+#ifndef __ASSEMBLY__
+
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
+ */
+#define MC_TYPE_GLOBAL          0
+#define MC_TYPE_BANK            1
+#define MC_TYPE_EXTENDED        2
+
+struct mcinfo_common {
+    uint16_t type;      /* structure type */
+    uint16_t size;      /* size of this struct in bytes */
+};
+
+
+#define MC_FLAG_CORRECTABLE     (1 << 0)
+#define MC_FLAG_UNCORRECTABLE   (1 << 1)
+
+/* contains global x86 mc information */
+struct mcinfo_global {
+    struct mcinfo_common common;
+
+    /* running domain at the time in error (most likely the impacted one) */
+    uint16_t mc_domid;
+    uint32_t mc_socketid; /* physical socket of the physical core */
+    uint16_t mc_coreid; /* physical impacted core */
+    uint16_t mc_core_threadid; /* core thread of physical core */
+    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+    uint64_t mc_gstatus; /* global status */
+    uint32_t mc_flags;
+};
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+    struct mcinfo_common common;
+
+    uint16_t mc_bank; /* bank nr */
+    uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0
+                        * and if mc_addr is valid. Never valid on DomU. */
+    uint64_t mc_status; /* bank status */
+    uint64_t mc_addr;   /* bank address, only valid
+                         * if addr bit is set in mc_status */
+    uint64_t mc_misc;
+};
+
+
+struct mcinfo_msr {
+    uint64_t reg;   /* MSR */
+    uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other
+ * or additional mc MSRs */ 
+struct mcinfo_extended {
+    struct mcinfo_common common;
+
+    /* You can fill up to five registers.
+     * If you need more, then use this structure
+     * multiple times. */
+
+    uint32_t mc_msrs; /* Number of msr with valid values. */
+    struct mcinfo_msr mc_msr[5];
+};
+
+#define MCINFO_HYPERCALLSIZE   1024
+#define MCINFO_MAXSIZE         768
+
+struct mc_info {
+    /* Number of mcinfo_* entries in mi_data */
+    uint32_t mi_nentries;
+
+    uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
+};
+typedef struct mc_info mc_info_t;
+
+
+
+/* 
+ * OS's should use these instead of writing their own lookup function
+ * each with its own bugs and drawbacks.
+ * We use macros instead of static inline functions to allow guests
+ * to include this header in assembly files (*.S).
+ */
+/* Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi)    \
+    (_mi)->mi_nentries
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi)       \
+    (struct mcinfo_common *)((_mi)->mi_data)
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+    (struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)
+
+/* Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+#define x86_mcinfo_lookup(_ret, _mi, _type)    \
+    do {                                                        \
+        uint32_t found, i;                                      \
+        struct mcinfo_common *_mic;                             \
+                                                                \
+        found = 0;                                              \
+       (_ret) = NULL;                                          \
+       if (_mi == NULL) break;                                 \
+        _mic = x86_mcinfo_first(_mi);                           \
+        for (i = 0; i < x86_mcinfo_nentries(_mi); i++) {        \
+            if (_mic->type == (_type)) {                        \
+                found = 1;                                      \
+                break;                                          \
+            }                                                   \
+            _mic = x86_mcinfo_next(_mic);                       \
+        }                                                       \
+        (_ret) = found ? _mic : NULL;                           \
+    } while (0)
+
+
+/* Usecase 1
+ * Register machine check trap callback handler
+ *    (already done via "set_trap_table" hypercall)
+ */
+
+/* Usecase 2
+ * Dom0 registers machine check event callback handler
+ * done by EVTCHNOP_bind_virq
+ */
+
+/* Usecase 3
+ * Fetch machine check data from hypervisor.
+ * Note, this hypercall is special, because both Dom0 and DomU must use this.
+ */
+#define XEN_MC_fetch            1
+struct xen_mc_fetch {
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+
+    /* OUT variables. */
+    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
+    struct mc_info mc_info;
+};
+typedef struct xen_mc_fetch xen_mc_fetch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
+
+
+/* Usecase 4
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain     2
+struct xen_mc_notifydomain {
+    /* IN variables. */
+    uint16_t mc_domid;    /* The unprivileged domain to notify. */
+    uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
+                           * Usually echo'd value from the fetch hypercall. */
+    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
+
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
+};
+typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
+
+
+struct xen_mc {
+    uint32_t cmd;
+    uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+    union {
+        struct xen_mc_fetch        mc_fetch;
+        struct xen_mc_notifydomain mc_notifydomain;
+        uint8_t pad[MCINFO_HYPERCALLSIZE];
+    } u;
+};
+typedef struct xen_mc xen_mc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
diff -r d133d452cb71 -r a49673cd23d2 xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/public/arch-x86/xen.h Fri Jul 04 16:27:44 2008 +0100
@@ -75,6 +75,10 @@ typedef unsigned long xen_pfn_t;
 
 /* Maximum number of virtual CPUs in multi-processor guests. */
 #define MAX_VIRT_CPUS 32
+
+
+/* Machine check support */
+#include "xen-mca.h"
 
 #ifndef __ASSEMBLY__
 
diff -r d133d452cb71 -r a49673cd23d2 xen/include/xen/event.h
--- a/xen/include/xen/event.h   Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/xen/event.h   Fri Jul 04 16:27:44 2008 +0100
@@ -50,6 +50,9 @@ void free_xen_event_channel(
 void free_xen_event_channel(
     struct vcpu *local_vcpu, int port);
 
+/* Query if event channel is in use by the guest */
+int guest_enabled_event(struct vcpu *v, int virq);
+
 /* Notify remote end of a Xen-attached event channel.*/
 void notify_via_xen_event_channel(int lport);
 
diff -r d133d452cb71 -r a49673cd23d2 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Fri Jul 04 13:02:31 2008 +0100
+++ b/xen/include/xen/sched.h   Fri Jul 04 16:27:44 2008 +0100
@@ -112,10 +112,21 @@ struct vcpu
     bool_t           is_initialised;
     /* Currently running on a CPU? */
     bool_t           is_running;
+    /* MCE callback pending for this VCPU? */
+    bool_t           mce_pending;
     /* NMI callback pending for this VCPU? */
     bool_t           nmi_pending;
-    /* Avoid NMI reentry by allowing NMIs to be masked for short periods. */
-    bool_t           nmi_masked;
+
+    /* Higher priorized traps may interrupt lower priorized traps,
+     * lower priorized traps wait until higher priorized traps finished.
+     * Note: This concept is known as "system priority level" (spl)
+     * in the UNIX world. */
+    uint16_t         old_trap_priority;
+    uint16_t         trap_priority;
+#define VCPU_TRAP_NONE    0
+#define VCPU_TRAP_NMI     1
+#define VCPU_TRAP_MCE     2
+
     /* Require shutdown to be deferred for some asynchronous operation? */
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] x86: MCA support., Xen patchbot-unstable <=