WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] mca: Fix several issues for MCA UCR error

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] mca: Fix several issues for MCA UCR error handling
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Tue, 22 Sep 2009 01:25:34 -0700
Delivery-date: Tue, 22 Sep 2009 01:27:04 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1253605052 -3600
# Node ID 8c4685fc198ef4b5ea8accf30cb0b6b828cef54f
# Parent  bcb6b95b30b13efa9635f8b8e1b7ff57c50dae3d
mca: Fix several issues for MCA UCR error handling

This patch is for fixing several issues for MCA UCR error handling on
latest Intel platforms, including:
1) For UCR error, the  is 0xC0 ~ 0xCF instead of just C0
2) Synchronization issues for clearing error finding flag and clearing
global MCIP flag. Otherwise, in some cases, MCIP flag can't be cleared.

Signed-off-by: Liping Ke <liping.ke@xxxxxxxxx>
---
 xen/arch/x86/cpu/mcheck/mce_intel.c |   73 +++++++++++++++---------------------
 1 files changed, 32 insertions(+), 41 deletions(-)

diff -r bcb6b95b30b1 -r 8c4685fc198e xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Sep 22 08:36:40 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Sep 22 08:37:32 2009 +0100
@@ -45,7 +45,6 @@ static atomic_t found_error = ATOMIC_INI
 
 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
-static int mce_barrier_last(struct mce_softirq_barrier *);
 
 #ifdef CONFIG_X86_MCE_THERMAL
 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -339,7 +338,7 @@ void intel_UCR_handler(struct mcinfo_ban
     unsigned long mfn, gfn;
     uint32_t status;
 
-    printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n");
+    printk(KERN_DEBUG "MCE: Enter UCR recovery action\n");
     result->result = MCA_NEED_RESET;
     if (bank->mc_addr != 0) {
          mfn = bank->mc_addr >> PAGE_SHIFT;
@@ -430,8 +429,10 @@ static int mce_action(mctelem_cookie_t m
         /* TODO: Add recovery actions here, such as page-offline, etc */
         memset(&mca_res, 0x0f, sizeof(mca_res));
         for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
-            if ( (mc_bank->mc_status & 0xffff) == 
-                        intel_recovery_handler[i].mca_code ) {
+            if ( ((mc_bank->mc_status & 0xffff) ==
+                        intel_recovery_handler[i].mca_code) ||
+                  ((mc_bank->mc_status & 0xfff0) ==
+                        intel_recovery_handler[i].mca_code)) {
                 /* For SRAR, OVER = 1 should have caused reset
                  * For SRAO, OVER = 1 skip recovery action, continue execution
                  */
@@ -439,10 +440,10 @@ static int mce_action(mctelem_cookie_t m
                     intel_recovery_handler[i].recovery_handler
                                 (mc_bank, mc_global, NULL, &mca_res);
                 else {
-                   if (!mc_global->mc_gstatus & MCG_STATUS_RIPV)
+                   if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
                        mca_res.result = MCA_NEED_RESET;
                    else
-                       mca_res.result = MCA_NO_ACTION; 
+                       mca_res.result = MCA_NO_ACTION;
                 }
                 if (mca_res.result & MCA_OWNER)
                     mc_bank->mc_domid = mca_res.owner;
@@ -458,13 +459,14 @@ static int mce_action(mctelem_cookie_t m
                                 "recover action, RIPV=1, let it be.\n");
                 break;
             }
-            /* For SRAR, no defined recovery action should have caused reset
-             * in MCA Handler
-             */
-            if ( i >= INTEL_MAX_RECOVERY )
-                printk(KERN_DEBUG "MCE: No software recovery action found for "
-                                "this SRAO error\n");
         }
+        /* For SRAR, no defined recovery action should have caused reset
+         * in MCA Handler
+         */
+        if ( i >= INTEL_MAX_RECOVERY )
+            printk(KERN_DEBUG "MCE: No software recovery action found for "
+                            "this SRAO error\n");
+
     }
     return 1;
 }
@@ -622,16 +624,6 @@ static void mce_barrier_exit(struct mce_
       }
 }
 
-static int mce_barrier_last(struct mce_softirq_barrier *bar)
-{
-    int gen = atomic_read(&bar->ingen);
-    if ( atomic_read(&bar->ingen) == gen &&
-        atomic_read(&bar->val) == 1 ) {
-        return 1;
-    }
-    return 0;
-}
-
 #if 0
 static void mce_barrier(struct mce_softirq_barrier *bar)
 {
@@ -645,7 +637,7 @@ static void intel_machine_check(struct c
     uint64_t gstatus;
     mctelem_cookie_t mctc = NULL;
     struct mca_summary bs;
-    cpu_banks_t clear_bank; 
+    cpu_banks_t clear_bank;
 
     mce_spin_lock(&mce_logout_lock);
 
@@ -677,9 +669,11 @@ static void intel_machine_check(struct c
         }
         atomic_set(&found_error, 1);
 
-        printk(KERN_DEBUG "MCE: clear_bank map %lx\n", 
-                *((unsigned long*)clear_bank));
+        printk(KERN_DEBUG "MCE: clear_bank map %lx on CPU%d\n",
+                *((unsigned long*)clear_bank), smp_processor_id());
         mcheck_mca_clearbanks(clear_bank);
+       /* Print MCE error */
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
 
     } else {
         if (mctc != NULL)
@@ -692,29 +686,26 @@ static void intel_machine_check(struct c
      */
     mce_barrier_enter(&mce_trap_bar);
     /* According to latest MCA OS writer guide, if no error bank found
-     * on all cpus, something unexpected happening, we can't do any 
+     * on all cpus, something unexpected happening, we can't do any
      * recovery job but to reset the system.
      */
     if (atomic_read(&found_error) == 0)
         mc_panic("Unexpected condition for the MCE handler, need reset\n");
-    if (mce_barrier_last(&mce_trap_bar)) {
-        printk(KERN_DEBUG "Choose one CPU to clear error finding flag\n ");
+    mce_barrier_exit(&mce_trap_bar);
+
+    /* Clear error finding flags after all cpus finishes above judgement */
+    mce_barrier_enter(&mce_trap_bar);
+    if (atomic_read(&found_error)) {
+        printk(KERN_DEBUG "MCE: Choose one CPU "
+                       "to clear error finding flag\n ");
         atomic_set(&found_error, 0);
     }
+    mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+    if ((gstatus & MCG_STATUS_MCIP) != 0) {
+        printk(KERN_DEBUG "MCE: Clear MCIP@ last step");
+        mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
+    }
     mce_barrier_exit(&mce_trap_bar);
-
-    /*
-     * Clear MCIP if it wasn't already. There is a small
-     * chance that more than 1 CPU will end up doing this,
-     * but that's OK.
-     */
-    if (bs.errcnt) {
-        mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
-        if ((gstatus & MCG_STATUS_MCIP) != 0)
-            mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
-        /* Print MCE error */
-        x86_mcinfo_dump(mctelem_dataptr(mctc));
-    }
 
     raise_softirq(MACHINE_CHECK_SOFTIRQ);
 }

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] mca: Fix several issues for MCA UCR error handling, Xen patchbot-unstable <=