[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 3/6 ] Make mce_action action be usable for both delayed handler and urgent handler


  • To: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
  • From: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>
  • Date: Wed, 9 Jun 2010 22:32:22 +0800
  • Accept-language: en-US
  • Acceptlanguage: en-US
  • Cc: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxx>
  • Delivery-date: Wed, 09 Jun 2010 07:41:53 -0700
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>
  • Thread-index: AcsH4JLPCinaKtkNS9OlS5qm3SP2rQ==
  • Thread-topic: [PATCH 3/6 ] Make mce_action action be usable for both delayed handler and urgent handler

Make mce_action action be usable for both delayed handler and urgent handler

Originally mce_action is called for delayed hander. Change it to be used for 
both delayed handler and urgent handler.Wrap it with mce_delayed_action for 
delay handler.

Change the return value to be more clearly.

Change the mca handler from mca_code to a function to be more flexible. And 
change the interface to mce_handler to be mca_binfo to pass more information.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

mce_intel.c |  210 ++++++++++++++++++++++++++++++++++++------------------------
 x86_mca.h   |   15 +++-
 2 files changed, 140 insertions(+), 85 deletions(-)

diff -r fb1adcc25acb xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Wed Jun 09 09:01:49 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Wed Jun 09 14:39:37 2010 +0800
@@ -154,89 +154,124 @@ static void mce_barrier_enter(struct mce
 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
 
-static void intel_UCR_handler(struct mcinfo_bank *bank,
-             struct mcinfo_global *global,
-             struct mcinfo_extended *extension,
-             struct mca_handle_result *result);
-#define INTEL_MAX_RECOVERY 2
-struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
-            {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
+struct mca_error_handler *mce_dhandlers, *mce_uhandlers;
+int mce_dhandler_num, mce_uhandler_num;
+
+enum mce_result
+{
+    MCER_NOERROR,
+    MCER_RECOVERED,
+    /* Not recoverd, but can continue */
+    MCER_CONTINUE,
+    MCER_RESET,
+};
+
+/* Maybe called in MCE context, no lock, no printk */
+static enum mce_result mce_action(struct cpu_user_regs *regs,
+                      mctelem_cookie_t mctc)
+{
+    struct mc_info *local_mi;
+    enum mce_result ret = MCER_NOERROR;
+    uint32_t i;
+    struct mcinfo_common *mic = NULL;
+    struct mca_handle_result mca_res;
+    struct mca_binfo binfo;
+    struct mca_error_handler *handlers = mce_dhandlers;
+    int handler_num = mce_dhandler_num;
+
+    /* When in mce context, regs is valid */
+    if (regs)
+    {
+        handler_num = mce_uhandler_num;
+        handlers = mce_uhandlers;
+    }
+
+    /* At least a default handler should be registerd */
+    ASSERT(handler_num);
+
+    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
+    if (mic == NULL) {
+        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
+        return MCER_CONTINUE;
+    }
+
+    memset(&binfo, 0, sizeof(binfo));
+    binfo.mig = (struct mcinfo_global *)mic;
+    binfo.mi = local_mi;
+
+    /* Processing bank information */
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
+
+    for ( ; ret != MCER_RESET && mic && mic->size;
+          mic = x86_mcinfo_next(mic) )
+    {
+        if (mic->type != MC_TYPE_BANK) {
+            continue;
+        }
+        binfo.mib = (struct mcinfo_bank*)mic;
+        binfo.bank = binfo.mib->mc_bank;
+        memset(&mca_res, 0x0f, sizeof(mca_res));
+        for ( i = 0; i < handler_num; i++ ) {
+            if (handlers[i].owned_error(binfo.mib->mc_status))
+            {
+                handlers[i].recovery_handler(binfo.bank, &binfo, &mca_res);
+
+                if (mca_res.result & MCA_OWNER)
+                    binfo.mib->mc_domid = mca_res.owner;
+
+                if (mca_res.result == MCA_NEED_RESET)
+                    ret = MCER_RESET;
+                else if (mca_res.result == MCA_RECOVERED)
+                {
+                    if (ret < MCER_RECOVERED)
+                        ret = MCER_RECOVERED;
+                }
+                else if (mca_res.result == MCA_NO_ACTION)
+                {
+                    if (ret < MCER_CONTINUE)
+                        ret = MCER_CONTINUE;
+                }
+                break;
+            }
+        }
+        ASSERT(i != handler_num);
+    }
+
+    return ret;
+}
 
 /*
  * Called from mctelem_process_deferred. Return 1 if the telemetry
  * should be committed for dom0 consumption, 0 if it should be
  * dismissed.
  */
-static int mce_action(mctelem_cookie_t mctc)
-{
-    struct mc_info *local_mi;
-    uint32_t i;
-    struct mcinfo_common *mic = NULL;
-    struct mcinfo_global *mc_global;
-    struct mcinfo_bank *mc_bank;
-    struct mca_handle_result mca_res;
-
-    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
-    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
-    if (mic == NULL) {
-        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
-        return 0;
-    }
-
-    mc_global = (struct mcinfo_global *)mic;
-
-    /* Processing bank information */
-    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
-
-    for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
-        if (mic->type != MC_TYPE_BANK) {
-            continue;
-        }
-        mc_bank = (struct mcinfo_bank*)mic;
-
-        /* TODO: Add recovery actions here, such as page-offline, etc */
-        memset(&mca_res, 0x0f, sizeof(mca_res));
-        for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
-            if ( ((mc_bank->mc_status & 0xffff) ==
-                        intel_recovery_handler[i].mca_code) ||
-                  ((mc_bank->mc_status & 0xfff0) ==
-                        intel_recovery_handler[i].mca_code)) {
-                /* For SRAR, OVER = 1 should have caused reset
-                 * For SRAO, OVER = 1 skip recovery action, continue execution
-                 */
-                if (!(mc_bank->mc_status & MCi_STATUS_OVER))
-                    intel_recovery_handler[i].recovery_handler
-                                (mc_bank, mc_global, NULL, &mca_res);
-                else {
-                   if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
-                       mca_res.result = MCA_NEED_RESET;
-                   else
-                       mca_res.result = MCA_NO_ACTION;
-                }
-                if (mca_res.result & MCA_OWNER)
-                    mc_bank->mc_domid = mca_res.owner;
-                if (mca_res.result == MCA_NEED_RESET)
-                    /* DOMID_XEN*/
-                    mc_panic("MCE: Software recovery failed for the UCR "
-                                "error\n");
-                else if (mca_res.result == MCA_RECOVERED)
-                    mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
-                                "successfully recovered by software!\n");
-                else if (mca_res.result == MCA_NO_ACTION)
-                    mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
-                                "do recover action, RIPV=1, let it be.\n");
-                break;
-            }
-        }
-        /* For SRAR, no defined recovery action should have caused reset
-         * in MCA Handler
-         */
-        if ( i >= INTEL_MAX_RECOVERY )
-            mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
-                            " found for this SRAO error\n");
-
-    }
-    return 1;
+static int mce_delayed_action(mctelem_cookie_t mctc)
+{
+    enum mce_result result;
+    int ret = 0;
+
+    result = mce_action(NULL, mctc);
+
+    switch (result)
+    {
+    case MCER_RESET:
+        panic("MCE: Software recovery failed for the UCR\n");
+        break;
+    case MCER_RECOVERED:
+        dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
+        ret  = 1;
+        break;
+    case MCER_CONTINUE:
+        dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
+            "system is tainted\n");
+        ret = 1;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+    return ret;
 }
 
 /* Softirq Handler for this MCE# processing */
@@ -274,7 +309,7 @@ static void mce_softirq(void)
          * vMCE MSRs virtualization buffer
          */
         for_each_online_cpu(workcpu) {
-            mctelem_process_deferred(workcpu, mce_action);
+            mctelem_process_deferred(workcpu, mce_delayed_action);
         }
 
         /* Step2: Send Log to DOM0 through vIRQ */
@@ -466,11 +501,18 @@ intel_get_extended_msrs(struct mcinfo_gl
     return mc_ext;
 }
 
-static void intel_UCR_handler(struct mcinfo_bank *bank,
-             struct mcinfo_global *global,
-             struct mcinfo_extended *extension,
+#define INTEL_MAX_RECOVERY 2
+static int is_async_memerr(uint64_t status)
+{
+    return (status & 0xFFFF) == 0x17A || (status & 0xFFF0) == 0xC0;
+}
+
+static void intel_memerr_dhandler(int bnum,
+             struct mca_binfo *binfo,
              struct mca_handle_result *result)
 {
+    struct mcinfo_bank *bank = binfo->mib;
+    struct mcinfo_global *global = binfo->mig;
     struct domain *d;
     unsigned long mfn, gfn;
     uint32_t status;
@@ -545,6 +587,9 @@ static void intel_UCR_handler(struct mci
     }
 }
 
+struct mca_error_handler intel_mce_dhandlers[] =
+            {{is_async_memerr, intel_memerr_dhandler}};
+
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
     uint64_t gstatus;
@@ -1007,6 +1052,9 @@ static void intel_init_mce(void)
     x86_mce_vector_register(intel_machine_check);
     mce_recoverable_register(intel_recoverable_scan);
     mce_need_clearbank_register(intel_need_clearbank_scan);
+
+    mce_dhandlers = intel_mce_dhandlers;
+    mce_dhandler_num = sizeof(intel_mce_dhandlers)/sizeof(struct 
mca_error_handler);
 }
 
 static int intel_init_mca_banks(void)
diff -r fb1adcc25acb xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Wed Jun 09 09:01:49 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Wed Jun 09 14:39:26 2010 +0800
@@ -151,6 +151,15 @@ struct mca_handle_result
     struct recovery_action *action;
 };
 
+/*Keep bank so that we can get staus even if mib is NULL */
+struct mca_binfo {
+    int bank;
+    struct mcinfo_global *mig;
+    struct mcinfo_bank *mib;
+    struct mc_info *mi;
+    struct cpu_user_regs *regs;
+};
+
 extern void (*mca_prehandler)( struct cpu_user_regs *regs,
                         struct mca_handle_result *result);
 
@@ -161,10 +170,8 @@ struct mca_error_handler
      * a seperate function to decode the corresponding actions
      * for the particular mca error later.
     */
-    uint16_t mca_code;
-    void (*recovery_handler)( struct mcinfo_bank *bank,
-                    struct mcinfo_global *global,
-                    struct mcinfo_extended *extension,
+    int (*owned_error)(uint64_t status);
+    void (*recovery_handler)(int bank, struct mca_binfo *binfo,
                     struct mca_handle_result *result);
 };
 


Attachment: mce_action.patch
Description: mce_action.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.