[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] VT-d: honor APEI firmware-first mode in XSA-59 workaround code



When firmware-first mode is being indicated by firmware, we shouldn't
be modifying AER registers - these are considered to be owned by
firmware in that case. Violating this is being reported to result in
SMI storms. While circumventing the workaround means re-exposing
affected hosts to the XSA-59 issues, this in any event seems better
than not booting at all. Respective messages are being issued to the
log, so the situation can be diagnosed.

The basic building blocks were taken from Linux 3.15-rc. Note that
this includes a block of code enclosed in #ifdef CONFIG_X86_MCE - we
don't define that symbol, and that code also wouldn't build without
suitable machine check side code added; that should happen eventually,
but isn't subject of this change.

Reported-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Reported-by: Malcolm Crossley <malcolm.crossley@xxxxxxxxxx>
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
Tested-by: Malcolm Crossley <malcolm.crossley@xxxxxxxxxx>
---
This is unchanged (other than the Tested-by tag added) from the v2 RFC
posting.

--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -754,6 +754,8 @@ int __init acpi_boot_init(void)
 
        erst_init();
 
+       acpi_hest_init();
+
        acpi_table_parse(ACPI_SIG_BGRT, acpi_invalidate_bgrt);
 
        return 0;
--- a/xen/drivers/acpi/apei/Makefile
+++ b/xen/drivers/acpi/apei/Makefile
@@ -1,3 +1,4 @@
 obj-y += erst.o
+obj-y += hest.o
 obj-y += apei-base.o
 obj-y += apei-io.o
--- /dev/null
+++ b/xen/drivers/acpi/apei/hest.c
@@ -0,0 +1,200 @@
+/*
+ * APEI Hardware Error Souce Table support
+ *
+ * HEST describes error sources in detail; communicates operational
+ * parameters (i.e. severity levels, masking bits, and threshold
+ * values) to Linux as necessary. It also allows the BIOS to report
+ * non-standard error sources to Linux (for example, chipset-specific
+ * error registers).
+ *
+ * For more information about HEST, please refer to ACPI Specification
+ * version 4.0, section 17.3.2.
+ *
+ * Copyright 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/errno.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+#include "apei-internal.h"
+
+#define HEST_PFX "HEST: "
+
+static bool_t hest_disable;
+boolean_param("hest_disable", hest_disable);
+
+/* HEST table parsing */
+
+static struct acpi_table_hest *__read_mostly hest_tab;
+
+static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
+       [ACPI_HEST_TYPE_IA32_CHECK] = -1,       /* need further calculation */
+       [ACPI_HEST_TYPE_IA32_CORRECTED_CHECK] = -1,
+       [ACPI_HEST_TYPE_IA32_NMI] = sizeof(struct acpi_hest_ia_nmi),
+       [ACPI_HEST_TYPE_AER_ROOT_PORT] = sizeof(struct acpi_hest_aer_root),
+       [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
+       [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
+       [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+};
+
+static int hest_esrc_len(const struct acpi_hest_header *hest_hdr)
+{
+       u16 hest_type = hest_hdr->type;
+       int len;
+
+       if (hest_type >= ACPI_HEST_TYPE_RESERVED)
+               return 0;
+
+       len = hest_esrc_len_tab[hest_type];
+
+       if (hest_type == ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) {
+               const struct acpi_hest_ia_corrected *cmc =
+                       container_of(hest_hdr,
+                                    const struct acpi_hest_ia_corrected,
+                                    header);
+
+               len = sizeof(*cmc) + cmc->num_hardware_banks *
+                     sizeof(struct acpi_hest_ia_error_bank);
+       } else if (hest_type == ACPI_HEST_TYPE_IA32_CHECK) {
+               const struct acpi_hest_ia_machine_check *mc =
+                       container_of(hest_hdr,
+                                    const struct acpi_hest_ia_machine_check,
+                                    header);
+
+               len = sizeof(*mc) + mc->num_hardware_banks *
+                     sizeof(struct acpi_hest_ia_error_bank);
+       }
+       BUG_ON(len == -1);
+
+       return len;
+};
+
+int apei_hest_parse(apei_hest_func_t func, void *data)
+{
+       struct acpi_hest_header *hest_hdr;
+       int i, rc, len;
+
+       if (hest_disable || !hest_tab)
+               return -EINVAL;
+
+       hest_hdr = (struct acpi_hest_header *)(hest_tab + 1);
+       for (i = 0; i < hest_tab->error_source_count; i++) {
+               len = hest_esrc_len(hest_hdr);
+               if (!len) {
+                       printk(XENLOG_WARNING HEST_PFX
+                              "Unknown or unused hardware error source "
+                              "type: %d for hardware error source: %d\n",
+                              hest_hdr->type, hest_hdr->source_id);
+                       return -EINVAL;
+               }
+               if ((void *)hest_hdr + len >
+                   (void *)hest_tab + hest_tab->header.length) {
+                       printk(XENLOG_WARNING HEST_PFX
+                              "Table contents overflow for hardware error 
source: %d\n",
+                              hest_hdr->source_id);
+                       return -EINVAL;
+               }
+
+               rc = func(hest_hdr, data);
+               if (rc)
+                       return rc;
+
+               hest_hdr = (void *)hest_hdr + len;
+       }
+
+       return 0;
+}
+
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
+                                void *data)
+{
+#ifdef CONFIG_X86_MCE
+       unsigned int i;
+       const struct acpi_hest_ia_corrected *cmc;
+       const struct acpi_hest_ia_error_bank *mc_bank;
+
+       if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+               return 0;
+
+       cmc = container_of(hest_hdr, const struct acpi_hest_ia_corrected, 
header);
+       if (!cmc->enabled)
+               return 0;
+
+       /*
+        * We expect HEST to provide a list of MC banks that report errors
+        * in firmware first mode. Otherwise, return non-zero value to
+        * indicate that we are done parsing HEST.
+        */
+       if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || 
!cmc->num_hardware_banks)
+               return 1;
+
+       printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for corrected 
errors.\n");
+
+       mc_bank = (const struct acpi_hest_ia_error_bank *)(cmc + 1);
+       for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+               mce_disable_bank(mc_bank->bank_number);
+#else
+# define acpi_disable_cmcff 1
+#endif
+
+       return 1;
+}
+
+void __init acpi_hest_init(void)
+{
+       acpi_status status;
+       acpi_physical_address hest_addr;
+       acpi_native_uint hest_len;
+
+       if (acpi_disabled)
+               return;
+
+       if (hest_disable) {
+               printk(XENLOG_INFO HEST_PFX "Table parsing disabled.\n");
+               return;
+       }
+
+       status = acpi_get_table_phys(ACPI_SIG_HEST, 0, &hest_addr, &hest_len);
+       if (status == AE_NOT_FOUND)
+               goto err;
+       if (ACPI_FAILURE(status)) {
+               printk(XENLOG_ERR HEST_PFX "Failed to get table, %s\n",
+                      acpi_format_exception(status));
+               goto err;
+       }
+       map_pages_to_xen((unsigned long)__va(hest_addr), PFN_DOWN(hest_addr),
+                        PFN_UP(hest_addr + hest_len) - PFN_DOWN(hest_addr),
+                        PAGE_HYPERVISOR);
+       hest_tab = __va(hest_addr);
+
+       if (!acpi_disable_cmcff)
+               apei_hest_parse(hest_parse_cmc, NULL);
+
+       printk(XENLOG_INFO HEST_PFX "Table parsing has been initialized\n");
+       return;
+err:
+       hest_disable = 1;
+}
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1069,6 +1069,106 @@ void __hwdom_init setup_hwdom_pci_device
     spin_unlock(&pcidevs_lock);
 }
 
+#ifdef CONFIG_ACPI
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+static int hest_match_pci(const struct acpi_hest_aer_common *p,
+                          const struct pci_dev *pdev)
+{
+    return ACPI_HEST_SEGMENT(p->bus) == pdev->seg &&
+           ACPI_HEST_BUS(p->bus)     == pdev->bus &&
+           p->device                 == PCI_SLOT(pdev->devfn) &&
+           p->function               == PCI_FUNC(pdev->devfn);
+}
+
+static bool_t hest_match_type(const struct acpi_hest_header *hest_hdr,
+                              const struct pci_dev *pdev)
+{
+    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus,
+                                           PCI_SLOT(pdev->devfn),
+                                           PCI_FUNC(pdev->devfn),
+                                           PCI_CAP_ID_EXP);
+    u8 pcie = MASK_EXTR(pci_conf_read16(pdev->seg, pdev->bus,
+                                        PCI_SLOT(pdev->devfn),
+                                        PCI_FUNC(pdev->devfn),
+                                        pos + PCI_EXP_FLAGS),
+                        PCI_EXP_FLAGS_TYPE);
+
+    switch ( hest_hdr->type )
+    {
+    case ACPI_HEST_TYPE_AER_ROOT_PORT:
+        return pcie == PCI_EXP_TYPE_ROOT_PORT;
+    case ACPI_HEST_TYPE_AER_ENDPOINT:
+        return pcie == PCI_EXP_TYPE_ENDPOINT;
+    case ACPI_HEST_TYPE_AER_BRIDGE:
+        return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CLASS_DEVICE) ==
+               PCI_CLASS_BRIDGE_PCI;
+    }
+
+    return 0;
+}
+
+struct aer_hest_parse_info {
+    const struct pci_dev *pdev;
+    bool_t firmware_first;
+};
+
+static bool_t hest_source_is_pcie_aer(const struct acpi_hest_header *hest_hdr)
+{
+    if ( hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE )
+        return 1;
+    return 0;
+}
+
+static int aer_hest_parse(const struct acpi_hest_header *hest_hdr, void *data)
+{
+    struct aer_hest_parse_info *info = data;
+    const struct acpi_hest_aer_common *p;
+    bool_t ff;
+
+    if ( !hest_source_is_pcie_aer(hest_hdr) )
+        return 0;
+
+    p = (const struct acpi_hest_aer_common *)(hest_hdr + 1);
+    ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
+
+    /*
+     * If no specific device is supplied, determine whether
+     * FIRMWARE_FIRST is set for *any* PCIe device.
+     */
+    if ( !info->pdev )
+    {
+        info->firmware_first |= ff;
+        return 0;
+    }
+
+    /* Otherwise, check the specific device */
+    if ( p->flags & ACPI_HEST_GLOBAL ?
+         hest_match_type(hest_hdr, info->pdev) :
+         hest_match_pci(p, info->pdev) )
+    {
+        info->firmware_first = ff;
+        return 1;
+    }
+
+    return 0;
+}
+
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *pdev)
+{
+    struct aer_hest_parse_info info = { .pdev = pdev };
+
+    return pci_find_cap_offset(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CAP_ID_EXP) &&
+           apei_hest_parse(aer_hest_parse, &info) >= 0 &&
+           info.firmware_first;
+}
+#endif
+
 static int _dump_pci_devices(struct pci_seg *pseg, void *arg)
 {
     struct pci_dev *pdev;
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -386,9 +386,11 @@ void pci_vtd_quirk(const struct pci_dev 
     int dev = PCI_SLOT(pdev->devfn);
     int func = PCI_FUNC(pdev->devfn);
     int pos;
-    u32 val;
+    bool_t ff;
+    u32 val, val2;
     u64 bar;
     paddr_t pa;
+    const char *action;
 
     if ( pci_conf_read16(seg, bus, dev, func, PCI_VENDOR_ID) !=
          PCI_VENDOR_ID_INTEL )
@@ -438,7 +440,10 @@ void pci_vtd_quirk(const struct pci_dev 
                 pos = pci_find_next_ext_capability(seg, bus, pdev->devfn, pos,
                                                    PCI_EXT_CAP_ID_VNDR);
             }
+            ff = 0;
         }
+        else
+            ff = pcie_aer_get_firmware_first(pdev);
         if ( !pos )
         {
             printk(XENLOG_WARNING "%04x:%02x:%02x.%u without AER 
capability?\n",
@@ -447,18 +452,26 @@ void pci_vtd_quirk(const struct pci_dev 
         }
 
         val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
-                         val | PCI_ERR_UNC_UNSUP);
-        val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
-                         val | PCI_ERR_COR_ADV_NFAT);
+        val2 = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
+        if ( (val & PCI_ERR_UNC_UNSUP) && (val2 & PCI_ERR_COR_ADV_NFAT) )
+            action = "Found masked";
+        else if ( !ff )
+        {
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
+                             val | PCI_ERR_UNC_UNSUP);
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
+                             val2 | PCI_ERR_COR_ADV_NFAT);
+            action = "Masked";
+        }
+        else
+            action = "Must not mask";
 
         /* XPUNCERRMSK Send Completion with Unsupported Request */
         val = pci_conf_read32(seg, bus, dev, func, 0x20c);
         pci_conf_write32(seg, bus, dev, func, 0x20c, val | (1 << 4));
 
-        printk(XENLOG_INFO "Masked UR signaling on %04x:%02x:%02x.%u\n",
-               seg, bus, dev, func);
+        printk(XENLOG_INFO "%s UR signaling on %04x:%02x:%02x.%u\n",
+               action, seg, bus, dev, func);
         break;
 
     case 0x100: case 0x104: case 0x108: /* Sandybridge */
--- a/xen/include/acpi/actbl1.h
+++ b/xen/include/acpi/actbl1.h
@@ -445,6 +445,14 @@ struct acpi_hest_aer_common {
 #define ACPI_HEST_FIRMWARE_FIRST        (1)
 #define ACPI_HEST_GLOBAL                (1<<1)
 
+/*
+ * Macros to access the bus/segment numbers in Bus field above:
+ *  Bus number is encoded in bits 7:0
+ *  Segment number is encoded in bits 23:8
+ */
+#define ACPI_HEST_BUS(bus)              ((bus) & 0xFF)
+#define ACPI_HEST_SEGMENT(bus)          (((bus) >> 8) & 0xFFFF)
+
 /* Hardware Error Notification */
 
 struct acpi_hest_notify {
--- a/xen/include/acpi/apei.h
+++ b/xen/include/acpi/apei.h
@@ -12,6 +12,9 @@
 
 #define FIX_APEI_RANGE_MAX 64
 
+typedef int (*apei_hest_func_t)(const struct acpi_hest_header *, void *);
+int apei_hest_parse(apei_hest_func_t, void *);
+
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
 int erst_get_next_record_id(u64 *record_id);
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -61,6 +61,7 @@ int acpi_boot_init (void);
 int acpi_boot_table_init (void);
 int acpi_numa_init (void);
 int erst_init(void);
+void acpi_hest_init(void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_table_handler handler);
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -144,6 +144,8 @@ int pci_find_next_ext_capability(int seg
 const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
                       unsigned int *dev, unsigned int *func);
 
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *);
+
 struct pirq;
 int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);


Attachment: VT-d-mask-UR-honor-firmware-first.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.