[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC XEN PATCH v4 05/41] x86/mm: exclude PMEM regions from initial frametable



No specification defines that PMEM regions cannot appear in margins
between RAM regions. If that does happen, init_frametable() will need
to allocate RAM for the part of frametable of PMEM regions. However,
PMEM regions can be very large (several terabytes or more), so
init_frametable() may fail.

Because Xen does not use PMEM at the boot time, we can defer the
actual resource allocation of frametable of PMEM regions. At the boot
time, all pages of frametable of PMEM regions appearing between RAM
regions are mapped one RAM page filled with 0xff.

Any attempt, whichs write to those frametable pages before the their
actual resource is allocated, implies bugs in Xen. Therefore, the
read-only mapping is used here to make those bugs explicit.

Signed-off-by: Haozhong Zhang <haozhong.zhang@xxxxxxxxx>
---
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>
Cc: Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
Cc: Jan Beulich <jbeulich@xxxxxxxx>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx>
Cc: Tim Deegan <tim@xxxxxxx>
Cc: Wei Liu <wei.liu2@xxxxxxxxxx>
---
 xen/arch/x86/mm.c         | 118 +++++++++++++++++++++++++++++++++++++++++-----
 xen/arch/x86/setup.c      |   4 ++
 xen/drivers/acpi/Makefile |   2 +
 xen/drivers/acpi/nfit.c   | 116 +++++++++++++++++++++++++++++++++++++++++++++
 xen/include/acpi/actbl1.h |  43 +++++++++++++++++
 xen/include/xen/acpi.h    |   7 +++
 6 files changed, 279 insertions(+), 11 deletions(-)
 create mode 100644 xen/drivers/acpi/nfit.c

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 155e42569b..b2046ca2f0 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -83,6 +83,9 @@
  * an application-supplied buffer).
  */
 
+#ifdef CONFIG_NVDIMM_PMEM
+#include <xen/acpi.h>
+#endif
 #include <xen/init.h>
 #include <xen/kernel.h>
 #include <xen/lib.h>
@@ -188,34 +191,127 @@ static int __init parse_mmio_relax(const char *s)
 }
 custom_param("mmio-relax", parse_mmio_relax);
 
-static void __init init_frametable_chunk(void *start, void *end)
+static void __init init_frametable_ram_chunk(unsigned long s, unsigned long e)
 {
-    unsigned long s = (unsigned long)start;
-    unsigned long e = (unsigned long)end;
-    unsigned long step;
+    unsigned long cur, step;
     mfn_t mfn;
 
-    ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
-    for ( ; s < e; s += step << PAGE_SHIFT )
+    for ( cur = s; cur < e; cur += step << PAGE_SHIFT )
     {
         step = 1UL << (cpu_has_page1gb &&
-                       !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
+                       !(cur & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
                        L3_PAGETABLE_SHIFT - PAGE_SHIFT :
                        L2_PAGETABLE_SHIFT - PAGE_SHIFT);
         /*
          * The hardcoded 4 below is arbitrary - just pick whatever you think
          * is reasonable to waste as a trade-off for using a large page.
          */
-        while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
+        while ( step && cur + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
             step >>= PAGETABLE_ORDER;
         mfn = alloc_boot_pages(step, step);
-        map_pages_to_xen(s, mfn_x(mfn), step, PAGE_HYPERVISOR);
+        map_pages_to_xen(cur, mfn_x(mfn), step, PAGE_HYPERVISOR);
     }
 
-    memset(start, 0, end - start);
-    memset(end, -1, s - e);
+    memset((void *)s, 0, e - s);
+    memset((void *)e, -1, cur - e);
 }
 
+#ifdef CONFIG_NVDIMM_PMEM
+static void __init init_frametable_pmem_chunk(unsigned long s, unsigned long e)
+{
+    static mfn_t pmem_init_frametable_mfn = INVALID_MFN_INITIALIZER;
+
+    ASSERT(!((s | e) & (PAGE_SIZE - 1)));
+
+    if ( mfn_eq(pmem_init_frametable_mfn, INVALID_MFN) )
+    {
+        pmem_init_frametable_mfn = alloc_boot_pages(1, 1);
+        if ( mfn_eq(pmem_init_frametable_mfn, INVALID_MFN) )
+            panic("Not enough memory for pmem initial frame table page");
+        memset(mfn_to_virt(mfn_x(pmem_init_frametable_mfn)), -1, PAGE_SIZE);
+    }
+
+    while ( s < e )
+    {
+        /*
+         * The real frame table entries of a pmem region will be
+         * created when the pmem region is registered to hypervisor.
+         * Any write attempt to the initial entries of that pmem
+         * region implies potential hypervisor bugs. In order to make
+         * those bugs explicit, map those initial entries as read-only.
+         */
+        map_pages_to_xen(s, mfn_x(pmem_init_frametable_mfn),
+                         1, PAGE_HYPERVISOR_RO);
+        s += PAGE_SIZE;
+    }
+}
+#endif /* CONFIG_NVDIMM_PMEM */
+
+static void __init init_frametable_chunk(void *start, void *end)
+{
+    unsigned long s = (unsigned long)start;
+    unsigned long e = (unsigned long)end;
+#ifdef CONFIG_NVDIMM_PMEM
+    unsigned long pmem_smfn, pmem_emfn;
+    unsigned long pmem_spage = s, pmem_epage = s;
+    unsigned long pmem_page_aligned;
+    bool found = false;
+#endif /* CONFIG_NVDIMM_PMEM */
+
+    ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
+
+#ifndef CONFIG_NVDIMM_PMEM
+    init_frametable_ram_chunk(s, e);
+#else
+    while ( s < e )
+    {
+        /* No previous found pmem region overlaps with s ~ e. */
+        if ( s >= (pmem_epage & PAGE_MASK) )
+        {
+            found = acpi_nfit_boot_search_pmem(
+                mfn_x(page_to_mfn((struct page_info *)s)),
+                mfn_x(page_to_mfn((struct page_info *)e)),
+                &pmem_smfn, &pmem_emfn);
+            if ( found )
+            {
+                pmem_spage = (unsigned long)mfn_to_page(_mfn(pmem_smfn));
+                pmem_epage = (unsigned long)mfn_to_page(_mfn(pmem_emfn));
+            }
+        }
+
+        /* No pmem region found in s ~ e. */
+        if ( s >= (pmem_epage & PAGE_MASK) )
+        {
+            init_frametable_ram_chunk(s, e);
+            break;
+        }
+
+        if ( s < pmem_spage )
+        {
+            init_frametable_ram_chunk(s, pmem_spage);
+            pmem_page_aligned = (pmem_spage + PAGE_SIZE - 1) & PAGE_MASK;
+            if ( pmem_page_aligned > pmem_epage )
+                memset((void *)pmem_epage, -1, pmem_page_aligned - pmem_epage);
+            s = pmem_page_aligned;
+        }
+        else
+        {
+            pmem_page_aligned = pmem_epage & PAGE_MASK;
+            if ( pmem_page_aligned > s )
+                init_frametable_pmem_chunk(s, pmem_page_aligned);
+            if ( pmem_page_aligned < pmem_epage )
+            {
+                init_frametable_ram_chunk(pmem_page_aligned,
+                                          min(pmem_page_aligned + PAGE_SIZE, 
e));
+                memset((void *)pmem_page_aligned, -1,
+                       pmem_epage - pmem_page_aligned);
+            }
+            s = (pmem_epage + PAGE_SIZE - 1) & PAGE_MASK;
+        }
+    }
+#endif
+ }
+
 void __init init_frametable(void)
 {
     unsigned int sidx, eidx, nidx;
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 32bb02e3a5..d7c41e2e5d 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1358,6 +1358,10 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
     BUILD_BUG_ON(MACH2PHYS_VIRT_END   != RO_MPT_VIRT_END);
 
+#ifdef CONFIG_NVDIMM_PMEM
+    acpi_nfit_boot_init();
+#endif
+
     init_frametable();
 
     if ( !acpi_boot_table_init_done )
diff --git a/xen/drivers/acpi/Makefile b/xen/drivers/acpi/Makefile
index 444b11d583..c8bb869cb8 100644
--- a/xen/drivers/acpi/Makefile
+++ b/xen/drivers/acpi/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_HAS_CPUFREQ) += pmstat.o
 
 obj-$(CONFIG_X86) += hwregs.o
 obj-$(CONFIG_X86) += reboot.o
+
+obj-$(CONFIG_NVDIMM_PMEM) += nfit.o
diff --git a/xen/drivers/acpi/nfit.c b/xen/drivers/acpi/nfit.c
new file mode 100644
index 0000000000..e099378ee0
--- /dev/null
+++ b/xen/drivers/acpi/nfit.c
@@ -0,0 +1,116 @@
+/*
+ * xen/drivers/acpi/nfit.c
+ *
+ * Copyright (C) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/acpi.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+
+/*
+ * GUID of a byte addressable persistent memory region
+ * (ref. ACPI 6.2, Section 5.2.25.2)
+ */
+static const uint8_t nfit_spa_pmem_guid[] =
+{
+    0x79, 0xd3, 0xf0, 0x66, 0xf3, 0xb4, 0x74, 0x40,
+    0xac, 0x43, 0x0d, 0x33, 0x18, 0xb7, 0x8c, 0xdb,
+};
+
+struct acpi_nfit_desc {
+    struct acpi_table_nfit *acpi_table;
+};
+
+static struct acpi_nfit_desc nfit_desc;
+
+void __init acpi_nfit_boot_init(void)
+{
+    acpi_status status;
+    acpi_physical_address nfit_addr;
+    acpi_native_uint nfit_len;
+
+    status = acpi_get_table_phys(ACPI_SIG_NFIT, 0, &nfit_addr, &nfit_len);
+    if ( ACPI_FAILURE(status) )
+        return;
+
+    nfit_desc.acpi_table = (struct acpi_table_nfit *)__va(nfit_addr);
+    map_pages_to_xen((unsigned long)nfit_desc.acpi_table, PFN_DOWN(nfit_addr),
+                     PFN_UP(nfit_addr + nfit_len) - PFN_DOWN(nfit_addr),
+                     PAGE_HYPERVISOR);
+}
+
+/**
+ * Search pmem regions overlapped with the specified address range.
+ *
+ * Parameters:
+ *  @smfn, @emfn: the start and end MFN of address range to search
+ *  @ret_smfn, @ret_emfn: return the address range of the first pmem region
+ *                        in above range
+ *
+ * Return:
+ *  Return true if a pmem region is overlapped with @smfn - @emfn. The
+ *  start and end MFN of the lowest pmem region are returned via
+ *  @ret_smfn and @ret_emfn respectively.
+ *
+ *  Return false if no pmem region is overlapped with @smfn - @emfn.
+ */
+bool __init acpi_nfit_boot_search_pmem(unsigned long smfn, unsigned long emfn,
+                                       unsigned long *ret_smfn,
+                                       unsigned long *ret_emfn)
+{
+    struct acpi_table_nfit *nfit_table = nfit_desc.acpi_table;
+    uint32_t hdr_offset = sizeof(*nfit_table);
+    unsigned long saddr = pfn_to_paddr(smfn), eaddr = pfn_to_paddr(emfn);
+    unsigned long ret_saddr = 0, ret_eaddr = 0;
+
+    if ( !nfit_table )
+        return false;
+
+    while ( hdr_offset < nfit_table->header.length )
+    {
+        struct acpi_nfit_header *hdr = (void *)nfit_table + hdr_offset;
+        struct acpi_nfit_system_address *spa;
+        unsigned long pmem_saddr, pmem_eaddr;
+
+        hdr_offset += hdr->length;
+
+        if ( hdr->type != ACPI_NFIT_TYPE_SYSTEM_ADDRESS )
+            continue;
+
+        spa = (struct acpi_nfit_system_address *)hdr;
+        if ( memcmp(spa->range_guid, nfit_spa_pmem_guid, 16) )
+            continue;
+
+        pmem_saddr = spa->address;
+        pmem_eaddr = pmem_saddr + spa->length;
+        if ( pmem_saddr >= eaddr || pmem_eaddr <= saddr )
+            continue;
+
+        if ( ret_saddr < pmem_saddr )
+            continue;
+        ret_saddr = pmem_saddr;
+        ret_eaddr = pmem_eaddr;
+    }
+
+    if ( ret_saddr == ret_eaddr )
+        return false;
+
+    *ret_smfn = paddr_to_pfn(ret_saddr);
+    *ret_emfn = paddr_to_pfn(ret_eaddr);
+
+    return true;
+}
diff --git a/xen/include/acpi/actbl1.h b/xen/include/acpi/actbl1.h
index e1991362dc..94d8d7775c 100644
--- a/xen/include/acpi/actbl1.h
+++ b/xen/include/acpi/actbl1.h
@@ -71,6 +71,7 @@
 #define ACPI_SIG_SBST           "SBST" /* Smart Battery Specification Table */
 #define ACPI_SIG_SLIT           "SLIT" /* System Locality Distance Information 
Table */
 #define ACPI_SIG_SRAT           "SRAT" /* System Resource Affinity Table */
+#define ACPI_SIG_NFIT           "NFIT" /* NVDIMM Firmware Interface Table */
 
 /*
  * All tables must be byte-packed to match the ACPI specification, since
@@ -903,6 +904,48 @@ struct acpi_msct_proximity {
        u64 memory_capacity;    /* In bytes */
 };
 
+/*******************************************************************************
+ *
+ * NFIT - NVDIMM Interface Table (ACPI 6.0+)
+ *               Version 1
+ *
+ 
******************************************************************************/
+
+struct acpi_table_nfit {
+       struct acpi_table_header header;        /* Common ACPI table header */
+       u32 reserved;                                           /* Reserved, 
must be zero */
+};
+
+/* Subtable header for NFIT */
+
+struct acpi_nfit_header {
+       u16 type;
+       u16 length;
+};
+
+/* Values for subtable type in struct acpi_nfit_header */
+enum acpi_nfit_type {
+       ACPI_NFIT_TYPE_SYSTEM_ADDRESS = 0,
+       ACPI_NFIT_TYPE_MEMORY_MAP = 1,
+};
+
+/*
+ * NFIT Subtables
+ */
+
+/* 0: System Physical Address Range Structure */
+struct acpi_nfit_system_address {
+       struct acpi_nfit_header header;
+       u16 range_index;
+       u16 flags;
+       u32 reserved;           /* Reseved, must be zero */
+       u32 proximity_domain;
+       u8      range_guid[16];
+       u64 address;
+       u64 length;
+       u64 memory_mapping;
+};
+
 
/*******************************************************************************
  *
  * SBST - Smart Battery Specification Table
diff --git a/xen/include/xen/acpi.h b/xen/include/xen/acpi.h
index 9409350f05..1bd8f9f4e4 100644
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -180,4 +180,11 @@ void acpi_reboot(void);
 void acpi_dmar_zap(void);
 void acpi_dmar_reinstate(void);
 
+#ifdef CONFIG_NVDIMM_PMEM
+void acpi_nfit_boot_init(void);
+bool acpi_nfit_boot_search_pmem(unsigned long smfn, unsigned long emfn,
+                                unsigned long *ret_smfn,
+                                unsigned long *ret_emfn);
+#endif /* CONFIG_NVDIMM_PMEM */
+
 #endif /*_LINUX_ACPI_H*/
-- 
2.15.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.