WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 2/2] hvm: NUMA guest: inject NUMA topology into the g

To: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 2/2] hvm: NUMA guest: inject NUMA topology into the guest (resend)
From: Andre Przywara <andre.przywara@xxxxxxx>
Date: Fri, 11 Jul 2008 16:13:27 +0200
Cc:
Delivery-date: Fri, 11 Jul 2008 07:14:53 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 1.5.0.10 (X11/20070409)
This patch extends the hvm_info_table to store the number of guest nodes
and will create a suitable ACPI SRAT table to describe the used guest
NUMA topology.

Rediffed to apply against staging 18036.

Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
diff -r f70a956b987f tools/firmware/hvmloader/acpi/acpi2_0.h
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h   Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h   Fri Jul 11 16:02:39 2008 +0200
@@ -356,6 +356,61 @@
 };
 
 /*
+ * System Resource Affinity Table header definition (SRAT).
+ */
+struct acpi_20_srat {
+    struct acpi_header header;
+    uint32_t table_revision;
+    uint32_t reserved2[2];
+};
+
+#define ACPI_SRAT_TABLE_REVISION 1
+
+/*
+ * System Resource Affinity Table structure types.
+ */
+#define ACPI_PROCESSOR_AFFIN           0x00
+#define ACPI_MEMORY_AFFIN              0x01
+
+struct acpi_20_srat_processor {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t sapic_id;
+    uint8_t domain_hi[3];
+    uint32_t reserved;
+};
+
+/*
+ * Local APIC Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0)
+
+struct acpi_20_srat_memory {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t domain_hi[3];      /* this is ACPI 3.0, reserved in 2.0 */
+    uint16_t reserved;
+    uint32_t base_address_lo;
+    uint32_t base_address_hi;
+    uint32_t length_lo;
+    uint32_t length_hi;
+    uint32_t reserved2;
+    uint32_t flags;
+    uint32_t reserved3[2];
+};
+
+/*
+ * Memory Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_MEM_AFFIN_ENABLED (1 << 0)
+#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1)
+#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2)  /* this is ACPI 3.0 */
+
+/*
  * Table Signatures.
  */
 #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -366,6 +421,7 @@
 #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
 
 /*
  * Table revision numbers.
@@ -378,6 +434,7 @@
 #define ACPI_2_0_TCPA_REVISION 0x02
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_2_0_SRAT_REVISION 0x01
 
 #pragma pack ()
 
diff -r f70a956b987f tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c     Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/build.c     Fri Jul 11 16:02:39 2008 +0200
@@ -20,6 +20,9 @@
 #include "ssdt_tpm.h"
 #include "../config.h"
 #include "../util.h"
+#include "../e820.h"
+
+#define ONEMB 0x100000
 
 #define align16(sz)        (((sz) + 15) & ~15)
 #define fixed_strcpy(d, s) strncpy((d), (s), sizeof(d))
@@ -45,6 +48,140 @@
 
     p = table;
     p[checksum_offset] = -sum;
+}
+
+static int vcpu_to_numa_node (int vcpu_id, int nr_vcpus)
+{
+int div,mod;
+
+    div=nr_vcpus / get_numanodes();
+    mod=nr_vcpus % get_numanodes();
+
+    if ( vcpu_id < mod * (div + 1)) return vcpu_id / (div + 1);
+    return ( ( vcpu_id - (mod * (div + 1)) ) / div ) + mod;
+}
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+static uint64_t guessmemsize (void)
+{
+    uint64_t ret = 0;
+    struct e820entry *map = HVM_E820;
+    int i;
+
+    for ( i = 0; i < *HVM_E820_NR ; i++)
+    {
+        if (map[i].addr == ONEMB )
+            ret+=map[i].size + PAGE_SIZE * 3 + ONEMB;
+        if (map[i].addr == (1ULL << 32))
+            ret+=map[i].size;
+    }
+    return ret;
+}
+
+int construct_srat(struct acpi_20_srat *srat)
+{
+    struct acpi_20_srat_processor *processor;
+    struct acpi_20_srat_memory    *memory;
+    struct e820entry *map = HVM_E820;
+    int i, offset = 0;
+    uint64_t hvm_node_mem;
+
+    memset(srat, 0, sizeof(*srat));
+    srat->header.signature    = ACPI_2_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_2_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision      = ACPI_SRAT_TABLE_REVISION;
+    offset += sizeof(*srat);
+
+    processor = (struct acpi_20_srat_processor *)(srat + 1);
+    for ( i = 0; i < get_vcpu_nr(); i++ )
+    {
+        memset(processor, 0, sizeof(*processor));
+        processor->type    = ACPI_PROCESSOR_AFFIN;
+        processor->length  = sizeof(*processor);
+        processor->domain  = vcpu_to_numa_node (i, get_vcpu_nr());
+        processor->apic_id = LAPIC_ID(i);
+        processor->flags   = ACPI_LOCAL_APIC_AFFIN_ENABLED;
+        processor->sapic_id= 0;
+        offset += sizeof(*processor);
+        processor++;
+    }
+
+ /*
+  * Equally distribute the memory on all NUMA nodes. Round up the size
+  * of available memory to whole megabytes, as (at least) Linux cannot cope
+  * with uneven NUMA node boundaries. The remaining part of memory will be
+  * assigned to the last NUMA node. The mapping of the first MB is copied
+  * from the E820 map and assigned to node 0
+  */
+    hvm_node_mem = guessmemsize()+ONEMB-1;
+    hvm_node_mem = hvm_node_mem >> 20;
+ /* 64bit/32bit does not work because of missing libgcc */
+    hvm_node_mem = (uint32_t)hvm_node_mem / get_numanodes();
+    hvm_node_mem = hvm_node_mem << 20;
+
+    memory = (struct acpi_20_srat_memory *)(processor);
+    for ( i = 0; i < *HVM_E820_NR; i++ )
+    {
+        if ( map[i].type != E820_RAM ) continue;
+        if ( map[i].addr >= ONEMB ) break;
+
+        memset(memory, 0, sizeof(*memory));
+        memory->type        = ACPI_MEMORY_AFFIN;
+        memory->length      = sizeof(*memory);
+        memory->domain      = 0;
+        memory->base_address_lo = map[i].addr & 0xFFFFFFFFL;
+        memory->base_address_hi = map[i].addr >> 32;
+        memory->length_lo   = map[i].size & 0xFFFFFFFFL;
+        memory->length_hi   = map[i].size >> 32;
+        memory->flags       = ACPI_MEM_AFFIN_ENABLED;
+
+        offset += sizeof(*memory);
+        memory++;
+    }
+
+    for ( i = 0; i < get_numanodes(); i++ )
+    {
+        memset(memory, 0, sizeof(*memory));
+        memory->type        = ACPI_MEMORY_AFFIN;
+        memory->length      = sizeof(*memory);
+        memory->domain      = i;
+        if ( i == 0 )
+        {
+            memory->base_address_lo = ONEMB;
+            memory->base_address_hi = 0;
+            memory->length_lo   = ( hvm_node_mem  - ONEMB ) & 0xFFFFFFFFL;
+            memory->length_hi   = ( hvm_node_mem  - ONEMB ) >> 32;
+        } else
+        if ( i == get_numanodes()-1 )
+        {
+            memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+            memory->base_address_hi = (i * hvm_node_mem) >> 32;
+            memory->length_lo   = (guessmemsize()-hvm_node_mem*i) & 
0xFFFFFFFFL;
+            memory->length_hi   = (guessmemsize()-hvm_node_mem*i) >> 32;
+        } else
+        {
+            memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+            memory->base_address_hi = (i * hvm_node_mem) >> 32;
+            memory->length_lo   = hvm_node_mem & 0xFFFFFFFFL;
+            memory->length_hi   = hvm_node_mem >> 32;
+        }
+        memory->flags       = ACPI_MEM_AFFIN_ENABLED;
+        offset += sizeof(*memory);
+        memory++;
+    }
+
+    srat->header.length = offset;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+    return align16(offset);
 }
 
 static int uart_exists(uint16_t uart_base)
@@ -188,6 +325,7 @@
 static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
 {
     int offset = 0, nr_tables = 0;
+    struct acpi_20_srat *srat;
     struct acpi_20_madt *madt;
     struct acpi_20_hpet *hpet;
     struct acpi_20_tcpa *tcpa;
@@ -200,6 +338,14 @@
         madt = (struct acpi_20_madt *)&buf[offset];
         offset += construct_madt(madt);
         table_ptrs[nr_tables++] = (unsigned long)madt;
+    }
+
+    /* SRAT. */
+    if ( get_numanodes() > 0 )
+    {
+        srat = (struct acpi_20_srat *)&buf[offset];
+        offset += construct_srat(srat);
+        table_ptrs[nr_tables++] = (unsigned long)srat;
     }
 
     /* HPET. */
diff -r f70a956b987f tools/firmware/hvmloader/util.c
--- a/tools/firmware/hvmloader/util.c   Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.c   Fri Jul 11 16:02:39 2008 +0200
@@ -594,6 +594,12 @@
     return (t ? t->nr_vcpus : 1);
 }
 
+int get_numanodes(void)
+{
+    struct hvm_info_table *t = get_hvm_info_table();
+    return (t ? t->numanodes : 1);
+}
+
 int get_acpi_enabled(void)
 {
     struct hvm_info_table *t = get_hvm_info_table();
diff -r f70a956b987f tools/firmware/hvmloader/util.h
--- a/tools/firmware/hvmloader/util.h   Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.h   Fri Jul 11 16:02:39 2008 +0200
@@ -104,6 +104,7 @@
 
 /* HVM-builder info. */
 int get_vcpu_nr(void);
+int get_numanodes(void);
 int get_acpi_enabled(void);
 int get_apic_mode(void);
 
diff -r f70a956b987f tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 16:02:39 2008 +0200
@@ -861,6 +861,18 @@
 
 #endif /* __i386__ || __x86_64__ */
 
+static unsigned hweight_long (unsigned long value)
+{
+int ret=0;
+
+    while (value>0)
+    {
+        if (value&1) ++ret;
+        value>>=1;
+    }
+    return ret;
+}
+
 static PyObject *pyxc_hvm_build(XcObject *self,
                                 PyObject *args,
                                 PyObject *kwds)
@@ -900,6 +912,7 @@
     va_hvm->acpi_enabled = acpi;
     va_hvm->apic_mode    = apic;
     va_hvm->nr_vcpus     = vcpus;
+    va_hvm->numanodes    = hweight_long(nodemask);
     for ( i = 0, sum = 0; i < va_hvm->length; i++ )
         sum += ((uint8_t *)va_hvm)[i];
     va_hvm->checksum = -sum;
diff -r f70a956b987f xen/include/public/hvm/hvm_info_table.h
--- a/xen/include/public/hvm/hvm_info_table.h   Fri Jul 11 16:02:11 2008 +0200
+++ b/xen/include/public/hvm/hvm_info_table.h   Fri Jul 11 16:02:39 2008 +0200
@@ -36,6 +36,7 @@
     uint8_t     acpi_enabled;
     uint8_t     apic_mode;
     uint32_t    nr_vcpus;
+    uint32_t    numanodes;
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 2/2] hvm: NUMA guest: inject NUMA topology into the guest (resend), Andre Przywara <=