This patch extends the hvm_info_table to store the number of guest nodes
and will create a suitable ACPI SRAT table to describe the used guest
NUMA topology.
Rediffed to apply against staging 18036.
Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>
--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
diff -r f70a956b987f tools/firmware/hvmloader/acpi/acpi2_0.h
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h Fri Jul 11 16:02:39 2008 +0200
@@ -356,6 +356,61 @@
};
/*
+ * System Resource Affinity Table header definition (SRAT).
+ */
+struct acpi_20_srat {
+ struct acpi_header header;
+ uint32_t table_revision;
+ uint32_t reserved2[2];
+};
+
+#define ACPI_SRAT_TABLE_REVISION 1
+
+/*
+ * System Resource Affinity Table structure types.
+ */
+#define ACPI_PROCESSOR_AFFIN 0x00
+#define ACPI_MEMORY_AFFIN 0x01
+
+struct acpi_20_srat_processor {
+ uint8_t type;
+ uint8_t length;
+ uint8_t domain;
+ uint8_t apic_id;
+ uint32_t flags;
+ uint8_t sapic_id;
+ uint8_t domain_hi[3];
+ uint32_t reserved;
+};
+
+/*
+ * Local APIC Affinity Flags. All other bits are reserved and must be 0.
+ */
+#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0)
+
+struct acpi_20_srat_memory {
+ uint8_t type;
+ uint8_t length;
+ uint8_t domain;
+ uint8_t domain_hi[3]; /* this is ACPI 3.0, reserved in 2.0 */
+ uint16_t reserved;
+ uint32_t base_address_lo;
+ uint32_t base_address_hi;
+ uint32_t length_lo;
+ uint32_t length_hi;
+ uint32_t reserved2;
+ uint32_t flags;
+ uint32_t reserved3[2];
+};
+
+/*
+ * Memory Affinity Flags. All other bits are reserved and must be 0.
+ */
+#define ACPI_MEM_AFFIN_ENABLED (1 << 0)
+#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1)
+#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2) /* this is ACPI 3.0 */
+
+/*
* Table Signatures.
*/
#define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -366,6 +421,7 @@
#define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
#define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
#define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
/*
* Table revision numbers.
@@ -378,6 +434,7 @@
#define ACPI_2_0_TCPA_REVISION 0x02
#define ACPI_2_0_HPET_REVISION 0x01
#define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_2_0_SRAT_REVISION 0x01
#pragma pack ()
diff -r f70a956b987f tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/build.c Fri Jul 11 16:02:39 2008 +0200
@@ -20,6 +20,9 @@
#include "ssdt_tpm.h"
#include "../config.h"
#include "../util.h"
+#include "../e820.h"
+
+#define ONEMB 0x100000
#define align16(sz) (((sz) + 15) & ~15)
#define fixed_strcpy(d, s) strncpy((d), (s), sizeof(d))
@@ -45,6 +48,140 @@
p = table;
p[checksum_offset] = -sum;
+}
+
+static int vcpu_to_numa_node (int vcpu_id, int nr_vcpus)
+{
+int div,mod;
+
+ div=nr_vcpus / get_numanodes();
+ mod=nr_vcpus % get_numanodes();
+
+ if ( vcpu_id < mod * (div + 1)) return vcpu_id / (div + 1);
+ return ( ( vcpu_id - (mod * (div + 1)) ) / div ) + mod;
+}
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+static uint64_t guessmemsize (void)
+{
+ uint64_t ret = 0;
+ struct e820entry *map = HVM_E820;
+ int i;
+
+ for ( i = 0; i < *HVM_E820_NR ; i++)
+ {
+ if (map[i].addr == ONEMB )
+ ret+=map[i].size + PAGE_SIZE * 3 + ONEMB;
+ if (map[i].addr == (1ULL << 32))
+ ret+=map[i].size;
+ }
+ return ret;
+}
+
+int construct_srat(struct acpi_20_srat *srat)
+{
+ struct acpi_20_srat_processor *processor;
+ struct acpi_20_srat_memory *memory;
+ struct e820entry *map = HVM_E820;
+ int i, offset = 0;
+ uint64_t hvm_node_mem;
+
+ memset(srat, 0, sizeof(*srat));
+ srat->header.signature = ACPI_2_0_SRAT_SIGNATURE;
+ srat->header.revision = ACPI_2_0_SRAT_REVISION;
+ fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+ fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+ srat->header.oem_revision = ACPI_OEM_REVISION;
+ srat->header.creator_id = ACPI_CREATOR_ID;
+ srat->header.creator_revision = ACPI_CREATOR_REVISION;
+ srat->table_revision = ACPI_SRAT_TABLE_REVISION;
+ offset += sizeof(*srat);
+
+ processor = (struct acpi_20_srat_processor *)(srat + 1);
+ for ( i = 0; i < get_vcpu_nr(); i++ )
+ {
+ memset(processor, 0, sizeof(*processor));
+ processor->type = ACPI_PROCESSOR_AFFIN;
+ processor->length = sizeof(*processor);
+ processor->domain = vcpu_to_numa_node (i, get_vcpu_nr());
+ processor->apic_id = LAPIC_ID(i);
+ processor->flags = ACPI_LOCAL_APIC_AFFIN_ENABLED;
+ processor->sapic_id= 0;
+ offset += sizeof(*processor);
+ processor++;
+ }
+
+ /*
+ * Equally distribute the memory on all NUMA nodes. Round up the size
+ * of available memory to whole megabytes, as (at least) Linux cannot cope
+ * with uneven NUMA node boundaries. The remaining part of memory will be
+ * assigned to the last NUMA node. The mapping of the first MB is copied
+ * from the E820 map and assigned to node 0
+ */
+ hvm_node_mem = guessmemsize()+ONEMB-1;
+ hvm_node_mem = hvm_node_mem >> 20;
+ /* 64bit/32bit does not work because of missing libgcc */
+ hvm_node_mem = (uint32_t)hvm_node_mem / get_numanodes();
+ hvm_node_mem = hvm_node_mem << 20;
+
+ memory = (struct acpi_20_srat_memory *)(processor);
+ for ( i = 0; i < *HVM_E820_NR; i++ )
+ {
+ if ( map[i].type != E820_RAM ) continue;
+ if ( map[i].addr >= ONEMB ) break;
+
+ memset(memory, 0, sizeof(*memory));
+ memory->type = ACPI_MEMORY_AFFIN;
+ memory->length = sizeof(*memory);
+ memory->domain = 0;
+ memory->base_address_lo = map[i].addr & 0xFFFFFFFFL;
+ memory->base_address_hi = map[i].addr >> 32;
+ memory->length_lo = map[i].size & 0xFFFFFFFFL;
+ memory->length_hi = map[i].size >> 32;
+ memory->flags = ACPI_MEM_AFFIN_ENABLED;
+
+ offset += sizeof(*memory);
+ memory++;
+ }
+
+ for ( i = 0; i < get_numanodes(); i++ )
+ {
+ memset(memory, 0, sizeof(*memory));
+ memory->type = ACPI_MEMORY_AFFIN;
+ memory->length = sizeof(*memory);
+ memory->domain = i;
+ if ( i == 0 )
+ {
+ memory->base_address_lo = ONEMB;
+ memory->base_address_hi = 0;
+ memory->length_lo = ( hvm_node_mem - ONEMB ) & 0xFFFFFFFFL;
+ memory->length_hi = ( hvm_node_mem - ONEMB ) >> 32;
+ } else
+ if ( i == get_numanodes()-1 )
+ {
+ memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+ memory->base_address_hi = (i * hvm_node_mem) >> 32;
+ memory->length_lo = (guessmemsize()-hvm_node_mem*i) &
0xFFFFFFFFL;
+ memory->length_hi = (guessmemsize()-hvm_node_mem*i) >> 32;
+ } else
+ {
+ memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+ memory->base_address_hi = (i * hvm_node_mem) >> 32;
+ memory->length_lo = hvm_node_mem & 0xFFFFFFFFL;
+ memory->length_hi = hvm_node_mem >> 32;
+ }
+ memory->flags = ACPI_MEM_AFFIN_ENABLED;
+ offset += sizeof(*memory);
+ memory++;
+ }
+
+ srat->header.length = offset;
+ set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+ return align16(offset);
}
static int uart_exists(uint16_t uart_base)
@@ -188,6 +325,7 @@
static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
{
int offset = 0, nr_tables = 0;
+ struct acpi_20_srat *srat;
struct acpi_20_madt *madt;
struct acpi_20_hpet *hpet;
struct acpi_20_tcpa *tcpa;
@@ -200,6 +338,14 @@
madt = (struct acpi_20_madt *)&buf[offset];
offset += construct_madt(madt);
table_ptrs[nr_tables++] = (unsigned long)madt;
+ }
+
+ /* SRAT. */
+ if ( get_numanodes() > 0 )
+ {
+ srat = (struct acpi_20_srat *)&buf[offset];
+ offset += construct_srat(srat);
+ table_ptrs[nr_tables++] = (unsigned long)srat;
}
/* HPET. */
diff -r f70a956b987f tools/firmware/hvmloader/util.c
--- a/tools/firmware/hvmloader/util.c Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.c Fri Jul 11 16:02:39 2008 +0200
@@ -594,6 +594,12 @@
return (t ? t->nr_vcpus : 1);
}
+int get_numanodes(void)
+{
+ struct hvm_info_table *t = get_hvm_info_table();
+ return (t ? t->numanodes : 1);
+}
+
int get_acpi_enabled(void)
{
struct hvm_info_table *t = get_hvm_info_table();
diff -r f70a956b987f tools/firmware/hvmloader/util.h
--- a/tools/firmware/hvmloader/util.h Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.h Fri Jul 11 16:02:39 2008 +0200
@@ -104,6 +104,7 @@
/* HVM-builder info. */
int get_vcpu_nr(void);
+int get_numanodes(void);
int get_acpi_enabled(void);
int get_apic_mode(void);
diff -r f70a956b987f tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 16:02:39 2008 +0200
@@ -861,6 +861,18 @@
#endif /* __i386__ || __x86_64__ */
+static unsigned hweight_long (unsigned long value)
+{
+int ret=0;
+
+ while (value>0)
+ {
+ if (value&1) ++ret;
+ value>>=1;
+ }
+ return ret;
+}
+
static PyObject *pyxc_hvm_build(XcObject *self,
PyObject *args,
PyObject *kwds)
@@ -900,6 +912,7 @@
va_hvm->acpi_enabled = acpi;
va_hvm->apic_mode = apic;
va_hvm->nr_vcpus = vcpus;
+ va_hvm->numanodes = hweight_long(nodemask);
for ( i = 0, sum = 0; i < va_hvm->length; i++ )
sum += ((uint8_t *)va_hvm)[i];
va_hvm->checksum = -sum;
diff -r f70a956b987f xen/include/public/hvm/hvm_info_table.h
--- a/xen/include/public/hvm/hvm_info_table.h Fri Jul 11 16:02:11 2008 +0200
+++ b/xen/include/public/hvm/hvm_info_table.h Fri Jul 11 16:02:39 2008 +0200
@@ -36,6 +36,7 @@
uint8_t acpi_enabled;
uint8_t apic_mode;
uint32_t nr_vcpus;
+ uint32_t numanodes;
};
#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|