vNUMA : Build ACPI NUMA tables for HVMs diff --git a/tools/firmware/hvmloader/acpi/acpi2_0.h b/tools/firmware/hvmloader/acpi/acpi2_0.h --- a/tools/firmware/hvmloader/acpi/acpi2_0.h +++ b/tools/firmware/hvmloader/acpi/acpi2_0.h @@ -283,6 +283,66 @@ struct acpi_20_madt { uint32_t flags; }; +/* + * System Resource Affinity Table header definition (SRAT) (Version 3.0) + * X2APIC_CPU_AFFINITY is defined in version 4.0 + */ +struct acpi_30_srat { + struct acpi_header header; /* Common ACPI table header */ + uint32_t table_revision; /* Must be value '1' */ + uint32_t reserved[2]; /* Reserved, must be zero */ +}; +#define ACPI_30_SRAT_TABLE_REVISION 0x1 + +/* Values for type (in SRAT subtables) */ +enum acpi_30_srat_type { + ACPI_30_SRAT_TYPE_CPU_AFFINITY = 0, + ACPI_30_SRAT_TYPE_MEMORY_AFFINITY = 1, + ACPI_30_SRAT_TYPE_RESERVED = 2 /* 2 and greater are reserved */ +}; + +/* type(0) : Processor Local APIC/SAPIC Affinity */ +struct acpi_30_srat_cpu_affinity { + uint8_t type; + uint8_t length; + uint8_t proximity_domain_lo; + uint8_t apic_id; + uint32_t flags; + uint8_t local_sapic_eid; + uint8_t proximity_domain_hi[3]; + uint32_t reserved; /* Reserved, must be zero */ +}; + +/* Flags */ +#define ACPI_30_SRAT_CPU_USE_AFFINITY (1) /* 00: Use affinity structure */ + +/* 1: Memory Affinity */ + +struct acpi_30_srat_mem_affinity { + uint8_t type; + uint8_t length; + uint32_t proximity_domain; + uint16_t reserved; /* Reserved, must be zero */ + uint64_t base_address; + uint64_t size; + uint32_t reserved1; + uint32_t flags; + uint64_t reserved2; /* Reserved, must be zero */ +}; + +/* Flags */ +#define ACPI_30_SRAT_MEM_ENABLED (1) /* 00: Use affinity structure */ +#define ACPI_30_SRAT_MEM_HOT_PLUGGABLE (1<<1) /* 01: Mem is hot pluggable */ +#define ACPI_30_SRAT_MEM_NON_VOLATILE (1<<2) /* 02: Mem is non-volatile */ + +/* + * System Locality Information Table header definition (SLIT) (Version 1.0) + */ +struct acpi_10_slit { + struct acpi_header header; + uint64_t locality_count; + uint8_t entry[1]; +}; /* * HPET Description Table @@ -367,6 +427,8 @@ struct acpi_20_madt_intsrcovr { #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T') #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A') #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T') +#define ACPI_3_0_SRAT_SIGNATURE ASCII32('S','R','A','T') +#define ACPI_1_0_SLIT_SIGNATURE ASCII32('S','L','I','T') /* * Table revision numbers. @@ -379,6 +441,8 @@ struct acpi_20_madt_intsrcovr { #define ACPI_2_0_TCPA_REVISION 0x02 #define ACPI_2_0_HPET_REVISION 0x01 #define ACPI_1_0_FADT_REVISION 0x01 +#define ACPI_3_0_SRAT_REVISION 0x01 +#define ACPI_1_0_SLIT_REVISION 0x01 #pragma pack () diff --git a/tools/firmware/hvmloader/acpi/build.c b/tools/firmware/hvmloader/acpi/build.c --- a/tools/firmware/hvmloader/acpi/build.c +++ b/tools/firmware/hvmloader/acpi/build.c @@ -149,6 +149,114 @@ static int construct_madt(struct acpi_20 return align16(offset); } +static int +construct_srat_cpu_affinity(struct acpi_30_srat_cpu_affinity *cpu_srat) +{ + struct acpi_30_srat_cpu_affinity *cpu_srat_iter; + int vnode, vcpu; + struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0]; + uint8_t *numa_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(numa_info); + + for ( vnode = 0, cpu_srat_iter = cpu_srat; + vnode < numa_info->nr_vnodes; vnode++ ) + { + for ( vcpu = 0 ; vcpu < numa_info->nr_vcpus; vcpu++ ) + { + if (numa_vcpu_to_vnode[vcpu] == vnode) + { + memset(cpu_srat_iter, 0, sizeof(*cpu_srat_iter)); + cpu_srat_iter->type = ACPI_30_SRAT_TYPE_CPU_AFFINITY; + cpu_srat_iter->length = sizeof(*cpu_srat); + cpu_srat_iter->proximity_domain_lo = vnode; + cpu_srat_iter->apic_id = LAPIC_ID(vcpu); + cpu_srat_iter->flags = ACPI_30_SRAT_CPU_USE_AFFINITY; + cpu_srat_iter++; + } + } + } + /* return length of the sub-table */ + return ((uint8_t *)cpu_srat_iter-(uint8_t *)cpu_srat); +} + +static int +construct_srat_mem_affinity(struct acpi_30_srat_mem_affinity *mem_srat) +{ + int vnode; + struct acpi_30_srat_mem_affinity *mem_srat_iter = mem_srat; + struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0]; + struct xen_vnode_info *numa_vnode_info = NUMA_INFO_VNODE_INFO(numa_info); + + for ( vnode = 0; vnode < numa_info->nr_vnodes; vnode++ ) + { + struct xen_vnode_info *vnode_info = &numa_vnode_info[vnode]; + memset(mem_srat_iter, 0, sizeof(*mem_srat_iter)); + mem_srat_iter->type = ACPI_30_SRAT_TYPE_MEMORY_AFFINITY; + mem_srat_iter->length = sizeof(*mem_srat_iter); + mem_srat_iter->proximity_domain = vnode; + mem_srat_iter->base_address = (uint64_t)vnode_info->start << PAGE_SHIFT; + mem_srat_iter->size = + (uint64_t)(vnode_info->end - vnode_info->start) << PAGE_SHIFT; + mem_srat_iter->flags = ACPI_30_SRAT_MEM_ENABLED; + mem_srat_iter++; + } + /* return length of the sub-table */ + return ((uint8_t *)mem_srat_iter-(uint8_t *)mem_srat); +} + +static int construct_srat(struct acpi_30_srat *srat) +{ + int offset; + + memset(srat, 0, sizeof(*srat)); + srat->header.signature = ACPI_3_0_SRAT_SIGNATURE; + srat->header.revision = ACPI_3_0_SRAT_REVISION; + fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID); + fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID); + srat->header.oem_revision = ACPI_OEM_REVISION; + srat->header.creator_id = ACPI_CREATOR_ID; + srat->header.creator_revision = ACPI_CREATOR_REVISION; + srat->table_revision = ACPI_30_SRAT_TABLE_REVISION; + offset = sizeof(*srat); + + offset += construct_srat_cpu_affinity((struct acpi_30_srat_cpu_affinity *) + ((uint8_t *)srat + offset)); + + offset += construct_srat_mem_affinity((struct acpi_30_srat_mem_affinity *) + ((uint8_t *)srat + offset)); + + srat->header.length = offset; + set_checksum(srat, offsetof(struct acpi_header, checksum), offset); + + return offset; +} + +static int construct_slit(struct acpi_10_slit *slit) +{ + int offset, i, nr_vnodes; + struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0]; + uint8_t *numa_vnode_distance = NUMA_INFO_VNODE_DISTANCE(numa_info); + + memset(slit, 0, sizeof(*slit)); + slit->header.signature = ACPI_1_0_SLIT_SIGNATURE; + slit->header.revision = ACPI_1_0_SLIT_REVISION; + fixed_strcpy(slit->header.oem_id, ACPI_OEM_ID); + fixed_strcpy(slit->header.oem_table_id, ACPI_OEM_TABLE_ID); + slit->header.oem_revision = ACPI_OEM_REVISION; + slit->header.creator_id = ACPI_CREATOR_ID; + slit->header.creator_revision = ACPI_CREATOR_REVISION; + slit->locality_count = numa_info->nr_vnodes; + + nr_vnodes = numa_info->nr_vnodes; + for (i=0; i<(nr_vnodes*nr_vnodes); i++) + slit->entry[i] = numa_vnode_distance[i]; + + offset = sizeof(*slit)+(nr_vnodes*nr_vnodes)-1; + slit->header.length = offset; + set_checksum(slit, offsetof(struct acpi_header, checksum), offset); + + return offset; +} + static int construct_hpet(struct acpi_20_hpet *hpet) { int offset; @@ -177,6 +285,8 @@ static int construct_secondary_tables(ui struct acpi_20_madt *madt; struct acpi_20_hpet *hpet; struct acpi_20_tcpa *tcpa; + struct acpi_30_srat *srat; + struct acpi_10_slit *slit; static const uint16_t tis_signature[] = {0x0001, 0x0001, 0x0001}; uint16_t *tis_hdr; void *lasa; @@ -189,6 +299,18 @@ static int construct_secondary_tables(ui table_ptrs[nr_tables++] = (unsigned long)madt; } + /* SRAT/SLIT. */ + if ( hvm_info->numa_enabled && + hvm_info->numa_info[0].version == XEN_DOM_NUMA_INTERFACE_VERSION ) + { + srat = (struct acpi_30_srat *)&buf[offset]; + offset += construct_srat(srat); + table_ptrs[nr_tables++] = (unsigned long)srat; + slit = (struct acpi_10_slit *)&buf[offset]; + offset += construct_slit(slit); + table_ptrs[nr_tables++] = (unsigned long)slit; + } + /* HPET. */ if ( hpet_exists(ACPI_HPET_ADDRESS) ) { diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c --- a/tools/libxc/xc_hvm_build.c +++ b/tools/libxc/xc_hvm_build.c @@ -11,6 +11,7 @@ #include "xg_private.h" #include "xc_private.h" #include "xc_dom_numa.h" +#include "xc_cpumap.h" #include #include @@ -32,7 +33,62 @@ #define NR_SPECIAL_PAGES 4 #define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x)) -static void build_hvm_info(void *hvm_info_page, uint64_t mem_size) +static int build_hvm_numa_info(struct hvm_info_table *hvm_info, + xc_domain_numa_layout_t *dlayout) +{ + int i, j; + uint64_t vnode_pgstart; + struct xen_domain_numa_info *ninfo; + struct xen_vnode_info *ninfo_vnode_info; + uint8_t *ninfo_vcpu_to_vnode, *ninfo_vnode_distance; + + ninfo = &hvm_info->numa_info[0]; + ninfo->version = dlayout->version; + ninfo->type = dlayout->type; + ninfo->nr_vcpus = dlayout->nr_vcpus; + ninfo->nr_vnodes = dlayout->nr_vnodes; + + ninfo_vnode_info = NUMA_INFO_VNODE_INFO(ninfo); + ninfo_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(ninfo); + ninfo_vnode_distance = NUMA_INFO_VNODE_DISTANCE(ninfo); + + for (i=0; inr_vcpus; i++) + ninfo_vcpu_to_vnode[i] = XEN_INVALID_NODE; + + for (i=0, vnode_pgstart=0; inr_vnodes; i++) + { + uint64_t vnode_pgend; + struct xenctl_cpumap vnode_vcpumap; + xc_vnode_data_t *vnode_data = &dlayout->vnode_data[i]; + xc_cpumask_t *vnode_vcpumask = &vnode_data->vcpu_mask; + struct xen_vnode_info *vnode_info = &ninfo_vnode_info[i]; + + vnode_info->mnode_id = vnode_data->mnode_id; + vnode_pgend = vnode_pgstart + vnode_data->nr_pages; + /* Account for hole in the memory map */ + if ( (vnode_pgstart < hvm_info->low_mem_pgend) && + (vnode_pgend >= hvm_info->low_mem_pgend) ) + vnode_pgend += ((1ull<<32) - HVM_BELOW_4G_RAM_END)>>PAGE_SHIFT; + + vnode_info->start = vnode_pgstart; + vnode_info->end = vnode_pgend; + vnode_pgstart = vnode_pgend; + + xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask); + xc_for_each_cpu(j, vnode_vcpumap) + ninfo_vcpu_to_vnode[j] = i; + } + + for (i=0; inr_vnodes; i++) + for (j=0; jnr_vnodes; j++) + ninfo_vnode_distance[(i*ninfo->nr_vnodes)+j] = + dlayout->vnode_distance[(i*ninfo->nr_vnodes)+j]; + + return NUMA_INFO_SIZE(ninfo); +} + +static void build_hvm_info(void *hvm_info_page, uint64_t mem_size, + xc_domain_numa_layout_t *dom_layout) { struct hvm_info_table *hvm_info = (struct hvm_info_table *) (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET); @@ -62,6 +118,12 @@ static void build_hvm_info(void *hvm_inf hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT; hvm_info->reserved_mem_pgstart = special_pfn(0); + if ( dom_layout && ( dom_layout->type == XEN_DOM_NUMA_SPLIT )) + { + hvm_info->numa_enabled = 1; + hvm_info->length += build_hvm_numa_info(hvm_info, dom_layout); + } + /* Finish with the checksum. */ for ( i = 0, sum = 0; i < hvm_info->length; i++ ) sum += ((uint8_t *)hvm_info)[i]; @@ -408,8 +470,8 @@ out: return rc; } -static int -setup_guest_special_pages(xc_interface *xch, uint32_t dom, uint64_t memsize) +static int setup_guest_special_pages(xc_interface *xch, uint32_t dom, + uint64_t memsize, xc_domain_numa_layout_t *dom_layout) { void *hvm_info_page; uint32_t *ident_pt; @@ -419,7 +481,7 @@ setup_guest_special_pages(xc_interface * xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, HVM_INFO_PFN)) == NULL ) goto error_out; - build_hvm_info(hvm_info_page, memsize); + build_hvm_info(hvm_info_page, memsize, dom_layout); munmap(hvm_info_page, PAGE_SIZE); /* Allocate and clear special pages. */ @@ -509,7 +571,7 @@ static int setup_guest(xc_interface *xch if ( rc < 0 ) goto error_out; - rc = setup_guest_special_pages(xch, dom, v_end); + rc = setup_guest_special_pages(xch, dom, v_end, dom_layout); if ( rc < 0 ) goto error_out; diff --git a/xen/include/public/hvm/hvm_info_table.h b/xen/include/public/hvm/hvm_info_table.h --- a/xen/include/public/hvm/hvm_info_table.h +++ b/xen/include/public/hvm/hvm_info_table.h @@ -25,12 +25,14 @@ #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ #define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ +#include "../dom_numa.h" + #define HVM_INFO_PFN 0x09F #define HVM_INFO_OFFSET 0x800 #define HVM_INFO_PADDR ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET) /* Maximum we can support with current vLAPIC ID mapping. */ -#define HVM_MAX_VCPUS 128 +#define HVM_MAX_VCPUS XEN_MAX_VCPUS struct hvm_info_table { char signature[8]; /* "HVM INFO" */ @@ -70,6 +72,12 @@ struct hvm_info_table { /* Bitmap of which CPUs are online at boot time. */ uint8_t vcpu_online[(HVM_MAX_VCPUS + 7)/8]; + + /* Domain NUMA memory distribution. Size of this structure should be + * obtained using the macro XEN_DOMAIN_NUMA_INFO_SIZE(numa_info). + */ + uint8_t numa_enabled; /* numa_info is populated only if numa_enabled != 0 */ + struct xen_domain_numa_info numa_info[0]; }; #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */