|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH] x86/domain_page: implement pure per-vCPU mapping infrastructure
Rewrite the mapcache to be purely per-vCPU instead of partly per-vCPU
and partly per-domain.
This patch is needed to address performance issues when we start relying
on the mapcache, e.g., when we do not have a direct map. Currently, the
per-domain lock on the mapcache is a bottleneck for multicore, causing
performance degradation and even functional regressions. This patch
makes the mapping structure per-vCPU and completely lockless.
Functional regression:
When a domain is run on more than 64 cores, FreeBSD 10 panicks frequently
due to occasional simultaneous set_singleshot_timer hypercalls from too
many cores. Some cores will be blocked waiting on map_domain_page,
eventually failing to set a timer in the future. FreeBSD cannot handle
this and panicks. This was fixed in later FreeBSD releases by handling
-ETIME, but still the degradation in timer performance is a big issue.
Performance regression:
Many benchmarks see a performance drop when having a large core count.
I have done a Geekbench on a 32-vCPU guest.
perf drop old new
single 0.04% 0.18%
multi 2.43% 0.08%
Removing the per-domain lock in the mapcache brings the multi-core
performance almost identical to using the direct map for mappings.
There should be room for futher optimisations, but this already
improves over the old mapcache by a lot.
Note that entries in the maphash will occupy inuse slots. With 16 slots
per vCPU and a maphash capacity of 8, we only have another 8 available,
which is not enough for nested page table walks. We need to increase the
number of slots in config.h.
Signed-off-by: Hongyan Xia <hongyxia@xxxxxxxxxx>
---
xen/arch/x86/domain.c | 5 +-
xen/arch/x86/domain_page.c | 229 +++++++++++------------------------
xen/include/asm-x86/config.h | 2 +-
xen/include/asm-x86/domain.h | 30 +----
4 files changed, 80 insertions(+), 186 deletions(-)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f53ae5ff86..a278aa4678 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -445,6 +445,9 @@ void arch_vcpu_destroy(struct vcpu *v)
xfree(v->arch.msrs);
v->arch.msrs = NULL;
+ xfree(v->arch.pv.mapcache);
+ v->arch.pv.mapcache = NULL;
+
if ( is_hvm_vcpu(v) )
hvm_vcpu_destroy(v);
else
@@ -633,8 +636,6 @@ int arch_domain_create(struct domain *d,
}
else if ( is_pv_domain(d) )
{
- mapcache_domain_init(d);
-
if ( (rc = pv_domain_initialise(d)) != 0 )
goto fail;
}
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index dd32712d2f..52971d2ecc 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -69,12 +69,11 @@ void __init mapcache_override_current(struct vcpu *v)
void *map_domain_page(mfn_t mfn)
{
- unsigned long flags;
- unsigned int idx, i;
+ unsigned long flags, *phashmfn;
+ unsigned int idx, glb_idx, *phashidx, ohashidx;
struct vcpu *v;
- struct mapcache_domain *dcache;
struct mapcache_vcpu *vcache;
- struct vcpu_maphash_entry *hashent;
+ void *ret;
#ifdef NDEBUG
if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
@@ -82,104 +81,59 @@ void *map_domain_page(mfn_t mfn)
#endif
v = mapcache_current_vcpu();
- if ( !v || !is_pv_vcpu(v) )
+ if ( !v || !is_pv_vcpu(v) || !v->arch.pv.mapcache )
return mfn_to_virt(mfn_x(mfn));
- dcache = &v->domain->arch.pv.mapcache;
- vcache = &v->arch.pv.mapcache;
- if ( !dcache->inuse )
- return mfn_to_virt(mfn_x(mfn));
+ vcache = v->arch.pv.mapcache;
+ phashmfn = &vcache->hash_mfn[MAPHASH_HASHFN(mfn_x(mfn))];
+ phashidx = &vcache->hash_idx[MAPHASH_HASHFN(mfn_x(mfn))];
perfc_incr(map_domain_page_count);
local_irq_save(flags);
- hashent = &vcache->hash[MAPHASH_HASHFN(mfn_x(mfn))];
- if ( hashent->mfn == mfn_x(mfn) )
+ ohashidx = *phashidx;
+ if ( *phashmfn != mfn_x(mfn) )
{
- idx = hashent->idx;
- ASSERT(idx < dcache->entries);
- hashent->refcnt++;
- ASSERT(hashent->refcnt);
- ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
- goto out;
- }
+ idx = find_first_zero_bit(vcache->inuse, MAPCACHE_VCPU_ENTRIES);
+ BUG_ON(idx >= MAPCACHE_VCPU_ENTRIES);
- spin_lock(&dcache->lock);
+ ASSERT(vcache->refcnt[idx] == 0);
+ __set_bit(idx, vcache->inuse);
- /* Has some other CPU caused a wrap? We must flush if so. */
- if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
- {
- vcache->shadow_epoch = dcache->epoch;
- if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) )
- {
- perfc_incr(domain_page_tlb_flush);
- flush_tlb_local();
- }
- }
+ glb_idx = idx + v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
+ l1e_write(&MAPCACHE_L1ENT(glb_idx),
+ l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
+ ret = (void *)MAPCACHE_VIRT_START + pfn_to_paddr(glb_idx);
+ flush_tlb_one_local(ret);
+
+ if ( ohashidx != MAPHASHENT_NOTINUSE && !vcache->refcnt[ohashidx] )
+ __clear_bit(ohashidx, vcache->inuse);
- idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor);
- if ( unlikely(idx >= dcache->entries) )
+ *phashmfn = mfn_x(mfn);
+ *phashidx = idx;
+ }
+ else
{
- unsigned long accum = 0, prev = 0;
-
- /* /First/, clean the garbage map and update the inuse list. */
- for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ )
- {
- accum |= prev;
- dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0);
- prev = ~dcache->inuse[i];
- }
-
- if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) )
- idx = find_first_zero_bit(dcache->inuse, dcache->entries);
- else
- {
- /* Replace a hash entry instead. */
- i = MAPHASH_HASHFN(mfn_x(mfn));
- do {
- hashent = &vcache->hash[i];
- if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt )
- {
- idx = hashent->idx;
- ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == hashent->mfn);
- l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
- hashent->idx = MAPHASHENT_NOTINUSE;
- hashent->mfn = ~0UL;
- break;
- }
- if ( ++i == MAPHASH_ENTRIES )
- i = 0;
- } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
- }
- BUG_ON(idx >= dcache->entries);
-
- /* /Second/, flush TLBs. */
- perfc_incr(domain_page_tlb_flush);
- flush_tlb_local();
- vcache->shadow_epoch = ++dcache->epoch;
- dcache->tlbflush_timestamp = tlbflush_current_time();
+ idx = ohashidx;
+ glb_idx = idx + v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
+ ret = (void *)MAPCACHE_VIRT_START + pfn_to_paddr(glb_idx);
}
- set_bit(idx, dcache->inuse);
- dcache->cursor = idx + 1;
-
- spin_unlock(&dcache->lock);
-
- l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
+ vcache->refcnt[idx]++;
+ ASSERT(vcache->refcnt[idx]);
+ ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(glb_idx)) == mfn_x(mfn));
- out:
local_irq_restore(flags);
- return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx);
+ return ret;
}
void unmap_domain_page(const void *ptr)
{
- unsigned int idx;
+ unsigned int idx, glb_idx;
struct vcpu *v;
- struct mapcache_domain *dcache;
- unsigned long va = (unsigned long)ptr, mfn, flags;
- struct vcpu_maphash_entry *hashent;
+ struct mapcache_vcpu *vcache;
+ unsigned long va = (unsigned long)ptr, mfn, hashmfn, flags;
if ( va >= DIRECTMAP_VIRT_START )
return;
@@ -189,73 +143,21 @@ void unmap_domain_page(const void *ptr)
v = mapcache_current_vcpu();
ASSERT(v && is_pv_vcpu(v));
- dcache = &v->domain->arch.pv.mapcache;
- ASSERT(dcache->inuse);
-
- idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
- mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
- hashent = &v->arch.pv.mapcache.hash[MAPHASH_HASHFN(mfn)];
+ vcache = v->arch.pv.mapcache;
+ ASSERT(vcache);
+ glb_idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
+ idx = glb_idx - v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
local_irq_save(flags);
- if ( hashent->idx == idx )
- {
- ASSERT(hashent->mfn == mfn);
- ASSERT(hashent->refcnt);
- hashent->refcnt--;
- }
- else if ( !hashent->refcnt )
- {
- if ( hashent->idx != MAPHASHENT_NOTINUSE )
- {
- /* /First/, zap the PTE. */
- ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(hashent->idx)) ==
- hashent->mfn);
- l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty());
- /* /Second/, mark as garbage. */
- set_bit(hashent->idx, dcache->garbage);
- }
-
- /* Add newly-freed mapping to the maphash. */
- hashent->mfn = mfn;
- hashent->idx = idx;
- }
- else
- {
- /* /First/, zap the PTE. */
- l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
- /* /Second/, mark as garbage. */
- set_bit(idx, dcache->garbage);
- }
-
- local_irq_restore(flags);
-}
-
-int mapcache_domain_init(struct domain *d)
-{
- struct mapcache_domain *dcache = &d->arch.pv.mapcache;
- unsigned int bitmap_pages;
+ mfn = l1e_get_pfn(MAPCACHE_L1ENT(glb_idx));
+ hashmfn = vcache->hash_mfn[MAPHASH_HASHFN(mfn)];
- ASSERT(is_pv_domain(d));
+ vcache->refcnt[idx]--;
+ if ( hashmfn != mfn && !vcache->refcnt[idx] )
+ __clear_bit(idx, vcache->inuse);
-#ifdef NDEBUG
- if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
- return 0;
-#endif
-
- BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
- 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
- MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
- bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
- dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
- dcache->garbage = dcache->inuse +
- (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
-
- spin_lock_init(&dcache->lock);
-
- return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
- 2 * bitmap_pages + 1,
- NIL(l1_pgentry_t *), NULL);
+ local_irq_restore(flags);
}
int mapcache_vcpu_init(struct vcpu *v)
@@ -264,39 +166,48 @@ int mapcache_vcpu_init(struct vcpu *v)
struct mapcache_domain *dcache = &d->arch.pv.mapcache;
unsigned long i;
unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
- unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
- if ( !is_pv_vcpu(v) || !dcache->inuse )
+ if ( !is_pv_vcpu(v) )
return 0;
+ BUILD_BUG_ON(MAPCACHE_VIRT_END > ARG_XLAT_VIRT_START);
+
if ( ents > dcache->entries )
{
+ int rc;
+
+ ASSERT(ents * PAGE_SIZE <= (PERDOMAIN_SLOT_MBYTES << 20));
+
/* Populate page tables. */
- int rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
+ rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
NIL(l1_pgentry_t *), NULL);
- /* Populate bit maps. */
- if ( !rc )
- rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
- nr, NULL, NIL(struct page_info *));
- if ( !rc )
- rc = create_perdomain_mapping(d, (unsigned long)dcache->garbage,
- nr, NULL, NIL(struct page_info *));
-
if ( rc )
return rc;
dcache->entries = ents;
}
- /* Mark all maphash entries as not in use. */
BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES);
+ /* MAPHASH_ENTRIES has to be power-of-two to make hashing work. */
+ BUILD_BUG_ON(MAPHASH_ENTRIES & (MAPHASH_ENTRIES - 1));
+ /*
+ * Since entries in the maphash also occupy inuse slots, we have to make
+ * sure MAPCACHE_VCPU_ENTRIES is large enough to accommodate both the
+ * maphash and a nested page table walk.
+ */
+ BUILD_BUG_ON(MAPCACHE_VCPU_ENTRIES - MAPHASH_ENTRIES >=
+ CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS);
+
+ v->arch.pv.mapcache = xzalloc(struct mapcache_vcpu);
+ if ( !v->arch.pv.mapcache )
+ return -ENOMEM;
+
+ /* Mark all maphash entries as not in use. */
for ( i = 0; i < MAPHASH_ENTRIES; i++ )
{
- struct vcpu_maphash_entry *hashent = &v->arch.pv.mapcache.hash[i];
-
- hashent->mfn = ~0UL; /* never valid to map */
- hashent->idx = MAPHASHENT_NOTINUSE;
+ v->arch.pv.mapcache->hash_mfn[i] = ~0UL;
+ v->arch.pv.mapcache->hash_idx[i] = MAPHASHENT_NOTINUSE;
}
return 0;
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index d0cfbb70a8..4b2217151b 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -296,7 +296,7 @@ extern unsigned long xen_phys_start;
(GDT_VIRT_START(v) + (64*1024))
/* map_domain_page() map cache. The second per-domain-mapping sub-area. */
-#define MAPCACHE_VCPU_ENTRIES (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS)
+#define MAPCACHE_VCPU_ENTRIES 32
#define MAPCACHE_ENTRIES (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES)
#define MAPCACHE_VIRT_START PERDOMAIN_VIRT_SLOT(1)
#define MAPCACHE_VIRT_END (MAPCACHE_VIRT_START + \
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index a3ae5d9a20..367bba7110 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -40,35 +40,17 @@ struct trap_bounce {
#define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1))
#define MAPHASHENT_NOTINUSE ((u32)~0U)
struct mapcache_vcpu {
- /* Shadow of mapcache_domain.epoch. */
- unsigned int shadow_epoch;
-
- /* Lock-free per-VCPU hash of recently-used mappings. */
- struct vcpu_maphash_entry {
- unsigned long mfn;
- uint32_t idx;
- uint32_t refcnt;
- } hash[MAPHASH_ENTRIES];
+ unsigned long hash_mfn[MAPHASH_ENTRIES];
+ uint32_t hash_idx[MAPHASH_ENTRIES];
+
+ uint8_t refcnt[MAPCACHE_VCPU_ENTRIES];
+ unsigned long inuse[BITS_TO_LONGS(MAPCACHE_VCPU_ENTRIES)];
};
struct mapcache_domain {
- /* The number of array entries, and a cursor into the array. */
unsigned int entries;
- unsigned int cursor;
-
- /* Protects map_domain_page(). */
- spinlock_t lock;
-
- /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */
- unsigned int epoch;
- u32 tlbflush_timestamp;
-
- /* Which mappings are in use, and which are garbage to reap next epoch? */
- unsigned long *inuse;
- unsigned long *garbage;
};
-int mapcache_domain_init(struct domain *);
int mapcache_vcpu_init(struct vcpu *);
void mapcache_override_current(struct vcpu *);
@@ -473,7 +455,7 @@ struct arch_domain
struct pv_vcpu
{
/* map_domain_page() mapping cache. */
- struct mapcache_vcpu mapcache;
+ struct mapcache_vcpu *mapcache;
unsigned int vgc_flags;
--
2.17.1
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |