|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 24/44] x86/mapcache: Reimplement map_domain_page() from scratch
There are two reasons:
1) To stop using the per-domain range for the mapcache
2) To make map_domain_page() safe to use during context switches
The new implementation is entirely percpu and rather more simple. See the
comment at the top of domain_page.c for a description of the algorithm.
A side effect of the new implementation is that we can get rid of struct
mapcache_{vcpu,domain} entirely, and mapcache_override_current().
Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
TODO: Consider whether to try and lazily unmap, utilising other TLB flush
scenarios rather than forcing an invlpg on each unmap.
---
xen/arch/x86/domain.c | 6 -
xen/arch/x86/domain_page.c | 353 +++++++++++++------------------------------
xen/arch/x86/pv/dom0_build.c | 3 -
xen/include/asm-x86/config.h | 7 -
xen/include/asm-x86/domain.h | 42 -----
5 files changed, 106 insertions(+), 305 deletions(-)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 93e81c0..3d9e7fb 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -324,10 +324,6 @@ int vcpu_initialise(struct vcpu *v)
v->arch.flags = TF_kernel_mode;
- rc = mapcache_vcpu_init(v);
- if ( rc )
- return rc;
-
if ( !is_idle_domain(d) )
{
paging_vcpu_init(v);
@@ -478,8 +474,6 @@ int arch_domain_create(struct domain *d, unsigned int
domcr_flags,
d->arch.emulation_flags = emflags;
}
- mapcache_domain_init(d);
-
HYPERVISOR_COMPAT_VIRT_START(d) =
is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 8f2bcd4..c17ff66 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -18,291 +18,131 @@
#include <asm/hardirq.h>
#include <asm/setup.h>
-static DEFINE_PER_CPU(struct vcpu *, override);
+/*
+ * Global mapcache entries are implemented using the vmap() infrastructure.
+ *
+ * Local mapcache entries are implemented with a percpu linear range, starting
+ * at PERCPU_MAPCACHE_START. The maximum number of concurrent mappings we
+ * expect to use (NR_MAPCACHE_SLOTS) is for a nested pagewalk. Being a small
+ * number, allocations are tracked with a simple bitmap (inuse).
+ *
+ * There is plenty of linear address space to use, so addresses are handed out
+ * by index into the inuse bitmap, with unmapped guard pages inbetween, to
+ * help catch bounds errors in the code using the mappings.
+ *
+ * It is *not* safe to pass local mapcache mappings to other CPUs to use.
+ */
-static inline struct vcpu *mapcache_current_vcpu(void)
-{
- /* In the common case we use the mapcache of the running VCPU. */
- struct vcpu *v = this_cpu(override) ?: current;
+struct mapcache_info {
+#define NR_MAPCACHE_SLOTS (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS)
+ unsigned long inuse;
+};
+static DEFINE_PER_CPU(struct mapcache_info, mapcache_info);
- /*
- * When current isn't properly set up yet, this is equivalent to
- * running in an idle vCPU (callers must check for NULL).
- */
- if ( v == INVALID_VCPU )
- return NULL;
+static unsigned long mapcache_idx_to_linear(unsigned int idx)
+{
+ return PERCPU_MAPCACHE_START + pfn_to_paddr(idx * 2 + 1);
+}
- /*
- * When using efi runtime page tables, we have the equivalent of the idle
- * domain's page tables but current may point at another domain's VCPU.
- * Return NULL as though current is not properly set up yet.
- */
- if ( efi_rs_using_pgtables() )
- return NULL;
+static unsigned int mapcache_linear_to_idx(unsigned long linear)
+{
+ return paddr_to_pfn(linear - PERCPU_MAPCACHE_START) / 2;
+}
- /*
- * If guest_table is NULL, and we are running a paravirtualised guest,
- * then it means we are running on the idle domain's page table and must
- * therefore use its mapcache.
- */
- if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) )
- {
- /* If we really are idling, perform lazy context switch now. */
- if ( (v = idle_vcpu[smp_processor_id()]) == current )
- sync_local_execstate();
- /* We must now be running on the idle page table. */
- ASSERT(read_cr3() == this_cpu(percpu_idle_pt));
- }
+static l1_pgentry_t *mapcache_l1e(unsigned long linear)
+{
+ l1_pgentry_t *l1t = (l1_pgentry_t *)PERCPU_MAPCACHE_L1ES;
- return v;
+ return &l1t[l1_table_offset(linear)];
}
-void __init mapcache_override_current(struct vcpu *v)
+/*
+ * Look up a mapcache entry, based on a linear address, ASSERT()ing that it is
+ * bounded senibly and in use.
+ */
+static l1_pgentry_t *lookup_inuse_mapcache_entry(
+ unsigned long linear, unsigned int *p_idx)
{
- this_cpu(override) = v;
-}
+ unsigned int idx;
+ l1_pgentry_t *pl1e;
-#define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
-#define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
-#define MAPCACHE_L1ENT(idx) \
- __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START +
pfn_to_paddr(idx))]
+ ASSERT(linear >= PERCPU_MAPCACHE_START && linear < PERCPU_MAPCACHE_END);
+
+ idx = mapcache_linear_to_idx(linear);
+ ASSERT(idx < NR_MAPCACHE_SLOTS);
+ ASSERT(test_bit(idx, &this_cpu(mapcache_info).inuse));
+
+ if ( p_idx )
+ *p_idx = idx;
+
+ pl1e = mapcache_l1e(linear);
+ ASSERT(l1e_get_flags(*pl1e) & _PAGE_PRESENT);
+
+ return pl1e;
+}
void *map_domain_page(mfn_t mfn)
{
- unsigned long flags;
- unsigned int idx, i;
- struct vcpu *v;
- struct mapcache_domain *dcache;
- struct mapcache_vcpu *vcache;
- struct vcpu_maphash_entry *hashent;
+ unsigned long flags, linear;
+ unsigned int idx;
+ struct mapcache_info *mci = &this_cpu(mapcache_info);
+ l1_pgentry_t *pl1e;
#ifdef NDEBUG
if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
return mfn_to_virt(mfn_x(mfn));
#endif
- v = mapcache_current_vcpu();
- if ( !v || !is_pv_vcpu(v) )
+ if ( this_cpu(curr_extended_directmap) )
return mfn_to_virt(mfn_x(mfn));
- dcache = &v->domain->arch.pv_domain.mapcache;
- vcache = &v->arch.pv_vcpu.mapcache;
- if ( !dcache->inuse )
- return mfn_to_virt(mfn_x(mfn));
-
- perfc_incr(map_domain_page_count);
-
+ /*
+ * map_domain_page() is used from many contexts, including fault handlers.
+ * Disable interrupts to keep the inuse bitmap consistent with the l1t.
+ *
+ * Be aware! Any #PF inside this region will most likely recurse with the
+ * spurious pagefault handler until the BUG_ON() is hit.
+ */
local_irq_save(flags);
- hashent = &vcache->hash[MAPHASH_HASHFN(mfn_x(mfn))];
- if ( hashent->mfn == mfn_x(mfn) )
- {
- idx = hashent->idx;
- ASSERT(idx < dcache->entries);
- hashent->refcnt++;
- ASSERT(hashent->refcnt);
- ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == mfn_x(mfn));
- goto out;
- }
-
- spin_lock(&dcache->lock);
-
- /* Has some other CPU caused a wrap? We must flush if so. */
- if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
- {
- vcache->shadow_epoch = dcache->epoch;
- if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) )
- {
- perfc_incr(domain_page_tlb_flush);
- flush_tlb_local();
- }
- }
+ idx = find_first_zero_bit(&mci->inuse, NR_MAPCACHE_SLOTS);
+ BUG_ON(idx == NR_MAPCACHE_SLOTS);
- idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor);
- if ( unlikely(idx >= dcache->entries) )
- {
- unsigned long accum = 0, prev = 0;
-
- /* /First/, clean the garbage map and update the inuse list. */
- for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ )
- {
- accum |= prev;
- dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0);
- prev = ~dcache->inuse[i];
- }
-
- if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) )
- idx = find_first_zero_bit(dcache->inuse, dcache->entries);
- else
- {
- /* Replace a hash entry instead. */
- i = MAPHASH_HASHFN(mfn_x(mfn));
- do {
- hashent = &vcache->hash[i];
- if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt )
- {
- idx = hashent->idx;
- ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == hashent->mfn);
- l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
- hashent->idx = MAPHASHENT_NOTINUSE;
- hashent->mfn = ~0UL;
- break;
- }
- if ( ++i == MAPHASH_ENTRIES )
- i = 0;
- } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
- }
- BUG_ON(idx >= dcache->entries);
-
- /* /Second/, flush TLBs. */
- perfc_incr(domain_page_tlb_flush);
- flush_tlb_local();
- vcache->shadow_epoch = ++dcache->epoch;
- dcache->tlbflush_timestamp = tlbflush_current_time();
- }
+ __set_bit(idx, &mci->inuse);
- set_bit(idx, dcache->inuse);
- dcache->cursor = idx + 1;
+ linear = mapcache_idx_to_linear(idx);
+ pl1e = mapcache_l1e(linear);
- spin_unlock(&dcache->lock);
+ ASSERT(!(l1e_get_flags(*pl1e) & _PAGE_PRESENT));
+ *pl1e = l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW);
+ barrier(); /* Ensure the pagetable is updated before enabling interrupts.
*/
- l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
-
- out:
local_irq_restore(flags);
- return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx);
+
+ return (void *)linear;
}
void unmap_domain_page(const void *ptr)
{
+ struct mapcache_info *mci = &this_cpu(mapcache_info);
+ unsigned long flags, linear = (unsigned long)ptr;
unsigned int idx;
- struct vcpu *v;
- struct mapcache_domain *dcache;
- unsigned long va = (unsigned long)ptr, mfn, flags;
- struct vcpu_maphash_entry *hashent;
+ l1_pgentry_t *pl1e;
- if ( va >= DIRECTMAP_VIRT_START )
+ if ( linear >= DIRECTMAP_VIRT_START )
return;
- ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-
- v = mapcache_current_vcpu();
- ASSERT(v && is_pv_vcpu(v));
-
- dcache = &v->domain->arch.pv_domain.mapcache;
- ASSERT(dcache->inuse);
-
- idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
- mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
- hashent = &v->arch.pv_vcpu.mapcache.hash[MAPHASH_HASHFN(mfn)];
+ pl1e = lookup_inuse_mapcache_entry(linear, &idx);
local_irq_save(flags);
- if ( hashent->idx == idx )
- {
- ASSERT(hashent->mfn == mfn);
- ASSERT(hashent->refcnt);
- hashent->refcnt--;
- }
- else if ( !hashent->refcnt )
- {
- if ( hashent->idx != MAPHASHENT_NOTINUSE )
- {
- /* /First/, zap the PTE. */
- ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(hashent->idx)) ==
- hashent->mfn);
- l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty());
- /* /Second/, mark as garbage. */
- set_bit(hashent->idx, dcache->garbage);
- }
-
- /* Add newly-freed mapping to the maphash. */
- hashent->mfn = mfn;
- hashent->idx = idx;
- }
- else
- {
- /* /First/, zap the PTE. */
- l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
- /* /Second/, mark as garbage. */
- set_bit(idx, dcache->garbage);
- }
+ *pl1e = l1e_empty();
+ asm volatile ( "invlpg %0" :: "m" (*(char *)ptr) : "memory" );
+ __clear_bit(idx, &mci->inuse);
local_irq_restore(flags);
}
-int mapcache_domain_init(struct domain *d)
-{
- struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
- unsigned int bitmap_pages;
-
- if ( !is_pv_domain(d) || is_idle_domain(d) )
- return 0;
-
-#ifdef NDEBUG
- if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
- return 0;
-#endif
-
- BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
- 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
- MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
- bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
- dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
- dcache->garbage = dcache->inuse +
- (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
-
- spin_lock_init(&dcache->lock);
-
- return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
- 2 * bitmap_pages + 1,
- NIL(l1_pgentry_t *), NULL);
-}
-
-int mapcache_vcpu_init(struct vcpu *v)
-{
- struct domain *d = v->domain;
- struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
- unsigned long i;
- unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
- unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
-
- if ( !is_pv_vcpu(v) || !dcache->inuse )
- return 0;
-
- if ( ents > dcache->entries )
- {
- /* Populate page tables. */
- int rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
- NIL(l1_pgentry_t *), NULL);
-
- /* Populate bit maps. */
- if ( !rc )
- rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
- nr, NULL, NIL(struct page_info *));
- if ( !rc )
- rc = create_perdomain_mapping(d, (unsigned long)dcache->garbage,
- nr, NULL, NIL(struct page_info *));
-
- if ( rc )
- return rc;
-
- dcache->entries = ents;
- }
-
- /* Mark all maphash entries as not in use. */
- BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES);
- for ( i = 0; i < MAPHASH_ENTRIES; i++ )
- {
- struct vcpu_maphash_entry *hashent = &v->arch.pv_vcpu.mapcache.hash[i];
-
- hashent->mfn = ~0UL; /* never valid to map */
- hashent->idx = MAPHASHENT_NOTINUSE;
- }
-
- return 0;
-}
-
void *map_domain_page_global(mfn_t mfn)
{
ASSERT(!in_irq() &&
@@ -345,10 +185,29 @@ unsigned long domain_page_map_to_mfn(const void *ptr)
BUG_ON(!pl1e);
}
else
- {
- ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
- pl1e = &__linear_l1_table[l1_linear_offset(va)];
- }
+ pl1e = lookup_inuse_mapcache_entry(va, NULL);
return l1e_get_pfn(*pl1e);
}
+
+static __init __maybe_unused void build_assertions(void)
+{
+ struct mapcache_info info;
+
+ /* NR_MAPCACHE_SLOTS within the bounds of the inuse bitmap? */
+ BUILD_BUG_ON(NR_MAPCACHE_SLOTS > (sizeof(info.inuse) * 8));
+
+ /* Enough linear address space, including guard pages? */
+ BUILD_BUG_ON((NR_MAPCACHE_SLOTS * 2) >
+ (PERCPU_MAPCACHE_END - PERCPU_MAPCACHE_START));
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 09c765a..3baf37b 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -698,7 +698,6 @@ int __init dom0_construct_pv(struct domain *d,
/* We run on dom0's page tables for the final part of the build process. */
write_ptbase(v);
- mapcache_override_current(v);
/* Copy the OS image and free temporary buffer. */
elf.dest_base = (void*)vkern_start;
@@ -717,7 +716,6 @@ int __init dom0_construct_pv(struct domain *d,
if ( (parms.virt_hypercall < v_start) ||
(parms.virt_hypercall >= v_end) )
{
- mapcache_override_current(NULL);
write_ptbase(current);
printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
rc = -1;
@@ -838,7 +836,6 @@ int __init dom0_construct_pv(struct domain *d,
xlat_start_info(si, XLAT_start_info_console_dom0);
/* Return to idle domain's page tables. */
- mapcache_override_current(NULL);
write_ptbase(current);
update_domain_wallclock_time(d);
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index a95f8c8..f78cbde 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -314,13 +314,6 @@ extern unsigned long xen_phys_start;
#define LDT_VIRT_START(v) \
(GDT_VIRT_START(v) + (64*1024))
-/* map_domain_page() map cache. The second per-domain-mapping sub-area. */
-#define MAPCACHE_VCPU_ENTRIES (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS)
-#define MAPCACHE_ENTRIES (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES)
-#define MAPCACHE_VIRT_START PERDOMAIN_VIRT_SLOT(1)
-#define MAPCACHE_VIRT_END (MAPCACHE_VIRT_START + \
- MAPCACHE_ENTRIES * PAGE_SIZE)
-
/* Argument translation area. The third per-domain-mapping sub-area. */
#define ARG_XLAT_VIRT_START PERDOMAIN_VIRT_SLOT(2)
/* Allow for at least one guard page (COMPAT_ARG_XLAT_SIZE being 2 pages): */
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index f699119..fa57c93 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -38,42 +38,6 @@ struct trap_bounce {
unsigned long eip;
};
-#define MAPHASH_ENTRIES 8
-#define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1))
-#define MAPHASHENT_NOTINUSE ((u32)~0U)
-struct mapcache_vcpu {
- /* Shadow of mapcache_domain.epoch. */
- unsigned int shadow_epoch;
-
- /* Lock-free per-VCPU hash of recently-used mappings. */
- struct vcpu_maphash_entry {
- unsigned long mfn;
- uint32_t idx;
- uint32_t refcnt;
- } hash[MAPHASH_ENTRIES];
-};
-
-struct mapcache_domain {
- /* The number of array entries, and a cursor into the array. */
- unsigned int entries;
- unsigned int cursor;
-
- /* Protects map_domain_page(). */
- spinlock_t lock;
-
- /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */
- unsigned int epoch;
- u32 tlbflush_timestamp;
-
- /* Which mappings are in use, and which are garbage to reap next epoch? */
- unsigned long *inuse;
- unsigned long *garbage;
-};
-
-int mapcache_domain_init(struct domain *);
-int mapcache_vcpu_init(struct vcpu *);
-void mapcache_override_current(struct vcpu *);
-
/* x86/64: toggle guest between kernel and user modes. */
void toggle_guest_mode(struct vcpu *);
/* x86/64: toggle guest page tables between kernel and user modes. */
@@ -251,9 +215,6 @@ struct pv_domain
atomic_t nr_l4_pages;
- /* map_domain_page() mapping cache. */
- struct mapcache_domain mapcache;
-
struct cpuidmasks *cpuidmasks;
};
@@ -444,9 +405,6 @@ struct arch_domain
struct pv_vcpu
{
- /* map_domain_page() mapping cache. */
- struct mapcache_vcpu mapcache;
-
struct trap_info *trap_ctxt;
unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE];
--
2.1.4
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |