[Xen-devel] [PATCH 5 of 9] Fine-grained concurrency control stru

 xen/arch/x86/mm/hap/private.h |    1 +
 xen/arch/x86/mm/mm-locks.h    |   20 +-
 xen/arch/x86/mm/p2m-ept.c     |    1 +
 xen/arch/x86/mm/p2m-lock.h    |  613 ++++++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/mm/p2m-pod.c     |    1 +
 xen/arch/x86/mm/p2m-pt.c      |    1 +
 xen/arch/x86/mm/p2m.c         |   24 +-
 xen/include/asm-x86/p2m.h     |    3 +-
 8 files changed, 652 insertions(+), 12 deletions(-)


Introduce a fine-grained concurrency control structure for the p2m. This
allows for locking 2M-aligned chunks of the p2m at a time, exclusively.
Recursive locking is allowed. Global locking of the whole p2m is also
allowed for certain operations. Simple deadlock detection heuristics are
put in place.

Note the patch creates backwards-compatible shortcuts that will lock the
p2m globally. So it should remain functionally identical to what is currently
in place.

Signed-off-by: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx>

diff -r 981073d78f7f -r a23e1262b124 xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -21,6 +21,7 @@
 #define __HAP_PRIVATE_H__
 
 #include "../mm-locks.h"
+#include "../p2m-lock.h"
 
 /********************************************/
 /*          GUEST TRANSLATION FUNCS         */
diff -r 981073d78f7f -r a23e1262b124 xen/arch/x86/mm/mm-locks.h
--- a/xen/arch/x86/mm/mm-locks.h
+++ b/xen/arch/x86/mm/mm-locks.h
@@ -146,14 +146,22 @@ declare_mm_lock(nestedp2m)
 
 /* P2M lock (per-p2m-table)
  * 
- * This protects all updates to the p2m table.  Updates are expected to
- * be safe against concurrent reads, which do *not* require the lock. */
+ * This protects all updates to the p2m table.
+ * 
+ * In 64 bit mode we disable this because the lock becomes fine-grained,
+ * and several code paths cause inversion/deadlock:
+ *   -- PoD sweeps
+ *   -- mem_sharing_unshare_page
+ *   -- generally widespread recursive locking, which we don't support
+ *      (yet, I guess) on an "external" mm lock. */
 
+#ifndef __x86_64__
 declare_mm_lock(p2m)
-#define p2m_lock(p)           mm_lock(p2m, &(p)->lock)
-#define p2m_lock_recursive(p) mm_lock_recursive(p2m, &(p)->lock)
-#define p2m_unlock(p)         mm_unlock(&(p)->lock)
-#define p2m_locked_by_me(p)   mm_locked_by_me(&(p)->lock)
+#define _p2m_lock(p)           mm_lock(p2m, &(p)->lock)
+#define _p2m_lock_recursive(p) mm_lock_recursive(p2m, &(p)->lock)
+#define _p2m_unlock(p)         mm_unlock(&(p)->lock)
+#define _p2m_locked_by_me(p)   mm_locked_by_me(&(p)->lock)
+#endif /* __x86_64__ */
 
 /* PoD lock (per-p2m-table)
  * 
diff -r 981073d78f7f -r a23e1262b124 xen/arch/x86/mm/p2m-ept.c
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -33,6 +33,7 @@
 #include <xen/softirq.h>
 
 #include "mm-locks.h"
+#include "p2m-lock.h"
 
 #define atomic_read_ept_entry(__pepte)                              \
     ( (ept_entry_t) { .epte = atomic_read64(&(__pepte)->epte) } )
diff -r 981073d78f7f -r a23e1262b124 xen/arch/x86/mm/p2m-lock.h
--- /dev/null
+++ b/xen/arch/x86/mm/p2m-lock.h
@@ -0,0 +1,613 @@
+/******************************************************************************
+ * arch/x86/mm/p2m-lock.h
+ *
+ * Fine-grained locking of the p2m. Allow for concurrent updates to different
+ * regions of the p2m. Serially synchronize updates and lookups. Mutex 
+ * access on p2m entries while a CPU is using them.
+ *
+ * Copyright (c) 2011 Andres Lagar-Cavilla, GridCentric Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _XEN_P2M_LOCK_H
+#define _XEN_P2M_LOCK_H
+
+#include <xen/config.h>
+#include <xen/lib.h>
+/* See comment about space consideration for spinlocks below */
+#define NDEBUG
+#undef LOCK_PROFILE
+#include <xen/spinlock.h>
+#include <asm/atomic.h>
+#include <xen/xmalloc.h>
+#include <xen/paging.h>
+#include <asm/page.h>
+#include <asm/p2m.h>
+#include "mm-locks.h"
+
+/* Rationale:
+ *
+ * The motivating scenario is one in which you have at least three CPUs 
+ * operating on likely disjoint regions of the p2m: a paging utility, a sharing
+ * utility, and the domU vcpu. With yet another p2m-heavy utility (mem 
+ * access?), and/or a migrate/remus utility, the number of CPUs operating
+ * on disjoint regions increases. Not to mention multi-vcpu domUs.
+ *
+ * Therefore, p2m concurrency control is achieved through a hierarchical 
+ * tree of locks, to allow all these CPUs to work without bothering each other.
+ * (Without disallowing any other cases such as single-vcpu domU)
+ *
+ * Leafs in the tree of locks are represented by spinlocks.
+ *
+ * Inner nodes (or uppper levels), are represented by a spinlock and a count.
+ * The count indicates how many CPUs are locking a node beneath. 
+ *
+ * A cpu holds a leaf by grabbing the spinlock, and not letting go of it. On 
its 
+ * way to the leaf, for each inner node, it grabs the spinlock, increases the 
+ * count, and releases the spinlock.
+ *
+ * Leaf levels are recursive, the same CPU can lock them again.
+ *
+ * A cpu holds an inner node in exclusive mode by busy-waiting until the count 
+ * is zero, grabbing the spinlock, and not letting go of it.
+ *
+ * Unlocks work by releasing the current spinlock, and working your way up:
+ * grab spinlock, decrease count, release.
+ *
+ * No locker can be preempted. For that reason, there are no atomic promotions:
+ * you would end up with promoters deadlocking on their way up the tree.
+ *
+ * Today, there are effectively two levels: the global lock (an inner node), 
and
+ * 2M locks, leaf locks for contiguous, aligned, 2M extents (akin to 
superpages).
+ *
+ * The global level can be held exclusively for big hammer operations such as
+ * log dirty (re)set.
+ *
+ * For non-global locking, the global lock is grabbed non-exclusively. At each 
+ * 1G boundary we allocate, if we hadn't before, the corresponding set of 512 
+ * 2M locks. Allocation of 2M locks is itself protected by a regular
+ * spinlock (this is rare enough). Allocation functions on-demand because
+ * we can't really know a priori the "total" size of the p2m.
+ *
+ * It is expected that every query or modification to the p2m will lock the 
+ * appropriate range. Leafs are recurisve for this reason: commonly you query 
a 
+ * range and then you modify it.
+ *
+ * Conversely, all callers of queries and modifications, once done, need to 
undo
+ * their locking.
+ * 
+ * Because we mimic the page table structure of a 512-radix tree, we run into 
+ * space considerations with the spinlocks in this tree. So we need to be 
careful
+ * about space.
+ *
+ * For 32bit code, we currently bail out and default to one big lock. Sorry 
Atom :(
+ *
+ * Also note that the p2m tree of locks is included in the ordering constraints
+ * enforced by mm-locks.h. It is treated as an "external" lock in that code.
+ *
+ */
+
+#define P2M_ORDER_GLOBAL    ~0U
+
+/* The 32 bit case serves as a concise summary of the external API */
+#ifndef __x86_64__
+/* For 32 bits we default to one big lock */
+typedef struct __p2m_lock {
+    mm_lock_t lock;
+} p2m_lock_t;
+
+static inline int p2m_lock_init(struct p2m_domain *p2m)
+{
+    p2m_lock_t *p2ml = xmalloc(p2m_lock_t);
+    if ( !p2ml )
+        return -ENOMEM;
+    mm_lock_init(&p2ml->lock);
+    p2m->lock = p2ml;
+    return 0;
+}
+
+static inline void get_p2m(struct p2m_domain *p2m, unsigned long gfn, unsigned 
int order)
+{
+    _p2m_lock(p2m->lock);
+}
+
+static inline void put_p2m(struct p2m_domain *p2m, unsigned long gfn, unsigned 
int order)
+{
+    _p2m_unlock(p2m->lock);
+}
+
+static inline void p2m_lock_destroy(struct p2m_domain *p2m)
+{
+    xfree(p2m->lock);
+    p2m->lock = NULL;
+}
+
+/* Backwards compatiblity */
+#define p2m_lock(p)             _p2m_lock((p)->lock)
+#define p2m_lock_recursive(p)   _p2m_lock_recursive((p)->lock)
+#define p2m_locked_by_me(p)     _p2m_locked_by_me((p)->lock)
+#define p2m_unlock(p)           _p2m_unlock((p)->lock)
+
+#else /* __x86_64__ */
+/* If we were to have inner locks (say 1G locks, then the space considerations
+ * outlined below for leaf locks would also apply here. */
+typedef struct p2m_inner_lock {
+    spinlock_t lock;
+    atomic_t   count;
+} p2m_inner_lock_t;
+
+static inline void init_p2m_inner_lock(p2m_inner_lock_t *inner)
+{
+    spin_lock_init(&inner->lock);
+    _atomic_set(inner->count, 0);
+}
+
+/* We cannot risk reusing the code in common/spinlock.c, because it may
+ * have been compiled with LOCK_DEBUG or LOCK_PROFILE. This is unfortunate. */
+static inline void lock_p2m_inner(p2m_inner_lock_t *inner)
+{
+    spin_lock(&inner->lock);
+}
+
+static inline void unlock_p2m_inner(p2m_inner_lock_t *inner)
+{
+    spin_unlock(&inner->lock);
+}
+
+static inline void get_p2m_inner(p2m_inner_lock_t *inner)
+{
+    lock_p2m_inner(inner);
+    atomic_inc(&inner->count);
+    unlock_p2m_inner(inner);
+}
+
+static inline void put_p2m_inner(p2m_inner_lock_t *inner)
+{
+    lock_p2m_inner(inner);
+    atomic_dec(&inner->count);
+    unlock_p2m_inner(inner);
+}
+
+/* XXX Consider starvation here */
+static inline void get_p2m_inner_exclusive(p2m_inner_lock_t *inner)
+{
+    int count;
+retry:
+    while (1)
+    {
+        mb();
+        count = atomic_read(&inner->count);
+        if ( count == 0 )
+            break;
+        cpu_relax();
+    }
+
+    spin_lock(&inner->lock);
+    mb();
+    count = atomic_read(&inner->count);
+    if ( count )
+    {
+        spin_unlock(&inner->lock);
+        goto retry;
+    }
+    /* We leave holding the spinlock */
+}
+
+static inline void put_p2m_inner_exclusive(p2m_inner_lock_t *inner)
+{
+    spin_unlock(&inner->lock);
+}
+
+/* Because we operate under page-table sizing constraints, we need to be 
+ * extremely conscious about the space we're taking up. So we become somewhat 
+ * re-inventers of the wheel, and we disable many things. */
+typedef struct p2m_leaf_lock {
+    raw_spinlock_t raw;
+    u16 recurse_cpu:12;
+    u16 recurse_cnt:4;
+/* Padding to confine each inner lock to its own word */
+#define LEAF_PAD   4
+    uint8_t             pad[LEAF_PAD];
+} __attribute__((packed)) p2m_leaf_lock_t;
+
+/* BUILD_BUG_ON(sizeof(p2m_leaf_lock_t) != sizeof(unsigned long)); */
+
+static inline void init_p2m_leaf_lock(p2m_leaf_lock_t *lock)
+{
+    *lock = (p2m_leaf_lock_t) { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, { } };
+}
+
+static inline int __p2m_spin_trylock_recursive(p2m_leaf_lock_t *lock)
+{
+    int cpu = smp_processor_id();
+
+    if ( likely(lock->recurse_cpu != cpu) )
+    {
+        if ( !_raw_spin_trylock(&lock->raw) )
+            return 0;
+        preempt_disable();
+        lock->recurse_cpu = cpu;
+    }
+
+    lock->recurse_cnt++;
+    return 1;
+}
+
+static inline void lock_p2m_leaf(p2m_leaf_lock_t *lock)
+{
+    while ( !__p2m_spin_trylock_recursive(lock) )
+        cpu_relax();
+}
+
+static inline void unlock_p2m_leaf(p2m_leaf_lock_t *lock)
+{
+    if ( likely(--lock->recurse_cnt == 0) )
+    {
+        lock->recurse_cpu = 0xfffu;
+        preempt_enable();
+        _raw_spin_unlock(&lock->raw);
+    }
+}
+
+/* Deadlock book-keeping, see below */
+#define MAX_LOCK_DEPTH  16
+
+/* The lock structure */
+typedef struct __p2m_lock 
+{
+    /* To enforce ordering in mm-locks */
+    int unlock_level;
+    /* To protect on-demand allocation of locks 
+     * (yeah you heard that right) */
+    spinlock_t alloc_lock;
+    /* Global lock */
+    p2m_inner_lock_t global;
+    /* 2M locks. Allocate on demand: fun */
+    p2m_leaf_lock_t  **locks_2m;
+    /* Book-keeping for deadlock detection. Could be a per-cpu. */
+    unsigned long deadlock_guard[NR_CPUS][MAX_LOCK_DEPTH + 1];
+    uint8_t lock_depth[NR_CPUS];
+    /* Is anybody holding this exclusively */
+    unsigned int exclusive_holder;
+    /* Order of pages allocates for first level of locks_2m */
+    uint8_t order;
+} p2m_lock_t;
+
+#define EXCLUSIVE_CPU_NULL  ~0U
+
+/* Some deadlock book-keeping. Say CPU A holds a lock on range A, CPU B holds 
a 
+ * lock on range B. Now, CPU A wants to lock range B and vice-versa. Deadlock.
+ * We detect this by remembering the start of the current locked range.
+ * We keep a fairly small stack of guards (8), because we don't anticipate
+ * a great deal of recursive locking because (a) recursive locking is rare 
+ * (b) it is evil (c) only PoD seems to do it (is PoD therefore evil?) */
+
+#define DEADLOCK_NULL   ~0UL
+
+#define CURRENT_GUARD(l)    ((l)->deadlock_guard[current->processor] \
+                                [(l)->lock_depth[current->processor]])
+
+#define DEADLOCK_CHECK(cond, action, _f, _a...) \
+do {                                            \
+    if ( (cond) )                               \
+    {                                           \
+        printk(_f, ##_a);                       \
+        action;                                 \
+    }                                           \
+} while(0)
+
+static inline void push_guard(p2m_lock_t *p2ml, unsigned long gfn)
+{
+    int cpu = current->processor;
+
+    DEADLOCK_CHECK(((p2ml->lock_depth[cpu] + 1) > MAX_LOCK_DEPTH), 
+                    BUG(), "CPU %u exceeded deadlock depth\n", cpu);
+
+    p2ml->lock_depth[cpu]++;
+    p2ml->deadlock_guard[cpu][p2ml->lock_depth[cpu]] = gfn;
+}
+
+static inline void pop_guard(p2m_lock_t *p2ml)
+{
+    int cpu = current->processor;
+
+    DEADLOCK_CHECK((!p2ml->lock_depth[cpu] == 0), BUG(), 
+                    "CPU %u underflow deadlock depth\n", cpu);
+
+    p2ml->lock_depth[cpu]--;
+}
+
+static inline int p2m_lock_init(struct p2m_domain *p2m)
+{
+    unsigned int i;
+    p2m_lock_t *p2ml;
+
+    p2ml = xmalloc(p2m_lock_t);
+    if ( !p2ml ) 
+        return -ENOMEM;
+
+    memset(p2ml, 0, sizeof(p2m_lock_t));
+
+    spin_lock_init(&p2ml->alloc_lock);
+    init_p2m_inner_lock(&p2ml->global);
+
+    p2ml->locks_2m = alloc_xenheap_page();
+    if ( !p2ml->locks_2m )
+    {
+        xfree(p2ml);
+        return -ENOMEM;
+    }
+    memset(p2ml->locks_2m, 0, PAGE_SIZE);
+
+    for (i = 0; i < NR_CPUS; i++)
+        p2ml->deadlock_guard[i][0] = DEADLOCK_NULL;
+
+    p2ml->exclusive_holder = EXCLUSIVE_CPU_NULL;
+
+    p2m->lock = p2ml;
+    return 0;    
+}
+
+/* Conversion macros for aligned boundaries */
+#define gfn_to_superpage(g, o)      (((g) & (~((1 << (o)) - 1))) >> (o))
+#define gfn_to_1g_sp(gfn)           gfn_to_superpage(gfn, PAGE_ORDER_1G)
+#define gfn_to_2m_sp(gfn)           gfn_to_superpage(gfn, PAGE_ORDER_2M)
+#define gfn_1g_to_2m(gfn_1g)        ((gfn_1g) << (PAGE_ORDER_1G - 
PAGE_ORDER_2M))
+#define gfn_1g_to_last_2m(gfn_1g)   (gfn_1g_to_2m(gfn_1g) + \
+                                        ((1 << (PAGE_ORDER_1G - 
PAGE_ORDER_2M)) - 1))
+#define gfn_1g_to_4k(gfn_1g)        ((gfn_1g) << PAGE_ORDER_1G)
+#define gfn_1g_to_last_4k(gfn_1g)   (gfn_1g_to_4k(gfn_1g) + ((1 << 
PAGE_ORDER_1G) - 1))
+
+/* Global lock accessors. Global lock is our only "inner" node. */
+#define p2m_exclusive_locked_by_me(l)    \
+     ((l)->lock->exclusive_holder == current->processor)
+
+static inline void get_p2m_global_exclusive(struct p2m_domain *p2m)
+{
+    p2m_lock_t *p2ml = p2m->lock;
+    DEADLOCK_CHECK((CURRENT_GUARD(p2ml) != DEADLOCK_NULL), BUG(),
+                    "P2M DEADLOCK: cpu %u prev range start %lx trying 
global\n",
+                    (unsigned) current->processor, CURRENT_GUARD(p2ml)); 
+
+    get_p2m_inner_exclusive(&p2ml->global);
+    p2ml->exclusive_holder = current->processor;
+}
+
+static inline void put_p2m_global_exclusive(struct p2m_domain *p2m)
+{
+    p2m_lock_t *p2ml = p2m->lock;
+    p2ml->exclusive_holder = EXCLUSIVE_CPU_NULL;
+    put_p2m_inner_exclusive(&p2ml->global);
+}
+
+/* Not to be confused with shortcut for external use */
+static inline void __get_p2m_global(struct p2m_domain *p2m)
+{
+    get_p2m_inner(&p2m->lock->global);
+}
+
+/* Not to be confused with shortcut for external use */
+static inline void __put_p2m_global(struct p2m_domain *p2m)
+{
+    put_p2m_inner(&p2m->lock->global);
+}
+
+/* 2M lock accessors */
+static inline p2m_leaf_lock_t *__get_2m_lock(p2m_lock_t *p2ml,
+                            unsigned long gfn_1g, unsigned long gfn_2m)
+{
+    p2m_leaf_lock_t *lock_2m_l1;
+    BUG_ON(gfn_1g >= (1 << PAGETABLE_ORDER));
+    BUG_ON(gfn_2m >= (1 << PAGETABLE_ORDER));
+    lock_2m_l1 = p2ml->locks_2m[gfn_1g];
+    BUG_ON(lock_2m_l1 == NULL);
+    return (lock_2m_l1 + gfn_2m);
+}
+
+static inline void get_p2m_2m(struct p2m_domain *p2m, unsigned long gfn_1g,
+                                unsigned long gfn_2m)
+{
+    lock_p2m_leaf(__get_2m_lock(p2m->lock, gfn_1g, gfn_2m));
+}
+
+static inline void put_p2m_2m(struct p2m_domain *p2m, unsigned long gfn_1g,
+                                unsigned long gfn_2m)
+{
+    unlock_p2m_leaf(__get_2m_lock(p2m->lock, gfn_1g, gfn_2m));
+}
+
+/* Allocate 2M locks we may not have allocated yet for this 1G superpage */
+static inline int alloc_locks_2m(struct p2m_domain *p2m, unsigned long gfn_1g)
+{
+    p2m_lock_t *p2ml = p2m->lock;
+
+    /* With a single page for l1, we cover a gfn space of 512GB (39 bits)
+     * Given that current x86_64 processors physically address 40 bits,
+     * we're in no immediate danger of overflowing this table for a domU.
+     * If necessary, the l1 itself can grow subject to proper locking 
+     * on the p2ml->alloc_lock */
+
+    /* Quick test for common case */
+    if ( likely(p2ml->locks_2m[gfn_1g] != NULL) ) 
+        return 0;
+
+    spin_lock(&(p2ml->alloc_lock));
+
+    if ( likely(p2ml->locks_2m[gfn_1g] == NULL) )
+    {
+        unsigned long j;
+        p2m_leaf_lock_t *p = alloc_xenheap_page();
+        if ( !p ) 
+        {
+            spin_unlock(&(p2ml->alloc_lock));
+            return -ENOMEM;
+        }
+
+        for (j = 0; j < (1 << PAGETABLE_ORDER); j++)
+            init_p2m_leaf_lock(&p[j]);
+
+        p2ml->locks_2m[gfn_1g] = p;
+    }
+
+    spin_unlock(&(p2ml->alloc_lock));
+    return 0;
+}
+
+static inline unsigned long __get_last_gfn(unsigned long gfn, unsigned int 
order)
+{
+    /* Underflow */
+    unsigned long last_gfn = gfn + (1 << order) - 1;
+    BUG_ON(last_gfn < gfn);
+    return last_gfn;
+}
+
+static inline void get_p2m(struct p2m_domain *p2m, unsigned long gfn, unsigned 
int order)
+{
+    unsigned long last_gfn, first_1g, last_1g, first_2m, last_2m, i, j;
+    p2m_lock_t *p2ml = p2m->lock;
+
+    /* Holders of the p2m in exclusive mode can lock sub ranges. We make that 
a no-op.
+     * however, locking exclusively again is considered rude and tasteless. */
+    if ( (p2m_exclusive_locked_by_me(p2m)) && (order != P2M_ORDER_GLOBAL) )
+        return;
+        
+    DEADLOCK_CHECK(((CURRENT_GUARD(p2ml) != DEADLOCK_NULL) &&
+                    (CURRENT_GUARD(p2ml) > gfn)), WARN(),
+                    "P2M DEADLOCK: cpu %d prev range start %lx new range start 
%lx",
+                    current->processor, CURRENT_GUARD(p2ml), gfn);
+
+    preempt_disable();
+
+    if ( order == P2M_ORDER_GLOBAL ) {
+        get_p2m_global_exclusive(p2m);
+        goto get_p2m_out;
+    } 
+
+    __get_p2m_global(p2m);
+    /* We're non-preemptible. We've disallowed global p2m locking. We
+     * will now (allocate and) lock all relevant 2M leafs */
+
+    last_gfn    = __get_last_gfn(gfn, order);
+    first_1g    = gfn_to_1g_sp(gfn);
+    last_1g     = gfn_to_1g_sp(last_gfn);
+
+    for (i = first_1g; i <= last_1g; i++) 
+    {
+        first_2m    = (gfn_1g_to_4k(i) > gfn) ? gfn_1g_to_2m(i) : 
gfn_to_2m_sp(gfn);
+        last_2m     = min(gfn_to_2m_sp(last_gfn), gfn_1g_to_last_2m(i));
+
+        if ( alloc_locks_2m(p2m, i) )
+        {
+            /* There really isn't much we can do at this point */
+            panic("Fine-grained p2m locking failed to alloc 2M locks"
+                  " for 1G page %lx, domain %hu\n", i, p2m->domain->domain_id);
+        }
+
+        for (j = first_2m; j <= last_2m; j++)
+        {
+            get_p2m_2m(p2m, i, j & ((1 << PAGETABLE_ORDER) - 1));
+        }
+    }
+
+get_p2m_out:
+    push_guard(p2ml, gfn);
+}
+
+/* Conversely to the get method, we unlock all leafs pro-actively here */
+static inline void put_p2m(struct p2m_domain *p2m, unsigned long gfn, unsigned 
int order)
+{
+    unsigned long last_gfn, first_1g, last_1g, first_2m, last_2m, i, j;
+    p2m_lock_t *p2ml = p2m->lock;
+
+    last_gfn = __get_last_gfn(gfn, order);
+
+    /* See comment about exclusive holders recursively locking sub-ranges in 
get_p2m */
+    if ( (p2m_exclusive_locked_by_me(p2m)) && (order != P2M_ORDER_GLOBAL) )
+        return;
+
+    if ( order == P2M_ORDER_GLOBAL )
+    {
+        put_p2m_global_exclusive(p2m);
+        goto cleanup;
+    }
+
+    first_1g    = gfn_to_1g_sp(gfn);
+    last_1g     = gfn_to_1g_sp(last_gfn);
+
+    for (i = first_1g; i <= last_1g; i++) 
+    {
+        first_2m    = (gfn_1g_to_4k(i) > gfn) ? gfn_1g_to_2m(i) : 
gfn_to_2m_sp(gfn);
+        last_2m     = min(gfn_to_2m_sp(last_gfn), gfn_1g_to_last_2m(i));
+
+        for (j = first_2m; j <= last_2m; j++)
+        {
+            put_p2m_2m(p2m, i, j & ((1 << PAGETABLE_ORDER) - 1));
+        }
+    }
+
+    __put_p2m_global(p2m);
+    
+cleanup:
+    pop_guard(p2ml);
+    preempt_enable();
+}
+
+static inline void p2m_lock_destroy(struct p2m_domain *p2m)
+{
+    unsigned int i;
+    p2m_lock_t *p2ml = p2m->lock;
+
+    get_p2m_global_exclusive(p2m);
+
+    for (i = 0; i < (1 << PAGETABLE_ORDER); i++)
+        if ( p2ml->locks_2m[i] )
+            free_xenheap_page(p2ml->locks_2m[i]);
+
+    free_xenheap_page(p2ml->locks_2m);
+
+    put_p2m_global_exclusive(p2m);
+
+    xfree(p2ml);
+    p2m->lock = NULL;
+}
+
+/* Backwards compatibility */
+#define p2m_lock(p)             get_p2m((p), 0, P2M_ORDER_GLOBAL)
+#define p2m_unlock(p)           put_p2m((p), 0, P2M_ORDER_GLOBAL)
+#define p2m_locked_by_me(p)     p2m_exclusive_locked_by_me((p))
+/* There is no backwards compatibility for this, unless we make the 
+ * global lock recursive */
+#define p2m_lock_recursive(p)   ((void)0) 
+
+#endif /* __x86_64__ */
+
+/* Commonly-used shortcus */
+#define get_p2m_global(p2m)     get_p2m((p2m), 0, P2M_ORDER_GLOBAL)
+#define put_p2m_global(p2m)     put_p2m((p2m), 0, P2M_ORDER_GLOBAL)
+
+#define get_p2m_gfn(p2m, gfn)   get_p2m((p2m), (gfn), 0)
+#define put_p2m_gfn(p2m, gfn)   put_p2m((p2m), (gfn), 0)
+
+#endif /* _XEN_P2M_LOCK_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 981073d78f7f -r a23e1262b124 xen/arch/x86/mm/p2m-pod.c
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -34,6 +34,7 @@
 #include <asm/hvm/svm/amd-iommu-proto.h>
 
 #include "mm-locks.h"
+#include "p2m-lock.h"
 
 /* Override macros from asm/page.h to make them work with mfn_t */
 #undef mfn_to_page
diff -r 981073d78f7f -r a23e1262b124 xen/arch/x86/mm/p2m-pt.c
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -39,6 +39,7 @@
 #include <asm/hvm/svm/amd-iommu-proto.h>
 
 #include "mm-locks.h"
+#include "p2m-lock.h"
 
 /* Override macros from asm/page.h to make them work with mfn_t */
 #undef mfn_to_page
diff -r 981073d78f7f -r a23e1262b124 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -38,6 +38,7 @@
 #include <asm/hvm/svm/amd-iommu-proto.h>
 
 #include "mm-locks.h"
+#include "p2m-lock.h"
 
 /* turn on/off 1GB host page table support for hap, default on */
 static bool_t __read_mostly opt_hap_1gb = 1;
@@ -69,9 +70,12 @@ boolean_param("hap_2mb", opt_hap_2mb);
 
 
 /* Init the datastructures for later use by the p2m code */
-static void p2m_initialise(struct domain *d, struct p2m_domain *p2m)
+static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
 {
-    mm_lock_init(&p2m->lock);
+    if (p2m_lock_init(p2m))
+    {
+        return -ENOMEM;
+    }
     mm_lock_init(&p2m->pod.lock);
     INIT_LIST_HEAD(&p2m->np2m_list);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
@@ -89,7 +93,7 @@ static void p2m_initialise(struct domain
     else
         p2m_pt_init(p2m);
 
-    return;
+    return 0;
 }
 
 static int
@@ -103,7 +107,11 @@ p2m_init_nestedp2m(struct domain *d)
         d->arch.nested_p2m[i] = p2m = xzalloc(struct p2m_domain);
         if (p2m == NULL)
             return -ENOMEM;
-        p2m_initialise(d, p2m);
+        if (p2m_initialise(d, p2m))
+        {
+            xfree(p2m);
+            return -ENOMEM;
+        }
         p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
         list_add(&p2m->np2m_list, &p2m_get_hostp2m(d)->np2m_list);
     }
@@ -118,7 +126,11 @@ int p2m_init(struct domain *d)
     p2m_get_hostp2m(d) = p2m = xzalloc(struct p2m_domain);
     if ( p2m == NULL )
         return -ENOMEM;
-    p2m_initialise(d, p2m);
+    if (p2m_initialise(d, p2m))
+    {
+        xfree(p2m);
+        return -ENOMEM;
+    }
 
     /* Must initialise nestedp2m unconditionally
      * since nestedhvm_enabled(d) returns false here.
@@ -331,6 +343,7 @@ static void p2m_teardown_nestedp2m(struc
     uint8_t i;
 
     for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m_lock_destroy(d->arch.nested_p2m[i]);
         xfree(d->arch.nested_p2m[i]);
         d->arch.nested_p2m[i] = NULL;
     }
@@ -338,6 +351,7 @@ static void p2m_teardown_nestedp2m(struc
 
 void p2m_final_teardown(struct domain *d)
 {
+    p2m_lock_destroy(d->arch.p2m); 
     /* Iterate over all p2m tables per domain */
     xfree(d->arch.p2m);
     d->arch.p2m = NULL;
diff -r 981073d78f7f -r a23e1262b124 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -187,9 +187,10 @@ typedef enum {
 #define p2m_is_broken(_t)   (p2m_to_mask(_t) & P2M_BROKEN_TYPES)
 
 /* Per-p2m-table state */
+struct __p2m_lock;
 struct p2m_domain {
     /* Lock that protects updates to the p2m */
-    mm_lock_t          lock;
+    struct __p2m_lock *lock;
 
     /* Shadow translated domain: p2m mapping */
     pagetable_t        phys_table;

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 5 of 9] Fine-grained concurrency control structure fo