[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v2 1/2] gnttab: Introduce rwlock to protect updates to grant table state



Split grant table lock into two separate locks. One to protect
maptrack state and change the other into a rwlock.

The rwlock is used to prevent readers from accessing
inconsistent grant table state such as current
version, partially initialized active table pages,
etc.

Signed-off-by: Matt Wilson <msw@xxxxxxxxxx>
[chegger: ported to xen-staging, split into multiple commits]

v2:
  * Add arm part per request from Julien Grall

Signed-off-by: Christoph Egger <chegger@xxxxxxxxx>
Cc: Jan Beulich <jbeulich@xxxxxxxx>
Cc: Keir Fraser <keir@xxxxxxx>
Cc: Julien Grall <julien.grall@xxxxxxxxxx>
---
 docs/misc/grant-tables.txt    |   28 ++++++++-
 xen/arch/arm/mm.c             |    4 +-
 xen/arch/x86/mm.c             |    4 +-
 xen/common/grant_table.c      |  135 ++++++++++++++++++++++++-----------------
 xen/include/xen/grant_table.h |    9 ++-
 5 files changed, 117 insertions(+), 63 deletions(-)

diff --git a/docs/misc/grant-tables.txt b/docs/misc/grant-tables.txt
index 19db4ec..c9ae8f2 100644
--- a/docs/misc/grant-tables.txt
+++ b/docs/misc/grant-tables.txt
@@ -74,7 +74,33 @@ is complete.
  matching map track entry is then removed, as if unmap had been invoked.
  These are not used by the transfer mechanism.
   map->domid         : owner of the mapped frame
-  map->ref_and_flags : grant reference, ro/rw, mapped for host or device access
+  map->ref           : grant reference
+  map->flags         : ro/rw, mapped for host or device access
+
+********************************************************************************
+ Locking
+ ~~~~~~~
+ Xen uses several locks to serialize access to the internal grant table state.
+
+  grant_table->lock          : rwlock used to prevent readers from accessing
+                               inconsistent grant table state such as current
+                               version, partially initialized active table 
pages,
+                               etc.
+  grant_table->maptrack_lock : spinlock used to protect the maptrack state
+
+ The primary lock for the grant table is a read/write spinlock. All
+ functions that access members of struct grant_table must acquire a
+ read lock around critical sections. Any modification to the members
+ of struct grant_table (e.g., nr_status_frames, nr_grant_frames,
+ active frames, etc.) must only be made if the write lock is
+ held. These elements are read-mostly, and read critical sections can
+ be large, which makes a rwlock a good choice.
+
+ The maptrack state is protected by its own spinlock. Any access (read
+ or write) of struct grant_table members that have a "maptrack_"
+ prefix must be made while holding the maptrack lock. The maptrack
+ state can be rapidly modified under some workloads, and the critical
+ sections are very small, thus we use a spinlock to protect them.
 
 
********************************************************************************
 
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 7d4ba0c..2765683 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -1037,7 +1037,7 @@ int xenmem_add_to_physmap_one(
     switch ( space )
     {
     case XENMAPSPACE_grant_table:
-        spin_lock(&d->grant_table->lock);
+        write_lock(&d->grant_table->lock);
 
         if ( d->grant_table->gt_version == 0 )
             d->grant_table->gt_version = 1;
@@ -1067,7 +1067,7 @@ int xenmem_add_to_physmap_one(
 
         t = p2m_ram_rw;
 
-        spin_unlock(&d->grant_table->lock);
+        write_unlock(&d->grant_table->lock);
         break;
     case XENMAPSPACE_shared_info:
         if ( idx != 0 )
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 522c43d..37c13b1 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4565,7 +4565,7 @@ int xenmem_add_to_physmap_one(
                 mfn = virt_to_mfn(d->shared_info);
             break;
         case XENMAPSPACE_grant_table:
-            spin_lock(&d->grant_table->lock);
+            write_lock(&d->grant_table->lock);
 
             if ( d->grant_table->gt_version == 0 )
                 d->grant_table->gt_version = 1;
@@ -4587,7 +4587,7 @@ int xenmem_add_to_physmap_one(
                     mfn = virt_to_mfn(d->grant_table->shared_raw[idx]);
             }
 
-            spin_unlock(&d->grant_table->lock);
+            write_unlock(&d->grant_table->lock);
             break;
         case XENMAPSPACE_gmfn_range:
         case XENMAPSPACE_gmfn:
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index 8fba923..018b5b6 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -227,23 +227,23 @@ double_gt_lock(struct grant_table *lgt, struct 
grant_table *rgt)
 {
     if ( lgt < rgt )
     {
-        spin_lock(&lgt->lock);
-        spin_lock(&rgt->lock);
+        spin_lock(&lgt->maptrack_lock);
+        spin_lock(&rgt->maptrack_lock);
     }
     else
     {
         if ( lgt != rgt )
-            spin_lock(&rgt->lock);
-        spin_lock(&lgt->lock);
+            spin_lock(&rgt->maptrack_lock);
+        spin_lock(&lgt->maptrack_lock);
     }
 }
 
 static inline void
 double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
 {
-    spin_unlock(&lgt->lock);
+    spin_unlock(&lgt->maptrack_lock);
     if ( lgt != rgt )
-        spin_unlock(&rgt->lock);
+        spin_unlock(&rgt->maptrack_lock);
 }
 
 static inline int
@@ -261,10 +261,10 @@ static inline void
 put_maptrack_handle(
     struct grant_table *t, int handle)
 {
-    spin_lock(&t->lock);
+    spin_lock(&t->maptrack_lock);
     maptrack_entry(t, handle).ref = t->maptrack_head;
     t->maptrack_head = handle;
-    spin_unlock(&t->lock);
+    spin_unlock(&t->maptrack_lock);
 }
 
 static inline int
@@ -276,7 +276,7 @@ get_maptrack_handle(
     struct grant_mapping *new_mt;
     unsigned int          new_mt_limit, nr_frames;
 
-    spin_lock(&lgt->lock);
+    spin_lock(&lgt->maptrack_lock);
 
     while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) )
     {
@@ -305,7 +305,7 @@ get_maptrack_handle(
                  nr_frames + 1);
     }
 
-    spin_unlock(&lgt->lock);
+    spin_unlock(&lgt->maptrack_lock);
 
     return handle;
 }
@@ -502,7 +502,7 @@ static int grant_map_exists(const struct domain *ld,
     const struct active_grant_entry *act;
     unsigned int ref, max_iter;
     
-    ASSERT(spin_is_locked(&rgt->lock));
+    ASSERT(rw_is_locked(&rgt->lock));
 
     max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
                    nr_grant_entries(rgt));
@@ -623,7 +623,7 @@ __gnttab_map_grant_ref(
     }
 
     rgt = rd->grant_table;
-    spin_lock(&rgt->lock);
+    read_lock(&rgt->lock);
 
     if ( rgt->gt_version == 0 )
         PIN_FAIL(unlock_out, GNTST_general_error,
@@ -696,7 +696,7 @@ __gnttab_map_grant_ref(
 
     cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
 
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
 
     /* pg may be set, with a refcount included, from __get_paged_frame */
     if ( !pg )
@@ -831,7 +831,7 @@ __gnttab_map_grant_ref(
         put_page(pg);
     }
 
-    spin_lock(&rgt->lock);
+    read_lock(&rgt->lock);
 
     act = &active_entry(rgt, op->ref);
 
@@ -851,7 +851,7 @@ __gnttab_map_grant_ref(
         gnttab_clear_flag(_GTF_reading, status);
 
  unlock_out:
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     op->status = rc;
     put_maptrack_handle(lgt, handle);
     rcu_unlock_domain(rd);
@@ -891,28 +891,28 @@ __gnttab_unmap_common(
     ld = current->domain;
     lgt = ld->grant_table;
 
+    read_lock(&lgt->lock);
     op->frame = (unsigned long)(op->dev_bus_addr >> PAGE_SHIFT);
 
     if ( unlikely(op->handle >= lgt->maptrack_limit) )
     {
+        read_unlock(&lgt->lock);
         gdprintk(XENLOG_INFO, "Bad handle (%d).\n", op->handle);
         op->status = GNTST_bad_handle;
         return;
     }
 
+    read_unlock(&lgt->lock);
     op->map = &maptrack_entry(lgt, op->handle);
-    spin_lock(&lgt->lock);
 
     if ( unlikely(!op->map->flags) )
     {
-        spin_unlock(&lgt->lock);
         gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle);
         op->status = GNTST_bad_handle;
         return;
     }
 
     dom = op->map->domid;
-    spin_unlock(&lgt->lock);
 
     if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
     {
@@ -944,6 +944,7 @@ __gnttab_unmap_common(
     }
 
     op->rd = rd;
+    read_lock(&rgt->lock);
     act = &active_entry(rgt, op->map->ref);
 
     if ( op->frame == 0 )
@@ -1004,6 +1005,7 @@ __gnttab_unmap_common(
 
  unmap_out:
     double_gt_unlock(lgt, rgt);
+    read_unlock(&rgt->lock);
     op->status = rc;
     rcu_unlock_domain(rd);
 }
@@ -1033,8 +1035,8 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common 
*op)
 
     rcu_lock_domain(rd);
     rgt = rd->grant_table;
-    spin_lock(&rgt->lock);
 
+    read_lock(&rgt->lock);
     if ( rgt->gt_version == 0 )
         goto unmap_out;
 
@@ -1098,7 +1100,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common 
*op)
         gnttab_clear_flag(_GTF_reading, status);
 
  unmap_out:
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     if ( put_handle )
     {
         op->map->flags = 0;
@@ -1284,10 +1286,13 @@ gnttab_unpopulate_status_frames(struct domain *d, 
struct grant_table *gt)
     gt->nr_status_frames = 0;
 }
 
+/* Grow the grant table. The caller must hold the grant table's
+ * write lock before calling this function.
+ */
 int
 gnttab_grow_table(struct domain *d, unsigned int req_nr_frames)
 {
-    /* d's grant table lock must be held by the caller */
+    /* d's grant table write lock must be held by the caller */
 
     struct grant_table *gt = d->grant_table;
     unsigned int i;
@@ -1392,7 +1397,7 @@ gnttab_setup_table(
     }
 
     gt = d->grant_table;
-    spin_lock(&gt->lock);
+    write_lock(&gt->lock);
 
     if ( gt->gt_version == 0 )
         gt->gt_version = 1;
@@ -1420,7 +1425,7 @@ gnttab_setup_table(
     }
 
  out3:
-    spin_unlock(&gt->lock);
+    write_unlock(&gt->lock);
  out2:
     rcu_unlock_domain(d);
  out1:
@@ -1462,13 +1467,13 @@ gnttab_query_size(
         goto query_out_unlock;
     }
 
-    spin_lock(&d->grant_table->lock);
+    read_lock(&d->grant_table->lock);
 
     op.nr_frames     = nr_grant_frames(d->grant_table);
     op.max_nr_frames = max_grant_frames;
     op.status        = GNTST_okay;
 
-    spin_unlock(&d->grant_table->lock);
+    read_unlock(&d->grant_table->lock);
 
  
  query_out_unlock:
@@ -1494,7 +1499,7 @@ gnttab_prepare_for_transfer(
     union grant_combo   scombo, prev_scombo, new_scombo;
     int                 retries = 0;
 
-    spin_lock(&rgt->lock);
+    read_lock(&rgt->lock);
 
     if ( rgt->gt_version == 0 )
     {
@@ -1545,11 +1550,11 @@ gnttab_prepare_for_transfer(
         scombo = prev_scombo;
     }
 
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     return 1;
 
  fail:
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     return 0;
 }
 
@@ -1741,7 +1746,7 @@ gnttab_transfer(
         TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
 
         /* Tell the guest about its new page frame. */
-        spin_lock(&e->grant_table->lock);
+        read_lock(&e->grant_table->lock);
 
         if ( e->grant_table->gt_version == 1 )
         {
@@ -1759,7 +1764,7 @@ gnttab_transfer(
         shared_entry_header(e->grant_table, gop.ref)->flags |=
             GTF_transfer_completed;
 
-        spin_unlock(&e->grant_table->lock);
+        read_unlock(&e->grant_table->lock);
 
         rcu_unlock_domain(e);
 
@@ -1797,7 +1802,7 @@ __release_grant_for_copy(
     released_read = 0;
     released_write = 0;
 
-    spin_lock(&rgt->lock);
+    read_lock(&rgt->lock);
 
     act = &active_entry(rgt, gref);
     sha = shared_entry_header(rgt, gref);
@@ -1838,7 +1843,7 @@ __release_grant_for_copy(
         released_read = 1;
     }
 
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
 
     if ( td != rd )
     {
@@ -1896,7 +1901,7 @@ __acquire_grant_for_copy(
 
     *page = NULL;
 
-    spin_lock(&rgt->lock);
+    read_lock(&rgt->lock);
 
     if ( rgt->gt_version == 0 )
         PIN_FAIL(unlock_out, GNTST_general_error,
@@ -1965,17 +1970,21 @@ __acquire_grant_for_copy(
                 PIN_FAIL(unlock_out_clear, GNTST_general_error,
                          "transitive grant referenced bad domain %d\n",
                          trans_domid);
-            spin_unlock(&rgt->lock);
+
+            /* __acquire_grant_for_copy() could take the read lock on
+             * the right table (if rd == td), so we have to drop the
+             * lock here and reacquire */
+            read_unlock(&rgt->lock);
 
             rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
                                           readonly, &grant_frame, page,
                                           &trans_page_off, &trans_length, 0);
 
-            spin_lock(&rgt->lock);
+            read_lock(&rgt->lock);
             if ( rc != GNTST_okay ) {
                 __fixup_status_for_copy_pin(act, status);
+                read_unlock(&rgt->lock);
                 rcu_unlock_domain(td);
-                spin_unlock(&rgt->lock);
                 return rc;
             }
 
@@ -1987,7 +1996,7 @@ __acquire_grant_for_copy(
             {
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
-                spin_unlock(&rgt->lock);
+                read_unlock(&rgt->lock);
                 put_page(*page);
                 return __acquire_grant_for_copy(rd, gref, ldom, readonly,
                                                 frame, page, page_off, length,
@@ -2055,7 +2064,7 @@ __acquire_grant_for_copy(
     *length = act->length;
     *frame = act->frame;
 
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     return rc;
  
  unlock_out_clear:
@@ -2067,7 +2076,7 @@ __acquire_grant_for_copy(
         gnttab_clear_flag(_GTF_reading, status);
 
  unlock_out:
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     return rc;
 }
 
@@ -2241,7 +2250,7 @@ 
gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
     if ( gt->gt_version == op.version )
         goto out;
 
-    spin_lock(&gt->lock);
+    write_lock(&gt->lock);
     /* Make sure that the grant table isn't currently in use when we
        change the version number, except for the first 8 entries which
        are allowed to be in use (xenstore/xenconsole keeps them mapped).
@@ -2327,7 +2336,7 @@ 
gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
     gt->gt_version = op.version;
 
 out_unlock:
-    spin_unlock(&gt->lock);
+    write_unlock(&gt->lock);
 
 out:
     op.version = gt->gt_version;
@@ -2383,7 +2392,7 @@ 
gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
 
     op.status = GNTST_okay;
 
-    spin_lock(&gt->lock);
+    read_lock(&gt->lock);
 
     for ( i = 0; i < op.nr_frames; i++ )
     {
@@ -2392,7 +2401,7 @@ 
gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
             op.status = GNTST_bad_virt_addr;
     }
 
-    spin_unlock(&gt->lock);
+    read_unlock(&gt->lock);
 out2:
     rcu_unlock_domain(d);
 out1:
@@ -2441,7 +2450,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t 
ref_b)
     struct active_grant_entry *act;
     s16 rc = GNTST_okay;
 
-    spin_lock(&gt->lock);
+    read_lock(&gt->lock);
 
     /* Bounds check on the grant refs */
     if ( unlikely(ref_a >= nr_grant_entries(d->grant_table)))
@@ -2481,7 +2490,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t 
ref_b)
     }
 
 out:
-    spin_unlock(&gt->lock);
+    read_unlock(&gt->lock);
 
     rcu_unlock_domain(d);
 
@@ -2501,8 +2510,19 @@ 
gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
             return i;
         if ( unlikely(__copy_from_guest(&op, uop, 1)) )
             return -EFAULT;
-        op.status = __gnttab_swap_grant_ref(op.ref_a, op.ref_b);
-        if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
+        if ( unlikely(op.ref_a == op.ref_b) )
+        {
+            /* noop, so avoid acquiring the same active entry
+             * twice in __gnttab_swap_grant_ref(), which would
+             * case a deadlock.
+             */
+            op.status = GNTST_okay;
+        }
+        else
+        {
+            op.status = __gnttab_swap_grant_ref(op.ref_a, op.ref_b);
+        }
+        if ( unlikely(__copy_to_guest_offset(uop, i, &op, 1)) )
             return -EFAULT;
         guest_handle_add_offset(uop, 1);
     }
@@ -2552,12 +2572,12 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t 
*cflush,
 
     if ( d != owner )
     {
-        spin_lock(&owner->grant_table->lock);
+        read_lock(&owner->grant_table->lock);
 
         ret = grant_map_exists(d, owner->grant_table, mfn, ref_count);
         if ( ret != 0 )
         {
-            spin_unlock(&owner->grant_table->lock);
+            read_unlock(&owner->grant_table->lock);
             rcu_unlock_domain(d);
             put_page(page);
             return ret;
@@ -2577,7 +2597,7 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t 
*cflush,
         ret = 0;
 
     if ( d != owner )
-        spin_unlock(&owner->grant_table->lock);
+        read_unlock(&owner->grant_table->lock);
     unmap_domain_page(v);
     put_page(page);
 
@@ -2793,7 +2813,8 @@ grant_table_create(
         goto no_mem_0;
 
     /* Simple stuff. */
-    spin_lock_init(&t->lock);
+    rwlock_init(&t->lock);
+    spin_lock_init(&t->maptrack_lock);
     t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
 
     /* Active grant table. */
@@ -2900,7 +2921,7 @@ gnttab_release_mappings(
         }
 
         rgt = rd->grant_table;
-        spin_lock(&rgt->lock);
+        read_lock(&rgt->lock);
 
         act = &active_entry(rgt, ref);
         sha = shared_entry_header(rgt, ref);
@@ -2960,7 +2981,7 @@ gnttab_release_mappings(
         if ( act->pin == 0 )
             gnttab_clear_flag(_GTF_reading, status);
 
-        spin_unlock(&rgt->lock);
+        read_unlock(&rgt->lock);
 
         rcu_unlock_domain(rd);
 
@@ -2978,6 +2999,8 @@ grant_table_destroy(
 
     if ( t == NULL )
         return;
+
+    write_lock(&t->lock);
     
     for ( i = 0; i < nr_grant_frames(t); i++ )
         free_xenheap_page(t->shared_raw[i]);
@@ -2995,6 +3018,8 @@ grant_table_destroy(
         free_xenheap_page(t->status[i]);
     xfree(t->status);
 
+    write_unlock(&t->lock);
+
     xfree(t);
     d->grant_table = NULL;
 }
@@ -3008,7 +3033,7 @@ static void gnttab_usage_print(struct domain *rd)
     printk("      -------- active --------       -------- shared --------\n");
     printk("[ref] localdom mfn      pin          localdom gmfn     flags\n");
 
-    spin_lock(&gt->lock);
+    read_lock(&gt->lock);
 
     if ( gt->gt_version == 0 )
         goto out;
@@ -3057,7 +3082,7 @@ static void gnttab_usage_print(struct domain *rd)
     }
 
  out:
-    spin_unlock(&gt->lock);
+    read_unlock(&gt->lock);
 
     if ( first )
         printk("grant-table for remote domain:%5d ... "
diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h
index 32f5786..ca49b41 100644
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -82,8 +82,11 @@ struct grant_table {
     struct grant_mapping **maptrack;
     unsigned int          maptrack_head;
     unsigned int          maptrack_limit;
-    /* Lock protecting updates to active and shared grant tables. */
-    spinlock_t            lock;
+    /* Lock protecting the maptrack page list, head, and limit */
+    spinlock_t            maptrack_lock;
+    /* Lock protecting updates to grant table state (version, active
+       entry list, etc.) */
+    rwlock_t              lock;
     /* The defined versions are 1 and 2.  Set to 0 if we don't know
        what version to use yet. */
     unsigned              gt_version;
@@ -101,7 +104,7 @@ gnttab_release_mappings(
     struct domain *d);
 
 /* Increase the size of a domain's grant table.
- * Caller must hold d's grant table lock.
+ * Caller must hold d's grant table write lock.
  */
 int
 gnttab_grow_table(struct domain *d, unsigned int req_nr_frames);
-- 
1.7.9.5


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.