[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 6 of 9] Protect superpage splitting in implementation-dependent traversals


  • To: xen-devel@xxxxxxxxxxxxxxxxxxx
  • From: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx>
  • Date: Thu, 27 Oct 2011 00:33:51 -0400
  • Cc: andres@xxxxxxxxxxxxxx, keir.xen@xxxxxxxxx, tim@xxxxxxx, olaf@xxxxxxxxx, adin@xxxxxxxxxxxxxx
  • Delivery-date: Thu, 27 Oct 2011 05:33:05 -0700
  • Domainkey-signature: a=rsa-sha1; c=nofws; d=lagarcavilla.org; h=content-type :mime-version:content-transfer-encoding:subject:message-id :in-reply-to:references:date:from:to:cc; q=dns; s= lagarcavilla.org; b=nesJcO0L4//rcmH5U++XHWAaBDv/bmb6M2yMIYjOF8gN cdlKydVIayjEMR8t4gS9DeX7+tumeiuq5RsbWdlttHPbXJHX2HPITG44zJQswZle SIf8c9rIrL6E3Rl8/gzTpK3WgUaqotEboFJEvYDTpbTz5O+GK+V4IXXmeF81FEc=
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>

 xen/arch/x86/mm/p2m-ept.c  |  15 +++++++-
 xen/arch/x86/mm/p2m-lock.h |  11 +++++-
 xen/arch/x86/mm/p2m-pt.c   |  82 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 84 insertions(+), 24 deletions(-)


In both pt and ept modes, the p2m trees can map 1GB superpages.
Because our locks work on a 2MB superpage basis, without proper
locking we could have two simultaneous traversals to two different
2MB ranges split the 1GB superpage in a racy manner.

Fix this with the existing alloc_lock in the superpage structure.
We allow 1GB-grained locking for a future implementation -- we
just default to a global lock in all cases currently.

Signed-off-by: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx>

diff -r a23e1262b124 -r 0a97d62c2d41 xen/arch/x86/mm/p2m-ept.c
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -163,6 +163,7 @@ static int ept_set_middle_entry(struct p
 }
 
 /* free ept sub tree behind an entry */
+/* Lock on this superpage (if any) held on entry */
 static void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int 
level)
 {
     /* End if the entry is a leaf entry. */
@@ -181,6 +182,7 @@ static void ept_free_entry(struct p2m_do
     p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn));
 }
 
+/* Lock on this superpage held on entry */
 static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
                                 int level, int target)
 {
@@ -315,6 +317,7 @@ ept_set_entry(struct p2m_domain *p2m, un
     int needs_sync = 1;
     struct domain *d = p2m->domain;
     ept_entry_t old_entry = { .epte = 0 };
+    p2m_lock_t *p2ml = p2m->lock;
 
     /*
      * the caller must make sure:
@@ -361,6 +364,8 @@ ept_set_entry(struct p2m_domain *p2m, un
      * with a leaf entry (a 1GiB or 2MiB page), and handle things 
appropriately.
      */
 
+    if ( target == 2 )
+        lock_p2m_1G(p2ml, index);
     if ( i == target )
     {
         /* We reached the target level. */
@@ -373,7 +378,7 @@ ept_set_entry(struct p2m_domain *p2m, un
         /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 
2MiB),
          * the intermediate tables will be freed below after the ept flush
          *
-         * Read-then-write is OK because we hold the p2m lock. */
+         * Read-then-write is OK because we hold the 1G or 2M lock. */
         old_entry = *ept_entry;
 
         if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
@@ -412,6 +417,8 @@ ept_set_entry(struct p2m_domain *p2m, un
         if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
         {
             ept_free_entry(p2m, &split_ept_entry, i);
+            if ( target == 2 )
+                unlock_p2m_1G(p2ml, index);
             goto out;
         }
 
@@ -440,7 +447,7 @@ ept_set_entry(struct p2m_domain *p2m, un
         /* the caller should take care of the previous page */
         new_entry.mfn = mfn_x(mfn);
 
-        /* Safe to read-then-write because we hold the p2m lock */
+        /* Safe to read-then-write because we hold the 1G or 2M lock */
         if ( ept_entry->mfn == new_entry.mfn )
              need_modify_vtd_table = 0;
 
@@ -448,6 +455,8 @@ ept_set_entry(struct p2m_domain *p2m, un
 
         atomic_write_ept_entry(ept_entry, new_entry);
     }
+    if ( target == 2 )
+        unlock_p2m_1G(p2ml, index);
 
     /* Track the highest gfn for which we have ever had a valid mapping */
     if ( mfn_valid(mfn_x(mfn)) &&
@@ -642,6 +651,8 @@ static ept_entry_t ept_get_entry_content
     return content;
 }
 
+/* This is called before crashing a domain, so we're not particularly 
+ * concerned with locking */
 void ept_walk_table(struct domain *d, unsigned long gfn)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
diff -r a23e1262b124 -r 0a97d62c2d41 xen/arch/x86/mm/p2m-lock.h
--- a/xen/arch/x86/mm/p2m-lock.h
+++ b/xen/arch/x86/mm/p2m-lock.h
@@ -109,6 +109,11 @@ typedef struct __p2m_lock {
     mm_lock_t lock;
 } p2m_lock_t;
 
+/* We do not need sub-locking on 1G superpages because we have a 
+ * global lock */
+#define lock_p2m_1G(l, gfn)     ((void)l) 
+#define unlock_p2m_1G(l, gfn)   ((void)l)
+
 static inline int p2m_lock_init(struct p2m_domain *p2m)
 {
     p2m_lock_t *p2ml = xmalloc(p2m_lock_t);
@@ -271,7 +276,8 @@ typedef struct __p2m_lock
     /* To enforce ordering in mm-locks */
     int unlock_level;
     /* To protect on-demand allocation of locks 
-     * (yeah you heard that right) */
+     * (yeah you heard that right) 
+     * Also protects 1GB superpage splitting. */
     spinlock_t alloc_lock;
     /* Global lock */
     p2m_inner_lock_t global;
@@ -429,6 +435,9 @@ static inline void put_p2m_2m(struct p2m
     unlock_p2m_leaf(__get_2m_lock(p2m->lock, gfn_1g, gfn_2m));
 }
 
+#define lock_p2m_1G(l, gfn)     spin_lock(&l->alloc_lock)
+#define unlock_p2m_1G(l, gfn)   spin_unlock(&l->alloc_lock)
+
 /* Allocate 2M locks we may not have allocated yet for this 1G superpage */
 static inline int alloc_locks_2m(struct p2m_domain *p2m, unsigned long gfn_1g)
 {
diff -r a23e1262b124 -r 0a97d62c2d41 xen/arch/x86/mm/p2m-pt.c
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -159,6 +159,7 @@ p2m_next_level(struct p2m_domain *p2m, m
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
+    p2m_lock_t *p2ml = p2m->lock;
     l1_pgentry_t *l1_entry;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
@@ -207,6 +208,7 @@ p2m_next_level(struct p2m_domain *p2m, m
     ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
 
     /* split 1GB pages into 2MB pages */
+    lock_p2m_1G(p2ml, *gfn_remainder >> shift);
     if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
     {
         unsigned long flags, pfn;
@@ -214,7 +216,10 @@ p2m_next_level(struct p2m_domain *p2m, m
 
         pg = p2m_alloc_ptp(p2m, PGT_l2_page_table);
         if ( pg == NULL )
+        {
+            unlock_p2m_1G(p2ml, *gfn_remainder >> shift);
             return 0;
+        }
 
         flags = l1e_get_flags(*p2m_entry);
         pfn = l1e_get_pfn(*p2m_entry);
@@ -233,9 +238,11 @@ p2m_next_level(struct p2m_domain *p2m, m
         p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
     }
-
+    unlock_p2m_1G(p2ml, *gfn_remainder >> shift);
 
     /* split single 2MB large page into 4KB page in P2M table */
+    /* This does not necessitate locking because 2MB regions are locked
+     * exclusively */
     if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
     {
         unsigned long flags, pfn;
@@ -297,6 +304,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
                                    IOMMUF_readable|IOMMUF_writable:
                                    0; 
     unsigned long old_mfn = 0;
+    p2m_lock_t *p2ml = p2m->lock;
 
     if ( tb_init_done )
     {
@@ -326,7 +334,10 @@ p2m_set_entry(struct p2m_domain *p2m, un
      */
     if ( page_order == PAGE_ORDER_1G )
     {
-        l1_pgentry_t old_entry = l1e_empty();
+        l1_pgentry_t old_entry;
+        lock_p2m_1G(p2ml, l3_table_offset(gfn));
+
+        old_entry = l1e_empty();
         p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
                                    L3_PAGETABLE_SHIFT - PAGE_SHIFT,
                                    L3_PAGETABLE_ENTRIES);
@@ -358,7 +369,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
         /* Free old intermediate tables if necessary */
         if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
             p2m_free_entry(p2m, &old_entry, page_order);
+        unlock_p2m_1G(p2ml, l3_table_offset(gfn));
     }
+
     /*
      * When using PAE Xen, we only allow 33 bits of pseudo-physical
      * address in translated guests (i.e. 8 GBytes).  This restriction
@@ -515,6 +528,7 @@ static mfn_t p2m_gfn_to_mfn_current(stru
      * XXX Once we start explicitly registering MMIO regions in the p2m 
      * XXX we will return p2m_invalid for unmapped gfns */
 
+    p2m_lock_t *p2ml = p2m->lock;
     l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
     l2_pgentry_t l2e = l2e_empty();
     int ret;
@@ -543,6 +557,8 @@ pod_retry_l3:
             /* The read has succeeded, so we know that mapping exists */
             if ( q != p2m_query )
             {
+                /* We do not need to lock the 1G superpage here because PoD 
+                 * will do it by splitting */
                 if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) )
                     goto pod_retry_l3;
                 p2mt = p2m_invalid;
@@ -558,6 +574,7 @@ pod_retry_l3:
         goto pod_retry_l2;
     }
 
+    lock_p2m_1G(p2ml, l3_table_offset(addr));
     if ( l3e_get_flags(l3e) & _PAGE_PSE )
     {
         p2mt = p2m_flags_to_type(l3e_get_flags(l3e));
@@ -571,8 +588,12 @@ pod_retry_l3:
             
         if ( page_order )
             *page_order = PAGE_ORDER_1G;
+        unlock_p2m_1G(p2ml, l3_table_offset(addr));
         goto out;
     }
+    unlock_p2m_1G(p2ml, l3_table_offset(addr));
+#else
+    (void)p2ml; /* gcc ... */
 #endif
     /*
      * Read & process L2
@@ -691,6 +712,7 @@ p2m_gfn_to_mfn(struct p2m_domain *p2m, u
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     l2_pgentry_t *l2e;
     l1_pgentry_t *l1e;
+    p2m_lock_t *p2ml = p2m->lock;
 
     ASSERT(paging_mode_translate(p2m->domain));
 
@@ -744,6 +766,8 @@ pod_retry_l3:
             {
                 if ( q != p2m_query )
                 {
+                    /* See previous comments on why there is no need to lock
+                     * 1GB superpage here */
                     if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) )
                         goto pod_retry_l3;
                 }
@@ -755,16 +779,23 @@ pod_retry_l3:
         }
         else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
         {
-            mfn = _mfn(l3e_get_pfn(*l3e) +
-                       l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
-                       l1_table_offset(addr));
-            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
-            unmap_domain_page(l3e);
+            lock_p2m_1G(p2ml, l3_table_offset(addr));
+            /* Retry to be sure */
+            if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+            {
+                mfn = _mfn(l3e_get_pfn(*l3e) +
+                           l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
+                           l1_table_offset(addr));
+                *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+                unmap_domain_page(l3e);
 
-            ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-            if ( page_order )
-                *page_order = PAGE_ORDER_1G;
-            return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+                ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+                if ( page_order )
+                    *page_order = PAGE_ORDER_1G;
+                unlock_p2m_1G(p2ml, l3_table_offset(addr));
+                return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+            }
+            unlock_p2m_1G(p2ml, l3_table_offset(addr));
         }
 
         mfn = _mfn(l3e_get_pfn(*l3e));
@@ -852,6 +883,7 @@ static void p2m_change_type_global(struc
     l4_pgentry_t *l4e;
     unsigned long i4;
 #endif /* CONFIG_PAGING_LEVELS == 4 */
+    p2m_lock_t *p2ml = p2m->lock;
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
     BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
@@ -891,17 +923,25 @@ static void p2m_change_type_global(struc
             }
             if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
             {
-                flags = l3e_get_flags(l3e[i3]);
-                if ( p2m_flags_to_type(flags) != ot )
+                lock_p2m_1G(p2ml, i3);
+                if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
+                {
+                    flags = l3e_get_flags(l3e[i3]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                    {
+                        unlock_p2m_1G(p2ml, i3);
+                        continue;
+                    }
+                    mfn = l3e_get_pfn(l3e[i3]);
+                    gfn = get_gpfn_from_mfn(mfn);
+                    flags = p2m_type_to_flags(nt, _mfn(mfn));
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    p2m->write_p2m_entry(p2m, gfn,
+                                         (l1_pgentry_t *)&l3e[i3],
+                                         l3mfn, l1e_content, 3);
+                    unlock_p2m_1G(p2ml, i3);
                     continue;
-                mfn = l3e_get_pfn(l3e[i3]);
-                gfn = get_gpfn_from_mfn(mfn);
-                flags = p2m_type_to_flags(nt, _mfn(mfn));
-                l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                p2m->write_p2m_entry(p2m, gfn,
-                                     (l1_pgentry_t *)&l3e[i3],
-                                     l3mfn, l1e_content, 3);
-                continue;
+                }
             }
 
             l2mfn = _mfn(l3e_get_pfn(l3e[i3]));

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.