WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] linux/x86: synchronize vmalloc_sync_all() with mm_{,

To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH] linux/x86: synchronize vmalloc_sync_all() with mm_{,un}pin()
From: "Jan Beulich" <JBeulich@xxxxxxxxxx>
Date: Tue, 21 Sep 2010 15:05:21 +0100
Delivery-date: Tue, 21 Sep 2010 07:06:15 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
As recently diagnosed by Citrix engineers, mm_{,un}pin() and
vmalloc_sync_all() aren't properly synchronized. While by now there is
a workaround in unstable and 4.0-testing, I still think this should be
fixed in the kernels as well: Add a backlink to the referencing struct
mm_struct to the pgd's struct page, and use this to lock the page
table updates in vmalloc_sync_all().

Due to the way pgd-s get allocated and put on the global list on i386,
we have to account for the backlink not to be set yet (in which case
they cannot be subject to (un)pinning.

Along the way, I found it necessary/desirable to also fix
- a potential NULL dereference in i386's pgd_alloc(),
- x86-64 adding not yet cleaned out pgd-s to the global list, and
- x86-64 removing pgd-s from the global list rather late.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

--- a/arch/i386/mm/fault-xen.c
+++ b/arch/i386/mm/fault-xen.c
@@ -712,12 +712,25 @@ void vmalloc_sync_all(void)
                                return;
                        }
                        for (page = pgd_list; page; page =
-                                       (struct page *)page->index)
-                               if (!vmalloc_sync_one(page_address(page),
-                                                               address)) {
+                                       (struct page *)page->index) {
+                               spinlock_t *lock = page->mapping
+                                       ? &((struct mm_struct *)page->mapping)
+                                               ->page_table_lock
+                                       : NULL;
+                               pmd_t *pmd;
+
+                               if (lock)
+                                       spin_lock(lock);
+                               pmd = vmalloc_sync_one(page_address(page),
+                                                      address);
+                               if (lock)
+                                       spin_unlock(lock);
+
+                               if (!pmd) {
                                        BUG_ON(page != pgd_list);
                                        break;
                                }
+                       }
                        spin_unlock_irqrestore(&pgd_lock, flags);
                        if (!page)
                                set_bit(sync_index(address), insync);
--- a/arch/x86_64/mm/fault-xen.c
+++ b/arch/x86_64/mm/fault-xen.c
@@ -640,6 +640,9 @@ do_sigbus:
 DEFINE_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 
+#define pgd_page_table(what, pg) \
+       spin_##what(&((struct mm_struct *)(pg)->mapping)->page_table_lock)
+
 void vmalloc_sync_all(void)
 {
        /* Note that races in the updates of insync and start aren't 
@@ -662,10 +665,13 @@ void vmalloc_sync_all(void)
                             page = (struct page *)page->index) {
                                pgd_t *pgd;
                                pgd = (pgd_t *)page_address(page) + 
pgd_index(address);
+
+                               pgd_page_table(lock, page);
                                if (pgd_none(*pgd))
                                        set_pgd(pgd, *pgd_ref);
                                else
                                        BUG_ON(pgd_page(*pgd) != 
pgd_page(*pgd_ref));
+                               pgd_page_table(unlock, page);
                        }
                        spin_unlock(&pgd_lock);
                        set_bit(pgd_index(address), insync);
--- a/arch/i386/mm/pgtable-xen.c
+++ b/arch/i386/mm/pgtable-xen.c
@@ -230,6 +230,7 @@ static inline void pgd_list_del(pgd_t *p
        *pprev = next;
        if (next)
                set_page_private(next, (unsigned long)pprev);
+       page->mapping = NULL;
 }
 
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
@@ -271,9 +272,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
        pmd_t **pmd;
        unsigned long flags;
 
+       if (!pgd)
+               return NULL;
+
        pgd_test_and_unpin(pgd);
 
-       if (PTRS_PER_PMD == 1 || !pgd)
+       /* Store a back link for vmalloc_sync_all(). */
+       virt_to_page(pgd)->mapping = (void *)mm;
+
+       if (PTRS_PER_PMD == 1)
                return pgd;
 
        if (HAVE_SHARED_KERNEL_PMD) {
--- a/include/asm-x86_64/mach-xen/asm/pgalloc.h
+++ b/include/asm-x86_64/mach-xen/asm/pgalloc.h
@@ -95,10 +95,13 @@ static inline void pud_free(pud_t *pud)
        pte_free(virt_to_page(pud));
 }
 
-static inline void pgd_list_add(pgd_t *pgd)
+static inline void pgd_list_add(pgd_t *pgd, void *mm)
 {
        struct page *page = virt_to_page(pgd);
 
+       /* Store a back link for vmalloc_sync_all(). */
+       page->mapping = mm;
+
        spin_lock(&pgd_lock);
        page->index = (pgoff_t)pgd_list;
        if (pgd_list)
@@ -119,6 +122,8 @@ static inline void pgd_list_del(pgd_t *p
        if (next)
                next->private = (unsigned long)pprev;
        spin_unlock(&pgd_lock);
+
+       page->mapping = NULL;
 }
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
@@ -127,22 +132,22 @@ static inline pgd_t *pgd_alloc(struct mm
         * We allocate two contiguous pages for kernel and user.
         */
        unsigned boundary;
-       pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
+       pgd_t *pgd;
+
+       pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 1);
        if (!pgd)
                return NULL;
-       pgd_list_add(pgd);
+       pgd_list_add(pgd, mm);
        /*
         * Copy kernel pointers in from init.
         * Could keep a freelist or slab cache of those because the kernel
         * part never changes.
         */
        boundary = pgd_index(__PAGE_OFFSET);
-       memset(pgd, 0, boundary * sizeof(pgd_t));
        memcpy(pgd + boundary,
               init_level4_pgt + boundary,
               (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
 
-       memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
        /*
         * Set level3_user_pgt for vsyscall area
         */
@@ -155,6 +160,8 @@ static inline void pgd_free(pgd_t *pgd)
 {
        pte_t *ptep = virt_to_ptep(pgd);
 
+       pgd_list_del(pgd);
+
        if (!pte_write(*ptep)) {
                xen_pgd_unpin(__pa(pgd));
                BUG_ON(HYPERVISOR_update_va_mapping(
@@ -174,7 +181,6 @@ static inline void pgd_free(pgd_t *pgd)
                               0));
        }
 
-       pgd_list_del(pgd);
        free_pages((unsigned long)pgd, 1);
 }
 


Attachment: xen-x86-pin-vs-vmalloc_sync_all.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH] linux/x86: synchronize vmalloc_sync_all() with mm_{,un}pin(), Jan Beulich <=