Subject: synchronize vmalloc_sync_all() with mm_{,un}pin() As recently diagnosed by Citrix engineers, mm_{,un}pin() and vmalloc_sync_all() aren't properly synchronized. While by now there is a workaround in unstable and 4.0-testing, I still think this should be fixed in the kernels as well: Add a backlink to the referencing struct mm_struct to the pgd's struct page, and use this to lock the page table updates in vmalloc_sync_all(). Due to the way pgd-s get allocated and put on the global list on i386, we have to account for the backlink not to be set yet (in which case they cannot be subject to (un)pinning. Along the way, I found it necessary/desirable to also fix - a potential NULL dereference in i386's pgd_alloc(), - x86-64 adding not yet cleaned out pgd-s to the global list, and - x86-64 removing pgd-s from the global list rather late. Signed-off-by: Jan Beulich --- a/arch/i386/mm/fault-xen.c +++ b/arch/i386/mm/fault-xen.c @@ -712,12 +712,25 @@ void vmalloc_sync_all(void) return; } for (page = pgd_list; page; page = - (struct page *)page->index) - if (!vmalloc_sync_one(page_address(page), - address)) { + (struct page *)page->index) { + spinlock_t *lock = page->mapping + ? &((struct mm_struct *)page->mapping) + ->page_table_lock + : NULL; + pmd_t *pmd; + + if (lock) + spin_lock(lock); + pmd = vmalloc_sync_one(page_address(page), + address); + if (lock) + spin_unlock(lock); + + if (!pmd) { BUG_ON(page != pgd_list); break; } + } spin_unlock_irqrestore(&pgd_lock, flags); if (!page) set_bit(sync_index(address), insync); --- a/arch/x86_64/mm/fault-xen.c +++ b/arch/x86_64/mm/fault-xen.c @@ -640,6 +640,9 @@ do_sigbus: DEFINE_SPINLOCK(pgd_lock); struct page *pgd_list; +#define pgd_page_table(what, pg) \ + spin_##what(&((struct mm_struct *)(pg)->mapping)->page_table_lock) + void vmalloc_sync_all(void) { /* Note that races in the updates of insync and start aren't @@ -662,10 +665,13 @@ void vmalloc_sync_all(void) page = (struct page *)page->index) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); + + pgd_page_table(lock, page); if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + pgd_page_table(unlock, page); } spin_unlock(&pgd_lock); set_bit(pgd_index(address), insync); --- a/arch/i386/mm/pgtable-xen.c +++ b/arch/i386/mm/pgtable-xen.c @@ -230,6 +230,7 @@ static inline void pgd_list_del(pgd_t *p *pprev = next; if (next) set_page_private(next, (unsigned long)pprev); + page->mapping = NULL; } void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) @@ -271,9 +272,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t **pmd; unsigned long flags; + if (!pgd) + return NULL; + pgd_test_and_unpin(pgd); - if (PTRS_PER_PMD == 1 || !pgd) + /* Store a back link for vmalloc_sync_all(). */ + virt_to_page(pgd)->mapping = (void *)mm; + + if (PTRS_PER_PMD == 1) return pgd; if (HAVE_SHARED_KERNEL_PMD) { --- a/include/asm-x86_64/mach-xen/asm/pgalloc.h +++ b/include/asm-x86_64/mach-xen/asm/pgalloc.h @@ -95,10 +95,13 @@ static inline void pud_free(pud_t *pud) pte_free(virt_to_page(pud)); } -static inline void pgd_list_add(pgd_t *pgd) +static inline void pgd_list_add(pgd_t *pgd, void *mm) { struct page *page = virt_to_page(pgd); + /* Store a back link for vmalloc_sync_all(). */ + page->mapping = mm; + spin_lock(&pgd_lock); page->index = (pgoff_t)pgd_list; if (pgd_list) @@ -119,6 +122,8 @@ static inline void pgd_list_del(pgd_t *p if (next) next->private = (unsigned long)pprev; spin_unlock(&pgd_lock); + + page->mapping = NULL; } static inline pgd_t *pgd_alloc(struct mm_struct *mm) @@ -127,22 +132,22 @@ static inline pgd_t *pgd_alloc(struct mm * We allocate two contiguous pages for kernel and user. */ unsigned boundary; - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1); + pgd_t *pgd; + + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 1); if (!pgd) return NULL; - pgd_list_add(pgd); + pgd_list_add(pgd, mm); /* * Copy kernel pointers in from init. * Could keep a freelist or slab cache of those because the kernel * part never changes. */ boundary = pgd_index(__PAGE_OFFSET); - memset(pgd, 0, boundary * sizeof(pgd_t)); memcpy(pgd + boundary, init_level4_pgt + boundary, (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); - memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ /* * Set level3_user_pgt for vsyscall area */ @@ -155,6 +160,8 @@ static inline void pgd_free(pgd_t *pgd) { pte_t *ptep = virt_to_ptep(pgd); + pgd_list_del(pgd); + if (!pte_write(*ptep)) { xen_pgd_unpin(__pa(pgd)); BUG_ON(HYPERVISOR_update_va_mapping( @@ -174,7 +181,6 @@ static inline void pgd_free(pgd_t *pgd) 0)); } - pgd_list_del(pgd); free_pages((unsigned long)pgd, 1); }