From: jbeulich@xxxxxxxxxx Subject: eliminate scalability issues from initial mapping setup Patch-mainline: obsolete References: bnc#417417 Direct Xen to place the initial P->M table outside of the initial mapping, as otherwise the 1G (implementation) / 2G (theoretical) restriction on the size of the initial mapping limits the amount of memory a domain can be handed initially. Note that the flags passed to HYPERVISOR_update_va_mapping() from __make_page_writable() and make_lowmem_page_writable() are intentionally not including UVMF_ALL. This is intended to be on optimal choice between the overhead of a potential spurious page fault (as remote CPUs may still have read-only translations in their TLBs) and the overhead of cross processor flushes. Flushing on the local CPU shouldn't be as expensive (and hence can be viewed as an optimization avoiding the spurious page fault on the local CPU), but is required when the functions are used before the page fault handler gets set up. --- sle11-2009-02-05.orig/arch/x86/kernel/head64-xen.c 2009-01-16 10:31:56.000000000 +0100 +++ sle11-2009-02-05/arch/x86/kernel/head64-xen.c 2008-12-23 12:23:58.000000000 +0100 @@ -171,6 +171,14 @@ void __init x86_64_start_reservations(ch + (xen_start_info->nr_pt_frames << PAGE_SHIFT), "Xen provided"); + if (xen_feature(XENFEAT_auto_translated_physmap)) + xen_start_info->mfn_list = ~0UL; + else if (xen_start_info->mfn_list < __START_KERNEL_map) + reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT, + (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT, + "INITP2M"); + /* * At this point everything still needed from the boot loader * or BIOS or kernel text should be early reserved or marked not --- sle11-2009-02-05.orig/arch/x86/kernel/head_64-xen.S 2008-12-15 11:34:16.000000000 +0100 +++ sle11-2009-02-05/arch/x86/kernel/head_64-xen.S 2008-12-22 12:57:33.000000000 +0100 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,7 @@ ENTRY(empty_zero_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) --- sle11-2009-02-05.orig/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:42.000000000 +0100 +++ sle11-2009-02-05/arch/x86/kernel/setup-xen.c 2008-12-23 15:58:43.000000000 +0100 @@ -1022,7 +1022,7 @@ void __init setup_arch(char **cmdline_p) difference = xen_start_info->nr_pages - max_pfn; set_xen_guest_handle(reservation.extent_start, - ((unsigned long *)xen_start_info->mfn_list) + max_pfn); + phys_to_machine_mapping + max_pfn); reservation.nr_extents = difference; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -1039,14 +1039,86 @@ void __init setup_arch(char **cmdline_p) phys_to_machine_mapping = alloc_bootmem_pages( max_pfn * sizeof(unsigned long)); memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info->mfn_list, + __va(__pa(xen_start_info->mfn_list)), p2m_pages * sizeof(unsigned long)); memset(phys_to_machine_mapping + p2m_pages, ~0, (max_pfn - p2m_pages) * sizeof(unsigned long)); - free_bootmem( - __pa(xen_start_info->mfn_list), - PFN_PHYS(PFN_UP(xen_start_info->nr_pages * - sizeof(unsigned long)))); + +#ifdef CONFIG_X86_64 + if (xen_start_info->mfn_list == VMEMMAP_START) { + /* + * Since it is well isolated we can (and since it is + * perhaps large we should) also free the page tables + * mapping the initial P->M table. + */ + unsigned long va = VMEMMAP_START, pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + + BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK); + xen_l4_entry_update(pgd, __pgd(0)); + for(;;) { + pud_t *pud = pud_page + pud_index(va); + + if (pud_none(*pud)) + va += PUD_SIZE; + else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + make_pages_writable(__va(pa), + PUD_SIZE >> PAGE_SHIFT, + XENFEAT_writable_page_tables); + free_bootmem(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd_t *pmd = pmd_offset(pud, va); + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + make_pages_writable(__va(pa), + PMD_SIZE >> PAGE_SHIFT, + XENFEAT_writable_page_tables); + free_bootmem(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte_t *pte = pte_offset_kernel(pmd, va); + + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + make_page_writable(__va(pa), + XENFEAT_writable_page_tables); + free_bootmem(pa, PAGE_SIZE); + } + ClearPagePinned(virt_to_page(pte)); + make_page_writable(pte, + XENFEAT_writable_page_tables); + free_bootmem(__pa(pte), PAGE_SIZE); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + ClearPagePinned(virt_to_page(pmd)); + make_page_writable(pmd, + XENFEAT_writable_page_tables); + free_bootmem(__pa((unsigned long)pmd + & PAGE_MASK), + PAGE_SIZE); + } + if (!pud_index(va)) + break; + } + ClearPagePinned(virt_to_page(pud_page)); + make_page_writable(pud_page, + XENFEAT_writable_page_tables); + free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK), + PAGE_SIZE); + } else if (!WARN_ON(xen_start_info->mfn_list + < __START_KERNEL_map)) +#endif + free_bootmem(__pa(xen_start_info->mfn_list), + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * + sizeof(unsigned long)))); + /* * Initialise the list of the frames that specify the list of --- sle11-2009-02-05.orig/arch/x86/mm/init_64-xen.c 2009-02-02 11:43:04.000000000 +0100 +++ sle11-2009-02-05/arch/x86/mm/init_64-xen.c 2009-02-02 11:43:10.000000000 +0100 @@ -157,6 +157,17 @@ static unsigned long __meminitdata table static unsigned long __meminitdata table_cur; static unsigned long __meminitdata table_top; +static __init unsigned long get_table_cur(void) +{ + BUG_ON(!table_cur); + if (xen_start_info->mfn_list < __START_KERNEL_map + && table_cur == xen_start_info->first_p2m_pfn) { + table_cur += xen_start_info->nr_p2m_frames; + table_top += xen_start_info->nr_p2m_frames; + } + return table_cur++; +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -168,8 +179,7 @@ static __ref void *spp_getpage(void) if (after_bootmem) ptr = (void *) get_zeroed_page(GFP_ATOMIC); else if (table_cur < table_top) { - ptr = __va(table_cur << PAGE_SHIFT); - table_cur++; + ptr = __va(get_table_cur() << PAGE_SHIFT); memset(ptr, 0, PAGE_SIZE); } else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -334,8 +344,7 @@ static __ref void *alloc_low_page(unsign return adr; } - BUG_ON(!table_cur); - pfn = table_cur++; + pfn = get_table_cur(); if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); @@ -361,14 +370,29 @@ static inline int __meminit make_readonl /* Make new page tables read-only on the first pass. */ if (!xen_feature(XENFEAT_writable_page_tables) && !max_pfn_mapped - && (paddr >= (table_start << PAGE_SHIFT)) - && (paddr < (table_top << PAGE_SHIFT))) - readonly = 1; + && (paddr >= (table_start << PAGE_SHIFT))) { + unsigned long top = table_top; + + /* Account for the range get_table_cur() skips. */ + if (xen_start_info->mfn_list < __START_KERNEL_map + && table_cur <= xen_start_info->first_p2m_pfn + && top > xen_start_info->first_p2m_pfn) + top += xen_start_info->nr_p2m_frames; + if (paddr < (top << PAGE_SHIFT)) + readonly = 1; + } /* Make old page tables read-only. */ if (!xen_feature(XENFEAT_writable_page_tables) && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) && (paddr < (table_cur << PAGE_SHIFT))) readonly = 1; + /* Make P->M table (and its page tables) read-only. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && xen_start_info->mfn_list < __START_KERNEL_map + && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT) + && paddr < (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT) + readonly = 1; /* * No need for writable mapping of kernel image. This also ensures that @@ -616,6 +640,12 @@ void __init xen_init_pt(void) __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE); memcpy(level2_kernel_pgt, page, PAGE_SIZE); + /* Copy the initial P->M table mappings if necessary. */ + addr = pgd_index(xen_start_info->mfn_list); + if (addr < pgd_index(__START_KERNEL_map)) + init_level4_pgt[addr] = + ((pgd_t *)xen_start_info->pt_base)[addr]; + /* Do an early initialization of the fixmap area. */ addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); level3_kernel_pgt[pud_index(addr)] = @@ -676,22 +706,28 @@ static void __init find_early_table_spac static void __init xen_finish_init_mapping(void) { unsigned long i, start, end; + struct mmuext_op mmuext; /* Re-vector virtual addresses pointing into the initial mapping to the just-established permanent ones. */ xen_start_info = __va(__pa(xen_start_info)); xen_start_info->pt_base = (unsigned long) __va(__pa(xen_start_info->pt_base)); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_feature(XENFEAT_auto_translated_physmap) + && xen_start_info->mfn_list >= __START_KERNEL_map) phys_to_machine_mapping = __va(__pa(xen_start_info->mfn_list)); - xen_start_info->mfn_list = (unsigned long) - phys_to_machine_mapping; - } if (xen_start_info->mod_start) xen_start_info->mod_start = (unsigned long) __va(__pa(xen_start_info->mod_start)); + /* Unpin the no longer used Xen provided page tables. */ + mmuext.cmd = MMUEXT_UNPIN_TABLE; + mmuext.arg1.mfn = pfn_to_mfn(__pa(xen_start_info->pt_base) + >> PAGE_SHIFT); + if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF)) + BUG(); + /* Destroy the Xen-created mappings beyond the kernel image. */ start = PAGE_ALIGN((unsigned long)_end); end = __START_KERNEL_map + (table_start << PAGE_SHIFT); @@ -948,9 +984,20 @@ unsigned long __init_refok init_memory_m __flush_tlb_all(); - if (!after_bootmem && table_top > table_start) + if (!after_bootmem && table_top > table_start) { + if (xen_start_info->mfn_list < __START_KERNEL_map + && table_start <= xen_start_info->first_p2m_pfn + && table_top > xen_start_info->first_p2m_pfn) { + reserve_early(table_start << PAGE_SHIFT, + xen_start_info->first_p2m_pfn + << PAGE_SHIFT, + "PGTABLE"); + table_start = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + } reserve_early(table_start << PAGE_SHIFT, table_top << PAGE_SHIFT, "PGTABLE"); + } printk(KERN_INFO "last_map_addr: %lx end: %lx\n", last_map_addr, end); --- sle11-2009-02-05.orig/arch/x86/mm/pageattr-xen.c 2009-02-05 10:41:27.000000000 +0100 +++ sle11-2009-02-05/arch/x86/mm/pageattr-xen.c 2009-02-05 10:42:43.000000000 +0100 @@ -1193,7 +1193,7 @@ static void __make_page_writable(unsigne pte = lookup_address(va, &level); BUG_ON(!pte || level != PG_LEVEL_4K); - if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0)) + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG)) BUG(); if (in_secondary_range(va)) { unsigned long pfn = pte_pfn(*pte); --- sle11-2009-02-05.orig/arch/x86/mm/pgtable-xen.c 2008-12-15 11:34:16.000000000 +0100 +++ sle11-2009-02-05/arch/x86/mm/pgtable-xen.c 2008-12-23 14:38:22.000000000 +0100 @@ -323,7 +323,7 @@ void __init xen_init_pgd_pin(void) if (PTRS_PER_PUD > 1) /* not folded */ SetPagePinned(virt_to_page(pud)); for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (!pud_present(*pud)) + if (!pud_present(*pud) || pud_large(*pud)) continue; pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ @@ -334,7 +334,7 @@ void __init xen_init_pgd_pin(void) && m >= pmd_index(HYPERVISOR_VIRT_START)) continue; #endif - if (!pmd_present(*pmd)) + if (!pmd_present(*pmd) || pmd_large(*pmd)) continue; SetPagePinned(pmd_page(*pmd)); } --- sle11-2009-02-05.orig/arch/x86/mm/pgtable_32-xen.c 2008-12-15 11:28:15.000000000 +0100 +++ sle11-2009-02-05/arch/x86/mm/pgtable_32-xen.c 2009-02-03 14:44:56.000000000 +0100 @@ -188,6 +188,6 @@ void make_lowmem_page_writable(void *va, pte = lookup_address((unsigned long)va, &level); BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_mkwrite(*pte), 0); + (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG); BUG_ON(rc); } --- sle11-2009-02-05.orig/include/xen/interface/elfnote.h 2008-11-25 12:35:56.000000000 +0100 +++ sle11-2009-02-05/include/xen/interface/elfnote.h 2008-12-22 12:53:19.000000000 +0100 @@ -162,9 +162,20 @@ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 /* + * The (non-default) location the initial phys-to-machine map should be + * placed at by the hypervisor (Dom0) or the tools (DomU). + * The kernel must be prepared for this mapping to be established using + * large pages, despite such otherwise not being available to guests. + * The kernel must also be prepared that the page table pages used for + * this mapping may not be accessible through the initial mapping. + * (Only x86-64 supports this at present.) + */ +#define XEN_ELFNOTE_INIT_P2M 15 + +/* * The number of the highest elfnote defined. */ -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M /* * System information exported through crash notes. --- sle11-2009-02-05.orig/include/xen/interface/xen.h 2008-12-15 11:27:38.000000000 +0100 +++ sle11-2009-02-05/include/xen/interface/xen.h 2008-12-23 10:50:00.000000000 +0100 @@ -534,6 +534,7 @@ typedef struct shared_info shared_info_t * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] * c. list of allocated page frames [mfn_list, nr_pages] + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] * e. bootstrap page tables [pt_base, CR3 (x86)] * f. bootstrap stack [register ESP (x86)] @@ -575,6 +576,9 @@ struct start_info { unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; typedef struct start_info start_info_t;