From: jbeulich@xxxxxxxxxx Subject: eliminate scalability issues from direct mapping setup Patch-mainline: obsolete References: bnc#417417 Should be merged into the 2.6.27 merge patch once verified. --- head-2009-02-02.orig/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:29.000000000 +0100 +++ head-2009-02-02/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:42.000000000 +0100 @@ -914,21 +914,6 @@ void __init setup_arch(char **cmdline_p) #endif /* max_pfn_mapped is updated here */ -#ifdef CONFIG_X86_64_XEN - /* - * Due to the way initial table space gets calculated on Xen, we have - * to call init_memory_mapping() with the larger end address first. - */ - if (max_pfn > max_low_pfn) - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn< max_low_pfn) - /* can we preserve max_low_pfn ?*/ - max_low_pfn = max_pfn; - else - max_pfn_mapped = max_low_pfn_mapped; -#else max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<= table_top) + panic("alloc_low_page: ran out of memory"); + + adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE); + memset(adr, 0, PAGE_SIZE); + *phys = pfn * PAGE_SIZE; + return adr; } -#define unmap_low_page(p) ((void)(p)) +static __ref void unmap_low_page(void *adr) +{ + if (after_bootmem) + return; + + early_iounmap(adr, PAGE_SIZE); +} static inline int __meminit make_readonly(unsigned long paddr) { extern char __vsyscall_0; int readonly = 0; - /* Make new page tables read-only. */ + /* Make new page tables read-only on the first pass. */ if (!xen_feature(XENFEAT_writable_page_tables) + && !max_pfn_mapped && (paddr >= (table_start << PAGE_SHIFT)) && (paddr < (table_top << PAGE_SHIFT))) readonly = 1; @@ -412,7 +426,7 @@ phys_pte_update(pmd_t *pmd, unsigned lon { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - BUG_ON(!after_bootmem); + BUG_ON(!max_pfn_mapped); return phys_pte_init(pte, address, end); } @@ -457,12 +471,14 @@ phys_pmd_init(pmd_t *pmd_page, unsigned continue; } - pte = alloc_static_page(&pte_phys); + pte = alloc_low_page(&pte_phys); last_map_addr = phys_pte_init(pte, address, end); unmap_low_page(pte); if (!after_bootmem) { - early_make_page_readonly(pte, XENFEAT_writable_page_tables); + if (max_pfn_mapped) + make_page_readonly(__va(pte_phys), + XENFEAT_writable_page_tables); *pmd = __pmd(pte_phys | _PAGE_TABLE); } else { spin_lock(&init_mm.page_table_lock); @@ -481,7 +497,7 @@ phys_pmd_update(pud_t *pud, unsigned lon pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - BUG_ON(!after_bootmem); + BUG_ON(!max_pfn_mapped); last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); __flush_tlb_all(); return last_map_addr; @@ -520,12 +536,14 @@ phys_pud_init(pud_t *pud_page, unsigned continue; } - pmd = alloc_static_page(&pmd_phys); + pmd = alloc_low_page(&pmd_phys); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); unmap_low_page(pmd); if (!after_bootmem) { - early_make_page_readonly(pmd, XENFEAT_writable_page_tables); + if (max_pfn_mapped) + make_page_readonly(__va(pmd_phys), + XENFEAT_writable_page_tables); if (page_size_mask & (1 << PG_LEVEL_NUM)) xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE)); else @@ -548,13 +566,7 @@ phys_pud_update(pgd_t *pgd, unsigned lon { pud_t *pud; - if (!after_bootmem) { - unsigned long addr = __pgd_val(*pgd), *page; - - addr_to_page(addr, page); - pud = (pud_t *)page; - } else - pud = (pud_t *)pgd_page_vaddr(*pgd); + pud = (pud_t *)pgd_page_vaddr(*pgd); return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM)); } @@ -628,73 +640,6 @@ void __init xen_init_pt(void) xen_pgd_pin(init_level4_pgt); } -static void __init extend_init_mapping(unsigned long tables_space) -{ - unsigned long va = __START_KERNEL_map; - unsigned long start = table_cur; - unsigned long phys, addr, *pte_page; - pmd_t *pmd; - pte_t *pte, new_pte; - unsigned long *page = (unsigned long *)init_level4_pgt; - - addr = page[pgd_index(va)]; - addr_to_page(addr, page); - addr = page[pud_index(va)]; - addr_to_page(addr, page); - - /* Kill mapping of low 1MB. */ - while (va < (unsigned long)&_text) { - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) - BUG(); - va += PAGE_SIZE; - } - - /* Ensure init mappings cover kernel text/data and initial tables. */ - while (va < (__START_KERNEL_map - + (table_cur << PAGE_SHIFT) - + tables_space)) { - if (!pmd_index(va) && !pte_index(va)) { - page = (unsigned long *)init_level4_pgt; - addr = page[pgd_index(va)]; - addr_to_page(addr, page); - addr = page[pud_index(va)]; - addr_to_page(addr, page); - } - pmd = (pmd_t *)&page[pmd_index(va)]; - if (pmd_none(*pmd)) { - pte_page = alloc_static_page(&phys); - early_make_page_readonly( - pte_page, XENFEAT_writable_page_tables); - set_pmd(pmd, __pmd(phys | _KERNPG_TABLE)); - } else { - addr = page[pmd_index(va)]; - addr_to_page(addr, pte_page); - } - pte = (pte_t *)&pte_page[pte_index(va)]; - if (pte_none(*pte)) { - new_pte = pfn_pte( - (va - __START_KERNEL_map) >> PAGE_SHIFT, - __pgprot(_KERNPG_TABLE)); - xen_l1_entry_update(pte, new_pte); - } - va += PAGE_SIZE; - } - - /* Finally, blow away any spurious initial mappings. */ - while (1) { - pmd = (pmd_t *)&page[pmd_index(va)]; - if (pmd_none(*pmd)) - break; - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) - BUG(); - va += PAGE_SIZE; - } - - if (table_cur > start) - reserve_early(start << PAGE_SHIFT, - table_cur << PAGE_SHIFT, "INITMAP"); -} - static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { @@ -708,19 +653,27 @@ static void __init find_early_table_spac ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); - table_cur = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + - xen_start_info->nr_pt_frames; - - extend_init_mapping(tables); + if (!table_top) { + table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + + xen_start_info->nr_pt_frames; + table_cur = table_start; + } else { + /* + * [table_start, table_top) gets passed to reserve_early(), + * so we must not use table_cur here, despite continuing + * to allocate from there. table_cur possibly being below + * table_start is otoh not a problem. + */ + table_start = table_top; + } - table_start = table_cur; - table_top = table_start + (tables >> PAGE_SHIFT); + table_top = table_cur + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); + end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT); } -static void __init xen_finish_init_mapping(bool reserve) +static void __init xen_finish_init_mapping(void) { unsigned long i, start, end; @@ -739,18 +692,17 @@ static void __init xen_finish_init_mappi xen_start_info->mod_start = (unsigned long) __va(__pa(xen_start_info->mod_start)); - /* Destroy the Xen-created mappings beyond the kernel image as - * well as the temporary mappings created above. Prevents - * overlap with modules area (if init mapping is very big). - */ + /* Destroy the Xen-created mappings beyond the kernel image. */ start = PAGE_ALIGN((unsigned long)_end); - end = __START_KERNEL_map + (table_top << PAGE_SHIFT); + end = __START_KERNEL_map + (table_start << PAGE_SHIFT); for (; start < end; start += PAGE_SIZE) if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0)) BUG(); /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */ - start = table_cur; + start = table_top; + WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n", + table_start, table_cur, start); table_top = ~0UL; /* Switch to the real shared_info page, and clear the dummy page. */ @@ -768,11 +720,7 @@ static void __init xen_finish_init_mappi << PAGE_SHIFT, PAGE_KERNEL_RO); - /* Disable the 'table_cur' allocator. */ - table_top = table_cur; - if (reserve && table_cur > start) - reserve_early(start << PAGE_SHIFT, - table_cur << PAGE_SHIFT, "FIXMAP"); + table_top = max(table_cur, start); } static void __init init_gbpages(void) @@ -810,13 +758,15 @@ static unsigned long __meminit kernel_ph continue; } - pud = alloc_static_page(&pud_phys); + pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); unmap_low_page(pud); if(!after_bootmem) { - early_make_page_readonly(pud, XENFEAT_writable_page_tables); + if (max_pfn_mapped) + make_page_readonly(__va(pud_phys), + XENFEAT_writable_page_tables); xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE)); } else { spin_lock(&init_mm.page_table_lock); @@ -864,7 +814,7 @@ unsigned long __init_refok init_memory_m unsigned long last_map_addr = 0; unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; - bool first = !table_start; + struct map_range mr[NR_RANGE_MR]; int nr_range, i; int use_pse, use_gbpages; @@ -955,23 +905,50 @@ unsigned long __init_refok init_memory_m (mr[i].page_size_mask & (1< table_top); - if (start < (table_start << PAGE_SHIFT)) { - WARN_ON(table_cur != table_top); - xen_finish_init_mapping(!first); - } + if (!start) + xen_finish_init_mapping(); + else if (table_cur < table_top) + /* Disable the 'table_cur' allocator. */ + table_top = table_cur; __flush_tlb_all(); - if (first && table_top > table_start) + if (!after_bootmem && table_top > table_start) reserve_early(table_start << PAGE_SHIFT, table_top << PAGE_SHIFT, "PGTABLE"); --- head-2009-02-02.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-15 11:34:16.000000000 +0100 +++ head-2009-02-02/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-02-02 10:23:00.000000000 +0100 @@ -165,7 +165,7 @@ static inline void xen_set_pgd(pgd_t *pg #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -#define MAXMEM _AC(0x0000006fffffffff, UL) +#define MAXMEM _AC(0x000004ffffffffff, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL)