From: jbeulich@xxxxxxxxxx Subject: eliminate scalability issues from initrd hanling Patch-mainline: n/a Size restrictions native kernels wouldn't have resulted from the initrd getting mapped into the initial mapping. The kernel doesn't really need the initrd to be mapped, so use new infrastructure available in 4.1+ Xen to avoid the mapping and hence the restriction. --- head-2010-10-25.orig/arch/x86/include/mach-xen/asm/setup.h 2010-06-22 11:48:53.000000000 +0200 +++ head-2010-10-25/arch/x86/include/mach-xen/asm/setup.h 2010-11-02 17:43:24.000000000 +0100 @@ -3,6 +3,13 @@ void xen_start_kernel(void); void xen_arch_setup(void); +#ifdef CONFIG_X86_64 +void reserve_pfn_range(unsigned long pfn, unsigned long nr, char *); +void reserve_pgtable_low(void); +#endif + +extern unsigned long xen_initrd_start; + #endif #include_next --- head-2010-10-25.orig/arch/x86/kernel/head-xen.c 2010-10-20 14:13:56.000000000 +0200 +++ head-2010-10-25/arch/x86/kernel/head-xen.c 2010-11-02 16:26:25.000000000 +0100 @@ -74,6 +74,8 @@ extern void nmi(void); #define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) } #endif +unsigned long __initdata xen_initrd_start; + unsigned long *__read_mostly machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); --- head-2010-10-25.orig/arch/x86/kernel/head32-xen.c 2010-10-20 14:13:30.000000000 +0200 +++ head-2010-10-25/arch/x86/kernel/head32-xen.c 2010-11-02 16:28:20.000000000 +0100 @@ -83,6 +83,11 @@ void __init i386_start_kernel(void) break; } #else +#ifdef CONFIG_BLK_DEV_INITRD + BUG_ON(xen_start_info->flags & SIF_MOD_START_PFN); + if (xen_start_info->mod_start) + xen_initrd_start = __pa(xen_start_info->mod_start); +#endif { int max_cmdline; --- head-2010-10-25.orig/arch/x86/kernel/head64-xen.c 2010-06-22 16:36:23.000000000 +0200 +++ head-2010-10-25/arch/x86/kernel/head64-xen.c 2010-11-02 16:58:05.000000000 +0100 @@ -121,13 +121,23 @@ void __init x86_64_start_reservations(ch reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); +#ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD if needed. */ + if (xen_start_info->flags & SIF_MOD_START_PFN) { + reserve_pfn_range(xen_start_info->mod_start, + PFN_UP(xen_start_info->mod_len), + "RAMDISK"); + xen_initrd_start = xen_start_info->mod_start << PAGE_SHIFT; + } else if (xen_start_info->mod_start) + xen_initrd_start = __pa(xen_start_info->mod_start); +#endif + if (xen_feature(XENFEAT_auto_translated_physmap)) xen_start_info->mfn_list = ~0UL; else if (xen_start_info->mfn_list < __START_KERNEL_map) - reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT, - (xen_start_info->first_p2m_pfn - + xen_start_info->nr_p2m_frames) << PAGE_SHIFT, - "INITP2M"); + reserve_pfn_range(xen_start_info->first_p2m_pfn, + xen_start_info->nr_p2m_frames, + "INITP2M"); /* * At this point everything still needed from the boot loader --- head-2010-10-25.orig/arch/x86/kernel/head_64-xen.S 2010-06-22 16:36:23.000000000 +0200 +++ head-2010-10-25/arch/x86/kernel/head_64-xen.S 2010-11-02 16:14:58.000000000 +0100 @@ -147,6 +147,7 @@ ENTRY(empty_zero_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") --- head-2010-10-25.orig/arch/x86/kernel/setup-xen.c 2010-11-08 16:08:51.000000000 +0100 +++ head-2010-10-25/arch/x86/kernel/setup-xen.c 2010-11-08 16:10:30.000000000 +0100 @@ -407,7 +407,7 @@ static void __init relocate_initrd(void) #else printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - __pa(xen_start_info->mod_start) + xen_start_info->mod_len, + xen_initrd_start + xen_start_info->mod_len, max_low_pfn_mapped << PAGE_SHIFT); initrd_start = 0; #endif @@ -426,7 +426,7 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ #else - unsigned long ramdisk_image = __pa(xen_start_info->mod_start); + unsigned long ramdisk_image = xen_initrd_start; unsigned long ramdisk_size = xen_start_info->mod_len; unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); unsigned long end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; --- head-2010-10-25.orig/arch/x86/mm/init-xen.c 2010-06-22 16:36:23.000000000 +0200 +++ head-2010-10-25/arch/x86/mm/init-xen.c 2010-11-02 17:05:20.000000000 +0100 @@ -341,16 +341,7 @@ unsigned long __init_refok init_memory_m if (!after_bootmem && e820_table_top > e820_table_start) { #ifdef CONFIG_X86_64 - if (xen_start_info->mfn_list < __START_KERNEL_map - && e820_table_start <= xen_start_info->first_p2m_pfn - && e820_table_top > xen_start_info->first_p2m_pfn) { - reserve_early(e820_table_start << PAGE_SHIFT, - xen_start_info->first_p2m_pfn - << PAGE_SHIFT, - "PGTABLE"); - e820_table_start = xen_start_info->first_p2m_pfn - + xen_start_info->nr_p2m_frames; - } + reserve_pgtable_low(); #endif reserve_early(e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT, "PGTABLE"); --- head-2010-10-25.orig/arch/x86/mm/init_64-xen.c 2010-08-18 14:39:04.000000000 +0200 +++ head-2010-10-25/arch/x86/mm/init_64-xen.c 2010-11-08 15:13:29.000000000 +0100 @@ -183,13 +183,72 @@ static int __init nonx32_setup(char *str } __setup("noexec32=", nonx32_setup); +static struct reserved_pfn_range { + unsigned long pfn, nr; +} reserved_pfn_ranges[3] __meminitdata; + +void __init reserve_pfn_range(unsigned long pfn, unsigned long nr, char *name) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) { + struct reserved_pfn_range *range = reserved_pfn_ranges + i; + + if (!range->nr) { + range->pfn = pfn; + range->nr = nr; + break; + } + BUG_ON(range->pfn < pfn + nr && pfn < range->pfn + range->nr); + if (range->pfn > pfn) { + i = ARRAY_SIZE(reserved_pfn_ranges) - 1; + if (reserved_pfn_ranges[i].nr) + continue; + for (; reserved_pfn_ranges + i > range; --i) + reserved_pfn_ranges[i] + = reserved_pfn_ranges[i - 1]; + range->pfn = pfn; + range->nr = nr; + break; + } + } + BUG_ON(i >= ARRAY_SIZE(reserved_pfn_ranges)); + reserve_early(pfn << PAGE_SHIFT, (pfn + nr) << PAGE_SHIFT, name); +} + +void __init reserve_pgtable_low(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) { + struct reserved_pfn_range *range = reserved_pfn_ranges + i; + + if (!range->nr) + break; + if (e820_table_start <= range->pfn + && e820_table_top > range->pfn) { + reserve_early(e820_table_start << PAGE_SHIFT, + range->pfn << PAGE_SHIFT, + "PGTABLE"); + e820_table_start = range->pfn + range->nr; + } + } +} + static __init unsigned long get_table_end(void) { + unsigned int i; + BUG_ON(!e820_table_end); - if (xen_start_info->mfn_list < __START_KERNEL_map - && e820_table_end == xen_start_info->first_p2m_pfn) { - e820_table_end += xen_start_info->nr_p2m_frames; - e820_table_top += xen_start_info->nr_p2m_frames; + for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) { + struct reserved_pfn_range *range = reserved_pfn_ranges + i; + + if (!range->nr) + break; + if (e820_table_end == range->pfn) { + e820_table_end += range->nr; + e820_table_top += range->nr; + } } return e820_table_end++; } @@ -428,14 +487,25 @@ static inline int __meminit make_readonl && !max_pfn_mapped && (paddr >= (e820_table_start << PAGE_SHIFT))) { unsigned long top = e820_table_top; + unsigned int i; + + /* Account for the ranges get_table_end() skips. */ + for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) { + const struct reserved_pfn_range *range; - /* Account for the range get_table_end() skips. */ - if (xen_start_info->mfn_list < __START_KERNEL_map - && e820_table_end <= xen_start_info->first_p2m_pfn - && top > xen_start_info->first_p2m_pfn) - top += xen_start_info->nr_p2m_frames; + range = reserved_pfn_ranges + i; + if (!range->nr) + continue; + if (e820_table_end <= range->pfn && top > range->pfn) { + if (paddr > (range->pfn << PAGE_SHIFT) + && paddr < ((range->pfn + range->nr) + << PAGE_SHIFT)) + break; + top += range->nr; + } + } if (paddr < (top << PAGE_SHIFT)) - readonly = 1; + readonly = (i >= ARRAY_SIZE(reserved_pfn_ranges)); } /* Make old page tables read-only. */ if (!xen_feature(XENFEAT_writable_page_tables) @@ -796,9 +866,6 @@ void __init xen_finish_init_mapping(void && xen_start_info->mfn_list >= __START_KERNEL_map) phys_to_machine_mapping = __va(__pa(xen_start_info->mfn_list)); - if (xen_start_info->mod_start) - xen_start_info->mod_start = (unsigned long) - __va(__pa(xen_start_info->mod_start)); /* Unpin the no longer used Xen provided page tables. */ mmuext.cmd = MMUEXT_UNPIN_TABLE; --- head-2010-10-25.orig/include/xen/interface/elfnote.h 2010-01-19 16:01:04.000000000 +0100 +++ head-2010-10-25/include/xen/interface/elfnote.h 2010-10-29 15:51:09.000000000 +0200 @@ -173,9 +173,15 @@ #define XEN_ELFNOTE_INIT_P2M 15 /* + * Whether or not the guest can deal with being passed an initrd not + * mapped through its initial page tables. + */ +#define XEN_ELFNOTE_MOD_START_PFN 16 + +/* * The number of the highest elfnote defined. */ -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_MOD_START_PFN /* * System information exported through crash notes. --- head-2010-10-25.orig/include/xen/interface/xen.h 2010-08-31 09:56:12.000000000 +0200 +++ head-2010-10-25/include/xen/interface/xen.h 2010-10-29 15:47:16.000000000 +0200 @@ -604,7 +604,9 @@ struct start_info { unsigned long pt_base; /* VIRTUAL address of page directory. */ unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */ unsigned long mfn_list; /* VIRTUAL address of page-frame list. */ - unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ + unsigned long mod_start; /* VIRTUAL address of pre-loaded module */ + /* (PFN of pre-loaded module if */ + /* SIF_MOD_START_PFN set in flags). */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; /* The pfn range here covers both page table and p->m table frames. */ @@ -623,6 +625,7 @@ typedef struct start_info start_info_t; #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ #define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ /*