kexec: x86_32 This is the x86_32 component of kexec for xen. The x86 component is a prerequsite for this patch. Signed-Off-By: Horms Signed-Off-By: Magnus Damm buildconfigs/linux-defconfig_xen_x86_32 | 4 linux-2.6-xen-sparse/arch/i386/Kconfig | 2 linux-2.6-xen-sparse/arch/i386/kernel/Makefile | 2 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c | 28 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h | 42 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h | 8 patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch | 457 ++++++++++ xen/arch/x86/x86_32/entry.S | 2 xen/arch/x86/x86_32/machine_kexec.c | 28 xen/include/asm-x86/x86_32/elf.h | 34 xen/include/asm-x86/x86_32/kexec.h | 68 + 11 files changed, 635 insertions(+), 40 deletions(-) --- x/buildconfigs/linux-defconfig_xen_x86_32 +++ x/buildconfigs/linux-defconfig_xen_x86_32 @@ -184,10 +184,11 @@ CONFIG_MTRR=y CONFIG_REGPARM=y CONFIG_SECCOMP=y CONFIG_HZ_100=y +CONFIG_KEXEC=y # CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=100 -# CONFIG_CRASH_DUMP is not set +CONFIG_CRASH_DUMP=y CONFIG_PHYSICAL_START=0x100000 CONFIG_HOTPLUG_CPU=y @@ -2774,6 +2775,7 @@ CONFIG_NTFS_FS=m # CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y +# CONFIG_PROC_VMCORE is not set CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_HUGETLB_PAGE is not set --- x/linux-2.6-xen-sparse/arch/i386/Kconfig +++ x/linux-2.6-xen-sparse/arch/i386/Kconfig @@ -726,7 +726,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL && !X86_XEN + depends on EXPERIMENTAL help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot --- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile +++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile @@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen obj-y += fixup.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o -n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) obj-y := $(call cherrypickxen, $(obj-y)) --- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c +++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c @@ -68,6 +68,10 @@ #include "setup_arch_pre.h" #include +#ifdef CONFIG_XEN +#include +#endif + /* Forward Declaration. */ void __init find_max_pfn(void); @@ -941,6 +945,7 @@ static void __init parse_cmdline_early ( * after a kernel panic. */ else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN unsigned long size, base; size = memparse(from+12, &from); if (*from == '@') { @@ -951,6 +956,10 @@ static void __init parse_cmdline_early ( crashk_res.start = base; crashk_res.end = base + size - 1; } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif } #endif #ifdef CONFIG_PROC_VMCORE @@ -1320,9 +1329,22 @@ void __init setup_bootmem_allocator(void } #endif #ifdef CONFIG_KEXEC +#ifndef CONFIG_XEN if (crashk_res.start != crashk_res.end) reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); +#else + { + xen_kexec_reserve_t reservation; + BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0, + &reservation)); + if (reservation.size) { + crashk_res.start = reservation.start; + crashk_res.end = reservation.start + + reservation.size - 1; + } + } +#endif #endif if (!xen_feature(XENFEAT_auto_translated_physmap)) @@ -1378,6 +1400,9 @@ legacy_init_iomem_resources(struct e820e res->end = res->start + e820[i].size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; request_resource(&iomem_resource, res); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif #ifndef CONFIG_XEN if (e820[i].type == E820_RAM) { /* @@ -1387,9 +1412,6 @@ legacy_init_iomem_resources(struct e820e */ request_resource(res, code_resource); request_resource(res, data_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif } #endif } --- /dev/null +++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h @@ -0,0 +1,42 @@ +/* + * include/asm-i386/kexec-xen.h + * + * Created By: Horms + */ + +#ifndef _I386_KEXEC_XEN_H +#define _I386_KEXEC_XEN_H + +#include +#include +#include + +static inline void crash_translate_regs(struct pt_regs *linux_regs, + struct cpu_user_regs *xen_regs) +{ + xen_regs->ebx = linux_regs->ebx; + xen_regs->ecx = linux_regs->ecx; + xen_regs->edx = linux_regs->edx; + xen_regs->esi = linux_regs->esi; + xen_regs->edi = linux_regs->edi; + xen_regs->ebp = linux_regs->ebp; + xen_regs->eax = linux_regs->eax; + xen_regs->esp = linux_regs->esp; + xen_regs->ss = linux_regs->xss; + xen_regs->cs = linux_regs->xcs; + xen_regs->ds = linux_regs->xds; + xen_regs->es = linux_regs->xes; + xen_regs->eflags = linux_regs->eflags; +} + +#endif /* _I386_KEXEC_XEN_H */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ --- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h +++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h @@ -359,5 +359,13 @@ HYPERVISOR_xenoprof_op( return _hypercall2(int, xenoprof_op, op, arg); } +static inline int +HYPERVISOR_kexec( + unsigned long op, unsigned int arg1, void * extra_args) +{ + return _hypercall3(int, kexec_op, op, arg1, extra_args); +} + + #endif /* __HYPERCALL_H__ */ --- x/xen/arch/x86/x86_32/entry.S +++ x/xen/arch/x86/x86_32/entry.S @@ -648,6 +648,7 @@ ENTRY(hypercall_table) .long do_xenoprof_op .long do_event_channel_op .long do_physdev_op + .long do_kexec .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr @@ -687,6 +688,7 @@ ENTRY(hypercall_args_table) .byte 2 /* do_xenoprof_op */ .byte 2 /* do_event_channel_op */ .byte 2 /* do_physdev_op */ + .byte 3 /* do_kexec */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr --- x/xen/arch/x86/x86_32/machine_kexec.c +++ x/xen/arch/x86/x86_32/machine_kexec.c @@ -1,19 +1,31 @@ -/* +/****************************************************************************** * arch/x86/x86_32/machine_kexec.c - * Handle transition of Linux booting another kernel - * - * Created By: Horms + * + * Created By: Horms * - * Should be losely based on arch/i386/kernel/machine_kexec.c + * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16 */ -#include /* for printk() used in stub */ +#include #include +#include +#include + +typedef asmlinkage void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address, + unsigned long page_table_a, + unsigned long has_pae); void machine_kexec(xen_kexec_image_t *image) { - printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: " - "not implemented\n"); + relocate_new_kernel_t rnk; + + rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE); + (*rnk)(image->indirection_page, image->reboot_code_buffer, + image->start_address, image->page_table_a[0], + (unsigned long)cpu_has_pae); } /* --- x/xen/include/asm-x86/x86_32/elf.h +++ x/xen/include/asm-x86/x86_32/elf.h @@ -3,19 +3,39 @@ * * Created By: Horms * - * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS - * from Linux 2.6.16 + * Based heavily on include/asm-i386/elf.h and + * include/asm-i386/system.h from Linux 2.6.16 */ #ifndef __X86_ELF_X86_32_H__ #define __X86_ELF_X86_32_H__ -#include /* for printk() used in stub */ +/* XXX: Xen doesn't have orig_eax. For kdump, on a dom0 crash, the values + * for the crashing CPU could could be passed down from dom0, but is that + * neccessary? + * Also, I'm not sure why fs and gs are derived from the CPU + * rather than regs */ -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \ - "not implemented\n") - +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + unsigned i; \ + pr_reg[0] = regs->ebx; \ + pr_reg[1] = regs->ecx; \ + pr_reg[2] = regs->edx; \ + pr_reg[3] = regs->esi; \ + pr_reg[4] = regs->edi; \ + pr_reg[5] = regs->ebp; \ + pr_reg[6] = regs->eax; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9] = i; \ + asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i; \ + pr_reg[11] = 0; /* regs->orig_eax; */ \ + pr_reg[12] = regs->eip; \ + pr_reg[13] = regs->cs; \ + pr_reg[14] = regs->eflags; \ + pr_reg[15] = regs->esp; \ + pr_reg[16] = regs->ss; \ +} while(0); #endif /* __X86_ELF_X86_32_H__ */ --- x/xen/include/asm-x86/x86_32/kexec.h +++ x/xen/include/asm-x86/x86_32/kexec.h @@ -3,42 +3,72 @@ * * Created By: Horms * - * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16 - * + * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16 */ -#ifndef __X86_32_KEXEC_H__ -#define __X86_32_KEXEC_H__ - -#include /* for printk() used in stub */ -#include -#include +#ifndef __X86_KEXEC_X86_32_H__ +#define __X86_KEXEC_X86_32_H__ +/* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ static void crash_fixup_ss_esp(struct cpu_user_regs *newregs, - struct cpu_user_regs *oldregs) + struct cpu_user_regs *oldregs) { - printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: " - "not implemented\n"); - return; - crash_fixup_ss_esp(newregs, oldregs); + memcpy(newregs, oldregs, sizeof(*newregs)); + newregs->esp = (unsigned long)&(oldregs->esp); + __asm__ __volatile__( + "xorl %%eax, %%eax\n\t" + "movw %%ss, %%ax\n\t" + :"=a"(newregs->ss)); } +/* + * This function is responsible for capturing register states if coming + * via panic otherwise just fix up the ss and esp if coming via kernel + * mode exception. + */ static void crash_setup_regs(struct cpu_user_regs *newregs, struct cpu_user_regs *oldregs) { - printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: " - "not implemented\n"); + if (oldregs) + crash_fixup_ss_esp(newregs, oldregs); + else { + __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags)); + + newregs->eip = (unsigned long)current_text_addr(); + } } +/* + * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h + * + * user_mode_vm(regs) determines whether a register set came from user mode. + * This is true if V8086 mode was enabled OR if the register set was from + * protected mode with RPL-3 CS value. This tricky test checks that with + * one comparison. Many places in the kernel can bypass this full check + * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + */ static inline int user_mode(struct cpu_user_regs *regs) { - printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: " - "not implemented\n"); - return -1; + return (regs->cs & 2) != 0; } -#endif /* __X86_32_KEXEC_H__ */ +#endif /* __X86_KEXEC_X86_32_H__ */ /* * Local variables: --- /dev/null +++ x/patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch @@ -0,0 +1,457 @@ +kexec: Avoid overwriting the current pgd (V2, i386) + +This patch upgrades the i386-specific kexec code to avoid overwriting the +current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used +to start a secondary kernel that dumps the memory of the previous kernel. + +The code introduces a new set of page tables called "page_table_a". These +tables are used to provide an executable identity mapping without overwriting +the current pgd. This updated version of the patch fixes a PAE bug and moves +the segment handling code into the reloacte_kernel.S. + +Signed-off-by: Magnus Damm +--- + + The patch has been tested with regular kexec and CONFIG_CRASH_DUMP. + Both PAE and non-PAE configurations work well. + Applies on top of 2.6.16 and 2.6.17-rc4. + + arch/i386/kernel/machine_kexec.c | 230 ++++++++++++++---------------------- + arch/i386/kernel/relocate_kernel.S | 92 ++++++++++++++ + include/asm-i386/kexec.h | 12 + + 3 files changed, 192 insertions(+), 142 deletions(-) + +--- x/arch/i386/kernel/machine_kexec.c ++++ x/arch/i386/kernel/machine_kexec.c +@@ -2,6 +2,10 @@ + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + * ++ * 2006-05-19 Magnus Damm : ++ * - rewrote identity map code to avoid overwriting current pgd ++ * - moved segment handling code into relocate_kernel.S ++ * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ +@@ -19,123 +23,73 @@ + #include + #include + +-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +- +-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +-#define L2_ATTR (_PAGE_PRESENT) +- +-#define LEVEL0_SIZE (1UL << 12UL) ++typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( ++ unsigned long indirection_page, ++ unsigned long reboot_code_buffer, ++ unsigned long start_address, ++ unsigned long page_table_a, ++ unsigned long has_pae) ATTRIB_NORET; + +-#ifndef CONFIG_X86_PAE +-#define LEVEL1_SIZE (1UL << 22UL) +-static u32 pgtable_level1[1024] PAGE_ALIGNED; ++const extern unsigned char relocate_new_kernel[]; ++extern void relocate_new_kernel_end(void); ++const extern unsigned int relocate_new_kernel_size; + +-static void identity_map_page(unsigned long address) ++static int allocate_page_table_a(struct kimage *image) + { +- unsigned long level1_index, level2_index; +- u32 *pgtable_level2; +- +- /* Find the current page table */ +- pgtable_level2 = __va(read_cr3()); ++ struct kimage_arch *arch = &image->arch_data; ++ struct page *page; ++ int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]); ++ ++ for (; k > 0; k--) { ++ page = kimage_alloc_control_pages(image, 0); ++ if (!page) ++ return -ENOMEM; ++ ++ clear_page(page_address(page)); ++ arch->page_table_a[k - 1] = page; ++ } + +- /* Find the indexes of the physical address to identity map */ +- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; +- level2_index = address / LEVEL1_SIZE; +- +- /* Identity map the page table entry */ +- pgtable_level1[level1_index] = address | L0_ATTR; +- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; +- +- /* Flush the tlb so the new mapping takes effect. +- * Global tlb entries are not flushed but that is not an issue. +- */ +- load_cr3(pgtable_level2); ++ return 0; + } + +-#else +-#define LEVEL1_SIZE (1UL << 21UL) +-#define LEVEL2_SIZE (1UL << 30UL) +-static u64 pgtable_level1[512] PAGE_ALIGNED; +-static u64 pgtable_level2[512] PAGE_ALIGNED; +- +-static void identity_map_page(unsigned long address) +-{ +- unsigned long level1_index, level2_index, level3_index; +- u64 *pgtable_level3; ++/* workaround for include/asm-i386/pgtable-3level.h */ + +- /* Find the current page table */ +- pgtable_level3 = __va(read_cr3()); +- +- /* Find the indexes of the physical address to identity map */ +- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; +- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; +- level3_index = address / LEVEL2_SIZE; +- +- /* Identity map the page table entry */ +- pgtable_level1[level1_index] = address | L0_ATTR; +- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; +- set_64bit(&pgtable_level3[level3_index], +- __pa(pgtable_level2) | L2_ATTR); +- +- /* Flush the tlb so the new mapping takes effect. +- * Global tlb entries are not flushed but that is not an issue. +- */ +- load_cr3(pgtable_level3); +-} ++#ifdef CONFIG_X86_PAE ++#undef pgd_present ++#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT) ++#define _PGD_ATTR _PAGE_PRESENT ++#else ++#define _PGD_ATTR _KERNPG_TABLE + #endif + +-static void set_idt(void *newidt, __u16 limit) +-{ +- struct Xgt_desc_struct curidt; +- +- /* ia32 supports unaliged loads & stores */ +- curidt.size = limit; +- curidt.address = (unsigned long)newidt; +- +- load_idt(&curidt); +-}; ++#define pa_page(page) __pa(page_address(page)) + +- +-static void set_gdt(void *newgdt, __u16 limit) ++static int create_mapping(struct page *root, struct page **pages, ++ unsigned long va, unsigned long pa) + { +- struct Xgt_desc_struct curgdt; +- +- /* ia32 supports unaligned loads & stores */ +- curgdt.size = limit; +- curgdt.address = (unsigned long)newgdt; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ int k = 0; + +- load_gdt(&curgdt); +-}; ++ pgd = (pgd_t *)page_address(root) + pgd_index(va); ++ if (!pgd_present(*pgd)) ++ set_pgd(pgd, __pgd(pa_page(pages[k++]) | _PGD_ATTR)); + +-static void load_segments(void) +-{ +-#define __STR(X) #X +-#define STR(X) __STR(X) ++ pud = pud_offset(pgd, va); ++ if (!pud_present(*pud)) ++ set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE)); + +- __asm__ __volatile__ ( +- "\tljmp $"STR(__KERNEL_CS)",$1f\n" +- "\t1:\n" +- "\tmovl $"STR(__KERNEL_DS)",%%eax\n" +- "\tmovl %%eax,%%ds\n" +- "\tmovl %%eax,%%es\n" +- "\tmovl %%eax,%%fs\n" +- "\tmovl %%eax,%%gs\n" +- "\tmovl %%eax,%%ss\n" +- ::: "eax", "memory"); +-#undef STR +-#undef __STR +-} ++ pmd = pmd_offset(pud, va); ++ if (!pmd_present(*pmd)) ++ set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE)); + +-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( +- unsigned long indirection_page, +- unsigned long reboot_code_buffer, +- unsigned long start_address, +- unsigned int has_pae) ATTRIB_NORET; ++ pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va); ++ set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC)); + +-const extern unsigned char relocate_new_kernel[]; +-extern void relocate_new_kernel_end(void); +-const extern unsigned int relocate_new_kernel_size; ++ return k; ++} + + /* + * A architecture hook called to validate the +@@ -147,11 +101,38 @@ const extern unsigned int relocate_new_k + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. +- * +- * Currently nothing. + */ + int machine_kexec_prepare(struct kimage *image) + { ++ void *control_page; ++ unsigned long pa; ++ int k; ++ ++ memset(&image->arch_data, 0, sizeof(image->arch_data)); ++ ++ k = allocate_page_table_a(image); ++ if (k) ++ return k; ++ ++ /* fill in control_page with assembly code */ ++ ++ control_page = page_address(image->control_code_page); ++ memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size); ++ ++ /* map the control_page at the virtual address of relocate_kernel.S */ ++ ++ pa = __pa(control_page); ++ ++ k = create_mapping(image->arch_data.page_table_a[0], ++ &image->arch_data.page_table_a[1], ++ (unsigned long)relocate_new_kernel, pa); ++ ++ /* identity map the control_page */ ++ ++ create_mapping(image->arch_data.page_table_a[0], ++ &image->arch_data.page_table_a[k + 1], ++ pa, pa); ++ + return 0; + } + +@@ -170,45 +151,16 @@ void machine_kexec_cleanup(struct kimage + NORET_TYPE void machine_kexec(struct kimage *image) + { + unsigned long page_list; +- unsigned long reboot_code_buffer; +- ++ unsigned long control_code; ++ unsigned long page_table_a; + relocate_new_kernel_t rnk; + +- /* Interrupts aren't acceptable while we reboot */ +- local_irq_disable(); +- +- /* Compute some offsets */ +- reboot_code_buffer = page_to_pfn(image->control_code_page) +- << PAGE_SHIFT; + page_list = image->head; +- +- /* Set up an identity mapping for the reboot_code_buffer */ +- identity_map_page(reboot_code_buffer); +- +- /* copy it out */ +- memcpy((void *)reboot_code_buffer, relocate_new_kernel, +- relocate_new_kernel_size); +- +- /* The segment registers are funny things, they are +- * automatically loaded from a table, in memory wherever you +- * set them to a specific selector, but this table is never +- * accessed again you set the segment to a different selector. +- * +- * The more common model is are caches where the behide +- * the scenes work is done, but is also dropped at arbitrary +- * times. +- * +- * I take advantage of this here by force loading the +- * segments, before I zap the gdt with an invalid value. +- */ +- load_segments(); +- /* The gdt & idt are now invalid. +- * If you want to load them you must set up your own idt & gdt. +- */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); ++ control_code = __pa(page_address(image->control_code_page)); ++ page_table_a = __pa(page_address(image->arch_data.page_table_a[0])); + + /* now call it */ +- rnk = (relocate_new_kernel_t) reboot_code_buffer; +- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); ++ rnk = (relocate_new_kernel_t) relocate_new_kernel; ++ (*rnk)(page_list, control_code, image->start, ++ page_table_a, (unsigned long)cpu_has_pae); + } +--- x/arch/i386/kernel/relocate_kernel.S ++++ x/arch/i386/kernel/relocate_kernel.S +@@ -2,12 +2,20 @@ + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + * ++ * 2006-05-19 Magnus Damm : ++ * - moved segment handling code from machine_kexec.c ++ * - gdt tables stolen from arch/i386/boot/setup.S ++ * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + #include ++#include + ++.text ++.align (1 << PAGE_SHIFT) ++ + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. +@@ -18,18 +26,68 @@ relocate_new_kernel: + movl 4(%esp), %ebx /* page_list */ + movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 12(%esp), %edx /* start address */ +- movl 16(%esp), %ecx /* cpu_has_pae */ ++ movl 16(%esp), %edi /* page_table_a */ ++ movl 20(%esp), %ecx /* cpu_has_pae */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + ++ /* switch to page_table_a */ ++ movl %edi, %eax ++ movl %eax, %cr3 ++ ++ /* setup idt */ ++ ++ movl %ebp, %eax ++ addl $(idt_48 - relocate_new_kernel), %eax ++ lidtl (%eax) ++ ++ /* setup gdt */ ++ ++ movl %ebp, %eax ++ addl $(gdt - relocate_new_kernel), %eax ++ movl %ebp, %esi ++ addl $((gdt_48 - relocate_new_kernel) + 2), %esi ++ movl %eax, (%esi) ++ ++ movl %ebp, %eax ++ addl $(gdt_48 - relocate_new_kernel), %eax ++ lgdtl (%eax) ++ ++ /* setup data segment registers */ ++ ++ mov $(gdt_ds - gdt), %eax ++ mov %eax, %ds ++ mov %eax, %es ++ mov %eax, %fs ++ mov %eax, %gs ++ mov %eax, %ss ++ + /* set a new stack at the bottom of our page... */ + lea 4096(%ebp), %esp + ++ /* load new code segment */ ++ ++ movl %ebp, %esi ++ xorl %eax, %eax ++ pushl %eax ++ pushl %esi ++ pushl %eax ++ ++ movl $(gdt_cs - gdt), %eax ++ pushl %eax ++ ++ movl %ebp, %eax ++ addl $(identity_mapped - relocate_new_kernel),%eax ++ pushl %eax ++ iretl ++ ++identity_mapped: ++ + /* store the parameters back on the stack */ + pushl %edx /* store the start address */ +- ++ + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled +@@ -113,6 +171,36 @@ relocate_new_kernel: + xorl %edi, %edi + xorl %ebp, %ebp + ret ++ ++ .align 16 ++gdt: ++ .fill 1,8,0 ++ ++gdt_cs: ++ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) ++ .word 0 # base address = 0 ++ .word 0x9A00 # code read/exec ++ .word 0x00CF # granularity = 4096, 386 ++ # (+5th nibble of limit) ++gdt_ds: ++ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) ++ .word 0 # base address = 0 ++ .word 0x9200 # data read/write ++ .word 0x00CF # granularity = 4096, 386 ++ # (+5th nibble of limit) ++gdt_end: ++ .align 4 ++ ++ .word 0 # alignment byte ++idt_48: ++ .word 0 # idt limit = 0 ++ .word 0, 0 # idt base = 0L ++ ++ .word 0 # alignment byte ++gdt_48: ++ .word gdt_end - gdt - 1 # gdt limit ++ .word 0, 0 # gdt base (filled in later) ++ + relocate_new_kernel_end: + + .globl relocate_new_kernel_size +--- x/include/asm-i386/kexec.h ++++ x/include/asm-i386/kexec.h +@@ -29,7 +29,17 @@ + + #define MAX_NOTE_BYTES 1024 + +-struct kimage_arch {}; ++struct kimage_arch { ++ /* page_table_a[] holds enough pages to create a new page table ++ * that maps the control page twice.. ++ */ ++ ++#if defined(CONFIG_X86_PAE) ++ struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */ ++#else ++ struct page *page_table_a[3]; /* (2 * pte) + pgd */ ++#endif ++}; + + /* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code