kexec: x86_32 This is the x86_32 component of kexec for xen. The x86 component is a prerequsite for this patch. buildconfigs/linux-defconfig_xen_x86_32 | 1 linux-2.6-xen-sparse/arch/i386/Kconfig | 2 linux-2.6-xen-sparse/arch/i386/kernel/Makefile | 2 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c | 25 + linux-2.6-xen-sparse/drivers/xen/core/crash.c | 2 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h | 19 + linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h | 10 patches/linux-2.6.16.13/kexec-x86_32.patch | 12 xen/arch/x86/machine_kexec.c | 6 xen/arch/x86/x86_32/entry.S | 2 xen/arch/x86/x86_32/machine_kexec.c | 143 +++++++++- xen/include/asm-x86/x86_32/elf.h | 34 +- xen/include/asm-x86/x86_32/kexec.h | 68 +++- xen/include/xen/kexec.h | 1 14 files changed, 281 insertions(+), 46 deletions(-) --- x/buildconfigs/linux-defconfig_xen_x86_32 +++ x/buildconfigs/linux-defconfig_xen_x86_32 @@ -184,6 +184,7 @@ CONFIG_MTRR=y CONFIG_REGPARM=y CONFIG_SECCOMP=y CONFIG_HZ_100=y +CONFIG_KEXEC=y # CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=100 --- x/linux-2.6-xen-sparse/arch/i386/Kconfig +++ x/linux-2.6-xen-sparse/arch/i386/Kconfig @@ -726,7 +726,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL && !X86_XEN + depends on EXPERIMENTAL help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot --- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile +++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile @@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen obj-y += fixup.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o -n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) obj-y := $(call cherrypickxen, $(obj-y)) --- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c +++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c @@ -68,6 +68,10 @@ #include "setup_arch_pre.h" #include +#ifdef CONFIG_XEN +#include +#endif + /* Forward Declaration. */ void __init find_max_pfn(void); @@ -932,6 +936,7 @@ static void __init parse_cmdline_early ( * after a kernel panic. */ else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN unsigned long size, base; size = memparse(from+12, &from); if (*from == '@') { @@ -942,6 +947,10 @@ static void __init parse_cmdline_early ( crashk_res.start = base; crashk_res.end = base + size - 1; } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif } #endif #ifdef CONFIG_PROC_VMCORE @@ -1318,9 +1327,22 @@ void __init setup_bootmem_allocator(void } #endif #ifdef CONFIG_KEXEC +#ifndef CONFIG_XEN if (crashk_res.start != crashk_res.end) reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); +#else + { + struct kexec_arg xen_kexec_arg; + BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0, + &xen_kexec_arg)); + if (xen_kexec_arg.u.reserve.size) { + crashk_res.start = xen_kexec_arg.u.reserve.start; + crashk_res.end = xen_kexec_arg.u.reserve.start + + xen_kexec_arg.u.reserve.size - 1; + } + } +#endif #endif if (!xen_feature(XENFEAT_auto_translated_physmap)) @@ -1395,6 +1417,9 @@ legacy_init_iomem_resources(struct resou res->end = map[i].end - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; request_resource(&iomem_resource, res); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } free_bootmem(__pa(map), PAGE_SIZE); --- x/linux-2.6-xen-sparse/drivers/xen/core/crash.c +++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c @@ -1,5 +1,5 @@ /* - * Architecture independent functions for kexec based crash dumps in xen. + * Architecture specific (i386-xen) functions for kexec based crash dumps. * * Created by: Horms * --- x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h +++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h @@ -7,11 +7,26 @@ #ifndef _I386_KEXEC_XEN_H #define _I386_KEXEC_XEN_H +#include +#include +#include + static inline void crash_translate_regs(struct pt_regs *linux_regs, struct cpu_user_regs *xen_regs) { - printk("STUB: include/asm-i386/kexec-xen.h: crash_translate_regs: " - "not implemented\n"); + xen_regs->ebx = linux_regs->ebx; + xen_regs->ecx = linux_regs->ecx; + xen_regs->edx = linux_regs->edx; + xen_regs->esi = linux_regs->esi; + xen_regs->edi = linux_regs->edi; + xen_regs->ebp = linux_regs->ebp; + xen_regs->eax = linux_regs->eax; + xen_regs->esp = linux_regs->esp; + xen_regs->ss = linux_regs->xss; + xen_regs->cs = linux_regs->xcs; + xen_regs->ds = linux_regs->xds; + xen_regs->es = linux_regs->xes; + xen_regs->eflags = linux_regs->eflags; } #endif /* _I386_KEXEC_XEN_H */ --- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h +++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h @@ -39,6 +39,8 @@ # error "please don't include this file directly" #endif +#include + #define __STR(x) #x #define STR(x) __STR(x) @@ -359,6 +361,14 @@ HYPERVISOR_xenoprof_op( return _hypercall2(int, xenoprof_op, op, arg); } +static inline int +HYPERVISOR_kexec( + unsigned long op, int type, kexec_arg_t * arg) +{ + return _hypercall3(int, kexec_op, op, type, arg); +} + + #endif /* __HYPERCALL_H__ */ --- x/xen/arch/x86/machine_kexec.c +++ x/xen/arch/x86/machine_kexec.c @@ -25,12 +25,6 @@ void machine_kexec_cleanup(int type, str { } -void machine_kexec_reserved(struct kexec_arg *arg) -{ - arg->u.reserve.size = opt_kdump_megabytes << 20; - arg->u.reserve.start = opt_kdump_megabytes_base << 20; -} - static void __machine_shutdown(void *data) { struct kexec_arg *arg = (struct kexec_arg *)data; --- x/xen/arch/x86/x86_32/entry.S +++ x/xen/arch/x86/x86_32/entry.S @@ -648,6 +648,7 @@ ENTRY(hypercall_table) .long do_xenoprof_op .long do_event_channel_op .long do_physdev_op + .long do_kexec .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr @@ -687,6 +688,7 @@ ENTRY(hypercall_args_table) .byte 2 /* do_xenoprof_op */ .byte 2 /* do_event_channel_op */ .byte 2 /* do_physdev_op */ + .byte 3 /* do_kexec */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr --- x/xen/arch/x86/x86_32/machine_kexec.c +++ x/xen/arch/x86/x86_32/machine_kexec.c @@ -1,19 +1,146 @@ -/* +/****************************************************************************** * arch/x86/x86_32/machine_kexec.c - * Handle transition of Linux booting another kernel - * - * Created By: Horms + * + * Created By: Horms * - * Should be losely based on arch/i386/kernel/machine_kexec.c + * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16 */ -#include /* for printk() used in stub */ +#include +#include +#include +#include +#include +#include #include +typedef asmlinkage void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address, + unsigned int has_pae); + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L2_ATTR (_PAGE_PRESENT) + +#ifndef CONFIG_X86_PAE + +static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long mfn; + u32 *pgtable_level2; + + /* Find the current page table */ + mfn = read_cr3() >> PAGE_SHIFT; + pgtable_level2 = map_domain_page(mfn); + + /* Identity map the page table entry */ + pgtable_level1[l1_table_offset(address)] = address | L0_ATTR; + pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR; + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + write_cr3(mfn << PAGE_SHIFT); + + unmap_domain_page(pgtable_level2); +} + +#else +static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED; +static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + int mfn; + intpte_t *pgtable_level3; + + /* Find the current page table */ + mfn = read_cr3() >> PAGE_SHIFT; + pgtable_level3 = map_domain_page(mfn); + + /* Identity map the page table entry */ + pgtable_level1[l1_table_offset(address)] = address | L0_ATTR; + pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR; + set_64bit(&pgtable_level3[l3_table_offset(address)], + __pa(pgtable_level2) | L2_ATTR); + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(mfn << PAGE_SHIFT); + + unmap_domain_page(pgtable_level3); +} +#endif + +static void kexec_load_segments(void) +{ +#define __SSTR(X) #X +#define SSTR(X) __SSTR(X) + __asm__ __volatile__ ( + "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n" + "\t1:\n" + "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n" + "\tmovl %%eax,%%ds\n" + "\tmovl %%eax,%%es\n" + "\tmovl %%eax,%%fs\n" + "\tmovl %%eax,%%gs\n" + "\tmovl %%eax,%%ss\n" + ::: "eax", "memory"); +#undef SSTR +#undef __SSTR +} + +#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) +static void kexec_set_idt(void *newidt, __u16 limit) +{ + struct Xgt_desc_struct curidt; + + /* ia32 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + kexec_load_idt(&curidt); + +}; + +#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) +static void kexec_set_gdt(void *newgdt, __u16 limit) +{ + struct Xgt_desc_struct curgdt; + + /* ia32 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + kexec_load_gdt(&curgdt); +}; + void machine_kexec(struct kexec_arg *arg) { - printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: " - "not implemented\n"); + relocate_new_kernel_t rnk; + + local_irq_disable(); + + identity_map_page(arg->u.image.reboot_code_buffer); + + copy_from_user((void *)arg->u.image.reboot_code_buffer, + arg->u.image.relocate_new_kernel, + arg->u.image.relocate_new_kernel_size); + + kexec_load_segments(); + kexec_set_gdt(__va(0),0); + kexec_set_idt(__va(0),0); + + rnk = (relocate_new_kernel_t) arg->u.image.reboot_code_buffer; + (*rnk)(arg->u.image.indirection_page, arg->u.image.reboot_code_buffer, + arg->u.image.start_address, cpu_has_pae); } /* --- x/xen/include/asm-x86/x86_32/elf.h +++ x/xen/include/asm-x86/x86_32/elf.h @@ -3,19 +3,39 @@ * * Created By: Horms * - * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS - * from Linux 2.6.16 + * Based heavily on include/asm-i386/elf.h and + * include/asm-i386/system.h from Linux 2.6.16 */ #ifndef __X86_ELF_X86_32_H__ #define __X86_ELF_X86_32_H__ -#include /* for printk() used in stub */ +/* XXX: Xen doesn't have orig_eax. For kdump, on a dom0 crash, the values + * for the crashing CPU could could be passed down from dom0, but is that + * neccessary? + * Also, I'm not sure why fs and gs are derived from the CPU + * rather than regs */ -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \ - "not implemented\n") - +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + unsigned i; \ + pr_reg[0] = regs->ebx; \ + pr_reg[1] = regs->ecx; \ + pr_reg[2] = regs->edx; \ + pr_reg[3] = regs->esi; \ + pr_reg[4] = regs->edi; \ + pr_reg[5] = regs->ebp; \ + pr_reg[6] = regs->eax; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9] = i; \ + asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i; \ + pr_reg[11] = 0; /* regs->orig_eax; */ \ + pr_reg[12] = regs->eip; \ + pr_reg[13] = regs->cs; \ + pr_reg[14] = regs->eflags; \ + pr_reg[15] = regs->esp; \ + pr_reg[16] = regs->ss; \ +} while(0); #endif /* __X86_ELF_X86_32_H__ */ --- x/xen/include/asm-x86/x86_32/kexec.h +++ x/xen/include/asm-x86/x86_32/kexec.h @@ -3,42 +3,72 @@ * * Created By: Horms * - * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16 - * + * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16 */ -#ifndef __X86_32_KEXEC_H__ -#define __X86_32_KEXEC_H__ - -#include /* for printk() used in stub */ -#include -#include +#ifndef __X86_KEXEC_X86_32_H__ +#define __X86_KEXEC_X86_32_H__ +/* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ static void crash_fixup_ss_esp(struct cpu_user_regs *newregs, - struct cpu_user_regs *oldregs) + struct cpu_user_regs *oldregs) { - printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: " - "not implemented\n"); - return; - crash_fixup_ss_esp(newregs, oldregs); + memcpy(newregs, oldregs, sizeof(*newregs)); + newregs->esp = (unsigned long)&(oldregs->esp); + __asm__ __volatile__( + "xorl %%eax, %%eax\n\t" + "movw %%ss, %%ax\n\t" + :"=a"(newregs->ss)); } +/* + * This function is responsible for capturing register states if coming + * via panic otherwise just fix up the ss and esp if coming via kernel + * mode exception. + */ static void crash_setup_regs(struct cpu_user_regs *newregs, struct cpu_user_regs *oldregs) { - printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: " - "not implemented\n"); + if (oldregs) + crash_fixup_ss_esp(newregs, oldregs); + else { + __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags)); + + newregs->eip = (unsigned long)current_text_addr(); + } } +/* + * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h + * + * user_mode_vm(regs) determines whether a register set came from user mode. + * This is true if V8086 mode was enabled OR if the register set was from + * protected mode with RPL-3 CS value. This tricky test checks that with + * one comparison. Many places in the kernel can bypass this full check + * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + */ static inline int user_mode(struct cpu_user_regs *regs) { - printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: " - "not implemented\n"); - return -1; + return (regs->cs & 2) != 0; } -#endif /* __X86_32_KEXEC_H__ */ +#endif /* __X86_KEXEC_X86_32_H__ */ /* * Local variables: --- x/xen/include/xen/kexec.h +++ x/xen/include/xen/kexec.h @@ -14,7 +14,6 @@ DECLARE_PER_CPU (note_buf_t, crash_notes int machine_kexec_prepare(int type, struct kexec_arg *arg); void machine_kexec_cleanup(int type, struct kexec_arg *arg); -void machine_kexec_reserved(struct kexec_arg *arg); void machine_kexec(struct kexec_arg *arg); void machine_shutdown(struct kexec_arg *arg); void machine_crash_shutdown(struct cpu_user_regs *regs); --- /dev/null 2006-05-09 15:32:30.399072192 +0900 +++ x/patches/linux-2.6.16.13/kexec-x86_32.patch 2006-05-17 18:37:45.000000000 +0900 @@ -0,0 +1,12 @@ +--- x/arch/i386/kernel/crash.c ++++ x/arch/i386/kernel/crash.c +@@ -175,9 +175,5 @@ void machine_crash_shutdown(struct pt_re + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = smp_processor_id(); + nmi_shootdown_cpus(); +- lapic_shutdown(); +-#if defined(CONFIG_X86_IO_APIC) +- disable_IO_APIC(); +-#endif + crash_save_self(regs); + }