kexec: x86 This is the x86 component of kexec for xen. The generic component is a prerequsite for this patch. The x86_64 or x86_32 (i386) patch is also needed in order to use this code, however the code should compile is. Signed-Off-By: Horms Signed-Off-By: Magnus Damm xen/arch/x86/crash.c | 173 ++++++++++++++++++++++++++++++++++- xen/arch/x86/machine_kexec.c | 145 +++++++++++++++++++++++++++-- xen/arch/x86/setup.c | 75 +++++++++++++-- xen/arch/x86/x86_32/Makefile | 1 xen/arch/x86/x86_32/machine_kexec.c | 26 +++++ xen/arch/x86/x86_64/Makefile | 1 xen/arch/x86/x86_64/machine_kexec.c | 27 +++++ xen/include/asm-x86/elf.h | 27 +++++ xen/include/asm-x86/fixmap.h | 1 xen/include/asm-x86/hypercall.h | 5 + xen/include/asm-x86/kexec.h | 13 +- xen/include/asm-x86/x86_32/elf.h | 28 +++++ xen/include/asm-x86/x86_32/kexec.h | 48 +++++++++ xen/include/asm-x86/x86_64/elf.h | 28 +++++ xen/include/asm-x86/x86_64/kexec.h | 33 ++++++ xen/include/public/kexec.h | 3 xen/include/xen/elfcore.h | 3 17 files changed, 611 insertions(+), 26 deletions(-) --- x/xen/arch/x86/crash.c +++ x/xen/arch/x86/crash.c @@ -3,16 +3,183 @@ * * Created By: Horms * - * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16 + * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16 */ -#include /* for printk() used in stub */ +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +static int crashing_cpu; + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + Elf_Note note; + + note.namesz = strlen(name) + 1; + note.descsz = data_len; + note.type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) +3)/4; + memcpy(buf, name, note.namesz); + buf += (note.namesz + 3)/4; + memcpy(buf, data, note.descsz); + buf += (note.descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + Elf_Note note; + + note.namesz = 0; + note.descsz = 0; + note.type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static void crash_save_this_cpu(struct cpu_user_regs *regs, int cpu) +{ + ELF_Prstatus prstatus; + uint32_t *buf; + + printk("crash_save_this_cpu: %d\n", cpu); + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * A well defined structure format with tags is needed + * ELF notes happen to provide this and there is infastructure + * in the Linux kernel to supprot them. In order to make + * crash dumps produced by xen the same, the same + * technique is used here. + */ + + /* It should be safe to use per_cpu() here instead of per_cpu_ptr() + * (which does not exist in xen) as kexecing_lock must be held in + * order to get anywhere near here */ + buf = (uint32_t *)per_cpu(crash_notes, cpu); + if (!buf) /* XXX: Can this ever occur? */ + return; + memset(&prstatus, 0, sizeof(prstatus)); + /* XXX: Xen does not have processes. For the crashing CPU on a dom0 + * crash this could be pased down from dom0, but is this + * neccessary? + * prstatus.pr_pid = current->pid; */ + ELF_CORE_COPY_REGS(prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + +static void crash_save_self(struct cpu_user_regs *regs) +{ + crash_save_this_cpu(regs, smp_processor_id()); +} + +#ifdef CONFIG_SMP +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu) +{ + struct cpu_user_regs fixed_regs; + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return 1; + local_irq_disable(); + +#ifdef CONFIG_X86_32 + if (!user_mode(regs)) { + crash_fixup_ss_esp(&fixed_regs, regs); + regs = &fixed_regs; + } +#endif + crash_save_this_cpu(regs, cpu); + disable_local_APIC(); + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + __asm__ __volatile__ ( "hlt" ); + for(;;); + + return 1; + + /* Need to use this somewhere as Xen builds with -Werror */ + crash_setup_regs(&fixed_regs, regs); +} + +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +static void smp_send_nmi_allbutself(void) +{ + cpumask_t allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + send_IPI_mask(allbutself, APIC_DM_NMI); +} + +static void nmi_shootdown_cpus(void) +{ + unsigned long msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + set_nmi_callback(crash_nmi_callback); + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ + disable_local_APIC(); +} +#else +static void nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + void machine_crash_shutdown(struct cpu_user_regs *regs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + printk("machine_crash_shutdown: %d\n", smp_processor_id()); + local_irq_disable(); + + crashing_cpu = smp_processor_id(); + nmi_shootdown_cpus(); +#ifdef CONFIG_X86_IO_APIC + disable_IO_APIC(); +#endif + crash_save_self(regs); } /* --- x/xen/arch/x86/machine_kexec.c +++ x/xen/arch/x86/machine_kexec.c @@ -5,34 +5,163 @@ * */ -#include /* for printk() used in stubs */ +#include +#include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include + +#define create_level_mapping(lvl, next, pages, nopages, k, va) \ +{ \ + lvl##_pgentry_t *table; \ + void *old = next; \ + \ + table = (lvl##_pgentry_t *)next + lvl##_table_offset(va); \ + if (!(lvl##e_get_flags(*table) & _PAGE_PRESENT)) { \ + if (k >= nopages || pages[k] == 0) \ + return -1; \ + *table = lvl##e_from_pfn(pages[k++]>>PAGE_SHIFT, __PAGE_HYPERVISOR); \ + } \ + next = map_domain_page(lvl##e_get_pfn(*table)); \ + unmap_domain_page(old); \ +} + +#define create_level_1_mapping(next, nopages, va, pa) \ +{ \ + l1_pgentry_t *table; \ + \ + table = (l1_pgentry_t *)next + l1_table_offset(va); \ + if (!(l1e_get_flags(*table) & _PAGE_PRESENT)) { \ + *table = l1e_from_pfn(pa >> PAGE_SHIFT, __PAGE_HYPERVISOR); \ + } \ + unmap_domain_page(next); \ +} + +static int create_mapping(unsigned long root, + unsigned long *pages, int nopages, + unsigned long va, unsigned long pa) +{ + void *next = map_domain_page(root >> PAGE_SHIFT); + int k = 0; + +#if CONFIG_PAGING_LEVELS >= 4 + create_level_mapping(l4, next, pages, nopages, k, va); +#endif +#if CONFIG_PAGING_LEVELS >= 3 + create_level_mapping(l3, next, pages, nopages, k, va); +#endif + create_level_mapping(l2, next, pages, nopages, k, va); + + create_level_1_mapping(next, nopages, va, pa); + + return k; +} + +static int setup_page_table_a(xen_kexec_image_t *image) +{ + void *page; + int k, n = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]); + + /* clear page_table_a pages */ + + for (k = 0; k < n; k++) { + if (!image->page_table_a[k]) + break; + + page = map_domain_page(image->page_table_a[k] >> PAGE_SHIFT); + clear_page(page); + unmap_domain_page(page); + } + + /* check that the first page (root page) is actually non-zero */ + + if (k == 0) + return -1; + + /* setup fixmap to point to our control page */ + + set_fixmap(FIX_KEXEC_PAGE, image->reboot_code_buffer); + + /* fill in page_table_a: create mapping at fixmap address */ + + k = create_mapping(image->page_table_a[0], + &image->page_table_a[1], + n - 1, fix_to_virt(FIX_KEXEC_PAGE), + image->reboot_code_buffer); + if (k < 0) + return -1; + + /* fill in page_table_a: create identity mapping */ + + k = create_mapping(image->page_table_a[0], + &image->page_table_a[1 + k], + n - (1 + k), image->reboot_code_buffer, + image->reboot_code_buffer); + if (k < 0) + return -1; + + return 0; +} int machine_kexec_load(int type, xen_kexec_image_t *image) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); - return -1; + return setup_page_table_a(image); } void machine_kexec_unload(int type, xen_kexec_image_t *image) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); } void machine_kexec_reserved(xen_kexec_reserve_t *reservation) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + reservation->size = opt_kdump_megabytes << 20; + reservation->start = opt_kdump_megabytes_base << 20; } -void machine_kexec(xen_kexec_image_t *image) +static void __machine_shutdown(void *data) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + xen_kexec_image_t *image = (xen_kexec_image_t *)data; + + watchdog_disable(); + console_start_sync(); + + smp_send_stop(); + +#ifdef CONFIG_X86_IO_APIC + disable_IO_APIC(); +#endif + + machine_kexec(image); } void machine_shutdown(xen_kexec_image_t *image) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + int reboot_cpu_id; + cpumask_t reboot_cpu; + + reboot_cpu_id = 0; + + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) + reboot_cpu_id = smp_processor_id(); + + if (reboot_cpu_id != smp_processor_id()) { + cpus_clear(reboot_cpu); + cpu_set(reboot_cpu_id, reboot_cpu); + on_selected_cpus(reboot_cpu, __machine_shutdown, image, 1, 0); + for (;;) + ; /* nothing */ + } + else + __machine_shutdown(image); + BUG(); } /* --- x/xen/arch/x86/setup.c +++ x/xen/arch/x86/setup.c @@ -39,6 +39,11 @@ static unsigned int opt_xenheap_megabyte integer_param("xenheap_megabytes", opt_xenheap_megabytes); #endif +unsigned int opt_kdump_megabytes = 0; +integer_param("kdump_megabytes", opt_kdump_megabytes); +unsigned int opt_kdump_megabytes_base = 0; +integer_param("kdump_megabytes_base", opt_kdump_megabytes_base); + /* opt_nosmp: If true, secondary processors are ignored. */ static int opt_nosmp = 0; boolean_param("nosmp", opt_nosmp); @@ -220,6 +225,20 @@ static void __init init_idle_domain(void setup_idle_pagetable(); } +void __init move_memory(unsigned long dst, + unsigned long src_start, unsigned long src_end) +{ +#if defined(CONFIG_X86_32) + memmove((void *)dst, /* use low mapping */ + (void *)src_start, /* use low mapping */ + src_end - src_start); +#elif defined(CONFIG_X86_64) + memmove(__va(dst), + __va(src_start), + src_end - src_start); +#endif +} + void __init __start_xen(multiboot_info_t *mbi) { char __cmdline[] = "", *cmdline = __cmdline; @@ -353,15 +372,8 @@ void __init __start_xen(multiboot_info_t initial_images_start = xenheap_phys_end; initial_images_end = initial_images_start + modules_length; -#if defined(CONFIG_X86_32) - memmove((void *)initial_images_start, /* use low mapping */ - (void *)mod[0].mod_start, /* use low mapping */ - mod[mbi->mods_count-1].mod_end - mod[0].mod_start); -#elif defined(CONFIG_X86_64) - memmove(__va(initial_images_start), - __va(mod[0].mod_start), - mod[mbi->mods_count-1].mod_end - mod[0].mod_start); -#endif + move_memory(initial_images_start, + mod[0].mod_start, mod[mbi->mods_count-1].mod_end); /* Initialise boot-time allocator with all RAM situated after modules. */ xenheap_phys_start = init_boot_allocator(__pa(&_end)); @@ -409,6 +421,51 @@ void __init __start_xen(multiboot_info_t #endif } + if (opt_kdump_megabytes) { + unsigned long kdump_start, kdump_size, k; + + /* mark images pages as free for now */ + + init_boot_pages(initial_images_start, initial_images_end); + + kdump_start = opt_kdump_megabytes_base << 20; + kdump_size = opt_kdump_megabytes << 20; + + printk("Kdump: %luMB (%lukB) at 0x%lx\n", + kdump_size >> 20, + kdump_size >> 10, + kdump_start); + + if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK)) + panic("Kdump parameters not page aligned\n"); + + kdump_start >>= PAGE_SHIFT; + kdump_size >>= PAGE_SHIFT; + + /* allocate pages for Kdump memory area */ + + k = alloc_boot_pages_at(kdump_size, kdump_start); + + if (k != kdump_start) + panic("Unable to reserve Kdump memory\n"); + + /* allocate pages for relocated initial images */ + + k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0; + k += (initial_images_end - initial_images_start) >> PAGE_SHIFT; + + k = alloc_boot_pages(k, 1); + + if (!k) + panic("Unable to allocate initial images memory\n"); + + move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end); + + initial_images_end -= initial_images_start; + initial_images_start = k << PAGE_SHIFT; + initial_images_end += initial_images_start; + } + memguard_init(); percpu_guard_areas(); --- x/xen/arch/x86/x86_32/Makefile +++ x/xen/arch/x86/x86_32/Makefile @@ -3,5 +3,6 @@ obj-y += entry.o obj-y += mm.o obj-y += seg_fixup.o obj-y += traps.o +obj-y += machine_kexec.o obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o --- /dev/null +++ x/xen/arch/x86/x86_32/machine_kexec.c @@ -0,0 +1,26 @@ +/* + * arch/x86/x86_32/machine_kexec.c + * Handle transition of Linux booting another kernel + * + * Created By: Horms + * + * Should be losely based on arch/i386/kernel/machine_kexec.c + */ + +#include /* for printk() used in stub */ +#include + +void machine_kexec(xen_kexec_image_t *image) +{ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- x/xen/arch/x86/x86_64/Makefile +++ x/xen/arch/x86/x86_64/Makefile @@ -1,3 +1,4 @@ obj-y += entry.o obj-y += mm.o obj-y += traps.o +obj-y += machine_kexec.o --- /dev/null +++ x/xen/arch/x86/x86_64/machine_kexec.c @@ -0,0 +1,27 @@ +/****************************************************************************** + * arch/x86/x86_64/machine_kexec.c + * Handle transition of Linux booting another kernel + * + * Created By: Horms + * + * Should be losely based on arch/x86_64/kernel/machine_kexec.c + */ + +#include /* for printk() used in stub */ +#include +#include + +void machine_kexec(xen_kexec_image_t *image) +{ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- /dev/null +++ x/xen/include/asm-x86/elf.h @@ -0,0 +1,27 @@ +/****************************************************************************** + * include/asm-x86/elf.h + * + * Created By: Horms + * + */ + +#ifndef __X86_ELF_H__ +#define __X86_ELF_H__ + +#ifdef __x86_64__ +#include +#else +#include +#endif + +#endif /* __X86_ELF_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- x/xen/include/asm-x86/fixmap.h +++ x/xen/include/asm-x86/fixmap.h @@ -36,6 +36,7 @@ enum fixed_addresses { FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, FIX_HPET_BASE, FIX_CYCLONE_TIMER, + FIX_KEXEC_PAGE, __end_of_fixed_addresses }; --- x/xen/include/asm-x86/hypercall.h +++ x/xen/include/asm-x86/hypercall.h @@ -6,6 +6,7 @@ #define __ASM_X86_HYPERCALL_H__ #include +#include extern long do_event_channel_op_compat( @@ -87,6 +88,10 @@ extern long arch_do_vcpu_op( int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg); +extern int +do_kexec( + unsigned long op, unsigned arg1, XEN_GUEST_HANDLE(void) uarg); + #ifdef __x86_64__ extern long --- x/xen/include/asm-x86/kexec.h +++ x/xen/include/asm-x86/kexec.h @@ -8,15 +8,16 @@ #ifndef __X86_KEXEC_H__ #define __X86_KEXEC_H__ -#include /* for printk() used in stub */ +#include #include +#include #include -static void crash_setup_regs(struct cpu_user_regs *newregs, - struct cpu_user_regs *oldregs) -{ - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); -} +#ifdef __x86_64__ +#include +#else +#include +#endif #endif /* __X86_KEXEC_H__ */ --- /dev/null +++ x/xen/include/asm-x86/x86_32/elf.h @@ -0,0 +1,28 @@ +/****************************************************************************** + * include/asm-x86/x86_32/elf.h + * + * Created By: Horms + * + * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS + * from Linux 2.6.16 + */ + +#ifndef __X86_ELF_X86_32_H__ +#define __X86_ELF_X86_32_H__ + +#include /* for printk() used in stub */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + +#endif /* __X86_ELF_X86_32_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- /dev/null +++ x/xen/include/asm-x86/x86_32/kexec.h @@ -0,0 +1,48 @@ +/****************************************************************************** + * include/asm-x86/x86_32/kexec.h + * + * Created By: Horms + * + * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16 + * + */ + +#ifndef __X86_32_KEXEC_H__ +#define __X86_32_KEXEC_H__ + +#include /* for printk() used in stub */ +#include +#include + +static void crash_fixup_ss_esp(struct cpu_user_regs *newregs, + struct cpu_user_regs *oldregs) +{ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + return; + crash_fixup_ss_esp(newregs, oldregs); +} + +static void crash_setup_regs(struct cpu_user_regs *newregs, + struct cpu_user_regs *oldregs) +{ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); +} + +static inline int user_mode(struct cpu_user_regs *regs) +{ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + return -1; +} + + +#endif /* __X86_32_KEXEC_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- /dev/null +++ x/xen/include/asm-x86/x86_64/elf.h @@ -0,0 +1,28 @@ +/****************************************************************************** + * include/asm-x86/x86_64/elf.h + * + * Created By: Horms + * + * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS + * from Linux 2.6.16 + */ + +#ifndef __X86_ELF_X86_64_H__ +#define __X86_ELF_X86_64_H__ + +#include /* for printk() used in stub */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + +#endif /* __X86_ELF_X86_64_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- /dev/null +++ x/xen/include/asm-x86/x86_64/kexec.h @@ -0,0 +1,33 @@ +/****************************************************************************** + * include/asm-x86/x86_64/kexec.h + * + * Created By: Horms + * + * Should be based heavily on include/asm-x86_64/kexec.h from Linux 2.6.16 + * + */ + +#ifndef __X86_64_KEXEC_H__ +#define __X86_64_KEXEC_H__ + +#include /* for printk() used in stub */ +#include +#include + +static void crash_setup_regs(struct cpu_user_regs *newregs, + struct cpu_user_regs *oldregs) +{ + printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); +} + +#endif /* __X86_64_KEXEC_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- x/xen/include/public/kexec.h +++ x/xen/include/public/kexec.h @@ -43,6 +43,9 @@ */ #define KEXEC_CMD_kexec_load 1 typedef struct xen_kexec_image { +#if defined(__i386__) || defined(__x86_64__) + unsigned long page_table_a[7]; +#endif unsigned long indirection_page; unsigned long reboot_code_buffer; unsigned long start_address; --- x/xen/include/xen/elfcore.h +++ x/xen/include/xen/elfcore.h @@ -16,6 +16,9 @@ #include #define NT_PRSTATUS 1 +#define NT_XEN_DOM0_CR3 0x10000001 /* XXX: Hopefully this is unused, + feel free to change to a + better/different value */ typedef struct {