[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 03/04] Kexec / Kdump: x86_32 specific code



[PATCH 03/04] Kexec / Kdump: x86_32 specific code

This patch contains the x86_32 implementation of Kexec / Kdump for Xen.

Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
---

 Applies on top of xen-unstable-11760.

 buildconfigs/linux-defconfig_xen_x86_32                        |    2
 linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   25
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |    8
 patches/linux-2.6.16.29/series                                 |    3
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h              |   57 +
 patches/linux-2.6.16.29/git-35..cc9.patch                      |  401 +++++++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexe..code-i386.patch |  169 ++++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch  |   54 +
 xen/arch/x86/crash.c                                           |   47 +
 xen/arch/x86/x86_32/entry.S                                    |    2
 xen/arch/x86/x86_32/machine_kexec.c                            |   25
 xen/include/asm-x86/x86_32/elf.h                               |   32
 xen/include/asm-x86/x86_32/kexec.h                             |   65 +
 15 files changed, 863 insertions(+), 31 deletions(-)

--- 0002/buildconfigs/linux-defconfig_xen_x86_32
+++ work/buildconfigs/linux-defconfig_xen_x86_32        2006-10-16 
12:23:54.000000000 +0900
@@ -183,6 +183,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
@@ -1036,6 +1037,7 @@ CONFIG_DNOTIFY=y
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- 0001/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ work/linux-2.6-xen-sparse/arch/i386/Kconfig 2006-10-16 12:23:54.000000000 
+0900
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
-       depends on EXPERIMENTAL && !X86_XEN
+       depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/Makefile 2006-10-16 
12:23:54.000000000 +0900
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c      2006-10-16 
12:40:53.000000000 +0900
@@ -69,6 +69,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -943,6 +947,7 @@ static void __init parse_cmdline_early (
                 * after a kernel panic.
                 */
                else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
                        unsigned long size, base;
                        size = memparse(from+12, &from);
                        if (*from == '@') {
@@ -953,6 +958,10 @@ static void __init parse_cmdline_early (
                                crashk_res.start = base;
                                crashk_res.end   = base + size - 1;
                        }
+#else
+                       printk("Ignoring crashkernel command line, "
+                              "parameter will be supplied by xen\n");
+#endif
                }
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1322,9 +1331,22 @@ void __init setup_bootmem_allocator(void
        }
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
        if (crashk_res.start != crashk_res.end)
                reserve_bootmem(crashk_res.start,
                        crashk_res.end - crashk_res.start + 1);
+#else
+       {
+               xen_kexec_reserve_t reservation;
+               BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+                                       &reservation));
+               if (reservation.size) {
+                       crashk_res.start = reservation.start;
+                       crashk_res.end = reservation.start + 
+                               reservation.size - 1;
+               }
+       }
+#endif
 #endif
 
        if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1389,7 +1411,8 @@ legacy_init_iomem_resources(struct e820e
                        request_resource(res, data_resource);
 #endif
 #ifdef CONFIG_KEXEC
-                       request_resource(res, &crashk_res);
+                       if (crashk_res.start != crashk_res.end)
+                            request_resource(res, &crashk_res);
 #endif
                }
        }
--- /dev/null
+++ work/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h      2006-10-16 
12:23:55.000000000 +0900
@@ -0,0 +1,57 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+                                       struct cpu_user_regs *xen_regs)
+{
+       xen_regs->ebx    = linux_regs->ebx;
+       xen_regs->ecx    = linux_regs->ecx;
+       xen_regs->edx    = linux_regs->edx;
+       xen_regs->esi    = linux_regs->esi;
+       xen_regs->edi    = linux_regs->edi;
+       xen_regs->ebp    = linux_regs->ebp;
+       xen_regs->eax    = linux_regs->eax;
+       xen_regs->esp    = linux_regs->esp;
+       xen_regs->ss     = linux_regs->xss;
+       xen_regs->cs     = linux_regs->xcs;
+       xen_regs->ds     = linux_regs->xds;
+       xen_regs->es     = linux_regs->xes;
+       xen_regs->eflags = linux_regs->eflags;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- 0001/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ work/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h 
2006-10-16 12:23:54.000000000 +0900
@@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op(
        return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+       unsigned long op, unsigned int arg1, void * extra_args)
+{
+       return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch 
    2006-10-16 12:23:55.000000000 +0900
@@ -0,0 +1,401 @@
+From: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200)
+Subject: [PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+X-Git-Url: 
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3566561bfadffcb5dbc85d576be80c0dbf2cccc9
+
+[PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+
+kexec: Avoid overwriting the current pgd (V4, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables. These tables are used to provide
+an executable identity mapping without overwriting the current pgd.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Signed-off-by: Andi Kleen <ak@xxxxxxx>
+---
+
+--- a/arch/i386/kernel/machine_kexec.c
++++ b/arch/i386/kernel/machine_kexec.c
+@@ -21,70 +21,13 @@
+ #include <asm/system.h>
+ 
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
+-
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-      unsigned long level1_index, level2_index;
+-      u32 *pgtable_level2;
+-
+-      /* Find the current page table */
+-      pgtable_level2 = __va(read_cr3());
+-
+-      /* Find the indexes of the physical address to identity map */
+-      level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-      level2_index = address / LEVEL1_SIZE;
+-
+-      /* Identity map the page table entry */
+-      pgtable_level1[level1_index] = address | L0_ATTR;
+-      pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+-      /* Flush the tlb so the new mapping takes effect.
+-       * Global tlb entries are not flushed but that is not an issue.
+-       */
+-      load_cr3(pgtable_level2);
+-}
+-
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-      unsigned long level1_index, level2_index, level3_index;
+-      u64 *pgtable_level3;
+-
+-      /* Find the current page table */
+-      pgtable_level3 = __va(read_cr3());
+-
+-      /* Find the indexes of the physical address to identity map */
+-      level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-      level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-      level3_index = address / LEVEL2_SIZE;
+-
+-      /* Identity map the page table entry */
+-      pgtable_level1[level1_index] = address | L0_ATTR;
+-      pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-      set_64bit(&pgtable_level3[level3_index],
+-                                             __pa(pgtable_level2) | L2_ATTR);
+-
+-      /* Flush the tlb so the new mapping takes effect.
+-       * Global tlb entries are not flushed but that is not an issue.
+-       */
+-      load_cr3(pgtable_level3);
+-}
++static u32 kexec_pgd[1024] PAGE_ALIGNED;
++#ifdef CONFIG_X86_PAE
++static u32 kexec_pmd0[1024] PAGE_ALIGNED;
++static u32 kexec_pmd1[1024] PAGE_ALIGNED;
+ #endif
++static u32 kexec_pte0[1024] PAGE_ALIGNED;
++static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
+ static void set_idt(void *newidt, __u16 limit)
+ {
+@@ -128,16 +71,6 @@ static void load_segments(void)
+ #undef __STR
+ }
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-                                      unsigned long indirection_page,
+-                                      unsigned long reboot_code_buffer,
+-                                      unsigned long start_address,
+-                                      unsigned int has_pae) ATTRIB_NORET;
+-
+-extern const unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-extern const unsigned int relocate_new_kernel_size;
+-
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
+@@ -170,25 +103,29 @@ void machine_kexec_cleanup(struct kimage
+  */
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+-      unsigned long page_list;
+-      unsigned long reboot_code_buffer;
+-
+-      relocate_new_kernel_t rnk;
++      unsigned long page_list[PAGES_NR];
++      void *control_page;
+ 
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+ 
+-      /* Compute some offsets */
+-      reboot_code_buffer = page_to_pfn(image->control_code_page)
+-                                                              << PAGE_SHIFT;
+-      page_list = image->head;
+-
+-      /* Set up an identity mapping for the reboot_code_buffer */
+-      identity_map_page(reboot_code_buffer);
+-
+-      /* copy it out */
+-      memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-                                              relocate_new_kernel_size);
++      control_page = page_address(image->control_code_page);
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      page_list[PA_CONTROL_PAGE] = __pa(control_page);
++      page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
++      page_list[PA_PGD] = __pa(kexec_pgd);
++      page_list[VA_PGD] = (unsigned long)kexec_pgd;
++#ifdef CONFIG_X86_PAE
++      page_list[PA_PMD_0] = __pa(kexec_pmd0);
++      page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
++      page_list[PA_PMD_1] = __pa(kexec_pmd1);
++      page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
++#endif
++      page_list[PA_PTE_0] = __pa(kexec_pte0);
++      page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
++      page_list[PA_PTE_1] = __pa(kexec_pte1);
++      page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
+       /* The segment registers are funny things, they have both a
+        * visible and an invisible part.  Whenever the visible part is
+@@ -207,8 +144,8 @@ NORET_TYPE void machine_kexec(struct kim
+       set_idt(phys_to_virt(0),0);
+ 
+       /* now call it */
+-      rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-      (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++      relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
++                      image->start, cpu_has_pae);
+ }
+ 
+ /* crashkernel=size@addr specifies the location to reserve for
+--- a/arch/i386/kernel/relocate_kernel.S
++++ b/arch/i386/kernel/relocate_kernel.S
+@@ -7,16 +7,138 @@
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++#include <asm/kexec.h>
++
++/*
++ * Must be relocatable PIC code callable as a C function
++ */
++
++#define PTR(x) (x << 2)
++#define PAGE_ALIGNED (1 << PAGE_SHIFT)
++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
++#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
++
++      .text
++      .align PAGE_ALIGNED
++      .globl relocate_kernel
++relocate_kernel:
++      movl    8(%esp), %ebp /* list of pages */
++
++#ifdef CONFIG_X86_PAE
++      /* map the control page at its virtual address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xc0000000, %eax
++      shrl    $27, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PMD_0)(%ebp), %edx
++      orl     $PAE_PGD_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PMD_0)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x3fe00000, %eax
++      shrl    $18, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_0)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_0)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x001ff000, %eax
++      shrl    $9, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      /* identity map the control page at its physical address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xc0000000, %eax
++      shrl    $27, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PMD_1)(%ebp), %edx
++      orl     $PAE_PGD_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PMD_1)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x3fe00000, %eax
++      shrl    $18, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_1)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_1)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x001ff000, %eax
++      shrl    $9, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++#else
++      /* map the control page at its virtual address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xffc00000, %eax
++      shrl    $20, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_0)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_0)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x003ff000, %eax
++      shrl    $10, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      /* identity map the control page at its physical address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xffc00000, %eax
++      shrl    $20, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_1)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_1)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x003ff000, %eax
++      shrl    $10, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++#endif
+ 
+-      /*
+-       * Must be relocatable PIC code callable as a C function, that once
+-       * it starts can not use the previous processes stack.
+-       */
+-      .globl relocate_new_kernel
+ relocate_new_kernel:
+       /* read the arguments and say goodbye to the stack */
+       movl  4(%esp), %ebx /* page_list */
+-      movl  8(%esp), %ebp /* reboot_code_buffer */
++      movl  8(%esp), %ebp /* list of pages */
+       movl  12(%esp), %edx /* start address */
+       movl  16(%esp), %ecx /* cpu_has_pae */
+ 
+@@ -24,11 +146,26 @@ relocate_new_kernel:
+       pushl $0
+       popfl
+ 
+-      /* set a new stack at the bottom of our page... */
+-      lea   4096(%ebp), %esp
++      /* get physical address of control page now */
++      /* this is impossible after page table switch */
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edi
++
++      /* switch to new set of page tables */
++      movl    PTR(PA_PGD)(%ebp), %eax
++      movl    %eax, %cr3
++
++      /* setup a new stack at the end of the physical control page */
++      lea     4096(%edi), %esp
+ 
+-      /* store the parameters back on the stack */
+-      pushl   %edx /* store the start address */
++      /* jump to identity mapped page */
++      movl    %edi, %eax
++      addl    $(identity_mapped - relocate_kernel), %eax
++      pushl   %eax
++      ret
++
++identity_mapped:
++      /* store the start address on the stack */
++      pushl   %edx
+ 
+       /* Set cr0 to a known state:
+        * 31 0 == Paging disabled
+@@ -113,8 +250,3 @@ relocate_new_kernel:
+       xorl    %edi, %edi
+       xorl    %ebp, %ebp
+       ret
+-relocate_new_kernel_end:
+-
+-      .globl relocate_new_kernel_size
+-relocate_new_kernel_size:
+-      .long relocate_new_kernel_end - relocate_new_kernel
+--- a/include/asm-i386/kexec.h
++++ b/include/asm-i386/kexec.h
+@@ -1,6 +1,26 @@
+ #ifndef _I386_KEXEC_H
+ #define _I386_KEXEC_H
+ 
++#define PA_CONTROL_PAGE  0
++#define VA_CONTROL_PAGE  1
++#define PA_PGD           2
++#define VA_PGD           3
++#define PA_PTE_0         4
++#define VA_PTE_0         5
++#define PA_PTE_1         6
++#define VA_PTE_1         7
++#ifdef CONFIG_X86_PAE
++#define PA_PMD_0         8
++#define VA_PMD_0         9
++#define PA_PMD_1         10
++#define VA_PMD_1         11
++#define PAGES_NR         12
++#else
++#define PAGES_NR         8
++#endif
++
++#ifndef __ASSEMBLY__
++
+ #include <asm/fixmap.h>
+ #include <asm/ptrace.h>
+ #include <asm/string.h>
+@@ -72,5 +92,12 @@ static inline void crash_setup_regs(stru
+                newregs->eip = (unsigned long)current_text_addr();
+        }
+ }
++asmlinkage NORET_TYPE void
++relocate_kernel(unsigned long indirection_page,
++              unsigned long control_page,
++              unsigned long start_address,
++              unsigned int has_pae) ATTRIB_NORET;
++
++#endif /* __ASSEMBLY__ */
+ 
+ #endif /* _I386_KEXEC_H */
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
    2006-10-16 12:23:55.000000000 +0900
@@ -0,0 +1,169 @@
+kexec: Move asm segment handling code to the assembly file (i386)
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S. The main reason behind this move is to avoid code 
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+---
+
+ Applies to 2.6.19-rc1.
+
+ machine_kexec.c   |   59 -----------------------------------------------------
+ relocate_kernel.S |   58 +++++++++++++++++++++++++++++++++++++++++++++++-----
+ 2 files changed, 53 insertions(+), 64 deletions(-)
+
+--- 0002/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c      2006-10-05 15:49:08.000000000 
+0900
+@@ -29,48 +29,6 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-      struct Xgt_desc_struct curidt;
+-
+-      /* ia32 supports unaliged loads & stores */
+-      curidt.size    = limit;
+-      curidt.address = (unsigned long)newidt;
+-
+-      load_idt(&curidt);
+-};
+-
+-
+-static void set_gdt(void *newgdt, __u16 limit)
+-{
+-      struct Xgt_desc_struct curgdt;
+-
+-      /* ia32 supports unaligned loads & stores */
+-      curgdt.size    = limit;
+-      curgdt.address = (unsigned long)newgdt;
+-
+-      load_gdt(&curgdt);
+-};
+-
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
+-
+-      __asm__ __volatile__ (
+-              "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-              "\t1:\n"
+-              "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-              "\tmovl %%eax,%%ds\n"
+-              "\tmovl %%eax,%%es\n"
+-              "\tmovl %%eax,%%fs\n"
+-              "\tmovl %%eax,%%gs\n"
+-              "\tmovl %%eax,%%ss\n"
+-              ::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
+-
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
+@@ -127,23 +85,6 @@ NORET_TYPE void machine_kexec(struct kim
+       page_list[PA_PTE_1] = __pa(kexec_pte1);
+       page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
+-      /* The segment registers are funny things, they have both a
+-       * visible and an invisible part.  Whenever the visible part is
+-       * set to a specific selector, the invisible part is loaded
+-       * with from a table in memory.  At no other time is the
+-       * descriptor table in memory accessed.
+-       *
+-       * I take advantage of this here by force loading the
+-       * segments, before I zap the gdt with an invalid value.
+-       */
+-      load_segments();
+-      /* The gdt & idt are now invalid.
+-       * If you want to load them you must set up your own idt & gdt.
+-       */
+-      set_gdt(phys_to_virt(0),0);
+-      set_idt(phys_to_virt(0),0);
+-
+-      /* now call it */
+       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                       image->start, cpu_has_pae);
+ }
+--- 0002/arch/i386/kernel/relocate_kernel.S
++++ work/arch/i386/kernel/relocate_kernel.S    2006-10-05 16:03:21.000000000 
+0900
+@@ -154,14 +154,45 @@ relocate_new_kernel:
+       movl    PTR(PA_PGD)(%ebp), %eax
+       movl    %eax, %cr3
+ 
++      /* setup idt */
++      movl    %edi, %eax
++      addl    $(idt_48 - relocate_kernel), %eax
++      lidtl   (%eax)
++
++      /* setup gdt */
++      movl    %edi, %eax
++      addl    $(gdt - relocate_kernel), %eax
++      movl    %edi, %esi
++      addl    $((gdt_48 - relocate_kernel) + 2), %esi
++      movl    %eax, (%esi)
++      
++      movl    %edi, %eax
++      addl    $(gdt_48 - relocate_kernel), %eax
++      lgdtl   (%eax)
++
++      /* setup data segment registers */
++      mov     $(gdt_ds - gdt), %eax
++      mov     %eax, %ds
++      mov     %eax, %es
++      mov     %eax, %fs
++      mov     %eax, %gs
++      mov     %eax, %ss
++      
+       /* setup a new stack at the end of the physical control page */
+       lea     4096(%edi), %esp
+ 
+-      /* jump to identity mapped page */
+-      movl    %edi, %eax
+-      addl    $(identity_mapped - relocate_kernel), %eax
+-      pushl   %eax
+-      ret
++      /* load new code segment and jump to identity mapped page */
++      movl    %edi, %esi
++      xorl    %eax, %eax
++      pushl   %eax
++      pushl   %esi
++      pushl   %eax
++      movl    $(gdt_cs - gdt), %eax
++      pushl   %eax    
++      movl    %edi, %eax
++      addl    $(identity_mapped - relocate_kernel),%eax
++      pushl   %eax
++      iretl
+ 
+ identity_mapped:
+       /* store the start address on the stack */
+@@ -250,3 +281,20 @@ identity_mapped:
+       xorl    %edi, %edi
+       xorl    %ebp, %ebp
+       ret
++
++      .align  16
++gdt:
++      .quad   0x0000000000000000      /* NULL descriptor */
++gdt_cs:       
++      .quad   0x00cf9a000000ffff      /* kernel 4GB code at 0x00000000 */
++gdt_ds:
++      .quad   0x00cf92000000ffff      /* kernel 4GB data at 0x00000000 */
++gdt_end:
++      
++gdt_48:
++      .word   gdt_end - gdt - 1       /* limit */
++      .long   0                       /* base - filled in by code above */
++
++idt_48:
++      .word   0                       /* limit */
++      .long   0                       /* base */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch  
2006-10-16 12:23:55.000000000 +0900
@@ -0,0 +1,54 @@
+--- 0004/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c      2006-10-11 18:34:06.000000000 
+0900
+@@ -20,6 +20,10 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+ static u32 kexec_pgd[1024] PAGE_ALIGNED;
+ #ifdef CONFIG_X86_PAE
+@@ -29,6 +33,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
++#ifdef CONFIG_XEN
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage 
*image)
++{
++      void *control_page;
++
++      memset(xki->page_list, 0, sizeof(xki->page_list));
++
++      control_page = page_address(image->control_code_page);
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++      xki->page_list[PA_PGD] = __ma(kexec_pgd);
++#ifdef CONFIG_X86_PAE
++      xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
++      xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
++#endif
++      xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
++      xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
++
++}
++
++#endif /* CONFIG_XEN */
++
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
--- 0004/patches/linux-2.6.16.29/series
+++ work/patches/linux-2.6.16.29/series 2006-10-16 12:23:54.000000000 +0900
@@ -1,6 +1,9 @@
 kexec-generic.patch
 git-2efe55a9cec8418f0e0cde3dc3787a42fddc4411.patch
 git-2a8a3d5b65e86ec1dfef7d268c64a909eab94af7.patch
+git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
+linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
+linux-2.6.19-rc1-kexec-xen-i386.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- 0004/xen/arch/x86/crash.c
+++ work/xen/arch/x86/crash.c   2006-10-16 12:23:54.000000000 +0900
@@ -21,6 +21,7 @@
 #include <xen/delay.h>
 #include <xen/perfc.h>
 #include <xen/kexec.h>
+#include <xen/sched.h>
 #include <public/xen.h>
 #include <asm/hvm/hvm.h>
 
@@ -171,6 +172,51 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
 
+/* The cr3 for dom0 on each of its vcpus
+ * It is added as ELF_Prstatus prstatus.pr_reg[ELF_NGREG-1)], where
+ * prstatus is the data of the elf note, and ELF_NGREG was extended
+ * by one to allow extra space.
+ * This code runs after all cpus except the crashing one have
+ * been shutdown so as to avoid having to hold domlist_lock,
+ * as locking after a crash is playing with fire */
+void find_dom0_cr3(void)
+{
+       struct domain *d;
+       struct vcpu   *v;
+       uint32_t *buf;
+       uint32_t cr3;
+       Elf_Note note;
+
+       /* Don't need to grab domlist_lock as we are the only thing running */
+
+       /* No need to traverse domain_list, as dom0 is always first */
+       d = domain_list;
+       BUG_ON(d->domain_id);
+
+       for_each_vcpu ( d, v ) {
+               if ( test_bit(_VCPUF_down, &v->vcpu_flags) )
+                       continue;
+               buf = (uint32_t *)per_cpu(crash_notes, v->processor);
+               if (!buf) /* XXX: Can this ever occur? */
+                       continue;
+
+               memcpy(&note, buf, sizeof(Elf_Note));
+               buf += (sizeof(Elf_Note) +3)/4 + (note.namesz + 3)/4 +
+                       (note.descsz + 3)/4;
+
+               /* XXX: This probably doesn't take into account shadow mode,
+                * but that might not be a problem */
+               cr3 = pagetable_get_pfn(v->arch.guest_table);
+
+               buf = append_elf_note(buf, "Xen Domanin-0 CR3",
+                       NT_XEN_DOM0_CR3, &cr3, 4);
+               final_note(buf);
+
+               printk("domain:%i vcpu:%u processor:%u cr3:%08x\n", 
+                      d->domain_id, v->vcpu_id, v->processor, cr3);
+       }
+}
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
        printk("machine_crash_shutdown: %d\n", smp_processor_id());
@@ -185,6 +231,7 @@ void machine_crash_shutdown(struct cpu_u
     hvm_disable();
 
        crash_save_self(regs);
+       find_dom0_cr3();
 }
 
 /*
--- 0001/xen/arch/x86/x86_32/entry.S
+++ work/xen/arch/x86/x86_32/entry.S    2006-10-16 12:23:54.000000000 +0900
@@ -672,6 +672,7 @@ ENTRY(hypercall_table)
         .long do_hvm_op
         .long do_sysctl             /* 35 */
         .long do_domctl
+        .long do_kexec_op
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -714,6 +715,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_hvm_op            */
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
+        .byte 1 /* do_kexec_op          */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- 0004/xen/arch/x86/x86_32/machine_kexec.c
+++ work/xen/arch/x86/x86_32/machine_kexec.c    2006-10-16 12:23:55.000000000 
+0900
@@ -1,18 +1,29 @@
-/*
+/******************************************************************************
  * arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@xxxxxxxxxxxx>
+ * 
+ * Created By: Horms
  *
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+               unsigned long indirection_page,
+               unsigned long control_page,
+               unsigned long start_address,
+               unsigned int has_pae);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_BASE_0);
+    (*rnk)(image->indirection_page, (unsigned long)image->page_list, 
+           image->start_address, (unsigned long)cpu_has_pae);
 }
 
 /*
--- 0004/xen/include/asm-x86/x86_32/elf.h
+++ work/xen/include/asm-x86/x86_32/elf.h       2006-10-16 12:23:55.000000000 
+0900
@@ -3,17 +3,39 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and 
+ * include/asm-i386/system.h from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_32_H__
 #define __X86_ELF_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_ELF_X86_32_H__ */
 
--- 0004/xen/include/asm-x86/x86_32/kexec.h
+++ work/xen/include/asm-x86/x86_32/kexec.h     2006-10-16 12:23:55.000000000 
+0900
@@ -3,39 +3,72 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
  */
 
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-                   struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return;
-    crash_fixup_ss_esp(newregs, oldregs);
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
 
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
                             struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.