[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC 41/44] x86/smp: Switch to using the percpu stacks



This is very easy for the APs.  __high_start() is modified to switch stacks
before entering C.  The BSP however is more complicated, and needs to stay on
cpu0_stack[] until setup is complete.

The end of __start_xen() is modified to copy the top-of-stack data to the
percpu stack immediately before jumping there.  The VMCS Host and SYSENTER
stacks are suitably adjusted, and become construction-time constant.

The stack_start and stack_base[] array are removed completely, as well as the
memguard_guard_stack() infrastructure.  The STACK_ORDER xenheap allocations
are no longer needed, and higher CPUs on large machines are finally
numa-local.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
 xen/arch/x86/boot/x86_64.S  | 15 ++++++++-------
 xen/arch/x86/efi/efi-boot.h |  8 ++++----
 xen/arch/x86/hvm/vmx/vmcs.c | 21 ++++++++++-----------
 xen/arch/x86/mm.c           | 15 ---------------
 xen/arch/x86/setup.c        | 29 +++++++++++++++++++----------
 xen/arch/x86/smpboot.c      | 18 ------------------
 xen/arch/x86/tboot.c        | 29 +----------------------------
 xen/arch/x86/traps.c        | 10 ++--------
 xen/include/asm-arm/mm.h    |  1 -
 xen/include/asm-x86/mm.h    |  3 ---
 xen/include/xen/smp.h       |  2 --
 11 files changed, 44 insertions(+), 107 deletions(-)

diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
index b1f0457..ed4c805 100644
--- a/xen/arch/x86/boot/x86_64.S
+++ b/xen/arch/x86/boot/x86_64.S
@@ -15,21 +15,25 @@ ENTRY(__high_start)
         mov     $XEN_MINIMAL_CR4,%rcx
         mov     %rcx,%cr4
 
-        /* Set up %cr3 (differs between BSP and APs). */
+        /* Set up %cr3 and %rsp (differs between BSP and APs). */
         test    %ebx, %ebx
         jz      .Lbsp_setup
 
         /* APs switch onto percpu_idle_pt[], as provided by do_boot_cpu(). */
         mov     ap_cr3(%rip), %rax
         mov     %rax, %cr3
+
+        /* APs move straight onto the PERCPU stack. */
+        movabs  $STACK_SIZE - CPUINFO_sizeof + PERCPU_STACK_MAPPING, %rsp
+
         jmp     .Ldone
 
 .Lbsp_setup:
         /* The BSP stays on the idle_pg_table[] during early boot. */
-.Ldone:
 
-        mov     stack_start(%rip),%rsp
-        or      $(STACK_SIZE-CPUINFO_sizeof),%rsp
+        /* The BSP starts on cpu0_stack. */
+        lea     STACK_SIZE - CPUINFO_sizeof + cpu0_stack(%rip), %rsp
+.Ldone:
 
         /* Reset EFLAGS (subsumes CLI and CLD). */
         pushq   $0
@@ -61,9 +65,6 @@ GLOBAL(gdt_descr)
         .word   LAST_RESERVED_GDT_BYTE
         .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
 
-GLOBAL(stack_start)
-        .quad   cpu0_stack
-
         .section .data.page_aligned, "aw", @progbits
         .align PAGE_SIZE, 0
 GLOBAL(boot_cpu_gdt_table)
diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h
index d30f688..8af661b 100644
--- a/xen/arch/x86/efi/efi-boot.h
+++ b/xen/arch/x86/efi/efi-boot.h
@@ -251,15 +251,15 @@ static void __init noreturn efi_arch_post_exit_boot(void)
 #endif
                    "movabs $__start_xen, %[rip]\n\t"
                    "lgdt   gdt_descr(%%rip)\n\t"
-                   "mov    stack_start(%%rip), %%rsp\n\t"
+                   "lea    %c[stkoff] + cpu0_stack(%%rip), %%rsp\n\t"
                    "mov    %[ds], %%ss\n\t"
                    "mov    %[ds], %%ds\n\t"
                    "mov    %[ds], %%es\n\t"
                    "mov    %[ds], %%fs\n\t"
                    "mov    %[ds], %%gs\n\t"
-                   "movl   %[cs], 8(%%rsp)\n\t"
-                   "mov    %[rip], (%%rsp)\n\t"
-                   "lretq  %[stkoff]-16"
+                   "push   %[cs]\n\t"
+                   "push   %[rip]\n\t"
+                   "lretq"
                    : [rip] "=&r" (efer/* any dead 64-bit variable */),
                      [cr4] "+&r" (cr4)
                    : [cr3] "r" (idle_pg_table),
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 795210f..483f72d 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -804,15 +804,6 @@ static void vmx_set_host_env(struct vcpu *v)
 
     __vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(init_tss, cpu));
 
-    __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
-
-    /*
-     * Skip end of cpu_user_regs when entering the hypervisor because the
-     * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc
-     * all get saved into the VMCS instead.
-     */
-    __vmwrite(HOST_RSP,
-              (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code);
 }
 
 void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
@@ -1148,13 +1139,21 @@ static int construct_vmcs(struct vcpu *v)
     __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
     __vmwrite(HOST_CR4, mmu_cr4_features);
 
-    /* Host CS:RIP. */
+    /* Host code/stack. */
     __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
     __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
+    __vmwrite(HOST_RSP, /* VMExit doesn't push an excpetion frame. */
+              (PERCPU_STACK_MAPPING + STACK_SIZE -
+               sizeof(struct cpu_info) +
+               offsetof(struct cpu_info, guest_cpu_user_regs.error_code)));
 
-    /* Host SYSENTER CS:RIP. */
+    /* Host SYSENTER code/stack. */
     __vmwrite(HOST_SYSENTER_CS, __HYPERVISOR_CS);
     __vmwrite(HOST_SYSENTER_EIP, (unsigned long)sysenter_entry);
+    __vmwrite(HOST_SYSENTER_ESP,
+              (PERCPU_STACK_MAPPING + STACK_SIZE -
+               sizeof(struct cpu_info) +
+               offsetof(struct cpu_info, guest_cpu_user_regs.es)));
 
     /* MSR intercepts. */
     __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 933bd67..cb54921 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5281,21 +5281,6 @@ void memguard_unguard_range(void *p, unsigned long l)
 
 #endif
 
-void memguard_guard_stack(void *p)
-{
-    BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
-    p = (void *)((unsigned long)p + STACK_SIZE -
-                 PRIMARY_STACK_SIZE - PAGE_SIZE);
-    memguard_guard_range(p, PAGE_SIZE);
-}
-
-void memguard_unguard_stack(void *p)
-{
-    p = (void *)((unsigned long)p + STACK_SIZE -
-                 PRIMARY_STACK_SIZE - PAGE_SIZE);
-    memguard_unguard_range(p, PAGE_SIZE);
-}
-
 void arch_dump_shared_mem_info(void)
 {
     printk("Shared frames %u -- Saved frames %u\n",
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index d624b95..c0f7289 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -651,8 +651,6 @@ static void noinline init_done(void)
 /* Reinitalise all state referring to the old virtual address of the stack. */
 static void __init noreturn reinit_bsp_stack(void)
 {
-    unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
-
     /* Sanity check that IST settings weren't set up before this point. */
     ASSERT(MASK_EXTR(idt_tables[0][TRAP_nmi].a, 7UL << 32) == 0);
 
@@ -664,9 +662,6 @@ static void __init noreturn reinit_bsp_stack(void)
     /* Update SYSCALL trampolines */
     percpu_traps_init();
 
-    stack_base[0] = stack;
-    memguard_guard_stack(stack);
-
     reset_stack_and_jump(init_done);
 }
 
@@ -1744,11 +1739,25 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
     setup_io_bitmap(dom0);
 
-    /* Jump to the 1:1 virtual mappings of cpu0_stack. */
-    asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
-                  [stk] "g" (__va(__pa(get_stack_bottom()))),
-                  [fn] "i" (reinit_bsp_stack) : "memory");
-    unreachable();
+    /*
+     * Switch from cpu0_stack to the percpu stack, copying the non-GPR
+     * cpu_info data into place before hand.
+     */
+    {
+        const struct cpu_info *src = get_cpu_info();
+        struct cpu_info *dst = _p(PERCPU_STACK_MAPPING + STACK_SIZE -
+                                  sizeof(*dst));
+
+        dst->processor_id   = src->processor_id;
+        dst->current_vcpu   = src->current_vcpu;
+        dst->per_cpu_offset = src->per_cpu_offset;
+        dst->cr4            = src->cr4;
+
+        asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
+                      [stk] "g" (&dst->guest_cpu_user_regs.es),
+                      [fn] "i" (reinit_bsp_stack) : "memory");
+        unreachable();
+    }
 }
 
 void arch_get_xen_caps(xen_capabilities_info_t *info)
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index f785d5f..77ee883 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -91,8 +91,6 @@ static enum cpu_state {
 } cpu_state;
 #define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
 
-void *stack_base[NR_CPUS];
-
 void initialize_cpu_data(unsigned int cpu)
 {
     cpu_data[cpu] = boot_cpu_data;
@@ -386,7 +384,6 @@ void start_secondary(void *unused)
 
 /* Used to pass percpu_idle_pt to the booting AP. */
 paddr_t ap_cr3;
-extern void *stack_start;
 
 static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 {
@@ -529,7 +526,6 @@ static int do_boot_cpu(int apicid, int cpu)
                cpu, apicid, start_eip);
 
     ap_cr3 = per_cpu(percpu_idle_pt, cpu);
-    stack_start = stack_base[cpu];
 
     /* This grunge runs the startup process for the targeted processor. */
 
@@ -1002,13 +998,6 @@ static void cpu_smpboot_free(unsigned int cpu)
     free_xenheap_page(idt_tables[cpu]);
     idt_tables[cpu] = NULL;
 
-    if ( stack_base[cpu] != NULL )
-    {
-        memguard_unguard_stack(stack_base[cpu]);
-        free_xenheap_pages(stack_base[cpu], STACK_ORDER);
-        stack_base[cpu] = NULL;
-    }
-
     if ( per_cpu(percpu_idle_pt, cpu) )
     {
         free_domheap_page(maddr_to_page(per_cpu(percpu_idle_pt, cpu)));
@@ -1030,11 +1019,6 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
 
-    stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
-    if ( stack_base[cpu] == NULL )
-        goto out;
-    memguard_guard_stack(stack_base[cpu]);
-
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
     if ( gdt == NULL )
@@ -1148,8 +1132,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
     boot_cpu_physical_apicid = get_apic_id();
     x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
 
-    stack_base[0] = stack_start;
-
     set_nr_sockets();
 
     socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c
index 59d7c47..c283b91 100644
--- a/xen/arch/x86/tboot.c
+++ b/xen/arch/x86/tboot.c
@@ -243,29 +243,6 @@ static void tboot_gen_domain_integrity(const uint8_t 
key[TB_KEY_SIZE],
     memset(&ctx, 0, sizeof(ctx));
 }
 
-/*
- * For stack overflow detection in debug build, a guard page is set up.
- * This fn is used to detect whether a page is in the guarded pages for
- * the above reason.
- */
-static int mfn_in_guarded_stack(unsigned long mfn)
-{
-    void *p;
-    int i;
-
-    for ( i = 0; i < nr_cpu_ids; i++ )
-    {
-        if ( !stack_base[i] )
-            continue;
-        p = (void *)((unsigned long)stack_base[i] + STACK_SIZE -
-                     PRIMARY_STACK_SIZE - PAGE_SIZE);
-        if ( mfn == virt_to_mfn(p) )
-            return -1;
-    }
-
-    return 0;
-}
-
 static void tboot_gen_xenheap_integrity(const uint8_t key[TB_KEY_SIZE],
                                         vmac_t *mac)
 {
@@ -290,12 +267,8 @@ static void tboot_gen_xenheap_integrity(const uint8_t 
key[TB_KEY_SIZE],
 
         if ( is_page_in_use(page) && is_xen_heap_page(page) )
         {
-            void *pg;
-
-            if ( mfn_in_guarded_stack(mfn) )
-                continue; /* skip guard stack, see memguard_guard_stack() in 
mm.c */
+            void *pg = mfn_to_virt(mfn);
 
-            pg = mfn_to_virt(mfn);
             vmac_update((uint8_t *)pg, PAGE_SIZE, &ctx);
         }
     }
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index eeabb4a..493f8f3 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -356,9 +356,6 @@ unsigned long get_stack_trace_bottom(unsigned long sp)
         return ROUNDUP(sp, PAGE_SIZE) -
             offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
 
-#ifndef MEMORY_GUARD
-    case 3 ... 5:
-#endif
     case 6 ... 7:
         return ROUNDUP(sp, STACK_SIZE) -
             sizeof(struct cpu_info) - sizeof(unsigned long);
@@ -375,9 +372,6 @@ unsigned long get_stack_dump_bottom(unsigned long sp)
     case 0 ... 2:
         return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
 
-#ifndef MEMORY_GUARD
-    case 3 ... 5:
-#endif
     case 6 ... 7:
         return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
 
@@ -518,9 +512,9 @@ void show_stack_overflow(unsigned int cpu, const struct 
cpu_user_regs *regs)
     unsigned long esp_top, esp_bottom;
 #endif
 
-    if ( _p(curr_stack_base) != stack_base[cpu] )
+    if ( curr_stack_base != PERCPU_STACK_MAPPING )
         printk("Current stack base %p differs from expected %p\n",
-               _p(curr_stack_base), stack_base[cpu]);
+               _p(curr_stack_base), _p(PERCPU_STACK_MAPPING));
 
 #ifdef MEMORY_GUARD
     esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index 4d5563b..86b8fcb 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -362,7 +362,6 @@ unsigned long domain_get_maximum_gpfn(struct domain *d);
 
 extern struct domain *dom_xen, *dom_io, *dom_cow;
 
-#define memguard_guard_stack(_p)       ((void)0)
 #define memguard_guard_range(_p,_l)    ((void)0)
 #define memguard_unguard_range(_p,_l)  ((void)0)
 
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 22c2809..2c1ed1d 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -521,9 +521,6 @@ void memguard_unguard_range(void *p, unsigned long l);
 #define memguard_unguard_range(_p,_l)  ((void)0)
 #endif
 
-void memguard_guard_stack(void *p);
-void memguard_unguard_stack(void *p);
-
 struct mmio_ro_emulate_ctxt {
         unsigned long cr2;
         unsigned int seg, bdf;
diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h
index c55f57f..d30f369 100644
--- a/xen/include/xen/smp.h
+++ b/xen/include/xen/smp.h
@@ -69,8 +69,6 @@ void smp_send_call_function_mask(const cpumask_t *mask);
 
 int alloc_cpu_id(void);
 
-extern void *stack_base[NR_CPUS];
-
 void initialize_cpu_data(unsigned int cpu);
 
 #endif /* __XEN_SMP_H__ */
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.