[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 1 of 4 RFC] x86/kexec: Change NMI and MCE handling on kexec path



At 18:16 +0000 on 04 Dec (1354644968), Andrew Cooper wrote:
> do_nmi_crash() will:
> 
>  * Save the crash notes and shut the pcpu down.
>     There is now an extra per-cpu variable to prevent us from executing
>     this multiple times.  In the case where we reenter midway through,
>     attempt the whole operation again in preference to not completing
>     it in the first place.
> 
>  * Set up another NMI at the LAPIC.
>     Even when the LAPIC has been disabled, the ID and command registers
>     are still usable.  As a result, we can deliberately queue up a new
>     NMI to re-interrupt us later if NMIs get unlatched.

Can you please include this reasoning in the code itself, as well as in
the commit message?

> machine_kexec() will:
> 
>   * Swap the MCE handlers to be a nop.
>      We cannot prevent MCEs from being delivered when we pass off to the
>      crash kernel, and the less Xen context is being touched the better.
> 
>   * Explicitly enable NMIs.

Would it be cleaner to have this path explicitly set the IDT entry to
invoke the noop handler?  Or do we know this is alwys the case when we
reach this point?

I'm just wondering about the case where we get here with an NMI pending.
Presumably if we have some other source of NMIs active while we kexec,
the post-exec kernel will crash if it overwrites our handlers &c before
setting up its own IDT.  But if kexec()ing with NMIs _disabled_ also
fails than we haven't much choice. :|

Otherwise, this looks good to me.

Tim.

> Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> 
> diff -r 1c69c938f641 -r b15d3ae525af xen/arch/x86/crash.c
> --- a/xen/arch/x86/crash.c
> +++ b/xen/arch/x86/crash.c
> @@ -32,41 +32,97 @@
>  
>  static atomic_t waiting_for_crash_ipi;
>  static unsigned int crashing_cpu;
> +static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
>  
> -static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
> +/* This becomes the NMI handler for non-crashing CPUs. */
> +void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
>  {
> -    /* Don't do anything if this handler is invoked on crashing cpu.
> -     * Otherwise, system will completely hang. Crashing cpu can get
> -     * an NMI if system was initially booted with nmi_watchdog parameter.
> +    /* nmi_shootdown_cpus() should ensure that this assertion is correct. */
> +    ASSERT( crashing_cpu != smp_processor_id() );
> +
> +    /* Save crash information and shut down CPU.  Attempt only once. */
> +    if ( ! this_cpu(crash_save_done) )
> +    {
> +        kexec_crash_save_cpu();
> +        __stop_this_cpu();
> +
> +        this_cpu(crash_save_done) = 1;
> +    }
> +
> +    /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
> +     * back to its boot state, so we are unable to rely on the regular
> +     * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
> +     * (The likely senario is that we have reverted from x2apic mode to
> +     * xapic, at which point #GPFs will occur if we use the apic_*
> +     * functions)
> +     *
> +     * The ICR and APIC ID of the LAPIC are still valid even during
> +     * software disable (Intel SDM Vol 3, 10.4.7.2), which allows us to
> +     * queue up another NMI, to force us back into this loop if we exit.
>       */
> -    if ( cpu == crashing_cpu )
> -        return 1;
> -    local_irq_disable();
> +    switch ( current_local_apic_mode() )
> +    {
> +        u32 apic_id;
>  
> -    kexec_crash_save_cpu();
> +    case APIC_MODE_X2APIC:
> +        apic_id = apic_rdmsr(APIC_ID);
>  
> -    __stop_this_cpu();
> +        apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL | 
> ((u64)apic_id << 32));
> +        break;
>  
> -    atomic_dec(&waiting_for_crash_ipi);
> +    case APIC_MODE_XAPIC:
> +        apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
> +
> +        while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
> +            cpu_relax();
> +
> +        apic_mem_write(APIC_ICR2, apic_id << 24);
> +        apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
> +        break;
> +
> +    default:
> +        break;
> +    }
>  
>      for ( ; ; )
>          halt();
> -
> -    return 1;
>  }
>  
>  static void nmi_shootdown_cpus(void)
>  {
>      unsigned long msecs;
> +    int i;
> +    int cpu = smp_processor_id();
>  
>      local_irq_disable();
>  
> -    crashing_cpu = smp_processor_id();
> +    crashing_cpu = cpu;
>      local_irq_count(crashing_cpu) = 0;
>  
>      atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> -    /* Would it be better to replace the trap vector here? */
> -    set_nmi_callback(crash_nmi_callback);
> +
> +    /* Change NMI trap handlers.  Non-crashing pcpus get crash_nmi which
> +     * invokes do_nmi_crash (above), which cause them to write state and
> +     * fall into a loop.  The crashing pcpu gets the nop handler to
> +     * cause it to return to this function ASAP.
> +     *
> +     * Futhermore, disable stack tables for NMIs and MCEs to prevent
> +     * race conditions resulting in corrupt stack frames.  As Xen is
> +     * about to die, we no longer care about the security-related race
> +     * condition with 'sysret' which ISTs are designed to solve.
> +     */
> +    for ( i = 0; i < nr_cpu_ids; ++i )
> +        if ( idt_tables[i] )
> +        {
> +            idt_tables[i][TRAP_nmi].a           &= ~(7UL << 32);
> +            idt_tables[i][TRAP_machine_check].a &= ~(7UL << 32);
> +
> +            if ( i == cpu )
> +                _set_gate(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop);
> +            else
> +                _set_gate(&idt_tables[i][TRAP_nmi], 14, 0, &nmi_crash);
> +        }
> +
>      /* Ensure the new callback function is set before sending out the NMI. */
>      wmb();
>  
> diff -r 1c69c938f641 -r b15d3ae525af xen/arch/x86/machine_kexec.c
> --- a/xen/arch/x86/machine_kexec.c
> +++ b/xen/arch/x86/machine_kexec.c
> @@ -87,6 +87,17 @@ void machine_kexec(xen_kexec_image_t *im
>       */
>      local_irq_disable();
>  
> +    /* Now regular interrupts are disabled, we need to reduce the impact
> +     * of interrupts not disabled by 'cli'.  NMIs have already been
> +     * dealt with by machine_crash_shutdown().
> +     *
> +     * Set all pcpu MCE handler to be a noop. */
> +    set_intr_gate(TRAP_machine_check, &trap_nop);
> +
> +    /* Explicitly enable NMIs on this CPU.  Some crashdump kernels do
> +     * not like running with NMIs disabled. */
> +    enable_nmis();
> +
>      /*
>       * compat_machine_kexec() returns to idle pagetables, which requires us
>       * to be running on a static GDT mapping (idle pagetables have no GDT
> diff -r 1c69c938f641 -r b15d3ae525af xen/arch/x86/x86_64/entry.S
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -635,11 +635,42 @@ ENTRY(nmi)
>          movl  $TRAP_nmi,4(%rsp)
>          jmp   handle_ist_exception
>  
> +ENTRY(nmi_crash)
> +        cli
> +        pushq $0
> +        movl $TRAP_nmi,4(%rsp)
> +        SAVE_ALL
> +        movq %rsp,%rdi
> +        callq do_nmi_crash /* Does not return */
> +        ud2
> +
>  ENTRY(machine_check)
>          pushq $0
>          movl  $TRAP_machine_check,4(%rsp)
>          jmp   handle_ist_exception
>  
> +/* No op trap handler.  Required for kexec path. */
> +ENTRY(trap_nop)
> +        iretq
> +
> +/* Enable NMIs.  No special register assumptions, and all preserved. */
> +ENTRY(enable_nmis)
> +        push %rax
> +        movq %rsp, %rax /* Grab RSP before pushing */
> +
> +        /* Set up stack frame */
> +        pushq $0               /* SS */
> +        pushq %rax             /* RSP */
> +        pushfq                 /* RFLAGS */
> +        pushq $__HYPERVISOR_CS /* CS */
> +        leaq  1f(%rip),%rax
> +        pushq %rax             /* RIP */
> +
> +        iretq /* Disable the hardware NMI latch */
> +1:
> +        popq %rax
> +        retq
> +
>  .section .rodata, "a", @progbits
>  
>  ENTRY(exception_table)
> diff -r 1c69c938f641 -r b15d3ae525af xen/include/asm-x86/processor.h
> --- a/xen/include/asm-x86/processor.h
> +++ b/xen/include/asm-x86/processor.h
> @@ -517,6 +517,7 @@ void do_ ## _name(struct cpu_user_regs *
>  DECLARE_TRAP_HANDLER(divide_error);
>  DECLARE_TRAP_HANDLER(debug);
>  DECLARE_TRAP_HANDLER(nmi);
> +DECLARE_TRAP_HANDLER(nmi_crash);
>  DECLARE_TRAP_HANDLER(int3);
>  DECLARE_TRAP_HANDLER(overflow);
>  DECLARE_TRAP_HANDLER(bounds);
> @@ -535,6 +536,9 @@ DECLARE_TRAP_HANDLER(alignment_check);
>  DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
>  #undef DECLARE_TRAP_HANDLER
>  
> +void trap_nop(void);
> +void enable_nmis(void);
> +
>  void syscall_enter(void);
>  void sysenter_entry(void);
>  void sysenter_eflags_saved(void);
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.