From 1685dc2a2f811eb709cb62bffd8e58020ff79419 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 6 Jan 2022 17:26:48 -0800 Subject: [PATCH 3/4] xen: deal with vCPUs that do not yield when idle Our RCU implementation needs that a CPU goes through Xen, from time to time, e.g., for a context switch, to properly mark the end of grace period. This usually happen often enough, and CPUs that go idle and stay like that for a while are handled specially (so that they are recorded as quiescent and "leave" the grace period before starting idling). In principle, even a CPU that starts executing guest code may/should be marked as quiescent (it certainly can't be in the middle of a read side RCU critical section if it's leaving Xen and entering the guest!). This isn't done and in general does not cause problems. However, if the NULL scheduler is used and the guest is configured to not go back in Xen when its vCPUs become idle (e.g., with "vwfi=native" on ARM) grace periods may extend for very long time and RCU callback delayed to a point that, for instance, a domain is not properly destroyed. To fix that, we must start marking a CPU as quiescent as soon as it enter the guest (and, vice versa, register it back to the current grace period when it enters Xen). In order to do that, some changes to the API of rcu_idle_enter/exit were necessary (and the functions were renamed too). Note that, exactly like in the case where the CPU goes idle, we need the arm the callback timer when we enter guest context. In fact, if a CPU enter a guest with an RCU callback queued and then stays in that context for long enough, we still risk to not execute the callback itself for long enough to have problems. XXX ARM only for now. Signed-off-by: Dario Faggioli --- xen/arch/arm/domain.c | 4 ++-- xen/arch/arm/traps.c | 3 +++ xen/arch/x86/acpi/cpu_idle.c | 8 +++---- xen/arch/x86/cpu/mwait-idle.c | 8 +++---- xen/common/rcupdate.c | 41 +++++++++++++++++++++++------------ xen/include/xen/rcupdate.h | 4 ++-- 6 files changed, 42 insertions(+), 26 deletions(-) diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c index 92a6c509e5..e47168e80d 100644 --- a/xen/arch/arm/domain.c +++ b/xen/arch/arm/domain.c @@ -48,7 +48,7 @@ static void do_idle(void) { unsigned int cpu = smp_processor_id(); - rcu_idle_enter(cpu); + rcu_quiet_enter(); /* rcu_idle_enter() can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); @@ -60,7 +60,7 @@ static void do_idle(void) } local_irq_enable(); - rcu_idle_exit(cpu); + rcu_quiet_exit(); } void idle_loop(void) diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c index e2842ba4db..6bceb5b536 100644 --- a/xen/arch/arm/traps.c +++ b/xen/arch/arm/traps.c @@ -2047,6 +2047,8 @@ void enter_hypervisor_from_guest(void) { struct vcpu *v = current; + rcu_quiet_exit(); + /* * If we pended a virtual abort, preserve it until it gets cleared. * See ARM ARM DDI 0487A.j D1.14.3 (Virtual Interrupts) for details, @@ -2337,6 +2339,7 @@ static bool check_for_vcpu_work(void) */ void leave_hypervisor_to_guest(void) { + rcu_quiet_enter(); local_irq_disable(); /* diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c index d788c8bffc..e3a5e67c68 100644 --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -716,7 +716,7 @@ static void acpi_processor_idle(void) cpufreq_dbs_timer_suspend(); - rcu_idle_enter(cpu); + rcu_quiet_enter(); /* rcu_idle_enter() can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); @@ -729,7 +729,7 @@ static void acpi_processor_idle(void) if ( !cpu_is_haltable(cpu) ) { local_irq_enable(); - rcu_idle_exit(cpu); + rcu_quiet_exit(); cpufreq_dbs_timer_resume(); return; } @@ -854,7 +854,7 @@ static void acpi_processor_idle(void) /* Now in C0 */ power->last_state = &power->states[0]; local_irq_enable(); - rcu_idle_exit(cpu); + rcu_quiet_exit(); cpufreq_dbs_timer_resume(); return; } @@ -862,7 +862,7 @@ static void acpi_processor_idle(void) /* Now in C0 */ power->last_state = &power->states[0]; - rcu_idle_exit(cpu); + rcu_quiet_exit(); cpufreq_dbs_timer_resume(); if ( cpuidle_current_governor->reflect ) diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c index d1739f6fc3..739f20e1fe 100644 --- a/xen/arch/x86/cpu/mwait-idle.c +++ b/xen/arch/x86/cpu/mwait-idle.c @@ -778,8 +778,8 @@ static void mwait_idle(void) cpufreq_dbs_timer_suspend(); - rcu_idle_enter(cpu); - /* rcu_idle_enter() can raise TIMER_SOFTIRQ. Process it now. */ + rcu_quiet_enter(); + /* rcu_quiet_enter() can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); /* Interrupts must be disabled for C2 and higher transitions. */ @@ -787,7 +787,7 @@ static void mwait_idle(void) if (!cpu_is_haltable(cpu)) { local_irq_enable(); - rcu_idle_exit(cpu); + rcu_quiet_exit(); cpufreq_dbs_timer_resume(); return; } @@ -829,7 +829,7 @@ static void mwait_idle(void) if (!(lapic_timer_reliable_states & (1 << cx->type))) lapic_timer_on(); - rcu_idle_exit(cpu); + rcu_quiet_exit(); cpufreq_dbs_timer_resume(); if ( cpuidle_current_governor->reflect ) diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c index 96e8ca01c0..cfcc8bbc97 100644 --- a/xen/common/rcupdate.c +++ b/xen/common/rcupdate.c @@ -87,7 +87,7 @@ struct rcu_data { int cpu; long last_rs_qlen; /* qlen during the last resched */ - /* 3) idle CPUs handling */ + /* 3) idle (or in guest mode) CPUs handling */ struct timer cb_timer; bool cb_timer_active; @@ -115,6 +115,12 @@ boolean_param("rcu_force_quiesc", rcu_always_quiesc); * 3) it is stopped immediately, if the CPU wakes up from idle and * resumes 'normal' execution. * + * Note also that the same happens if a CPU starts executing a guest that + * (almost) never comes back into the hypervisor. This may be the case if + * the guest uses "idle=poll" / "vwfi=native". Therefore, we need to handle + * guest entry events in the same way as the CPU going idle, i.e., consider + * it quiesced and arm the timer. + * * About how far in the future the timer should be programmed each time, * it's hard to tell (guess!!). Since this mimics Linux's periodic timer * tick, take values used there as an indication. In Linux 2.6.21, tick @@ -362,9 +368,10 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * Make sure the increment of rcp->cur is visible so, even if a * CPU that is about to go idle, is captured inside rcp->cpumask, * rcu_pending() will return false, which then means cpu_quiet() - * will be invoked, before the CPU would actually enter idle. + * will be invoked, before the CPU would actually enter idle (or + * enter a guest). * - * This barrier is paired with the one in rcu_idle_enter(). + * This barrier is paired with the one in rcu_quit_enter(). */ smp_mb(); cpumask_andnot(&rcp->cpumask, &cpu_online_map, &rcp->ignore_cpumask); @@ -534,14 +541,15 @@ int rcu_needs_cpu(int cpu) * periodically poke rcu_pedning(), so that it will invoke the callback * not too late after the end of the grace period. */ -static void cb_timer_start(void) +static void cb_timer_start(unsigned int cpu) { - struct rcu_data *rdp = &this_cpu(rcu_data); + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); /* * Note that we don't check rcu_pending() here. In fact, we don't want * the timer armed on CPUs that are in the process of quiescing while - * going idle, unless they really are the ones with a queued callback. + * going idle or entering guest mode, unless they really have queued + * callbacks. */ if (likely(!rdp->curlist)) return; @@ -550,9 +558,9 @@ static void cb_timer_start(void) rdp->cb_timer_active = true; } -static void cb_timer_stop(void) +static void cb_timer_stop(unsigned int cpu) { - struct rcu_data *rdp = &this_cpu(rcu_data); + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); if (likely(!rdp->cb_timer_active)) return; @@ -716,11 +724,14 @@ void __init rcu_init(void) } /* - * The CPU is becoming idle, so no more read side critical - * sections, and one more step toward grace period. + * The CPU is becoming about to either idle or enter the guest. In any of + * these cases, it can't have any outstanding read side critical sections + * so this is one step toward the end of the grace period. */ -void rcu_idle_enter(unsigned int cpu) +void rcu_quiet_enter(void) { + unsigned int cpu = smp_processor_id(); + ASSERT(!cpumask_test_cpu(cpu, &rcu_ctrlblk.ignore_cpumask)); cpumask_set_cpu(cpu, &rcu_ctrlblk.ignore_cpumask); /* @@ -733,12 +744,14 @@ void rcu_idle_enter(unsigned int cpu) */ smp_mb(); - cb_timer_start(); + cb_timer_start(cpu); } -void rcu_idle_exit(unsigned int cpu) +void rcu_quiet_exit(void) { - cb_timer_stop(); + unsigned int cpu = smp_processor_id(); + + cb_timer_stop(cpu); ASSERT(cpumask_test_cpu(cpu, &rcu_ctrlblk.ignore_cpumask)); cpumask_clear_cpu(cpu, &rcu_ctrlblk.ignore_cpumask); } diff --git a/xen/include/xen/rcupdate.h b/xen/include/xen/rcupdate.h index d279e39022..7dd0b2d74b 100644 --- a/xen/include/xen/rcupdate.h +++ b/xen/include/xen/rcupdate.h @@ -179,7 +179,7 @@ void call_rcu(struct rcu_head *head, void rcu_barrier(void); -void rcu_idle_enter(unsigned int cpu); -void rcu_idle_exit(unsigned int cpu); +void rcu_quiet_enter(void); +void rcu_quiet_exit(void); #endif /* __XEN_RCUPDATE_H */ -- 2.25.1