|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v2 for-4.21 2/9] x86/HPET: use single, global, low-priority vector for broadcast IRQ
On Mon, Oct 20, 2025 at 01:18:34PM +0200, Jan Beulich wrote:
> Using dynamically allocated / maintained vectors has several downsides:
> - possible nesting of IRQs due to the effects of IRQ migration,
> - reduction of vectors available for devices,
> - IRQs not moving as intended if there's shortage of vectors,
> - higher runtime overhead.
>
> As the vector also doesn't need to be of any priority (first and foremost
> it really shouldn't be of higher or same priority as the timer IRQ, as
> that raises TIMER_SOFTIRQ anyway), avoid any "ordinary" vectors altogther
> and use a vector from the 0x10...0x1f exception vector space. Exception vs
> interrupt can easily be distinguished by checking for the presence of an
> error code.
>
> With a fixed vector, less updating is now necessary in
> set_channel_irq_affinity(); in particular channels don't need transiently
> masking anymore, as the necessary update is now atomic. To fully leverage
> this, however, we want to stop using hpet_msi_set_affinity() there. With
> the transient masking dropped, we're no longer at risk of missing events.
>
> In principle a change to setup_vector_irq() would be necessary, but only
> if we used low-prio vectors as direct-APIC ones. Since the change would be
> at best benign here, it is being omitted.
>
> Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> Release-Acked-by: Oleksii Kurochko<oleksii.kurochko@xxxxxxxxx>
> ---
> This is an alternative proposal to
> https://lists.xen.org/archives/html/xen-devel/2014-03/msg00399.html.
>
> Should we keep hpet_msi_set_affinity() at all? We'd better not have the
> generic IRQ subsystem play with our IRQs' affinities ... (If so, this
> likely would want to be a separate patch, though.)
I think that needs to become a no-op, with possibly an ASSERT? Is it
possibly for dom0 to try to balance this IRQ? I would think not.
> The hpet_enable_channel() call could in principle be made (effectively)
> conditional, at the price of introducing a check in hpet_enable_channel().
> However, as much as eliminating the masking didn't help with the many
> excess (early) IRQs I'm observing on Intel hardware, doing so doesn't help
> either.
Let's go for the current approach.
> The Fixes: tag indicates where the problem got signficantly worse; in
> principle it was there already before (crashing at perhaps 6 or 7 levels
> of nested IRQs).
> ---
> v2: Re-work set_channel_irq_affinity() intensively. Re-base over the
> dropping of another patch. Drop setup_vector_irq() change.
>
> --- a/xen/arch/x86/hpet.c
> +++ b/xen/arch/x86/hpet.c
> @@ -9,17 +9,19 @@
> #include <xen/timer.h>
> #include <xen/smp.h>
> #include <xen/softirq.h>
> +#include <xen/cpuidle.h>
> #include <xen/irq.h>
> #include <xen/numa.h>
> #include <xen/param.h>
> #include <xen/sched.h>
>
> #include <asm/apic.h>
> -#include <asm/fixmap.h>
> #include <asm/div64.h>
> +#include <asm/fixmap.h>
> +#include <asm/genapic.h>
> #include <asm/hpet.h>
> +#include <asm/irq-vectors.h>
> #include <asm/msi.h>
> -#include <xen/cpuidle.h>
>
> #define MAX_DELTA_NS MILLISECS(10*1000)
> #define MIN_DELTA_NS MICROSECS(20)
> @@ -251,10 +253,9 @@ static void cf_check hpet_interrupt_hand
> ch->event_handler(ch);
> }
>
> -static void cf_check hpet_msi_unmask(struct irq_desc *desc)
> +static void hpet_enable_channel(struct hpet_event_channel *ch)
> {
> u32 cfg;
> - struct hpet_event_channel *ch = desc->action->dev_id;
>
> cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
> cfg |= HPET_TN_ENABLE;
> @@ -262,6 +263,11 @@ static void cf_check hpet_msi_unmask(str
> ch->msi.msi_attrib.host_masked = 0;
> }
>
> +static void cf_check hpet_msi_unmask(struct irq_desc *desc)
> +{
> + hpet_enable_channel(desc->action->dev_id);
> +}
> +
> static void hpet_disable_channel(struct hpet_event_channel *ch)
> {
> u32 cfg;
> @@ -307,15 +313,13 @@ static void cf_check hpet_msi_set_affini
> struct hpet_event_channel *ch = desc->action->dev_id;
> struct msi_msg msg = ch->msi.msg;
>
> - msg.dest32 = set_desc_affinity(desc, mask);
> - if ( msg.dest32 == BAD_APICID )
> - return;
> + /* This really is only for dump_irqs(). */
> + cpumask_copy(desc->arch.cpu_mask, mask);
>
> - msg.data &= ~MSI_DATA_VECTOR_MASK;
> - msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
> + msg.dest32 = cpu_mask_to_apicid(mask);
> msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
> msg.address_lo |= MSI_ADDR_DEST_ID(msg.dest32);
> - if ( msg.data != ch->msi.msg.data || msg.dest32 != ch->msi.msg.dest32 )
> + if ( msg.dest32 != ch->msi.msg.dest32 )
> hpet_msi_write(ch, &msg);
> }
>
> @@ -328,7 +332,7 @@ static hw_irq_controller hpet_msi_type =
> .shutdown = hpet_msi_shutdown,
> .enable = hpet_msi_unmask,
> .disable = hpet_msi_mask,
> - .ack = ack_nonmaskable_msi_irq,
> + .ack = irq_actor_none,
> .end = end_nonmaskable_irq,
> .set_affinity = hpet_msi_set_affinity,
> };
> @@ -347,6 +351,12 @@ static int __init hpet_setup_msi_irq(str
> u32 cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
> irq_desc_t *desc = irq_to_desc(ch->msi.irq);
>
> + clear_irq_vector(ch->msi.irq);
> + ret = bind_irq_vector(ch->msi.irq, HPET_BROADCAST_VECTOR,
> &cpu_online_map);
> + if ( ret )
> + return ret;
> + cpumask_setall(desc->affinity);
> +
> if ( iommu_intremap != iommu_intremap_off )
> {
> ch->msi.hpet_id = hpet_blockid;
> @@ -476,19 +486,50 @@ static struct hpet_event_channel *hpet_g
> static void set_channel_irq_affinity(struct hpet_event_channel *ch)
> {
> struct irq_desc *desc = irq_to_desc(ch->msi.irq);
> + struct msi_msg msg = ch->msi.msg;
>
> ASSERT(!local_irq_is_enabled());
> spin_lock(&desc->lock);
> - hpet_msi_mask(desc);
> - hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
> - hpet_msi_unmask(desc);
> +
> + per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
> +
> + /*
> + * Open-coding a reduced form of hpet_msi_set_affinity() here. With the
> + * actual update below (either of the IRTE or of [just] message address;
> + * with interrupt remapping message address/data don't change) now being
> + * atomic, we can avoid masking the IRQ around the update. As a result
> + * we're no longer at risk of missing IRQs (provided
> hpet_broadcast_enter()
> + * keeps setting the new deadline only afterwards).
> + */
> + cpumask_copy(desc->arch.cpu_mask, cpumask_of(ch->cpu));
> +
> spin_unlock(&desc->lock);
>
> - spin_unlock(&ch->lock);
> + msg.dest32 = cpu_physical_id(ch->cpu);
> + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
> + msg.address_lo |= MSI_ADDR_DEST_ID(msg.dest32);
> + if ( msg.dest32 != ch->msi.msg.dest32 )
> + {
> + ch->msi.msg = msg;
> +
> + if ( iommu_intremap != iommu_intremap_off )
> + {
> + int rc = iommu_update_ire_from_msi(&ch->msi, &msg);
>
> - /* We may have missed an interrupt due to the temporary masking. */
> - if ( ch->event_handler && ch->next_event < NOW() )
> - ch->event_handler(ch);
> + ASSERT(rc <= 0);
> + if ( rc > 0 )
> + {
> + ASSERT(msg.data == hpet_read32(HPET_Tn_ROUTE(ch->idx)));
> + ASSERT(msg.address_lo ==
> + hpet_read32(HPET_Tn_ROUTE(ch->idx) + 4));
> + }
The sequence of asserts seem wrong here, the asserts inside of the rc
> 0 check will never trigger, because there's an ASSERT(rc <= 0)
ahead of them?
> + }
> + else
> + hpet_write32(msg.address_lo, HPET_Tn_ROUTE(ch->idx) + 4);
If you avoid the HPET register update here you possibly need to make
sure that both fields are unconditionally written on the first call
after resume from suspension. hpet_resume() needs to somehow taint
the channels to signal that a re-write of the address and data fields
is mandatory regardless of what iommu_update_ire_from_msi() has
returned.
Thanks, Roger.
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |