Xen project Mailing List

Re: [Xen-devel] [PATCH v3 07/26] ARM: GICv3 ITS: introduce host LPI array

To: Andre Przywara <andre.przywara@xxxxxxx>

From: Stefano Stabellini <sstabellini@xxxxxxxxxx>

Date: Fri, 31 Mar 2017 16:24:51 -0700 (PDT)

Cc: xen-devel@xxxxxxxxxxxxxxxxxxxx, Julien Grall <julien.grall@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Shanker Donthineni <shankerd@xxxxxxxxxxxxxx>, Vijay Kilari <vijay.kilari@xxxxxxxxx>

Delivery-date: Fri, 31 Mar 2017 23:25:04 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

On Fri, 31 Mar 2017, Andre Przywara wrote: > The number of LPIs on a host can be potentially huge (millions), > although in practise will be mostly reasonable. So prematurely allocating > an array of struct irq_desc's for each LPI is not an option. > However Xen itself does not care about LPIs, as every LPI will be injected > into a guest (Dom0 for now). > Create a dense data structure (8 Bytes) for each LPI which holds just > enough information to determine the virtual IRQ number and the VCPU into > which the LPI needs to be injected. > Also to not artificially limit the number of LPIs, we create a 2-level > table for holding those structures. > This patch introduces functions to initialize these tables and to > create, lookup and destroy entries for a given LPI. > By using the naturally atomic access guarantee the native uint64_t data > type gives us, we allocate and access LPI information in a way that does > not require a lock. > > Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx> See alpine.DEB.2.10.1703221552490.8001@sstabellini-ThinkPad-X260. I'll stop here for now, I think that are enough comments already for another version. > --- > xen/arch/arm/gic-v3-its.c | 89 +++++++++++++++++- > xen/arch/arm/gic-v3-lpi.c | 196 > +++++++++++++++++++++++++++++++++++++++ > xen/include/asm-arm/gic.h | 2 + > xen/include/asm-arm/gic_v3_its.h | 5 + > xen/include/asm-arm/irq.h | 5 + > 5 files changed, 295 insertions(+), 2 deletions(-) > > diff --git a/xen/arch/arm/gic-v3-its.c b/xen/arch/arm/gic-v3-its.c > index 295f7dc..fa284e7 100644 > --- a/xen/arch/arm/gic-v3-its.c > +++ b/xen/arch/arm/gic-v3-its.c > @@ -151,6 +151,20 @@ static int its_send_cmd_sync(struct host_its *its, > unsigned int cpu) > return its_send_command(its, cmd); > } > > +static int its_send_cmd_mapti(struct host_its *its, > + uint32_t deviceid, uint32_t eventid, > + uint32_t pintid, uint16_t icid) > +{ > + uint64_t cmd[4]; > + > + cmd[0] = GITS_CMD_MAPTI | ((uint64_t)deviceid << 32); > + cmd[1] = eventid | ((uint64_t)pintid << 32); > + cmd[2] = icid; > + cmd[3] = 0x00; > + > + return its_send_command(its, cmd); > +} > + > static int its_send_cmd_mapc(struct host_its *its, uint32_t collection_id, > unsigned int cpu) > { > @@ -185,6 +199,19 @@ static int its_send_cmd_mapd(struct host_its *its, > uint32_t deviceid, > return its_send_command(its, cmd); > } > > +static int its_send_cmd_inv(struct host_its *its, > + uint32_t deviceid, uint32_t eventid) > +{ > + uint64_t cmd[4]; > + > + cmd[0] = GITS_CMD_INV | ((uint64_t)deviceid << 32); > + cmd[1] = eventid; > + cmd[2] = 0x00; > + cmd[3] = 0x00; > + > + return its_send_command(its, cmd); > +} > + > /* Set up the (1:1) collection mapping for the given host CPU. */ > int gicv3_its_setup_collection(unsigned int cpu) > { > @@ -469,7 +496,7 @@ int gicv3_its_init(void) > > static int remove_mapped_guest_device(struct its_devices *dev) > { > - int ret; > + int ret, i; > > if ( dev->hw_its ) > { > @@ -479,12 +506,16 @@ static int remove_mapped_guest_device(struct > its_devices *dev) > return ret; > } > > + for ( i = 0; i < DIV_ROUND_UP(dev->eventids, LPI_BLOCK); i++ ) > + gicv3_free_host_lpi_block(dev->host_lpi_blocks[i]); > + > ret = gicv3_its_wait_commands(dev->hw_its); > if ( ret ) > return ret; > > xfree(dev->itt_addr); > xfree(dev->pend_irqs); > + xfree(dev->host_lpi_blocks); > xfree(dev); > > return 0; > @@ -522,6 +553,37 @@ static int compare_its_guest_devices(struct its_devices > *dev, > } > > /* > + * On the host ITS @its, map @nr_events consecutive LPIs. > + * The mapping connects a device @devid and event @eventid pair to LPI @lpi, > + * increasing both @eventid and @lpi to cover the number of requested LPIs. > + */ > +static int gicv3_its_map_host_events(struct host_its *its, > + uint32_t devid, uint32_t eventid, > + uint32_t lpi, uint32_t nr_events) > +{ > + uint32_t i; > + int ret; > + > + for ( i = 0; i < nr_events; i++ ) > + { > + /* For now we map every host LPI to host CPU 0 */ > + ret = its_send_cmd_mapti(its, devid, eventid + i, lpi + i, 0); > + if ( ret ) > + return ret; > + > + ret = its_send_cmd_inv(its, devid, eventid + i); > + if ( ret ) > + return ret; > + } > + > + ret = its_send_cmd_sync(its, 0); > + if ( ret ) > + return ret; > + > + return gicv3_its_wait_commands(its); > +} > + > +/* > * Map a hardware device, identified by a certain host ITS and its device ID > * to domain d, a guest ITS (identified by its doorbell address) and device > ID. > * Also provide the number of events (MSIs) needed for that device. > @@ -537,7 +599,7 @@ int gicv3_its_map_guest_device(struct domain *d, > struct host_its *hw_its; > struct its_devices *dev = NULL; > struct rb_node **new = &d->arch.vgic.its_devices.rb_node, *parent = NULL; > - int ret = -ENOENT; > + int ret = -ENOENT, i; > > hw_its = gicv3_its_find_by_doorbell(host_doorbell); > if ( !hw_its ) > @@ -595,6 +657,11 @@ int gicv3_its_map_guest_device(struct domain *d, > if ( !dev->pend_irqs ) > goto out_unlock; > > + dev->host_lpi_blocks = xzalloc_array(uint32_t, > + DIV_ROUND_UP(nr_events, LPI_BLOCK)); > + if ( !dev->host_lpi_blocks ) > + goto out_unlock; > + > ret = its_send_cmd_mapd(hw_its, host_devid, > fls(ROUNDUP(nr_events, LPI_BLOCK) - 1) - 1, > virt_to_maddr(itt_addr), true); > @@ -613,10 +680,28 @@ int gicv3_its_map_guest_device(struct domain *d, > > spin_unlock(&d->arch.vgic.its_devices_lock); > > + /* > + * Map all host LPIs within this device already. We can't afford to queue > + * any host ITS commands later on during the guest's runtime. > + */ > + for ( i = 0; i < DIV_ROUND_UP(nr_events, LPI_BLOCK); i++ ) > + { > + ret = gicv3_allocate_host_lpi_block(d, &dev->host_lpi_blocks[i]); > + if ( ret < 0 ) > + goto out; > + > + ret = gicv3_its_map_host_events(hw_its, host_devid, i * LPI_BLOCK, > + dev->host_lpi_blocks[i], LPI_BLOCK); > + if ( ret < 0 ) > + goto out; > + } > + > return 0; > > out_unlock: > spin_unlock(&d->arch.vgic.its_devices_lock); > + > +out: > if ( dev ) > { > xfree(dev->pend_irqs); > diff --git a/xen/arch/arm/gic-v3-lpi.c b/xen/arch/arm/gic-v3-lpi.c > index d85d63d..d642cc5 100644 > --- a/xen/arch/arm/gic-v3-lpi.c > +++ b/xen/arch/arm/gic-v3-lpi.c > @@ -20,25 +20,55 @@ > > #include <xen/lib.h> > #include <xen/mm.h> > +#include <xen/sched.h> > #include <xen/sizes.h> > +#include <asm/atomic.h> > +#include <asm/domain.h> > #include <asm/gic.h> > #include <asm/gic_v3_defs.h> > #include <asm/gic_v3_its.h> > #include <asm/io.h> > #include <asm/page.h> > > +/* > + * There could be a lot of LPIs on the host side, and they always go to > + * a guest. So having a struct irq_desc for each of them would be wasteful > + * and useless. > + * Instead just store enough information to find the right VCPU to inject > + * those LPIs into, which just requires the virtual LPI number. > + * To avoid a global lock on this data structure, this is using a lockless > + * approach relying on the architectural atomicty of native data types: > + * We read or write the "data" view of this union atomically, then can > + * access the broken-down fields in our local copy. > + */ > +union host_lpi { > + uint64_t data; > + struct { > + uint32_t virt_lpi; > + uint16_t dom_id; > + uint16_t vcpu_id; > + }; > +}; > + > #define LPI_PROPTABLE_NEEDS_FLUSHING (1U << 0) > /* Global state */ > static struct { > /* The global LPI property table, shared by all redistributors. */ > uint8_t *lpi_property; > /* > + * A two-level table to lookup LPIs firing on the host and look up the > + * VCPU and virtual LPI number to inject into. > + */ > + union host_lpi **host_lpis; > + /* > * Number of physical LPIs the host supports. This is a property of > * the GIC hardware. We depart from the habit of naming these things > * "physical" in Xen, as the GICv3/4 spec uses the term "physical LPI" > * in a different context to differentiate them from "virtual LPIs". > */ > unsigned long int nr_host_lpis; > + /* Protects allocation and deallocation of host LPIs, but not the access > */ > + spinlock_t host_lpis_lock; > unsigned int flags; > } lpi_data; > > @@ -51,6 +81,19 @@ struct lpi_redist_data { > static DEFINE_PER_CPU(struct lpi_redist_data, lpi_redist); > > #define MAX_PHYS_LPIS (lpi_data.nr_host_lpis - LPI_OFFSET) > +#define HOST_LPIS_PER_PAGE (PAGE_SIZE / sizeof(union host_lpi)) > + > +static union host_lpi *gic_get_host_lpi(uint32_t plpi) > +{ > + if ( !is_lpi(plpi) || plpi >= MAX_PHYS_LPIS + LPI_OFFSET ) > + return NULL; > + > + plpi -= LPI_OFFSET; > + if ( !lpi_data.host_lpis[plpi / HOST_LPIS_PER_PAGE] ) > + return NULL; > + > + return &lpi_data.host_lpis[plpi / HOST_LPIS_PER_PAGE][plpi % > HOST_LPIS_PER_PAGE]; > +} > > /* Stores this redistributor's physical address and ID in a per-CPU variable > */ > void gicv3_set_redist_address(paddr_t address, unsigned int redist_id) > @@ -212,15 +255,168 @@ int gicv3_lpi_init_rdist(void __iomem * rdist_base) > static unsigned int max_lpi_bits = 20; > integer_param("max_lpi_bits", max_lpi_bits); > > +/* > + * Allocate the 2nd level array for host LPIs. This one holds pointers > + * to the page with the actual "union host_lpi" entries. Our LPI limit > + * avoids excessive memory usage. > + */ > int gicv3_lpi_init_host_lpis(unsigned int hw_lpi_bits) > { > + int nr_lpi_ptrs; > + > + /* We rely on the data structure being atomically accessible. */ > + BUILD_BUG_ON(sizeof(union host_lpi) > sizeof(unsigned long)); > + > lpi_data.nr_host_lpis = BIT_ULL(min(hw_lpi_bits, max_lpi_bits)); > > + spin_lock_init(&lpi_data.host_lpis_lock); > + > + nr_lpi_ptrs = MAX_PHYS_LPIS / (PAGE_SIZE / sizeof(union host_lpi)); > + lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs); > + if ( !lpi_data.host_lpis ) > + return -ENOMEM; > + > printk("GICv3: using at most %lu LPIs on the host.\n", MAX_PHYS_LPIS); > > return 0; > } > > +static int find_unused_host_lpi(uint32_t start, uint32_t *index) > +{ > + unsigned int chunk; > + uint32_t i = *index; > + > + ASSERT(spin_is_locked(&lpi_data.host_lpis_lock)); > + > + for ( chunk = start; chunk < MAX_PHYS_LPIS / HOST_LPIS_PER_PAGE; chunk++ > ) > + { > + /* If we hit an unallocated chunk, use entry 0 in that one. */ > + if ( !lpi_data.host_lpis[chunk] ) > + { > + *index = 0; > + return chunk; > + } > + > + /* Find an unallocated entry in this chunk. */ > + for ( ; i < HOST_LPIS_PER_PAGE; i += LPI_BLOCK ) > + { > + if ( lpi_data.host_lpis[chunk][i].dom_id == DOMID_INVALID ) > + { > + *index = i; > + return chunk; > + } > + } > + i = 0; > + } > + > + return -1; > +} > + > +/* > + * Allocate a block of 32 LPIs on the given host ITS for device "devid", > + * starting with "eventid". Put them into the respective ITT by issuing a > + * MAPTI command for each of them. > + */ > +int gicv3_allocate_host_lpi_block(struct domain *d, uint32_t *first_lpi) > +{ > + static uint32_t next_lpi = 0; > + uint32_t lpi, lpi_idx = next_lpi % HOST_LPIS_PER_PAGE; > + int chunk; > + int i; > + > + spin_lock(&lpi_data.host_lpis_lock); > + chunk = find_unused_host_lpi(next_lpi / HOST_LPIS_PER_PAGE, &lpi_idx); > + > + if ( chunk == - 1 ) /* rescan for a hole from the beginning */ > + { > + lpi_idx = 0; > + chunk = find_unused_host_lpi(0, &lpi_idx); > + if ( chunk == -1 ) > + { > + spin_unlock(&lpi_data.host_lpis_lock); > + return -ENOSPC; > + } > + } > + > + /* If we hit an unallocated chunk, we initialize it and use entry 0. */ > + if ( !lpi_data.host_lpis[chunk] ) > + { > + union host_lpi *new_chunk; > + > + /* TODO: NUMA locality for quicker IRQ path? */ > + new_chunk = xmalloc_bytes(PAGE_SIZE); > + if ( !new_chunk ) > + { > + spin_unlock(&lpi_data.host_lpis_lock); > + return -ENOMEM; > + } > + > + for ( i = 0; i < HOST_LPIS_PER_PAGE; i += LPI_BLOCK ) > + new_chunk[i].dom_id = DOMID_INVALID; > + > + lpi_data.host_lpis[chunk] = new_chunk; > + lpi_idx = 0; > + } > + > + lpi = chunk * HOST_LPIS_PER_PAGE + lpi_idx; > + > + for ( i = 0; i < LPI_BLOCK; i++ ) > + { > + union host_lpi hlpi; > + > + /* > + * Mark this host LPI as belonging to the domain, but don't assign > + * any virtual LPI or a VCPU yet. > + */ > + hlpi.virt_lpi = INVALID_LPI; > + hlpi.dom_id = d->domain_id; > + hlpi.vcpu_id = ~0; > + write_u64_atomic(&lpi_data.host_lpis[chunk][lpi_idx + i].data, > + hlpi.data); > + > + /* > + * Enable this host LPI, so we don't have to do this during the > + * guest's runtime. > + */ > + lpi_data.lpi_property[lpi + i] |= LPI_PROP_ENABLED; > + } > + > + /* > + * We have allocated and initialized the host LPI entries, so it's safe > + * to drop the lock now. Access to the structures can be done > concurrently > + * as it involves only an atomic uint64_t access. > + */ > + spin_unlock(&lpi_data.host_lpis_lock); > + > + if ( lpi_data.flags & LPI_PROPTABLE_NEEDS_FLUSHING ) > + clean_and_invalidate_dcache_va_range(&lpi_data.lpi_property[lpi], > + LPI_BLOCK); > + > + next_lpi = lpi + LPI_BLOCK; > + *first_lpi = lpi + LPI_OFFSET; > + > + return 0; > +} > + > +void gicv3_free_host_lpi_block(uint32_t first_lpi) > +{ > + union host_lpi *hlpi, empty_lpi = { .dom_id = DOMID_INVALID }; > + int i; > + > + hlpi = gic_get_host_lpi(first_lpi); > + if ( !hlpi ) > + return; /* Nothing to free here. */ > + > + spin_lock(&lpi_data.host_lpis_lock); > + > + for ( i = 0; i < LPI_BLOCK; i++ ) > + write_u64_atomic(&hlpi[i].data, empty_lpi.data); > + > + spin_unlock(&lpi_data.host_lpis_lock); > + > + return; > +} > + > /* > * Local variables: > * mode: C > diff --git a/xen/include/asm-arm/gic.h b/xen/include/asm-arm/gic.h > index 836a103..d04bd04 100644 > --- a/xen/include/asm-arm/gic.h > +++ b/xen/include/asm-arm/gic.h > @@ -220,6 +220,8 @@ enum gic_version { > GIC_V3, > }; > > +#define INVALID_LPI 0 > + > extern enum gic_version gic_hw_version(void); > > /* Program the IRQ type into the GIC */ > diff --git a/xen/include/asm-arm/gic_v3_its.h > b/xen/include/asm-arm/gic_v3_its.h > index 4ade5f6..7b47596 100644 > --- a/xen/include/asm-arm/gic_v3_its.h > +++ b/xen/include/asm-arm/gic_v3_its.h > @@ -106,6 +106,9 @@ > #define HOST_ITS_FLUSH_CMD_QUEUE (1U << 0) > #define HOST_ITS_USES_PTA (1U << 1) > > +/* We allocate LPIs on the hosts in chunks of 32 to reduce handling > overhead. */ > +#define LPI_BLOCK 32 > + > /* data structure for each hardware ITS */ > struct host_its { > struct list_head entry; > @@ -153,6 +156,8 @@ int gicv3_its_map_guest_device(struct domain *d, > paddr_t guest_doorbell, uint32_t guest_devid, > uint32_t nr_events, bool valid); > void gicv3_its_unmap_all_devices(struct domain *d); > +int gicv3_allocate_host_lpi_block(struct domain *d, uint32_t *first_lpi); > +void gicv3_free_host_lpi_block(uint32_t first_lpi); > > #else > > diff --git a/xen/include/asm-arm/irq.h b/xen/include/asm-arm/irq.h > index 13528c0..d16affc 100644 > --- a/xen/include/asm-arm/irq.h > +++ b/xen/include/asm-arm/irq.h > @@ -42,6 +42,11 @@ struct irq_desc *__irq_to_desc(int irq); > > void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq); > > +static inline bool is_lpi(unsigned int irq) > +{ > + return irq >= LPI_OFFSET; > +} > + > #define domain_pirq_to_irq(d, pirq) (pirq) > > bool_t is_assignable_irq(unsigned int irq); > -- > 2.9.0 > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.