[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v5 09/30] ARM: GICv3 ITS: introduce host LPI array



On Thu, 6 Apr 2017, Andre Przywara wrote:
> The number of LPIs on a host can be potentially huge (millions),
> although in practise will be mostly reasonable. So prematurely allocating
> an array of struct irq_desc's for each LPI is not an option.
> However Xen itself does not care about LPIs, as every LPI will be injected
> into a guest (Dom0 for now).
> Create a dense data structure (8 Bytes) for each LPI which holds just
> enough information to determine the virtual IRQ number and the VCPU into
> which the LPI needs to be injected.
> Also to not artificially limit the number of LPIs, we create a 2-level
> table for holding those structures.
> This patch introduces functions to initialize these tables and to
> create, lookup and destroy entries for a given LPI.
> By using the naturally atomic access guarantee the native uint64_t data
> type gives us, we allocate and access LPI information in a way that does
> not require a lock.
> 
> Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>
> ---
>  xen/arch/arm/gic-v3-its.c        |  60 +++++++++++
>  xen/arch/arm/gic-v3-lpi.c        | 227 
> +++++++++++++++++++++++++++++++++++++++
>  xen/include/asm-arm/gic_v3_its.h |   6 ++
>  xen/include/asm-arm/irq.h        |   8 ++
>  4 files changed, 301 insertions(+)
> 
> diff --git a/xen/arch/arm/gic-v3-its.c b/xen/arch/arm/gic-v3-its.c
> index 1ecd63b..eb47c9d 100644
> --- a/xen/arch/arm/gic-v3-its.c
> +++ b/xen/arch/arm/gic-v3-its.c
> @@ -157,6 +157,20 @@ static int its_send_cmd_sync(struct host_its *its, 
> unsigned int cpu)
>      return its_send_command(its, cmd);
>  }
>  
> +static int its_send_cmd_mapti(struct host_its *its,
> +                              uint32_t deviceid, uint32_t eventid,
> +                              uint32_t pintid, uint16_t icid)
> +{
> +    uint64_t cmd[4];
> +
> +    cmd[0] = GITS_CMD_MAPTI | ((uint64_t)deviceid << 32);
> +    cmd[1] = eventid | ((uint64_t)pintid << 32);
> +    cmd[2] = icid;
> +    cmd[3] = 0x00;
> +
> +    return its_send_command(its, cmd);
> +}
> +
>  static int its_send_cmd_mapc(struct host_its *its, uint32_t collection_id,
>                               unsigned int cpu)
>  {
> @@ -171,6 +185,19 @@ static int its_send_cmd_mapc(struct host_its *its, 
> uint32_t collection_id,
>      return its_send_command(its, cmd);
>  }
>  
> +static int its_send_cmd_inv(struct host_its *its,
> +                            uint32_t deviceid, uint32_t eventid)
> +{
> +    uint64_t cmd[4];
> +
> +    cmd[0] = GITS_CMD_INV | ((uint64_t)deviceid << 32);
> +    cmd[1] = eventid;
> +    cmd[2] = 0x00;
> +    cmd[3] = 0x00;
> +
> +    return its_send_command(its, cmd);
> +}
> +
>  /* Set up the (1:1) collection mapping for the given host CPU. */
>  int gicv3_its_setup_collection(unsigned int cpu)
>  {
> @@ -450,6 +477,39 @@ int gicv3_its_init(void)
>      return 0;
>  }
>  
> +/*
> + * On the host ITS @its, map @nr_events consecutive LPIs.
> + * The mapping connects a device @devid and event @eventid pair to LPI @lpi,
> + * increasing both @eventid and @lpi to cover the number of requested LPIs.
> + */
> +static int gicv3_its_map_host_events(struct host_its *its,
> +                                     uint32_t devid, uint32_t eventid,
> +                                     uint32_t lpi, uint32_t nr_events)
> +{
> +    uint32_t i;
> +    int ret;
> +
> +    for ( i = 0; i < nr_events; i++ )
> +    {
> +        /* For now we map every host LPI to host CPU 0 */
> +        ret = its_send_cmd_mapti(its, devid, eventid + i, lpi + i, 0);
> +        if ( ret )
> +            return ret;
> +
> +        ret = its_send_cmd_inv(its, devid, eventid + i);
> +        if ( ret )
> +            return ret;
> +    }
> +
> +    /* TODO: Consider using INVALL here. Didn't work on the model, though. */
> +
> +    ret = its_send_cmd_sync(its, 0);
> +    if ( ret )
> +        return ret;
> +
> +    return gicv3_its_wait_commands(its);
> +}
> +
>  /* Scan the DT for any ITS nodes and create a list of host ITSes out of it. 
> */
>  void gicv3_its_dt_init(const struct dt_device_node *node)
>  {
> diff --git a/xen/arch/arm/gic-v3-lpi.c b/xen/arch/arm/gic-v3-lpi.c
> index 9d3df7f..0785701 100644
> --- a/xen/arch/arm/gic-v3-lpi.c
> +++ b/xen/arch/arm/gic-v3-lpi.c
> @@ -20,14 +20,37 @@
>  
>  #include <xen/lib.h>
>  #include <xen/mm.h>
> +#include <xen/sched.h>
>  #include <xen/sizes.h>
>  #include <xen/warning.h>
> +#include <asm/atomic.h>
> +#include <asm/domain.h>
>  #include <asm/gic.h>
>  #include <asm/gic_v3_defs.h>
>  #include <asm/gic_v3_its.h>
>  #include <asm/io.h>
>  #include <asm/page.h>
>  
> +/*
> + * There could be a lot of LPIs on the host side, and they always go to
> + * a guest. So having a struct irq_desc for each of them would be wasteful
> + * and useless.
> + * Instead just store enough information to find the right VCPU to inject
> + * those LPIs into, which just requires the virtual LPI number.
> + * To avoid a global lock on this data structure, this is using a lockless
> + * approach relying on the architectural atomicity of native data types:
> + * We read or write the "data" view of this union atomically, then can
> + * access the broken-down fields in our local copy.
> + */
> +union host_lpi {
> +    uint64_t data;
> +    struct {
> +        uint32_t virt_lpi;
> +        uint16_t dom_id;
> +        uint16_t vcpu_id;
> +    };
> +};
> +
>  #define LPI_PROPTABLE_NEEDS_FLUSHING    (1U << 0)
>  
>  /* Global state */
> @@ -35,12 +58,23 @@ static struct {
>      /* The global LPI property table, shared by all redistributors. */
>      uint8_t *lpi_property;
>      /*
> +     * A two-level table to lookup LPIs firing on the host and look up the
> +     * VCPU and virtual LPI number to inject into.
> +     */
> +    union host_lpi **host_lpis;
> +    /*
>       * Number of physical LPIs the host supports. This is a property of
>       * the GIC hardware. We depart from the habit of naming these things
>       * "physical" in Xen, as the GICv3/4 spec uses the term "physical LPI"
>       * in a different context to differentiate them from "virtual LPIs".
>       */
>      unsigned long long int max_host_lpi_ids;
> +    /*
> +     * Protects allocation and deallocation of host LPIs and next_free_lpi,
> +     * but not the actual data stored in the host_lpi entry.
> +     */
> +    spinlock_t host_lpis_lock;
> +    uint32_t next_free_lpi;
>      unsigned int flags;
>  } lpi_data;
>  
> @@ -53,6 +87,28 @@ struct lpi_redist_data {
>  static DEFINE_PER_CPU(struct lpi_redist_data, lpi_redist);
>  
>  #define MAX_NR_HOST_LPIS   (lpi_data.max_host_lpi_ids - LPI_OFFSET)
> +#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))
> +
> +static union host_lpi *gic_get_host_lpi(uint32_t plpi)
> +{
> +    union host_lpi *block;
> +
> +    if ( !is_lpi(plpi) || plpi >= MAX_NR_HOST_LPIS + LPI_OFFSET )
> +        return NULL;
> +
> +    ASSERT(plpi >= LPI_OFFSET);
> +
> +    plpi -= LPI_OFFSET;
> +
> +    block = lpi_data.host_lpis[plpi / HOST_LPIS_PER_PAGE];
> +    if ( !block )
> +        return NULL;
> +
> +    /* Matches the write barrier in allocation code. */
> +    smp_rmb();
> +
> +    return &block[plpi % HOST_LPIS_PER_PAGE];
> +}
>  
>  /*
>   * An ITS can refer to redistributors in two ways: either by an ID (possibly
> @@ -220,8 +276,18 @@ int gicv3_lpi_init_rdist(void __iomem * rdist_base)
>  static unsigned int max_lpi_bits = 20;
>  integer_param("max_lpi_bits", max_lpi_bits);
>  
> +/*
> + * Allocate the 2nd level array for host LPIs. This one holds pointers
> + * to the page with the actual "union host_lpi" entries. Our LPI limit
> + * avoids excessive memory usage.
> + */
>  int gicv3_lpi_init_host_lpis(unsigned int host_lpi_bits)
>  {
> +    unsigned int nr_lpi_ptrs;
> +
> +    /* We rely on the data structure being atomically accessible. */
> +    BUILD_BUG_ON(sizeof(union host_lpi) > sizeof(unsigned long));
> +
>      /* An implementation needs to support at least 14 bits of LPI IDs. */
>      max_lpi_bits = max(max_lpi_bits, 14U);
>      lpi_data.max_host_lpi_ids = BIT_ULL(min(host_lpi_bits, max_lpi_bits));
> @@ -234,11 +300,172 @@ int gicv3_lpi_init_host_lpis(unsigned int 
> host_lpi_bits)
>      if ( lpi_data.max_host_lpi_ids > BIT(24) )
>          warning_add("Using high number of LPIs, limit memory usage with 
> max_lpi_bits\n");
>  
> +    spin_lock_init(&lpi_data.host_lpis_lock);
> +    lpi_data.next_free_lpi = 0;
> +
> +    nr_lpi_ptrs = MAX_NR_HOST_LPIS / (PAGE_SIZE / sizeof(union host_lpi));
> +    lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs);
> +    if ( !lpi_data.host_lpis )
> +        return -ENOMEM;
> +
>      printk("GICv3: using at most %llu LPIs on the host.\n", 
> MAX_NR_HOST_LPIS);
>  
>      return 0;
>  }
>  
> +static int find_unused_host_lpi(uint32_t start, uint32_t *index)
> +{
> +    unsigned int chunk;
> +    uint32_t i = *index;
> +
> +    ASSERT(spin_is_locked(&lpi_data.host_lpis_lock));
> +
> +    for ( chunk = start;
> +          chunk < MAX_NR_HOST_LPIS / HOST_LPIS_PER_PAGE;
> +          chunk++ )
> +    {
> +        /* If we hit an unallocated chunk, use entry 0 in that one. */
> +        if ( !lpi_data.host_lpis[chunk] )
> +        {
> +            *index = 0;
> +            return chunk;
> +        }
> +
> +        /* Find an unallocated entry in this chunk. */
> +        for ( ; i < HOST_LPIS_PER_PAGE; i += LPI_BLOCK )
> +        {
> +            if ( lpi_data.host_lpis[chunk][i].dom_id == DOMID_INVALID )
> +            {
> +                *index = i;
> +                return chunk;
> +            }
> +        }
> +        i = 0;
> +    }
> +
> +    return -1;
> +}
> +
> +/*
> + * Allocate a block of 32 LPIs on the given host ITS for device "devid",
> + * starting with "eventid". Put them into the respective ITT by issuing a
> + * MAPTI command for each of them.
> + */
> +int gicv3_allocate_host_lpi_block(struct domain *d, uint32_t *first_lpi)
> +{
> +    uint32_t lpi, lpi_idx;
> +    int chunk;
> +    int i;
> +
> +    spin_lock(&lpi_data.host_lpis_lock);
> +    lpi_idx = lpi_data.next_free_lpi % HOST_LPIS_PER_PAGE;
> +    chunk = find_unused_host_lpi(lpi_data.next_free_lpi / HOST_LPIS_PER_PAGE,
> +                                 &lpi_idx);
> +
> +    if ( chunk == - 1 )          /* rescan for a hole from the beginning */
> +    {
> +        lpi_idx = 0;
> +        chunk = find_unused_host_lpi(0, &lpi_idx);
> +        if ( chunk == -1 )
> +        {
> +            spin_unlock(&lpi_data.host_lpis_lock);
> +            return -ENOSPC;
> +        }
> +    }
> +
> +    /* If we hit an unallocated chunk, we initialize it and use entry 0. */
> +    if ( !lpi_data.host_lpis[chunk] )
> +    {
> +        union host_lpi *new_chunk;
> +
> +        /* TODO: NUMA locality for quicker IRQ path? */
> +        new_chunk = alloc_xenheap_page();
> +        if ( !new_chunk )
> +        {
> +            spin_unlock(&lpi_data.host_lpis_lock);
> +            return -ENOMEM;
> +        }
> +
> +        for ( i = 0; i < HOST_LPIS_PER_PAGE; i += LPI_BLOCK )
> +            new_chunk[i].dom_id = DOMID_INVALID;
> +
> +        /*
> +         * Make sure all slots are really marked empty before publishing the
> +         * new chunk.
> +         */
> +        smp_wmb();
> +
> +        lpi_data.host_lpis[chunk] = new_chunk;
> +        lpi_idx = 0;
> +    }
> +
> +    lpi = chunk * HOST_LPIS_PER_PAGE + lpi_idx;
> +
> +    for ( i = 0; i < LPI_BLOCK; i++ )
> +    {
> +        union host_lpi hlpi;
> +
> +        /*
> +         * Mark this host LPI as belonging to the domain, but don't assign
> +         * any virtual LPI or a VCPU yet.
> +         */
> +        hlpi.virt_lpi = INVALID_LPI;
> +        hlpi.dom_id = d->domain_id;
> +        hlpi.vcpu_id = INVALID_VCPU_ID;
> +        write_u64_atomic(&lpi_data.host_lpis[chunk][lpi_idx + i].data,
> +                         hlpi.data);
> +
> +        /*
> +         * Enable this host LPI, so we don't have to do this during the
> +         * guest's runtime.
> +         */
> +        lpi_data.lpi_property[lpi + i] |= LPI_PROP_ENABLED;
> +    }
> +
> +    lpi_data.next_free_lpi = lpi + LPI_BLOCK;
> +
> +    /*
> +     * We have allocated and initialized the host LPI entries, so it's safe
> +     * to drop the lock now. Access to the structures can be done 
> concurrently
> +     * as it involves only an atomic uint64_t access.
> +     */
> +    spin_unlock(&lpi_data.host_lpis_lock);
> +
> +    if ( lpi_data.flags & LPI_PROPTABLE_NEEDS_FLUSHING )
> +        clean_and_invalidate_dcache_va_range(&lpi_data.lpi_property[lpi],
> +                                             LPI_BLOCK);
> +
> +    *first_lpi = lpi + LPI_OFFSET;
> +
> +    return 0;
> +}
> +
> +void gicv3_free_host_lpi_block(uint32_t first_lpi)
> +{
> +    union host_lpi *hlpi, empty_lpi = { .dom_id = DOMID_INVALID };
> +    int i;
> +
> +    hlpi = gic_get_host_lpi(first_lpi);
> +    if ( !hlpi )
> +        return;         /* Nothing to free here. */

We should check that first_lpi is actually the first lpi in a block
before calling gic_get_host_lpi.


> +    spin_lock(&lpi_data.host_lpis_lock);
> +
> +    for ( i = 0; i < LPI_BLOCK; i++ )
> +        write_u64_atomic(&hlpi[i].data, empty_lpi.data);
> +
> +    /*
> +     * Make sure the next allocation can reuse this block, as we do only
> +     * forward scanning when finding an unused block.
> +     */
> +    if ( lpi_data.next_free_lpi > first_lpi )
> +        lpi_data.next_free_lpi = first_lpi;
> +
> +    spin_unlock(&lpi_data.host_lpis_lock);
> +
> +    return;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/include/asm-arm/gic_v3_its.h 
> b/xen/include/asm-arm/gic_v3_its.h
> index 7d7ec32..13b3e14 100644
> --- a/xen/include/asm-arm/gic_v3_its.h
> +++ b/xen/include/asm-arm/gic_v3_its.h
> @@ -108,6 +108,9 @@
>  #define HOST_ITS_FLUSH_CMD_QUEUE        (1U << 0)
>  #define HOST_ITS_USES_PTA               (1U << 1)
>  
> +/* We allocate LPIs on the hosts in chunks of 32 to reduce handling 
> overhead. */
> +#define LPI_BLOCK                       32
> +
>  /* data structure for each hardware ITS */
>  struct host_its {
>      struct list_head entry;
> @@ -146,6 +149,9 @@ uint64_t gicv3_get_redist_address(unsigned int cpu, bool 
> use_pta);
>  /* Map a collection for this host CPU to each host ITS. */
>  int gicv3_its_setup_collection(unsigned int cpu);
>  
> +int gicv3_allocate_host_lpi_block(struct domain *d, uint32_t *first_lpi);
> +void gicv3_free_host_lpi_block(uint32_t first_lpi);
> +
>  #else
>  
>  static LIST_HEAD(host_its_list);
> diff --git a/xen/include/asm-arm/irq.h b/xen/include/asm-arm/irq.h
> index f940092..7c76626 100644
> --- a/xen/include/asm-arm/irq.h
> +++ b/xen/include/asm-arm/irq.h
> @@ -28,6 +28,9 @@ struct arch_irq_desc {
>  
>  #define LPI_OFFSET      8192
>  
> +/* LPIs are always numbered starting at 8192, so 0 is a good invalid case. */
> +#define INVALID_LPI     0
> +
>  #define nr_irqs NR_IRQS
>  #define nr_static_irqs NR_IRQS
>  #define arch_hwdom_irqs(domid) NR_IRQS
> @@ -41,6 +44,11 @@ struct irq_desc *__irq_to_desc(int irq);
>  
>  void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq);
>  
> +static inline bool is_lpi(unsigned int irq)
> +{
> +    return irq >= LPI_OFFSET;
> +}
> +
>  #define domain_pirq_to_irq(d, pirq) (pirq)
>  
>  bool_t is_assignable_irq(unsigned int irq);
> -- 
> 2.8.2
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.