[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3 9/9] vpci/msix: add MSI-X handlers



>>> On 27.04.17 at 16:35, <roger.pau@xxxxxxxxxx> wrote:
> +static int vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg,
> +                                   union vpci_val val, void *data)
> +{
> +    uint8_t seg = pdev->seg, bus = pdev->bus;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +    paddr_t table_base = 
> pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;
> +    struct vpci_msix *msix = data;

Wouldn't you better use this also to obtain the array index one line
earlier?

> +    bool new_masked, new_enabled;
> +    unsigned int i;
> +    uint32_t data32;
> +    int rc;
> +
> +    new_masked = val.word & PCI_MSIX_FLAGS_MASKALL;
> +    new_enabled = val.word & PCI_MSIX_FLAGS_ENABLE;
> +
> +    if ( new_enabled != msix->enabled && new_enabled )

    if ( !msix->enabled && new_enabled )

would likely be easier to read (similar for the "else if" below).

> +    {
> +        /* MSI-X enabled. */
> +        for ( i = 0; i < msix->max_entries; i++ )
> +        {
> +            if ( msix->entries[i].masked )
> +                continue;
> +
> +            rc = vpci_msix_enable(&msix->entries[i].arch, pdev,
> +                                  msix->entries[i].addr, 
> msix->entries[i].data,
> +                                  msix->entries[i].nr, table_base);
> +            if ( rc )
> +            {
> +                gdprintk(XENLOG_ERR,
> +                         "%04x:%02x:%02x.%u: unable to update entry %u: 
> %d\n",
> +                         seg, bus, slot, func, i, rc);
> +                return rc;
> +            }
> +
> +            vpci_msix_mask(&msix->entries[i].arch, false);
> +        }
> +    }
> +    else if ( new_enabled != msix->enabled && !new_enabled )
> +    {
> +        /* MSI-X disabled. */
> +        for ( i = 0; i < msix->max_entries; i++ )
> +        {
> +            rc = vpci_msix_disable(&msix->entries[i].arch);
> +            if ( rc )
> +            {
> +                gdprintk(XENLOG_ERR,
> +                         "%04x:%02x:%02x.%u: unable to disable entry %u: 
> %d\n",
> +                         seg, bus, slot, func, i, rc);
> +                return rc;
> +            }
> +        }
> +    }
> +
> +    data32 = val.word;
> +    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
> +         pci_msi_conf_write_intercept(pdev, reg, 2, &data32) >= 0 )
> +        pci_conf_write16(seg, bus, slot, func, reg, data32);

What's the intermediate variable "data32" good for here? Afaict you
could use val.word in its stead.

> +static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
> +{
> +    struct vpci_msix *msix;
> +
> +    ASSERT(vpci_locked(d));
> +    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
> +        if ( msix->pdev->vpci->header.command & PCI_COMMAND_MEMORY &&

Please parenthesize & within &&.

> +             addr >= msix->addr &&
> +             addr < msix->addr + msix->max_entries * PCI_MSIX_ENTRY_SIZE )
> +            return msix;
> +
> +    return NULL;
> +}

Looking ahead I'm getting the impression that you only allow
accesses to the MSI-X table entries, yet in vpci_modify_bars() you
(correctly) prevent mapping entire pages. While most other
registers are disallowed from sharing a page with the table, the PBA
is specifically named as an exception. Hence you need to support
at least reads from the entire range.

> +static int vpci_msix_table_accept(struct vcpu *v, unsigned long addr)
> +{
> +    int found;
> +
> +    vpci_lock(v->domain);
> +    found = !!vpci_msix_find(v->domain, addr);

At the risk of repeating a comment I gave on an earlier patch: Using
"bool" for "found" allows you to avoid the !! .

> +static int vpci_msix_access_check(struct pci_dev *pdev, unsigned long addr,
> +                                  unsigned int len)
> +{
> +    uint8_t seg = pdev->seg, bus = pdev->bus;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +
> +

No double blank lines please.

> +    /* Only allow 32/64b accesses. */
> +    if ( len != 4 && len != 8 )
> +    {
> +        gdprintk(XENLOG_ERR,
> +                 "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n",
> +                 seg, bus, slot, func, len);
> +        return -EINVAL;
> +    }
> +
> +    /* Do no allow accesses that span across multiple entries. */
> +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) + len > PCI_MSIX_ENTRY_SIZE )
> +    {
> +        gdprintk(XENLOG_ERR,
> +                 "%04x:%02x:%02x.%u: MSI-X access crosses entry boundary\n",
> +                 seg, bus, slot, func);
> +        return -EINVAL;
> +    }
> +
> +    /*
> +     * Only allow 64b accesses to the low message address field.
> +     *
> +     * NB: this is more restrictive than the specification, that allows 64b
> +     * accesses to other fields under certain circumstances, so this check 
> and
> +     * the code will have to be fixed in order to fully comply with the
> +     * specification.
> +     */
> +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) != 0 && len != 4 )
> +    {
> +        gdprintk(XENLOG_ERR,
> +                 "%04x:%02x:%02x.%u: 64bit MSI-X table access to 32bit field"
> +                 " (offset: %#lx len: %u)\n", seg, bus, slot, func,
> +                 addr & (PCI_MSIX_ENTRY_SIZE - 1), len);
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}

So you allow mis-aligned accesses, but you disallow 8-byte ones
to the upper half of an entry? I think both aspects need to be got
right from the beginning, the more that you BUG() in the switch()es
further down in such cases.

> +static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix,
> +                                                   unsigned long addr)
> +{
> +    return &msix->entries[(addr - msix->addr) / PCI_MSIX_ENTRY_SIZE];
> +}
> +
> +static int vpci_msix_table_read(struct vcpu *v, unsigned long addr,
> +                                unsigned int len, unsigned long *data)
> +{
> +    struct vpci_msix *msix;
> +    struct vpci_msix_entry *entry;
> +    unsigned int offset;
> +
> +    vpci_lock(v->domain);
> +    msix = vpci_msix_find(v->domain, addr);
> +    if ( !msix )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    if ( vpci_msix_access_check(msix->pdev, addr, len) )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    /* Get the table entry and offset. */
> +    entry = vpci_msix_get_entry(msix, addr);
> +    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
> +
> +    switch ( offset )
> +    {
> +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
> +        *data = entry->addr;

You're not clipping off the upper 32 bits here - is that reliably
happening elsewhere?

> +        break;
> +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
> +        *data = entry->addr >> 32;
> +        break;
> +    case PCI_MSIX_ENTRY_DATA_OFFSET:
> +        *data = entry->data;
> +        break;
> +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
> +        *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;

What about the other 31 bits?

> +static int vpci_msix_table_write(struct vcpu *v, unsigned long addr,
> +                                 unsigned int len, unsigned long data)
> +{
> +    struct vpci_msix *msix;
> +    struct vpci_msix_entry *entry;
> +    unsigned int offset;
> +
> +    vpci_lock(v->domain);
> +    msix = vpci_msix_find(v->domain, addr);
> +    if ( !msix )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    if ( vpci_msix_access_check(msix->pdev, addr, len) )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    /* Get the table entry and offset. */
> +    entry = vpci_msix_get_entry(msix, addr);
> +    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
> +
> +    switch ( offset )
> +    {
> +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
> +        if ( len == 8 )
> +        {
> +            entry->addr = data;
> +            break;
> +        }
> +        entry->addr &= ~GENMASK(31, 0);
> +        entry->addr |= data;
> +        break;
> +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
> +        entry->addr &= ~GENMASK(63, 32);
> +        entry->addr |= data << 32;
> +        break;
> +    case PCI_MSIX_ENTRY_DATA_OFFSET:
> +        entry->data = data;
> +        break;
> +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
> +    {
> +        bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
> +        struct pci_dev *pdev = msix->pdev;
> +        paddr_t table_base =
> +            pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;

Again simply "msix->bir"?

> +        int rc;
> +
> +        if ( !msix->enabled )
> +        {
> +            entry->masked = new_masked;
> +            break;
> +        }
> +
> +        if ( new_masked != entry->masked && !new_masked )
> +        {
> +            /* Unmasking an entry, update it. */
> +            rc = vpci_msix_enable(&entry->arch, msix->pdev, entry->addr,

And simply "pdev" here?

> +static int vpci_init_msix(struct pci_dev *pdev)
> +{
> +    struct domain *d = pdev->domain;
> +    uint8_t seg = pdev->seg, bus = pdev->bus;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +    struct vpci_msix *msix;
> +    unsigned int msix_offset, i, max_entries;
> +    paddr_t msix_paddr;
> +    uint16_t control;
> +    int rc;
> +
> +    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
> +    if ( !msix_offset )
> +        return 0;
> +
> +    if ( !vpci_msix_enabled(pdev->domain) )

This is a non-__init function, so it can't use dom0_msix (I'm saying
this just in case there really is a need to retain those command line
options).

> +    {
> +        xen_vpci_mask_capability(pdev, PCI_CAP_ID_MSIX);
> +        return 0;
> +    }
> +
> +    control = pci_conf_read16(seg, bus, slot, func,
> +                              msix_control_reg(msix_offset));
> +
> +    /* Get the maximum number of vectors the device supports. */
> +    max_entries = msix_table_size(control);
> +    if ( !max_entries )
> +        return 0;

This if() is never going to be true.

> +    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
> +    if ( !msix )
> +        return -ENOMEM;
> +
> +    msix->max_entries = max_entries;
> +    msix->pdev = pdev;
> +
> +    /* Find the MSI-X table address. */
> +    msix->offset = pci_conf_read32(seg, bus, slot, func,
> +                                   msix_table_offset_reg(msix_offset));
> +    msix->bir = msix->offset & PCI_MSIX_BIRMASK;
> +    msix->offset &= ~PCI_MSIX_BIRMASK;
> +
> +    ASSERT(pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM ||
> +           pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM64_LO);
> +    msix->addr = pdev->vpci->header.bars[msix->bir].mapped_addr + 
> msix->offset;
> +    msix_paddr = pdev->vpci->header.bars[msix->bir].paddr + msix->offset;

I can't seem to find where these addresses get updated in case
the BARs are being relocated by the Dom0 kernel.

> +    for ( i = 0; i < msix->max_entries; i++)
> +    {
> +        msix->entries[i].masked = true;
> +        msix->entries[i].nr = i;
> +        vpci_msix_arch_init(&msix->entries[i].arch);
> +    }
> +
> +    if ( list_empty(&d->arch.hvm_domain.msix_tables) )
> +        register_mmio_handler(d, &vpci_msix_table_ops);
> +
> +    list_add(&msix->next, &d->arch.hvm_domain.msix_tables);
> +
> +    rc = xen_vpci_add_register(pdev, vpci_msix_control_read,
> +                               vpci_msix_control_write,
> +                               msix_control_reg(msix_offset), 2, msix);
> +    if ( rc )
> +    {
> +        dprintk(XENLOG_ERR,
> +                "%04x:%02x:%02x.%u: failed to add handler for MSI-X control: 
> %d\n",
> +                seg, bus, slot, func, rc);
> +        goto error;
> +    }
> +
> +    if ( pdev->vpci->header.command & PCI_COMMAND_MEMORY )
> +    {
> +        /* Unmap this memory from the guest. */
> +        rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(msix->addr)),
> +                         _mfn(PFN_DOWN(msix_paddr)),
> +                         PFN_UP(msix->max_entries * PCI_MSIX_ENTRY_SIZE),
> +                         false);
> +        if ( rc )
> +        {
> +            dprintk(XENLOG_ERR,
> +                    "%04x:%02x:%02x.%u: unable to unmap MSI-X BAR region: 
> %d\n",
> +                    seg, bus, slot, func, rc);
> +            goto error;
> +        }
> +    }

Why is this unmapping conditional upon PCI_COMMAND_MEMORY?

> +static void vpci_dump_msix(unsigned char key)
> +{
> +    struct domain *d;
> +    struct pci_dev *pdev;
> +
> +    printk("Guest MSI-X information:\n");
> +
> +    for_each_domain ( d )
> +    {
> +        if ( !has_vpci(d) )
> +            continue;
> +
> +        vpci_lock(d);

Dump handlers, even if there are existing examples to the contrary,
should only try-lock any locks they mean to hold (and not dump
anything if they can't get hold of the lock).

> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -112,6 +112,33 @@ struct vpci {
>          /* Arch-specific data. */
>          struct vpci_arch_msi arch;
>      } *msi;
> +
> +    /* MSI-X data. */
> +    struct vpci_msix {
> +        struct pci_dev *pdev;
> +        /* Maximum number of vectors supported by the device. */
> +        unsigned int max_entries;
> +        /* MSI-X table offset. */
> +        unsigned int offset;
> +        /* MSI-X table BIR. */
> +        unsigned int bir;
> +        /* Table addr. */
> +        paddr_t addr;
> +        /* MSI-X enabled? */
> +        bool enabled;
> +        /* Masked? */
> +        bool masked;
> +        /* List link. */
> +        struct list_head next;
> +        /* Entries. */
> +        struct vpci_msix_entry {
> +                unsigned int nr;
> +                uint64_t addr;
> +                uint32_t data;
> +                bool masked;
> +                struct vpci_arch_msix_entry arch;

Indentation.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.