[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v1 4/6] vpci: add SR-IOV support for PVH Dom0



Le 25/07/2025 à 16:26, Mykyta Poturai a écrit :
> From: Stewart Hildebrand <stewart.hildebrand@xxxxxxx>
>
> This code is expected to only be used by privileged domains,
> unprivileged domains should not get access to the SR-IOV capability.
>
> Implement RW handlers for PCI_SRIOV_CTRL register to dynamically
> map/unmap VF BARS. Recalculate BAR sizes before mapping VFs to account
> for possible changes in the system page size register.
>
> Relies on dom0 to enable SR-IOV and PHYSDEVOP to inform Xen about
> addition/removal of VFs.
>
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@xxxxxxx>
> Signed-off-by: Mykyta Poturai <mykyta_poturai@xxxxxxxx>
> ---
>   CHANGELOG.md              |   3 +-
>   SUPPORT.md                |   2 -
>   xen/drivers/vpci/Makefile |   2 +-
>   xen/drivers/vpci/header.c |   3 +
>   xen/drivers/vpci/sriov.c  | 235 ++++++++++++++++++++++++++++++++++++++
>   xen/drivers/vpci/vpci.c   |   1 +
>   xen/include/xen/vpci.h    |   7 +-
>   7 files changed, 247 insertions(+), 6 deletions(-)
>   create mode 100644 xen/drivers/vpci/sriov.c
>
> diff --git a/CHANGELOG.md b/CHANGELOG.md
> index 5f31ca08fe..7b0e8beb76 100644
> --- a/CHANGELOG.md
> +++ b/CHANGELOG.md
> @@ -23,8 +23,7 @@ The format is based on [Keep a 
> Changelog](https://keepachangelog.com/en/1.0.0/)
>    - On x86:
>      - Option to attempt to fixup p2m page-faults on PVH dom0.
>      - Resizable BARs is supported for PVH dom0.
> -   - Support PCI passthrough for HVM domUs when dom0 is PVH (note SR-IOV
> -     capability usage is not yet supported on PVH dom0).
> +   - Support PCI passthrough for HVM domUs when dom0 is PVH.
>      - Smoke tests for the FreeBSD Xen builds in Cirrus CI.
>
>    - On Arm:
> diff --git a/SUPPORT.md b/SUPPORT.md
> index 6a82a92189..830b598714 100644
> --- a/SUPPORT.md
> +++ b/SUPPORT.md
> @@ -170,8 +170,6 @@ unexpected behavior or issues on some hardware.
>
>   At least the following features are missing on a PVH dom0:
>
> -  * PCI SR-IOV.
> -
>     * Native NMI forwarding (nmi=dom0 command line option).
>
>     * MCE handling.

I would prefer changelog/support changes to be a separate patch or
alternatively at the last patch as I don't think SR-IOV is fully working
without the 2 remaining ones.

> diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
> index a7c8a30a89..fe1e57b64d 100644
> --- a/xen/drivers/vpci/Makefile
> +++ b/xen/drivers/vpci/Makefile
> @@ -1,2 +1,2 @@
> -obj-y += vpci.o header.o rebar.o
> +obj-y += vpci.o header.o rebar.o sriov.o
>   obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index f947f652cd..0a840c6dcc 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -839,6 +839,9 @@ static int cf_check init_header(struct pci_dev *pdev)
>
>       ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
>
> +    if ( pdev->info.is_virtfn )
> +        return 0;
> +
>       switch ( pci_conf_read8(pdev->sbdf, PCI_HEADER_TYPE) & 0x7f )
>       {
>       case PCI_HEADER_TYPE_NORMAL:
> diff --git a/xen/drivers/vpci/sriov.c b/xen/drivers/vpci/sriov.c
> new file mode 100644
> index 0000000000..640430e3e9
> --- /dev/null
> +++ b/xen/drivers/vpci/sriov.c
> @@ -0,0 +1,235 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Handlers for accesses to the SR-IOV capability structure.
> + *
> + * Copyright (C) 2018 Citrix Systems R&D
> + */
> +
> +#include <xen/sched.h>
> +#include <xen/vpci.h>
> +
> +static int vf_init_bars(const struct pci_dev *vf_pdev)
> +{
> +    unsigned int i, sriov_pos;
> +    int vf_idx, rc;
> +    const struct pci_dev *pf_pdev = vf_pdev->pf_pdev;
> +    uint16_t offset, stride;
> +    struct vpci_bar *bars = vf_pdev->vpci->header.bars;
> +    struct vpci_bar *physfn_vf_bars = pf_pdev->vpci->sriov->vf_bars;
> +
> +    sriov_pos = pci_find_ext_capability(pf_pdev->sbdf, PCI_EXT_CAP_ID_SRIOV);
> +    offset = pci_conf_read16(pf_pdev->sbdf, sriov_pos + PCI_SRIOV_VF_OFFSET);
> +    stride = pci_conf_read16(pf_pdev->sbdf, sriov_pos + PCI_SRIOV_VF_STRIDE);
> +
> +    vf_idx = vf_pdev->sbdf.sbdf;
> +    vf_idx -= pf_pdev->sbdf.sbdf + offset;

We can probably rewrite it as

vf_idx = vf_pdev->sbdf.sbdf - (pf_pdev->sbdf.sbdf + offset);

especially with sbdf being potentially not representable as a int (even
though very unlikely).

> +    if ( vf_idx < 0 )
> +        return -EINVAL;
> +    if ( stride )
> +    {
> +        if ( vf_idx % stride )
> +            return -EINVAL;
> +        vf_idx /= stride;
> +    }
> +
> +    /*
> +     * Set up BARs for this VF out of PF's VF BARs taking into account
> +     * the index of the VF.
> +     */
> +    for ( i = 0; i < PCI_SRIOV_NUM_BARS; i++ )
> +    {
> +        bars[i].addr = physfn_vf_bars[i].addr + vf_idx * 
> physfn_vf_bars[i].size;
> +        bars[i].guest_addr = bars[i].addr;
> +        bars[i].size = physfn_vf_bars[i].size;
> +        bars[i].type = physfn_vf_bars[i].type;
> +        bars[i].prefetchable = physfn_vf_bars[i].prefetchable;
> +        rc = vpci_bar_add_rangeset(vf_pdev, &bars[i], i);
> +        if ( rc )
> +            return rc;
> +    }
> +
> +    return 0;
> +}
> +
> +static int map_vf(const struct pci_dev *vf_pdev, uint16_t cmd)
> +{
> +    int rc;
> +
> +    ASSERT(rw_is_write_locked(&vf_pdev->domain->pci_lock));
> +
> +    rc = vf_init_bars(vf_pdev);
> +    if ( rc )
> +        return rc;
> +
> +    return vpci_modify_bars(vf_pdev, cmd, false);
> +}
> +
> +static int size_vf_bars(struct pci_dev *pf_pdev, unsigned int sriov_pos)
> +{
> +    /*
> +     * NB: a non-const pci_dev of the PF is needed in order to update
> +     * vf_rlen.
> +     */
> +    struct vpci_bar *bars;
> +    unsigned int i;
> +    int rc = 0;
> +
> +    ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock));
> +    ASSERT(!pf_pdev->info.is_virtfn);
> +
> +    if ( !pf_pdev->vpci->sriov )
> +        return -EINVAL;
> +
> +    /* Read BARs for VFs out of PF's SR-IOV extended capability. */
> +    bars = pf_pdev->vpci->sriov->vf_bars;
> +    /* Set the BARs addresses and size. */
> +    for ( i = 0; i < PCI_SRIOV_NUM_BARS; i += rc )
> +    {
> +        unsigned int idx = sriov_pos + PCI_SRIOV_BAR + i * 4;
> +        uint32_t bar;
> +        uint64_t addr, size;
> +
> +        bar = pci_conf_read32(pf_pdev->sbdf, idx);
> +
> +        rc = pci_size_mem_bar(pf_pdev->sbdf, idx, &addr, &size,
> +                              PCI_BAR_VF |
> +                              ((i == PCI_SRIOV_NUM_BARS - 1) ? PCI_BAR_LAST
> +                                                             : 0));
> +
> +        /*
> +         * Update vf_rlen on the PF. According to the spec the size of
> +         * the BARs can change if the system page size register is
> +         * modified, so always update rlen when enabling VFs.
> +         */
> +        pf_pdev->physfn.vf_rlen[i] = size;
> +
> +        if ( !size )
> +        {
> +            bars[i].type = VPCI_BAR_EMPTY;
> +            continue;
> +        }
> +
> +        bars[i].addr = addr;
> +        bars[i].guest_addr = addr;
> +        bars[i].size = size;
> +        bars[i].prefetchable = bar & PCI_BASE_ADDRESS_MEM_PREFETCH;
> +
> +        switch ( rc )
> +        {
> +        case 1:
> +            bars[i].type = VPCI_BAR_MEM32;
> +            break;
> +
> +        case 2:
> +            bars[i].type = VPCI_BAR_MEM64_LO;
> +            bars[i + 1].type = VPCI_BAR_MEM64_HI;
> +            break;
> +
> +        default:
> +            ASSERT_UNREACHABLE();
> +        }
> +    }
> +
> +    rc = rc > 0 ? 0 : rc;
> +
> +    return rc;
> +}
> +
> +static void cf_check control_write(const struct pci_dev *pdev, unsigned int 
> reg,
> +                                   uint32_t val, void *data)
> +{
> +    unsigned int sriov_pos = reg - PCI_SRIOV_CTRL;
> +    uint16_t control = pci_conf_read16(pdev->sbdf, reg);
> +    bool mem_enabled = control & PCI_SRIOV_CTRL_MSE;
> +    bool new_mem_enabled = val & PCI_SRIOV_CTRL_MSE;
> +
> +    ASSERT(!pdev->info.is_virtfn);
> +
> +    if ( new_mem_enabled != mem_enabled )
> +    {
> +        struct pci_dev *vf_pdev;
> +        if ( new_mem_enabled )
> +        {
> +            /* FIXME casting away const-ness to modify vf_rlen */
> +            size_vf_bars((struct pci_dev *)pdev, sriov_pos);
> +
> +            list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
> +                map_vf(vf_pdev, PCI_COMMAND_MEMORY);
> +        }
> +        else
> +        {
> +            list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
> +                map_vf(vf_pdev, 0);
> +        }
> +    }
> +
> +    pci_conf_write16(pdev->sbdf, reg, val);
> +}
> +
> +static int vf_init_header(struct pci_dev *vf_pdev)
> +{
> +    const struct pci_dev *pf_pdev;
> +    unsigned int sriov_pos;
> +    int rc = 0;
> +    uint16_t ctrl;
> +
> +    ASSERT(rw_is_write_locked(&vf_pdev->domain->pci_lock));
> +
> +    if ( !vf_pdev->info.is_virtfn )
> +        return 0;
> +
> +    pf_pdev = vf_pdev->pf_pdev;
> +    ASSERT(pf_pdev);
> +
> +    sriov_pos = pci_find_ext_capability(pf_pdev->sbdf, PCI_EXT_CAP_ID_SRIOV);
> +    ctrl = pci_conf_read16(pf_pdev->sbdf, sriov_pos + PCI_SRIOV_CTRL);
> +
> +    if ( (pf_pdev->domain == vf_pdev->domain) && (ctrl & PCI_SRIOV_CTRL_MSE) 
> )
> +    {
> +        rc = map_vf(vf_pdev, PCI_COMMAND_MEMORY);
> +        if ( rc )
> +            return rc;
> +    }
> +
> +    return rc;
> +}
> +
> +static int init_sriov(struct pci_dev *pdev)
> +{
> +    unsigned int pos;
> +
> +    if ( pdev->info.is_virtfn )
> +        return vf_init_header(pdev);
> +
> +    pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_SRIOV);
> +
> +    if ( !pos )
> +        return 0;
> +
> +    if ( !is_hardware_domain(pdev->domain) )
> +    {
> +        printk(XENLOG_ERR
> +               "%pp: SR-IOV configuration unsupported for unpriv %pd\n",
> +               &pdev->sbdf, pdev->domain);
> +        return 0;
> +    }

Should it instead rely on xsm to know if it the operation is allowed or
not for this domain ?

> +
> +    pdev->vpci->sriov = xzalloc(struct vpci_sriov);
> +    if ( !pdev->vpci->sriov )
> +        return -ENOMEM;
> +
> +    return vpci_add_register(pdev->vpci, vpci_hw_read16, control_write,
> +                             pos + PCI_SRIOV_CTRL, 2, NULL);
> +}
> +
> +REGISTER_VPCI_INIT(init_sriov, VPCI_PRIORITY_LOW);
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-file-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 09988f04c2..7af6651831 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -120,6 +120,7 @@ void vpci_deassign_device(struct pci_dev *pdev)
>       for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
>           rangeset_destroy(pdev->vpci->header.bars[i].mem);
>
> +    xfree(pdev->vpci->sriov);
>       xfree(pdev->vpci->msix);
>       xfree(pdev->vpci->msi);
>       xfree(pdev->vpci);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 06f7039f20..9e8dcab17e 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -138,7 +138,6 @@ struct vpci {
>            * upon to know whether BARs are mapped into the guest p2m.
>            */
>           bool bars_mapped      : 1;
> -        /* FIXME: currently there's no support for SR-IOV. */
>       } header;
>
>       /* MSI data. */
> @@ -192,6 +191,12 @@ struct vpci {
>               struct vpci_arch_msix_entry arch;
>           } entries[];
>       } *msix;
> +
> +    struct vpci_sriov {
> +        /* PF only */
> +        struct vpci_bar vf_bars[PCI_SRIOV_NUM_BARS];
> +    } *sriov;
> +
>   #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
>       /* Guest SBDF of the device. */
>   #define INVALID_GUEST_SBDF ((pci_sbdf_t){ .sbdf = ~0U })



Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech





 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.