[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 6/8] vpci: add SR-IOV support for PVH Dom0


  • To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Mykyta Poturai <Mykyta_Poturai@xxxxxxxx>
  • Date: Mon, 9 Mar 2026 11:08:27 +0000
  • Accept-language: en-US
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=epam.com; dmarc=pass action=none header.from=epam.com; dkim=pass header.d=epam.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector10001; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=jPd7eg99vjV5lo4tppv9T3tRyoUFlgTQOLxJi+eBzBY=; b=EKYqFMaPdpQTh/zZ2YFaEOLR70yWeWBwNJnhgjAUgLktNpTc/94GJVYvhfqjp+Ps2z+OzciGn99Mu1qI17/uymfDKIvC6uyNWwQ9scyEZOyDsJhY++bLI1B2iJKHcB+4UJu19bVfgU9HOFNGHWPJnABi5XBSfv4K9kTy01jcDn63ilY28lDKxErLcxOk9UpA7suJCLgjnu0BsCIhuCjh2WXbZrCWQMRAFo6OZ5k4xDP5u3flf7wLtZL4WeUkhmPQ4+kazVj6wX7fjtULLQtcT2Pb84Y7zFXhp8+UwXlCpoaawhdDJpiiCjGhFtzdZC1U9Flpq9PGgm8X1Ckqk7egbA==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector10001; d=microsoft.com; cv=none; b=drDPO9QznwPw0acjwL60yGPkwYzkpG+GVxUg97cIyKdC8Alk1uuH9lIjqjP3fCVRdLsYzf+YHNXhkSPGfvF8xzE7qc1So/gbYXSvYkWTrmLgUAl13UNrYYjfSZSqeMqy1nz08kZCruUqUFTOlt8QZWyde+NLu6OsPtmEbTnwj1TE1Tw2ZRQQgf31DqFFOKGHSlzF4ayo8/J1cyMdMkzFoZhpmdefd0B3IC0UzTHIMpEX1iNBxdxIWUtdz13jD/S6Lqg8fItYBahvzjoNZqTtLatmFZ+iagPnsreqBkK6aQzlatYz4zn/yUzaqFd74FfWRIwba/8y8nCvZAG9eo6GWQ==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=epam.com;
  • Cc: Stewart Hildebrand <stewart.hildebrand@xxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, "Daniel P. Smith" <dpsmith@xxxxxxxxxxxxxxxxxxxx>, Mykyta Poturai <Mykyta_Poturai@xxxxxxxx>
  • Delivery-date: Mon, 09 Mar 2026 11:08:41 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>
  • Thread-index: AQHcr7UOJDdt9SAQzUinWXRod1IIzA==
  • Thread-topic: [PATCH v2 6/8] vpci: add SR-IOV support for PVH Dom0

From: Stewart Hildebrand <stewart.hildebrand@xxxxxxx>

This code is expected to only be used by privileged domains,
unprivileged domains should not get access to the SR-IOV capability.

Implement RW handlers for PCI_SRIOV_CTRL register to dynamically
map/unmap VF BARS. Recalculate BAR sizes before mapping VFs to account
for possible changes in the system page size register.

Allow forcing vpci_modify_bars to not defer the actual mapping changes,
which is needed to fix the sequential calls to vpci_modify_bars when
enabling VFs from Dom0.

Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
Signed-off-by: Stewart Hildebrand <stewart.hildebrand@xxxxxxx>
Signed-off-by: Mykyta Poturai <mykyta_poturai@xxxxxxxx>
---
v1->v2:
* switch to VF discovery by Xen
* fix sequential vpci_modify_bars calls
* move documentation changes to a separate commit
---
 xen/drivers/vpci/Makefile |   2 +-
 xen/drivers/vpci/header.c |  17 +-
 xen/drivers/vpci/sriov.c  | 363 ++++++++++++++++++++++++++++++++++++++
 xen/include/xen/vpci.h    |  12 +-
 4 files changed, 385 insertions(+), 9 deletions(-)
 create mode 100644 xen/drivers/vpci/sriov.c

diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index a7c8a30a89..fe1e57b64d 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1,2 +1,2 @@
-obj-y += vpci.o header.o rebar.o
+obj-y += vpci.o header.o rebar.o sriov.o
 obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 284964f0d4..c55c3380d4 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -264,7 +264,7 @@ bool vpci_process_pending(struct vcpu *v)
     return false;
 }
 
-static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
+static int apply_map(struct domain *d, const struct pci_dev *pdev,
                             uint16_t cmd)
 {
     struct vpci_header *header = &pdev->vpci->header;
@@ -323,7 +323,8 @@ static void defer_map(const struct pci_dev *pdev, uint16_t 
cmd, bool rom_only)
     raise_softirq(SCHEDULE_SOFTIRQ);
 }
 
-int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
+int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only,
+                     bool no_defer)
 {
     struct vpci_header *header = &pdev->vpci->header;
     struct pci_dev *tmp;
@@ -519,7 +520,7 @@ int vpci_modify_bars(const struct pci_dev *pdev, uint16_t 
cmd, bool rom_only)
         d = dom_xen;
     }
 
-    if ( system_state < SYS_STATE_active )
+    if ( system_state < SYS_STATE_active || no_defer )
     {
         /*
          * Mappings might be created when building Dom0 if the memory decoding
@@ -566,7 +567,7 @@ static void cf_check cmd_write(
          * memory decoding bit has not been changed, so leave everything as-is,
          * hoping the guest will realize and try again.
          */
-        vpci_modify_bars(pdev, cmd, false);
+        vpci_modify_bars(pdev, cmd, false, false);
     else
         pci_conf_write16(pdev->sbdf, reg, cmd);
 }
@@ -736,7 +737,7 @@ static void cf_check rom_write(
      */
     else if ( vpci_modify_bars(pdev,
                                new_enabled ? PCI_COMMAND_MEMORY : 0,
-                               true) )
+                               true, false) )
         /*
          * No memory has been added or removed from the p2m (because the actual
          * p2m changes are deferred in defer_map) and the ROM enable bit has
@@ -954,6 +955,9 @@ int vpci_init_header(struct pci_dev *pdev)
 
     header->guest_cmd = cmd;
 
+    if ( pdev->info.is_virtfn )
+        return vf_init_header(pdev);
+
     /* Disable memory decoding before sizing. */
     if ( !is_hwdom || (cmd & PCI_COMMAND_MEMORY) )
         pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd & ~PCI_COMMAND_MEMORY);
@@ -1062,7 +1066,8 @@ int vpci_init_header(struct pci_dev *pdev)
             goto fail;
     }
 
-    return (cmd & PCI_COMMAND_MEMORY) ? vpci_modify_bars(pdev, cmd, false) : 0;
+    return (cmd & PCI_COMMAND_MEMORY)
+                ? vpci_modify_bars(pdev, cmd, false, false) : 0;
 
  fail:
     pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
diff --git a/xen/drivers/vpci/sriov.c b/xen/drivers/vpci/sriov.c
new file mode 100644
index 0000000000..6f691149e9
--- /dev/null
+++ b/xen/drivers/vpci/sriov.c
@@ -0,0 +1,363 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Handlers for accesses to the SR-IOV capability structure.
+ *
+ * Copyright (C) 2026 Citrix Systems R&D
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+#include <xsm/xsm.h>
+
+static int vf_init_bars(const struct pci_dev *vf_pdev)
+{
+    int vf_idx;
+    unsigned int i;
+    const struct pci_dev *pf_pdev = vf_pdev->pf_pdev;
+    struct vpci_bar *bars = vf_pdev->vpci->header.bars;
+    struct vpci_bar *physfn_vf_bars = pf_pdev->vpci->sriov->vf_bars;
+    unsigned int sriov_pos = pci_find_ext_capability(pf_pdev,
+                                                     PCI_EXT_CAP_ID_SRIOV);
+    uint16_t offset = pci_conf_read16(pf_pdev->sbdf,
+                                      sriov_pos + PCI_SRIOV_VF_OFFSET);
+    uint16_t stride = pci_conf_read16(pf_pdev->sbdf,
+                                      sriov_pos + PCI_SRIOV_VF_STRIDE);
+
+    vf_idx = vf_pdev->sbdf.sbdf - (pf_pdev->sbdf.sbdf + offset);
+    if ( vf_idx < 0 )
+        return -EINVAL;
+
+    if ( stride )
+    {
+        if ( vf_idx % stride )
+            return -EINVAL;
+        vf_idx /= stride;
+    }
+
+    /*
+     * Set up BARs for this VF out of PF's VF BARs taking into account
+     * the index of the VF.
+     */
+    for ( i = 0; i < PCI_SRIOV_NUM_BARS; i++ )
+    {
+        bars[i].addr = physfn_vf_bars[i].addr + vf_idx * 
physfn_vf_bars[i].size;
+        bars[i].guest_addr = bars[i].addr;
+        bars[i].size = physfn_vf_bars[i].size;
+        bars[i].type = physfn_vf_bars[i].type;
+        bars[i].prefetchable = physfn_vf_bars[i].prefetchable;
+    }
+
+    return 0;
+}
+
+/* Must be called form vpci_process_pending context */
+static int map_vfs(const struct pci_dev *pf_pdev, uint16_t cmd)
+{
+    struct pci_dev *vf_pdev;
+    int rc;
+
+    ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock));
+
+    list_for_each_entry(vf_pdev, &pf_pdev->vf_list, vf_list) {
+        rc = vpci_modify_bars(vf_pdev, cmd, false, true);
+        if ( rc )
+        {
+            gprintk(XENLOG_ERR, "failed to %s VF %pp: %d\n",
+                    (cmd & PCI_COMMAND_MEMORY) ? "map" : "unmap",
+                    &vf_pdev->sbdf, rc);
+            return rc;
+        }
+    }
+
+    return 0;
+}
+
+
+static int size_vf_bars(struct pci_dev *pf_pdev, unsigned int sriov_pos)
+{
+    /*
+     * NB: a non-const pci_dev of the PF is needed in order to update
+     * vf_rlen.
+     */
+    struct vpci_bar *bars;
+    unsigned int i;
+    int rc = 0;
+
+    ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock));
+    ASSERT(!pf_pdev->info.is_virtfn);
+    ASSERT(pf_pdev->vpci->sriov);
+
+    /* Read BARs for VFs out of PF's SR-IOV extended capability. */
+    bars = pf_pdev->vpci->sriov->vf_bars;
+    /* Set the BARs addresses and size. */
+    for ( i = 0; i < PCI_SRIOV_NUM_BARS; i += rc )
+    {
+        unsigned int idx = sriov_pos + PCI_SRIOV_BAR + i * 4;
+        uint32_t bar;
+        uint64_t addr, size;
+
+        bar = pci_conf_read32(pf_pdev->sbdf, idx);
+
+        rc = pci_size_mem_bar(pf_pdev->sbdf, idx, &addr, &size,
+                              PCI_BAR_VF |
+                              ((i == PCI_SRIOV_NUM_BARS - 1) ? PCI_BAR_LAST
+                                                             : 0));
+
+        /*
+         * Update vf_rlen on the PF. According to the spec the size of
+         * the BARs can change if the system page size register is
+         * modified, so always update rlen when enabling VFs.
+         */
+        pf_pdev->physfn.vf_rlen[i] = size;
+
+        if ( !size )
+        {
+            bars[i].type = VPCI_BAR_EMPTY;
+            continue;
+        }
+
+        bars[i].addr = addr;
+        bars[i].guest_addr = addr;
+        bars[i].size = size;
+        bars[i].prefetchable = bar & PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+        switch ( rc )
+        {
+        case 1:
+            bars[i].type = VPCI_BAR_MEM32;
+            break;
+
+        case 2:
+            bars[i].type = VPCI_BAR_MEM64_LO;
+            bars[i + 1].type = VPCI_BAR_MEM64_HI;
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+        }
+    }
+
+    rc = rc > 0 ? 0 : rc;
+
+    return rc;
+}
+
+struct callback_data {
+    const struct pci_dev *pdev;
+    unsigned int pos;
+    uint32_t value;
+    bool enable : 1;
+    bool disable : 1;
+    bool map : 1;
+    bool unmap : 1;
+};
+
+static void cf_check control_write_cb(void *data)
+{
+    struct callback_data *cb = data;
+    const struct pci_dev *pdev = cb->pdev;
+    uint16_t offset = pci_conf_read16(pdev->sbdf, cb->pos + 
PCI_SRIOV_VF_OFFSET);
+    uint16_t stride = pci_conf_read16(pdev->sbdf, cb->pos + 
PCI_SRIOV_VF_STRIDE);
+    struct vpci_sriov *sriov = pdev->vpci->sriov;
+    int rc = 0;
+    unsigned int i;
+
+    if ( cb->unmap )
+    {
+        write_lock(&pdev->domain->pci_lock);
+        map_vfs(pdev, 0);
+        write_unlock(&pdev->domain->pci_lock);
+    }
+
+    if ( cb->enable || cb->disable )
+    {
+        for ( i = 0; i < sriov->num_vfs; i++ )
+        {
+            const pci_sbdf_t vf_sbdf = {
+                .sbdf = pdev->sbdf.sbdf + offset + stride * i,
+            };
+
+            if ( cb->enable )
+            {
+                const struct pci_dev_info info = {
+                    .is_virtfn = true,
+                    .is_extfn = false,
+                    .physfn.bus = pdev->sbdf.bus,
+                    .physfn.devfn = pdev->sbdf.devfn,
+                };
+                rc = pci_add_device(vf_sbdf.seg, vf_sbdf.bus, vf_sbdf.devfn,
+                                    &info, pdev->node);
+            }
+            if ( cb->disable )
+                rc = pci_remove_device(vf_sbdf.seg, vf_sbdf.bus, 
vf_sbdf.devfn);
+
+            if ( rc && rc != -ENODEV)
+                gprintk(XENLOG_ERR, "failed to %s VF %pp: %d\n",
+                        cb->enable ? "add" : "remove", &vf_sbdf, rc);
+        }
+    }
+
+    if ( cb->map )
+    {
+        write_lock(&pdev->domain->pci_lock);
+        rc = map_vfs(pdev, PCI_COMMAND_MEMORY);
+
+        if ( rc )
+            map_vfs(pdev, 0);
+        write_unlock(&pdev->domain->pci_lock);
+    }
+
+    pci_conf_write16(pdev->sbdf, cb->pos + PCI_SRIOV_CTRL, cb->value);
+    xfree(cb);
+}
+
+static void cf_check control_write(const struct pci_dev *pdev, unsigned int 
reg,
+                                   uint32_t val, void *data)
+{
+    unsigned int sriov_pos = reg - PCI_SRIOV_CTRL;
+    struct vpci_sriov *sriov = pdev->vpci->sriov;
+    struct callback_data *cb = NULL;
+    uint16_t control = pci_conf_read16(pdev->sbdf, reg);
+    bool mem_enabled = control & PCI_SRIOV_CTRL_MSE;
+    bool new_mem_enabled = val & PCI_SRIOV_CTRL_MSE;
+    bool enabled = control & PCI_SRIOV_CTRL_VFE;
+    bool new_enabled = val & PCI_SRIOV_CTRL_VFE;
+
+    ASSERT(!pdev->info.is_virtfn);
+
+    if ( new_enabled == enabled && new_mem_enabled == mem_enabled )
+    {
+        pci_conf_write16(pdev->sbdf, reg, val);
+        return;
+    }
+
+    cb = xzalloc(struct callback_data);
+
+    if ( !cb )
+    {
+        gprintk(XENLOG_ERR,
+                "%pp: Unable to allocate memory for SR-IOV enable\n",
+                pdev);
+        return;
+    }
+
+    cb->pdev = pdev;
+    cb->pos = sriov_pos;
+    cb->value = val;
+    cb->map = new_mem_enabled && !mem_enabled;
+    cb->unmap = !new_mem_enabled && mem_enabled;
+    cb->enable = new_enabled && !enabled;
+    cb->disable = !new_enabled && enabled;
+
+    current->vpci.task = WAIT;
+    current->vpci.wait.callback = control_write_cb;
+    current->vpci.wait.data = cb;
+    current->vpci.wait.end = NOW();
+
+    if ( cb->enable )
+    {
+        size_vf_bars((struct pci_dev *)pdev, sriov_pos);
+
+        /*
+         * Only update the number of active VFs when enabling, when
+         * disabling use the cached value in order to always remove the same
+         * number of VFs that were active.
+         */
+        sriov->num_vfs = pci_conf_read16(pdev->sbdf,
+                                         sriov_pos + PCI_SRIOV_NUM_VF);
+        /*
+         * NB: VFE needs to be enabled before calling pci_add_device so Xen
+         * can access the config space of VFs. FIXME casting away const-ness
+         * to modify vf_rlen
+         */
+        pci_conf_write16(pdev->sbdf, reg, control | PCI_SRIOV_CTRL_VFE);
+        /*
+         * The spec states that the software must wait at least 100ms before
+         * attempting to access VF registers when enabling virtual functions
+         * on the PF.
+         */
+
+        current->vpci.wait.end = NOW() + MILLISECS(100);
+    }
+}
+
+int vf_init_header(struct pci_dev *vf_pdev)
+{
+    const struct pci_dev *pf_pdev;
+    unsigned int sriov_pos;
+    int rc = 0;
+    uint16_t ctrl;
+
+    ASSERT(rw_is_write_locked(&vf_pdev->domain->pci_lock));
+
+    if ( !vf_pdev->info.is_virtfn )
+        return 0;
+
+    pf_pdev = vf_pdev->pf_pdev;
+    ASSERT(pf_pdev);
+
+    rc = vf_init_bars(vf_pdev);
+    if ( rc )
+        return rc;
+
+    sriov_pos = pci_find_ext_capability(pf_pdev, PCI_EXT_CAP_ID_SRIOV);
+    ctrl = pci_conf_read16(pf_pdev->sbdf, sriov_pos + PCI_SRIOV_CTRL);
+
+    if ( (pf_pdev->domain == vf_pdev->domain) && (ctrl & PCI_SRIOV_CTRL_MSE) )
+    {
+        rc = vpci_modify_bars(vf_pdev, PCI_COMMAND_MEMORY, false, false);
+        if ( rc )
+            return rc;
+    }
+
+    return rc;
+}
+
+static int cf_check init_sriov(struct pci_dev *pdev)
+{
+    unsigned int pos;
+
+    ASSERT(!pdev->info.is_virtfn);
+
+    pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+
+    if ( !pos )
+        return 0;
+
+    if ( xsm_resource_setup_pci(XSM_PRIV, pdev->sbdf.bdf) )
+    {
+        printk(XENLOG_ERR
+               "%pp: SR-IOV configuration unsupported for unpriv %pd\n",
+               &pdev->sbdf, pdev->domain);
+        return 0;
+    }
+
+    pdev->vpci->sriov = xzalloc(struct vpci_sriov);
+    if ( !pdev->vpci->sriov )
+        return -ENOMEM;
+
+    return vpci_add_register(pdev->vpci, vpci_hw_read16, control_write,
+                             pos + PCI_SRIOV_CTRL, 2, NULL);
+}
+
+static int cf_check cleanup_sriov(const struct pci_dev *pdev, bool hide)
+{
+    if ( hide )
+        return 0;
+
+    XFREE(pdev->vpci->sriov);
+
+    return 0;
+}
+
+REGISTER_VPCI_EXTCAP(SRIOV, init_sriov, cleanup_sriov);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 47cdb54d42..ae5f3b7274 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -45,6 +45,7 @@ typedef struct {
     REGISTER_VPCI_CAPABILITY(PCI_EXT_CAP_ID_##name, name, finit, fclean, true)
 
 int __must_check vpci_init_header(struct pci_dev *pdev);
+int __must_check vf_init_header(struct pci_dev *pdev);
 
 /* Assign vPCI to device by adding handlers. */
 int __must_check vpci_assign_device(struct pci_dev *pdev);
@@ -146,7 +147,6 @@ struct vpci {
          * upon to know whether BARs are mapped into the guest p2m.
          */
         bool bars_mapped      : 1;
-        /* FIXME: currently there's no support for SR-IOV. */
     } header;
 
     /* MSI data. */
@@ -200,6 +200,13 @@ struct vpci {
             struct vpci_arch_msix_entry arch;
         } entries[];
     } *msix;
+
+    struct vpci_sriov {
+        /* PF only */
+        struct vpci_bar vf_bars[PCI_SRIOV_NUM_BARS];
+        uint16_t num_vfs;
+    } *sriov;
+
 #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
     /* Guest SBDF of the device. */
 #define INVALID_GUEST_SBDF ((pci_sbdf_t){ .sbdf = ~0U })
@@ -323,7 +330,8 @@ bool vpci_ecam_read(pci_sbdf_t sbdf, unsigned int reg, 
unsigned int len,
                     unsigned long *data);
 
 /* Map/unmap the BARs of a vPCI device. */
-int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only);
+int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only,
+                     bool no_defer);
 
 #endif /* __XEN__ */
 
-- 
2.51.2

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.