[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v2 11/11] vpci/sriov: add support for SR-IOV capability



So that a PCI device that supports SR-IOV (PF) can enable the capability
and use the virtual functions.

This code is expected to only be used by privileged domains,
unprivileged domains should not get access to the SR-IOV capability.

The current code detects enabling of the virtual functions feature and
automatically adds the VFs to the domain. It also detects enabling of
memory space and maps the VFs BARs into the domain p2m. Disabling of
the VF enable bit removes the devices and the BAR memory map from the
domain p2m.

Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
---
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>
Cc: Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
Cc: Jan Beulich <jbeulich@xxxxxxxx>
Cc: Julien Grall <julien.grall@xxxxxxx>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx>
Cc: Tim Deegan <tim@xxxxxxx>
Cc: Wei Liu <wei.liu2@xxxxxxxxxx>
---
Changes since v1:
 - Free sriov on teardown.
 - Use a recursive lock in order to prevent deadlocking when mapping a
   VF BAR and try to acquire the PF lock.
 - Fix comment typo.
 - Do not check pci_size_mem_bar return value against <= 0.
---
Tested with a Linux PVH Dom0 with Intel I350 nic.
---
 xen/drivers/vpci/Makefile |   2 +-
 xen/drivers/vpci/header.c |  36 +++--
 xen/drivers/vpci/sriov.c  | 317 ++++++++++++++++++++++++++++++++++++++
 xen/drivers/vpci/vpci.c   |  10 +-
 xen/include/xen/vpci.h    |   9 +-
 5 files changed, 359 insertions(+), 15 deletions(-)
 create mode 100644 xen/drivers/vpci/sriov.c

diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 55d1bdfda0..6274f60e74 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@
-obj-y += vpci.o header.o msi.o msix.o
+obj-y += vpci.o header.o msi.o msix.o sriov.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index e9c7b6aa72..092a888751 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -236,7 +236,7 @@ static void defer_map(struct domain *d, struct pci_dev 
*pdev,
     }
 }
 
-static int modify_bars(const struct pci_dev *pdev, bool map, bool rom_only)
+int vpci_modify_bars(const struct pci_dev *pdev, bool map, bool rom_only)
 {
     struct vpci_header *header = &pdev->vpci->header;
     struct rangeset *mem = rangeset_new(NULL, NULL, 0);
@@ -318,10 +318,16 @@ static int modify_bars(const struct pci_dev *pdev, bool 
map, bool rom_only)
                 continue;
         }
 
-        spin_lock(&tmp->vpci_lock);
+        /*
+         * When mapping the BARs of a VF the parent PF is already locked and
+         * trying to lock it will result in a deadlock, so use a recursive
+         * lock. This is because vpci_modify_bars is called from the parent PF
+         * control_write register handler.
+         */
+        spin_lock_recursive(&tmp->vpci_lock);
         if ( !tmp->vpci )
         {
-            spin_unlock(&tmp->vpci_lock);
+            spin_unlock_recursive(&tmp->vpci_lock);
             continue;
         }
         for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
@@ -342,14 +348,14 @@ static int modify_bars(const struct pci_dev *pdev, bool 
map, bool rom_only)
             rc = rangeset_remove_range(mem, start, end);
             if ( rc )
             {
-                spin_unlock(&tmp->vpci_lock);
+                spin_unlock_recursive(&tmp->vpci_lock);
                 printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
                        start, end, rc);
                 rangeset_destroy(mem);
                 return rc;
             }
         }
-        spin_unlock(&tmp->vpci_lock);
+        spin_unlock_recursive(&tmp->vpci_lock);
     }
 
     ASSERT(dev);
@@ -391,7 +397,7 @@ static void cmd_write(const struct pci_dev *pdev, unsigned 
int reg,
          * memory decoding bit has not been changed, so leave everything as-is,
          * hoping the guest will realize and try again.
          */
-        modify_bars(pdev, cmd & PCI_COMMAND_MEMORY, false);
+        vpci_modify_bars(pdev, cmd & PCI_COMMAND_MEMORY, false);
     else
         pci_conf_write16(pdev->seg, pdev->bus, slot, func, reg, cmd);
 }
@@ -472,13 +478,13 @@ static void rom_write(const struct pci_dev *pdev, 
unsigned int reg,
         header->rom_enabled = new_enabled;
         pci_conf_write32(pdev->seg, pdev->bus, slot, func, reg, val);
     }
-    else if ( modify_bars(pdev, new_enabled, true) )
+    else if ( vpci_modify_bars(pdev, new_enabled, true) )
         /*
          * No memory has been added or removed from the p2m (because the actual
          * p2m changes are deferred in defer_map) and the ROM enable bit has
          * not been changed, so leave everything as-is, hoping the guest will
          * realize and try again. It's important to not update rom->addr in the
-         * unmap case if modify_bars has failed, or future attempts would
+         * unmap case if vpci_modify_bars has failed, or future attempts would
          * attempt to unmap the wrong address.
          */
         return;
@@ -500,6 +506,16 @@ static int init_bars(struct pci_dev *pdev)
     };
     int rc;
 
+    if ( pdev->info.is_virtfn )
+        /*
+         * No need to set traps for the command register or the BAR registers
+         * because those are not used by VFs. Memory decoding and position of
+         * the VF BARs is controlled from the PF.
+         *
+         * TODO: add DomU support for VFs.
+         */
+        return 0;
+
     switch ( pci_conf_read8(pdev->seg, pdev->bus, slot, func, PCI_HEADER_TYPE)
              & 0x7f )
     {
@@ -608,7 +624,7 @@ static int init_bars(struct pci_dev *pdev)
             rom->type = VPCI_BAR_EMPTY;
     }
 
-    return (cmd & PCI_COMMAND_MEMORY) ? modify_bars(pdev, true, false) : 0;
+    return (cmd & PCI_COMMAND_MEMORY) ? vpci_modify_bars(pdev, true, false) : 
0;
 }
 
 static void teardown_bars(struct pci_dev *pdev)
@@ -619,7 +635,7 @@ static void teardown_bars(struct pci_dev *pdev)
     if ( cmd & PCI_COMMAND_MEMORY )
     {
         /* Unmap all BARs from guest p2m. */
-        modify_bars(pdev, false, false);
+        vpci_modify_bars(pdev, false, false);
         /*
          * Since this operation is deferred at the point when it finishes the
          * device might have been removed, so don't attempt to disable memory
diff --git a/xen/drivers/vpci/sriov.c b/xen/drivers/vpci/sriov.c
new file mode 100644
index 0000000000..64b7ae8203
--- /dev/null
+++ b/xen/drivers/vpci/sriov.c
@@ -0,0 +1,317 @@
+/*
+ * Handlers for accesses to the SR-IOV capability structure.
+ *
+ * Copyright (C) 2018 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/delay.h>
+#include <xen/sched.h>
+#include <xen/vpci.h>
+
+#define SRIOV_SIZE(num) offsetof(struct vpci_sriov, vf[num])
+
+static void modify_memory_mapping(const struct pci_dev *pdev, unsigned int pos,
+                                  bool enable)
+{
+    const struct vpci_sriov *sriov = pdev->vpci->sriov;
+    unsigned int i;
+    int rc;
+
+    if ( enable )
+    {
+        struct pci_dev *pf_dev;
+
+        pcidevs_lock();
+        /*
+         * NB: a non-const pci_dev of the PF is needed in order to update
+         * vf_rlen.
+         */
+        pf_dev = pci_get_pdev(pdev->seg, pdev->bus, pdev->devfn);
+        pcidevs_unlock();
+        ASSERT(pf_dev);
+
+        /* Set the BARs addresses and size. */
+        for ( i = 0; i < PCI_SRIOV_NUM_BARS; i += rc )
+        {
+            unsigned int j, idx = pos + PCI_SRIOV_BAR + i * 4;
+            const pci_sbdf_t sbdf = {
+                .sbdf = PCI_SBDF3(pdev->seg, pdev->bus, pdev->devfn),
+            };
+            uint32_t bar = pci_conf_read32(pdev->seg, pdev->bus,
+                                           PCI_SLOT(pdev->devfn),
+                                           PCI_FUNC(pdev->devfn), idx);
+            uint64_t addr, size;
+
+            rc = pci_size_mem_bar(sbdf, idx, &addr, &size,
+                                  PCI_BAR_VF |
+                                  ((i == PCI_SRIOV_NUM_BARS - 1) ?
+                                   PCI_BAR_LAST : 0));
+
+            /*
+             * Update vf_rlen on the PF. According to the spec the size of
+             * the BARs can change if the system page size register is
+             * modified, so always update rlen when enabling VFs.
+             */
+            pf_dev->vf_rlen[i] = size;
+
+            for ( j = 0; j < sriov->num_vfs; j++ )
+            {
+                struct vpci_header *header;
+
+                if ( !sriov->vf[j] )
+                    /* Can happen if pci_add_device fails. */
+                    continue;
+
+                spin_lock(&sriov->vf[j]->vpci_lock);
+                header = &sriov->vf[j]->vpci->header;
+
+                if ( !size )
+                {
+                    header->bars[i].type = VPCI_BAR_EMPTY;
+                    spin_unlock(&sriov->vf[j]->vpci_lock);
+                    continue;
+                }
+
+                header->bars[i].addr = addr + size * j;
+                header->bars[i].size = size;
+                header->bars[i].prefetchable =
+                    bar & PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+                switch ( rc )
+                {
+                case 1:
+                    header->bars[i].type = VPCI_BAR_MEM32;
+                    break;
+
+                case 2:
+                    header->bars[i].type = VPCI_BAR_MEM64_LO;
+                    header->bars[i + 1].type = VPCI_BAR_MEM64_HI;
+                    break;
+
+                default:
+                    ASSERT_UNREACHABLE();
+                    spin_unlock(&sriov->vf[j]->vpci_lock);
+                    domain_crash(pdev->domain);
+                    return;
+                }
+                spin_unlock(&sriov->vf[j]->vpci_lock);
+            }
+        }
+    }
+
+    /* Add/remove mappings for the VFs BARs into the p2m. */
+    for ( i = 0; i < sriov->num_vfs; i++ )
+    {
+        struct pci_dev *vf_pdev = sriov->vf[i];
+
+        spin_lock(&vf_pdev->vpci_lock);
+        rc = vpci_modify_bars(vf_pdev, enable, false);
+        spin_unlock(&vf_pdev->vpci_lock);
+        if ( rc )
+            gprintk(XENLOG_ERR,
+                    "failed to %smap BARs of VF %04x:%02x:%02x.%u: %d\n",
+                    enable ? "" : "un", vf_pdev->seg, vf_pdev->bus,
+                    PCI_SLOT(vf_pdev->devfn), PCI_FUNC(vf_pdev->devfn), rc);
+    }
+}
+
+static void enable_tail(const struct pci_dev *pdev, struct vpci_sriov *sriov,
+                        unsigned int pos, bool new_enabled,
+                        bool new_mem_enabled)
+{
+    uint16_t offset = pci_conf_read16(pdev->seg, pdev->bus,
+                                      PCI_SLOT(pdev->devfn),
+                                      PCI_FUNC(pdev->devfn),
+                                      pos + PCI_SRIOV_VF_OFFSET);
+    uint16_t stride = pci_conf_read16(pdev->seg, pdev->bus,
+                                      PCI_SLOT(pdev->devfn),
+                                      PCI_FUNC(pdev->devfn),
+                                      pos + PCI_SRIOV_VF_STRIDE);
+    unsigned int i;
+
+    for ( i = 0; i < sriov->num_vfs; i++ )
+    {
+        const pci_sbdf_t bdf = {
+            .bdf = PCI_BDF2(pdev->bus, pdev->devfn) + offset + stride * i,
+        };
+        int rc;
+
+        if ( new_enabled )
+        {
+            const struct pci_dev_info info = {
+                .is_virtfn = true,
+                .physfn.bus = pdev->bus,
+                .physfn.devfn = pdev->devfn,
+            };
+
+            rc = pci_add_device(pdev->seg, bdf.bus, bdf.extfunc, &info,
+                                pdev->node);
+        }
+        else
+            rc = pci_remove_device(pdev->seg, bdf.bus, bdf.extfunc);
+        if ( rc )
+            gprintk(XENLOG_ERR, "failed to %s VF %04x:%02x:%02x.%u: %d\n",
+                    new_enabled ? "add" : "remove", pdev->seg, bdf.bus,
+                    bdf.dev, bdf.func, rc);
+
+        pcidevs_lock();
+        sriov->vf[i] = pci_get_pdev(pdev->seg, bdf.bus, bdf.extfunc);
+        pcidevs_unlock();
+    }
+
+    if ( new_mem_enabled )
+        modify_memory_mapping(pdev, pos, true);
+}
+
+struct callback_data {
+    const struct pci_dev *pdev;
+    struct vpci_sriov *sriov;
+    unsigned int pos;
+    uint32_t value;
+    bool new_enabled;
+    bool new_mem_enabled;
+};
+
+static void enable_callback(void *data)
+{
+    struct callback_data *cb = data;
+    const struct pci_dev *pdev = cb->pdev;
+
+    enable_tail(pdev, cb->sriov, cb->pos, cb->new_enabled,
+                cb->new_mem_enabled);
+    pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                     PCI_FUNC(pdev->devfn), cb->pos + PCI_SRIOV_CTRL,
+                     cb->value);
+    xfree(cb);
+}
+
+static void control_write(const struct pci_dev *pdev, unsigned int reg,
+                          uint32_t val, void *data)
+{
+    struct vpci_sriov *sriov = data;
+    unsigned int pos = reg - PCI_SRIOV_CTRL;
+    uint16_t control = pci_conf_read16(pdev->seg, pdev->bus,
+                                       PCI_SLOT(pdev->devfn),
+                                       PCI_FUNC(pdev->devfn),
+                                       pos + PCI_SRIOV_CTRL);
+    bool enabled = control & PCI_SRIOV_CTRL_VFE;
+    bool mem_enabled = control & PCI_SRIOV_CTRL_MSE;
+    bool new_enabled = val & PCI_SRIOV_CTRL_VFE;
+    bool new_mem_enabled = val & PCI_SRIOV_CTRL_MSE;
+
+    if ( new_enabled != enabled )
+    {
+        if ( new_enabled )
+        {
+            struct callback_data *cb = xmalloc(struct callback_data);
+            struct vcpu *curr = current;
+
+            if ( !cb )
+            {
+                gprintk(XENLOG_WARNING, "%04x:%02x:%02x.%u: "
+                        "unable to allocate memory for SR-IOV enable\n",
+                        pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                        PCI_FUNC(pdev->devfn));
+                return;
+            }
+
+            /*
+             * Only update the number of active VFs when enabling, when
+             * disabling use the cached value in order to always remove the
+             * same number of VFs that were active.
+             */
+            sriov->num_vfs = pci_conf_read16(pdev->seg, pdev->bus,
+                                             PCI_SLOT(pdev->devfn),
+                                             PCI_FUNC(pdev->devfn),
+                                             pos + PCI_SRIOV_NUM_VF);
+
+            /*
+             * NB: VFE needs to be enabled before calling pci_add_device so Xen
+             * can access the config space of VFs.
+             */
+            pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                             PCI_FUNC(pdev->devfn), reg,
+                             control | PCI_SRIOV_CTRL_VFE);
+
+            /*
+             * The spec states that the software must wait at least 100ms
+             * before attempting to access VF registers when enabling virtual
+             * functions on the PF.
+             */
+            cb->pdev = pdev;
+            cb->sriov = sriov;
+            cb->pos = pos;
+            cb->value = val;
+            cb->new_enabled = new_enabled;
+            cb->new_mem_enabled = new_mem_enabled;
+            curr->vpci.task = WAIT;
+            curr->vpci.wait.callback = enable_callback;
+            curr->vpci.wait.data = cb;
+            curr->vpci.wait.end = get_cycles() + 100 * cpu_khz;
+            return;
+        }
+
+        enable_tail(pdev, sriov, pos, new_enabled, new_mem_enabled);
+    }
+    else if ( new_mem_enabled != mem_enabled && new_enabled )
+        modify_memory_mapping(pdev, pos, new_mem_enabled);
+
+    pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                     PCI_FUNC(pdev->devfn), reg, val);
+}
+
+static int init_sriov(struct pci_dev *pdev)
+{
+    unsigned int pos = pci_find_ext_capability(pdev->seg, pdev->bus,
+                                               pdev->devfn,
+                                               PCI_EXT_CAP_ID_SRIOV);
+    uint16_t total_vfs;
+
+    if ( !pos )
+        return 0;
+
+    total_vfs = pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                                PCI_FUNC(pdev->devfn),
+                                pos + PCI_SRIOV_TOTAL_VF);
+
+    pdev->vpci->sriov = xzalloc_bytes(SRIOV_SIZE(total_vfs));
+    if ( !pdev->vpci->sriov )
+        return -ENOMEM;
+
+    return vpci_add_register(pdev->vpci, vpci_hw_read16, control_write,
+                             pos + PCI_SRIOV_CTRL, 2, pdev->vpci->sriov);
+}
+
+static void teardown_sriov(struct pci_dev *pdev)
+{
+    if ( pdev->vpci->sriov )
+    {
+        /* TODO: removing PFs is not currently supported. */
+        ASSERT_UNREACHABLE();
+        xfree(pdev->vpci->sriov);
+        domain_crash(pdev->domain);
+    }
+}
+REGISTER_VPCI_INIT(init_sriov, teardown_sriov, VPCI_PRIORITY_LOW);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 732d3525ae..8734076d39 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -459,10 +459,14 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, 
unsigned int size,
         return;
     }
 
-    spin_lock(&pdev->vpci_lock);
+    /*
+     * NB: use the recursive variant here so that mapping an unmapping of the
+     * VF vars works correctly and can recursively take the PF lock.
+     */
+    spin_lock_recursive(&pdev->vpci_lock);
     if ( !pdev->vpci )
     {
-        spin_unlock(&pdev->vpci_lock);
+        spin_unlock_recursive(&pdev->vpci_lock);
         vpci_write_hw(sbdf, reg, size, data);
         return;
     }
@@ -501,7 +505,7 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned 
int size,
             break;
         ASSERT(data_offset < size);
     }
-    spin_unlock(&pdev->vpci_lock);
+    spin_unlock_recursive(&pdev->vpci_lock);
 
     if ( data_offset < size )
         /* Tailing gap, write the remaining. */
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 5479fe57d2..6fe03e9944 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -94,7 +94,6 @@ struct vpci {
          * is mapped into guest p2m) if there's a ROM BAR on the device.
          */
         bool rom_enabled      : 1;
-        /* FIXME: currently there's no support for SR-IOV. */
     } header;
 
     /* MSI data. */
@@ -144,6 +143,11 @@ struct vpci {
             struct vpci_arch_msix_entry arch;
         } entries[];
     } *msix;
+
+    struct vpci_sriov {
+        uint16_t num_vfs;
+        struct pci_dev *vf[];
+    } *sriov;
 #endif
 };
 
@@ -229,6 +233,9 @@ static inline unsigned int vmsix_entry_nr(const struct 
vpci_msix *msix,
 {
     return entry - msix->entries;
 }
+
+/* Map/unmap the BARs of a vPCI device. */
+int vpci_modify_bars(const struct pci_dev *pdev, bool map, bool rom_only);
 #endif /* __XEN__ */
 
 #else /* !CONFIG_HAS_VPCI */
-- 
2.17.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.