As was discussed a couple of time on this list, SR-IOV virtual functions have their BARs read as zero - the physical function's SR-IOV capability structure must be consulted instead. The patch is only a first draft, and may not apply as-is to current -unstable. On some machines it in particular depends on some additional changes to the MMCONFIG initialization, which I am yet to post (still missing some parts of it), as accessing the SR-IOV capability structure requires the extended config space to be readable by Xen. Issues: Can we expect doing the BAR sizing from pci_add_device() to be compatible with all Dom0 OSes? In particular, is the assumption that virtual functions aren't enabled at this point reasonable? Is using dev->devfn for obtaining the VF index really correct (this is how I see Linux handling it)? How would VFs beyond 255 be expressed? This is only compile tested, as I have no SR-IOV device available for testing. --- 2011-07-19.orig/xen/arch/x86/msi.c 2011-06-16 01:21:02.000000000 -0600 +++ 2011-07-19/xen/arch/x86/msi.c 2011-07-28 11:09:01.000000000 -0600 @@ -522,12 +522,39 @@ static int msi_capability_init(struct pc return 0; } -static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 bir) +static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 bir, int vf) { u8 limit; - u32 addr; + u32 addr, base = PCI_BASE_ADDRESS_0, disp = 0; - switch ( pci_conf_read8(bus, slot, func, PCI_HEADER_TYPE) & 0x7f ) + if ( vf >= 0 ) + { + struct pci_dev *pdev = pci_get_pdev(bus, PCI_DEVFN(slot, func)); + unsigned int pos = pci_find_ext_capability(0, bus, + PCI_DEVFN(slot, func), + PCI_EXT_CAP_ID_SRIOV); + u16 ctrl = pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_CTRL); + + if ( !pdev || !pos || + !(ctrl & PCI_SRIOV_CTRL_VFE) || + !(ctrl & PCI_SRIOV_CTRL_MSE) || + bir >= PCI_SRIOV_NUM_BARS || + !pdev->vf_rlen[bir] ) + return 0; + base = pos + PCI_SRIOV_BAR; + vf -= PCI_BDF(bus, slot, func) + + pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_VF_OFFSET); + if ( vf < 0 || + vf % pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_VF_STRIDE) ) + return 0; + vf /= pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_VF_STRIDE); + if ( vf >= pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_NUM_VF) ) + return 0; + BUILD_BUG_ON(ARRAY_SIZE(pdev->vf_rlen) != PCI_SRIOV_NUM_BARS); + disp = vf * pdev->vf_rlen[bir]; + limit = PCI_SRIOV_NUM_BARS; + } + else switch ( pci_conf_read8(bus, slot, func, PCI_HEADER_TYPE) & 0x7f ) { case PCI_HEADER_TYPE_NORMAL: limit = 6; @@ -544,7 +571,7 @@ static u64 read_pci_mem_bar(u8 bus, u8 s if ( bir >= limit ) return 0; - addr = pci_conf_read32(bus, slot, func, PCI_BASE_ADDRESS_0 + bir * 4); + addr = pci_conf_read32(bus, slot, func, base + bir * 4); if ( (addr & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO ) return 0; if ( (addr & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64 ) @@ -552,11 +579,10 @@ static u64 read_pci_mem_bar(u8 bus, u8 s addr &= PCI_BASE_ADDRESS_MEM_MASK; if ( ++bir >= limit ) return 0; - return addr | - ((u64)pci_conf_read32(bus, slot, func, - PCI_BASE_ADDRESS_0 + bir * 4) << 32); + return addr + disp + + ((u64)pci_conf_read32(bus, slot, func, base + bir * 4) << 32); } - return addr & PCI_BASE_ADDRESS_MEM_MASK; + return (addr & PCI_BASE_ADDRESS_MEM_MASK) + disp; } /** @@ -629,11 +655,29 @@ static int msix_capability_init(struct p if ( !dev->msix_nr_entries ) { + u8 pbus, pslot, pfunc; + int vf; u64 pba_paddr; u32 pba_offset; + if ( !dev->info.is_virtfn ) + { + pbus = bus; + pslot = slot; + pfunc = func; + vf = -1; + } + else + { + pbus = dev->info.physfn.bus; + pslot = PCI_SLOT(dev->info.physfn.devfn); + pfunc = PCI_FUNC(dev->info.physfn.devfn); + vf = PCI_BDF2(dev->bus, dev->devfn); + } + ASSERT(!dev->msix_used_entries); - WARN_ON(msi->table_base != read_pci_mem_bar(bus, slot, func, bir)); + WARN_ON(msi->table_base != + read_pci_mem_bar(pbus, pslot, pfunc, bir, vf)); dev->msix_nr_entries = nr_entries; dev->msix_table.first = PFN_DOWN(table_paddr); @@ -645,7 +689,7 @@ static int msix_capability_init(struct p pba_offset = pci_conf_read32(bus, slot, func, msix_pba_offset_reg(pos)); bir = (u8)(pba_offset & PCI_MSIX_BIRMASK); - pba_paddr = read_pci_mem_bar(bus, slot, func, bir); + pba_paddr = read_pci_mem_bar(pbus, pslot, pfunc, bir, vf); WARN_ON(!pba_paddr); pba_paddr += pba_offset & ~PCI_MSIX_BIRMASK; --- 2011-07-19.orig/xen/drivers/passthrough/pci.c 2011-07-19 00:44:57.000000000 -0600 +++ 2011-07-19/xen/drivers/passthrough/pci.c 2011-07-28 11:07:52.000000000 -0600 @@ -146,6 +146,7 @@ void pci_enable_acs(struct pci_dev *pdev int pci_add_device(u8 bus, u8 devfn, const struct pci_dev_info *info) { struct pci_dev *pdev; + unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn); const char *pdev_type; int ret = -ENOMEM; @@ -154,7 +155,14 @@ int pci_add_device(u8 bus, u8 devfn, con else if (info->is_extfn) pdev_type = "extended function"; else if (info->is_virtfn) + { + spin_lock(&pcidevs_lock); + pdev = pci_get_pdev(info->physfn.bus, info->physfn.devfn); + spin_unlock(&pcidevs_lock); + if ( !pdev ) + pci_add_device(info->physfn.bus, info->physfn.devfn, NULL); pdev_type = "virtual function"; + } else return -EINVAL; @@ -165,6 +173,70 @@ int pci_add_device(u8 bus, u8 devfn, con if ( info ) pdev->info = *info; + else if ( !pdev->vf_rlen[0] ) + { + unsigned int pos = pci_find_ext_capability(0, bus, devfn, + PCI_EXT_CAP_ID_SRIOV); + u16 ctrl = pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_CTRL); + + if ( !pos ) + /* Nothing */; + else if ( !(ctrl & (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE)) ) + { + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(pdev->vf_rlen) != PCI_SRIOV_NUM_BARS); + for ( i = 0; i < PCI_SRIOV_NUM_BARS; ++i ) + { + unsigned int idx = pos + PCI_SRIOV_BAR + i * 4; + u32 bar = pci_conf_read32(bus, slot, func, idx); + u32 hi = 0; + + if ( (bar & PCI_BASE_ADDRESS_SPACE) == + PCI_BASE_ADDRESS_SPACE_IO ) + { + printk(XENLOG_WARNING "SR-IOV device %02x:%02x.%x with vf" + " BAR%u in IO space\n", + bus, slot, func, i); + continue; + } + pci_conf_write32(bus, slot, func, idx, ~0); + if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == + PCI_BASE_ADDRESS_MEM_TYPE_64 ) + { + if ( i >= PCI_SRIOV_NUM_BARS ) + { + printk(XENLOG_WARNING "SR-IOV device %02x:%02x.%x with" + " 64-bit vf BAR in last slot\n", + bus, slot, func); + break; + } + hi = pci_conf_read32(bus, slot, func, idx + 4); + pci_conf_write32(bus, slot, func, idx + 4, ~0); + } + pdev->vf_rlen[i] = pci_conf_read32(bus, slot, func, idx) & + PCI_BASE_ADDRESS_MEM_MASK; + if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == + PCI_BASE_ADDRESS_MEM_TYPE_64 ) + { + pdev->vf_rlen[i] |= (u64)pci_conf_read32(bus, slot, func, + idx + 4) << 32; + pci_conf_write32(bus, slot, func, idx + 4, hi); + } + else if ( pdev->vf_rlen[i] ) + pdev->vf_rlen[i] |= (u64)~0 << 32; + pci_conf_write32(bus, slot, func, idx, bar); + pdev->vf_rlen[i] = -pdev->vf_rlen[i]; + if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == + PCI_BASE_ADDRESS_MEM_TYPE_64 ) + ++i; + } + } + else + printk(XENLOG_WARNING "SR-IOV device %02x:%02x.%x has its virtual" + " functions already enabled (%04x)\n", + bus, slot, func, ctrl); + } ret = 0; if ( !pdev->domain ) @@ -184,7 +256,7 @@ int pci_add_device(u8 bus, u8 devfn, con out: spin_unlock(&pcidevs_lock); printk(XENLOG_DEBUG "PCI add %s %02x:%02x.%x\n", pdev_type, - bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + bus, slot, func); return ret; } --- 2011-07-19.orig/xen/include/xen/pci.h 2011-07-19 00:44:57.000000000 -0600 +++ 2011-07-19/xen/include/xen/pci.h 2011-07-19 00:55:02.000000000 -0600 @@ -57,6 +57,7 @@ struct pci_dev { const u8 bus; const u8 devfn; struct pci_dev_info info; + u64 vf_rlen[6]; }; #define for_each_pdev(domain, pdev) \ --- 2011-07-19.orig/xen/include/xen/pci_regs.h 2010-08-06 00:44:34.000000000 -0600 +++ 2011-07-19/xen/include/xen/pci_regs.h 2011-07-19 00:55:02.000000000 -0600 @@ -425,7 +425,7 @@ #define PCI_EXT_CAP_ID_ACS 13 #define PCI_EXT_CAP_ID_ARI 14 #define PCI_EXT_CAP_ID_ATS 15 -#define PCI_EXT_CAP_ID_IOV 16 +#define PCI_EXT_CAP_ID_SRIOV 16 /* Advanced Error Reporting */ #define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ @@ -545,4 +545,35 @@ #define PCI_ACS_CTRL 0x06 /* ACS Control Register */ #define PCI_ACS_EGRESS_CTL_V 0x08 /* ACS Egress Control Vector */ +/* Single Root I/O Virtualization */ +#define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */ +#define PCI_SRIOV_CAP_VFM 0x01 /* VF Migration Capable */ +#define PCI_SRIOV_CAP_INTR(x) ((x) >> 21) /* Interrupt Message Number */ +#define PCI_SRIOV_CTRL 0x08 /* SR-IOV Control */ +#define PCI_SRIOV_CTRL_VFE 0x01 /* VF Enable */ +#define PCI_SRIOV_CTRL_VFM 0x02 /* VF Migration Enable */ +#define PCI_SRIOV_CTRL_INTR 0x04 /* VF Migration Interrupt Enable */ +#define PCI_SRIOV_CTRL_MSE 0x08 /* VF Memory Space Enable */ +#define PCI_SRIOV_CTRL_ARI 0x10 /* ARI Capable Hierarchy */ +#define PCI_SRIOV_STATUS 0x0a /* SR-IOV Status */ +#define PCI_SRIOV_STATUS_VFM 0x01 /* VF Migration Status */ +#define PCI_SRIOV_INITIAL_VF 0x0c /* Initial VFs */ +#define PCI_SRIOV_TOTAL_VF 0x0e /* Total VFs */ +#define PCI_SRIOV_NUM_VF 0x10 /* Number of VFs */ +#define PCI_SRIOV_FUNC_LINK 0x12 /* Function Dependency Link */ +#define PCI_SRIOV_VF_OFFSET 0x14 /* First VF Offset */ +#define PCI_SRIOV_VF_STRIDE 0x16 /* Following VF Stride */ +#define PCI_SRIOV_VF_DID 0x1a /* VF Device ID */ +#define PCI_SRIOV_SUP_PGSIZE 0x1c /* Supported Page Sizes */ +#define PCI_SRIOV_SYS_PGSIZE 0x20 /* System Page Size */ +#define PCI_SRIOV_BAR 0x24 /* VF BAR0 */ +#define PCI_SRIOV_NUM_BARS 6 /* Number of VF BARs */ +#define PCI_SRIOV_VFM 0x3c /* VF Migration State Array Offset*/ +#define PCI_SRIOV_VFM_BIR(x) ((x) & 7) /* State BIR */ +#define PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7) /* State Offset */ +#define PCI_SRIOV_VFM_UA 0x0 /* Inactive.Unavailable */ +#define PCI_SRIOV_VFM_MI 0x1 /* Dormant.MigrateIn */ +#define PCI_SRIOV_VFM_MO 0x2 /* Active.MigrateOut */ +#define PCI_SRIOV_VFM_AV 0x3 /* Active.Available */ + #endif /* LINUX_PCI_REGS_H */