x86/PCI: intercept all PV Dom0 MMCFG writes ... to hook up pci_conf_write_intercept() even for Dom0 not using method 1 accesses for the base part of PCI device config space. Signed-off-by: Jan Beulich --- Not entirely sure whether the complicated logging logic in x86/mm.c is actually worth it. --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -734,6 +734,46 @@ static int update_xen_mappings(unsigned return err; } +#ifndef NDEBUG +struct mmio_emul_range_ctxt { + const struct domain *d; + unsigned long mfn; +}; + +static int print_mmio_emul_range(unsigned long s, unsigned long e, void *arg) +{ + const struct mmio_emul_range_ctxt *ctxt = arg; + + if ( ctxt->mfn > e ) + return 0; + + if ( ctxt->mfn >= s ) + { + static DEFINE_SPINLOCK(last_lock); + static const struct domain *last_d; + static unsigned long last_s = ~0UL, last_e; + bool_t print = 0; + + spin_lock(&last_lock); + if ( last_d != ctxt->d || last_s != s || last_e != e ) + { + last_d = ctxt->d; + last_s = s; + last_e = e; + print = 1; + } + spin_unlock(&last_lock); + + if ( print ) + printk(XENLOG_G_INFO + "d%d: Forcing write emulation on MFNs %lx-%lx\n", + ctxt->d->domain_id, s, e); + } + + return 1; +} +#endif + int get_page_from_l1e( l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner) @@ -757,6 +797,11 @@ get_page_from_l1e( if ( !mfn_valid(mfn) || (real_pg_owner = page_get_owner_and_reference(page)) == dom_io ) { +#ifndef NDEBUG + const unsigned long *ro_map; + unsigned int seg, bdf; +#endif + /* Only needed the reference to confirm dom_io ownership. */ if ( mfn_valid(mfn) ) put_page(page); @@ -792,9 +837,20 @@ get_page_from_l1e( if ( !(l1f & _PAGE_RW) || !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) return 0; - dprintk(XENLOG_G_WARNING, - "d%d: Forcing read-only access to MFN %lx\n", - l1e_owner->domain_id, mfn); +#ifndef NDEBUG + if ( !pci_mmcfg_decode(mfn, &seg, &bdf) || + ((ro_map = pci_get_ro_map(seg)) != NULL && + test_bit(bdf, ro_map)) ) + printk(XENLOG_G_WARNING + "d%d: Forcing read-only access to MFN %lx\n", + l1e_owner->domain_id, mfn); + else + rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL, + print_mmio_emul_range, + &(struct mmio_emul_range_ctxt){ + .d = l1e_owner, + .mfn = mfn }); +#endif return 1; } @@ -5145,6 +5201,7 @@ int ptwr_do_page_fault(struct vcpu *v, u /* We are looking only for read-only mappings of p.t. pages. */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) || + rangeset_contains_singleton(mmio_ro_ranges, l1e_get_pfn(pte)) || !get_page_from_pagenr(l1e_get_pfn(pte), d) ) goto bail; @@ -5192,6 +5249,7 @@ int ptwr_do_page_fault(struct vcpu *v, u struct mmio_ro_emulate_ctxt { struct x86_emulate_ctxt ctxt; unsigned long cr2; + unsigned int seg, bdf; }; static int mmio_ro_emulated_read( @@ -5231,6 +5289,44 @@ static const struct x86_emulate_ops mmio .write = mmio_ro_emulated_write, }; +static int mmio_intercept_write( + enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct mmio_ro_emulate_ctxt *mmio_ctxt = + container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt); + + /* + * Only allow naturally-aligned stores no wider than 4 bytes to the + * original %cr2 address. + */ + if ( ((bytes | offset) & (bytes - 1)) || bytes > 4 || + offset != mmio_ctxt->cr2 ) + { + MEM_LOG("mmio_intercept: bad write (cr2=%lx, addr=%lx, bytes=%u)", + mmio_ctxt->cr2, offset, bytes); + return X86EMUL_UNHANDLEABLE; + } + + offset &= 0xfff; + pci_conf_write_intercept(mmio_ctxt->seg, mmio_ctxt->bdf, offset, bytes, + p_data); + pci_mmcfg_write(mmio_ctxt->seg, PCI_BUS(mmio_ctxt->bdf), + PCI_DEVFN2(mmio_ctxt->bdf), offset, bytes, + *(uint32_t *)p_data); + + return X86EMUL_OKAY; +} + +static const struct x86_emulate_ops mmio_intercept_ops = { + .read = mmio_ro_emulated_read, + .insn_fetch = ptwr_emulated_read, + .write = mmio_intercept_write, +}; + /* Check if guest is trying to modify a r/o MMIO page. */ int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr, struct cpu_user_regs *regs) @@ -5245,6 +5341,7 @@ int mmio_ro_do_page_fault(struct vcpu *v .ctxt.swint_emulate = x86_swint_emulate_none, .cr2 = addr }; + const unsigned long *ro_map; int rc; /* Attempt to read the PTE that maps the VA being accessed. */ @@ -5269,7 +5366,12 @@ int mmio_ro_do_page_fault(struct vcpu *v if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) return 0; - rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops); + if ( pci_mmcfg_decode(mfn, &mmio_ro_ctxt.seg, &mmio_ro_ctxt.bdf) && + ((ro_map = pci_get_ro_map(mmio_ro_ctxt.seg)) == NULL || + !test_bit(mmio_ro_ctxt.bdf, ro_map)) ) + rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_intercept_ops); + else + rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops); return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0; } --- a/xen/arch/x86/x86_64/mmconfig_64.c +++ b/xen/arch/x86/x86_64/mmconfig_64.c @@ -134,30 +134,10 @@ static void __iomem *mcfg_ioremap(const return (void __iomem *) virt; } -void arch_pci_ro_device(int seg, int bdf) -{ - unsigned int idx, bus = PCI_BUS(bdf); - - for (idx = 0; idx < pci_mmcfg_config_num; ++idx) { - const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg; - unsigned long mfn = (cfg->address >> PAGE_SHIFT) + bdf; - - if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg || - cfg->start_bus_number > bus || cfg->end_bus_number < bus) - continue; - - if (rangeset_add_singleton(mmio_ro_ranges, mfn)) - printk(XENLOG_ERR - "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx) read-only\n", - cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf), - mfn); - } -} - int pci_mmcfg_arch_enable(unsigned int idx) { const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg; - const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment); + unsigned long start_mfn, end_mfn; if (pci_mmcfg_virt[idx].virt) return 0; @@ -169,16 +149,15 @@ int pci_mmcfg_arch_enable(unsigned int i } printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); - if (ro_map) { - unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0); - unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1); - - while ((bdf = find_next_bit(ro_map, end + 1, bdf)) <= end) { - arch_pci_ro_device(cfg->pci_segment, bdf); - if (bdf++ == end) - break; - } - } + + start_mfn = PFN_DOWN(cfg->address) + PCI_BDF(cfg->start_bus_number, 0, 0); + end_mfn = PFN_DOWN(cfg->address) + PCI_BDF(cfg->end_bus_number, ~0, ~0); + if ( rangeset_add_range(mmio_ro_ranges, start_mfn, end_mfn) ) + printk(XENLOG_ERR + "%04x:%02x-%02x: could not mark MCFG (mfns %lx-%lx) read-only\n", + cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number, + start_mfn, end_mfn); + return 0; } @@ -197,6 +176,28 @@ void pci_mmcfg_arch_disable(unsigned int cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); } +bool_t pci_mmcfg_decode(unsigned long mfn, unsigned int *seg, + unsigned int *bdf) +{ + unsigned int idx; + + for (idx = 0; idx < pci_mmcfg_config_num; ++idx) { + const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg; + + if (pci_mmcfg_virt[idx].virt && + mfn >= PFN_DOWN(cfg->address) + PCI_BDF(cfg->start_bus_number, + 0, 0) && + mfn <= PFN_DOWN(cfg->address) + PCI_BDF(cfg->end_bus_number, + ~0, ~0)) { + *seg = cfg->pci_segment; + *bdf = mfn - PFN_DOWN(cfg->address); + return 1; + } + } + + return 0; +} + int __init pci_mmcfg_arch_init(void) { int i; --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -447,7 +447,6 @@ int __init pci_ro_device(int seg, int bu } __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map); - arch_pci_ro_device(seg, PCI_BDF2(bus, devfn)); _pci_hide_device(pdev); return 0; --- a/xen/include/asm-x86/pci.h +++ b/xen/include/asm-x86/pci.h @@ -20,5 +20,7 @@ int pci_conf_write_intercept(unsigned in uint32_t *data); int pci_msi_conf_write_intercept(struct pci_dev *, unsigned int reg, unsigned int size, uint32_t *data); +bool_t pci_mmcfg_decode(unsigned long mfn, unsigned int *seg, + unsigned int *bdf); #endif /* __X86_PCI_H__ */ --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -110,7 +110,6 @@ int pci_add_device(u16 seg, u8 bus, u8 d const struct pci_dev_info *, nodeid_t node); int pci_remove_device(u16 seg, u8 bus, u8 devfn); int pci_ro_device(int seg, int bus, int devfn); -void arch_pci_ro_device(int seg, int bdf); int pci_hide_device(int bus, int devfn); struct pci_dev *pci_get_pdev(int seg, int bus, int devfn); struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn);