iotlb flushes are very expensive on vt-d, up to 10ms in some cases. This patch decouple the flushing from modifying the entry, so we don't have to do a flush on each page when we want to map a range. Modify the rmrr mapping to use set_entry, then iotlb_flush. Signed-off-by: Jean Guyader --- diff -r 87f248de5230 xen/drivers/passthrough/vtd/iommu.c --- a/xen/drivers/passthrough/vtd/iommu.c Mon Nov 15 09:31:38 2010 +0000 +++ b/xen/drivers/passthrough/vtd/iommu.c Mon Nov 15 11:52:14 2010 +0000 @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -47,6 +49,7 @@ static void setup_dom0_devices(struct domain *d); static void setup_dom0_rmrr(struct domain *d); +int intel_iommu_flush_iotlb(struct domain *d, unsigned long gfn, unsigned int nr); static int domain_iommu_domid(struct domain *d, struct iommu *iommu) @@ -583,77 +586,6 @@ } } -/* clear one page's page table */ -static void dma_pte_clear_one(struct domain *domain, u64 addr) -{ - struct hvm_iommu *hd = domain_hvm_iommu(domain); - struct acpi_drhd_unit *drhd; - struct iommu *iommu; - struct dma_pte *page = NULL, *pte = NULL; - u64 pg_maddr; - int flush_dev_iotlb; - int iommu_domid; - struct list_head *rmrr_list, *tmp; - struct mapped_rmrr *mrmrr; - - spin_lock(&hd->mapping_lock); - /* get last level pte */ - pg_maddr = addr_to_dma_page_maddr(domain, addr, 0); - if ( pg_maddr == 0 ) - { - spin_unlock(&hd->mapping_lock); - return; - } - - page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); - pte = page + address_level_offset(addr, 1); - - if ( !dma_pte_present(*pte) ) - { - spin_unlock(&hd->mapping_lock); - unmap_vtd_domain_page(page); - return; - } - - dma_clear_pte(*pte); - spin_unlock(&hd->mapping_lock); - iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); - - /* No need pcidevs_lock here since do that on assign/deassign device*/ - for_each_drhd_unit ( drhd ) - { - iommu = drhd->iommu; - if ( test_bit(iommu->index, &hd->iommu_bitmap) ) - { - flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; - iommu_domid= domain_iommu_domid(domain, iommu); - if ( iommu_domid == -1 ) - continue; - if ( iommu_flush_iotlb_psi(iommu, iommu_domid, - addr, 1, 0, flush_dev_iotlb) ) - iommu_flush_write_buffer(iommu); - } - } - - unmap_vtd_domain_page(page); - - /* if the cleared address is between mapped RMRR region, - * remove the mapped RMRR - */ - spin_lock(&pcidevs_lock); - list_for_each_safe ( rmrr_list, tmp, &hd->mapped_rmrrs ) - { - mrmrr = list_entry(rmrr_list, struct mapped_rmrr, list); - if ( addr >= mrmrr->base && addr <= mrmrr->end ) - { - list_del(&mrmrr->list); - xfree(mrmrr); - break; - } - } - spin_unlock(&pcidevs_lock); -} - static void iommu_free_pagetable(u64 pt_maddr, int level) { int i; @@ -1633,86 +1565,98 @@ spin_unlock(&hd->mapping_lock); } +int intel_iommu_set_entry(struct domain *d, unsigned long gfn, unsigned long mfn, + unsigned int flags, int map) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + u64 pg_maddr; + struct dma_pte *page = NULL, *pte = NULL; + int rc; + + if ( iommu_passthrough && (d->domain_id == 0) ) + return -EINVAL; + + spin_lock(&hd->mapping_lock); + + rc = -ENOMEM; + pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, map); + if ( pg_maddr == 0 ) + goto out; + + page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); + pte = page + (gfn & LEVEL_MASK); + + rc = -EINVAL; + if ( !map && !dma_pte_present(*pte) ) + goto out; + + dma_clear_pte(*pte); + iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); + + if ( map ) + { + dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K); + dma_set_pte_prot(*pte, + ((flags & IOMMUF_readable) ? DMA_PTE_READ : 0) | + ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0)); + dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE); + /* Set the SNP on leaf page table if Snoop Control available */ + if ( iommu_snoop ) + dma_set_pte_snp(*pte); + iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); + } + else + { + struct list_head *rmrr_list, *tmp; + struct mapped_rmrr *mrmrr; + + /* if the cleared address is between mapped RMRR region, + * remove the mapped RMRR + */ + spin_lock(&pcidevs_lock); + list_for_each_safe ( rmrr_list, tmp, &hd->mapped_rmrrs ) + { + mrmrr = list_entry(rmrr_list, struct mapped_rmrr, list); + if ( gfn >= mrmrr->base && gfn <= mrmrr->end ) + { + list_del(&mrmrr->list); + xfree(mrmrr); + break; + } + } + spin_unlock(&pcidevs_lock); + } + rc = 0; + +out: + if ( page ) + unmap_vtd_domain_page(page); + spin_unlock(&hd->mapping_lock); + return rc; +} + static int intel_iommu_map_page( struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags) { - struct hvm_iommu *hd = domain_hvm_iommu(d); - struct acpi_drhd_unit *drhd; - struct iommu *iommu; - struct dma_pte *page = NULL, *pte = NULL, old, new = { 0 }; - u64 pg_maddr; - int flush_dev_iotlb; - int iommu_domid; + int rc; - /* do nothing if dom0 and iommu supports pass thru */ - if ( iommu_passthrough && (d->domain_id == 0) ) - return 0; + rc = intel_iommu_set_entry(d, gfn, mfn, flags, 1); + if ( !rc ) + intel_iommu_flush_iotlb(d, gfn, 1); - spin_lock(&hd->mapping_lock); - - pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1); - if ( pg_maddr == 0 ) - { - spin_unlock(&hd->mapping_lock); - return -ENOMEM; - } - page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); - pte = page + (gfn & LEVEL_MASK); - old = *pte; - dma_set_pte_addr(new, (paddr_t)mfn << PAGE_SHIFT_4K); - dma_set_pte_prot(new, - ((flags & IOMMUF_readable) ? DMA_PTE_READ : 0) | - ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0)); - - /* Set the SNP on leaf page table if Snoop Control available */ - if ( iommu_snoop ) - dma_set_pte_snp(new); - - if ( old.val == new.val ) - { - spin_unlock(&hd->mapping_lock); - return 0; - } - *pte = new; - - iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); - spin_unlock(&hd->mapping_lock); - unmap_vtd_domain_page(page); - - /* - * No need pcideves_lock here because we have flush - * when assign/deassign device - */ - for_each_drhd_unit ( drhd ) - { - iommu = drhd->iommu; - - if ( !test_bit(iommu->index, &hd->iommu_bitmap) ) - continue; - - flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; - iommu_domid= domain_iommu_domid(d, iommu); - if ( iommu_domid == -1 ) - continue; - if ( iommu_flush_iotlb_psi(iommu, iommu_domid, - (paddr_t)gfn << PAGE_SHIFT_4K, 1, - !dma_pte_present(old), flush_dev_iotlb) ) - iommu_flush_write_buffer(iommu); - } - - return 0; + return rc; } static int intel_iommu_unmap_page(struct domain *d, unsigned long gfn) { - /* Do nothing if dom0 and iommu supports pass thru. */ - if ( iommu_passthrough && (d->domain_id == 0) ) - return 0; + int rc; - dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K); + rc = intel_iommu_set_entry(d, gfn, 0, 0, 0); + if ( !rc ) + intel_iommu_flush_iotlb(d, gfn, 1); - return 0; + return rc; } static int domain_rmrr_mapped(struct domain *d, @@ -1735,9 +1679,10 @@ struct acpi_rmrr_unit *rmrr) { u64 base, end; - unsigned long base_pfn, end_pfn; + unsigned long base_pfn, origin_base_pfn, end_pfn; struct mapped_rmrr *mrmrr; struct hvm_iommu *hd = domain_hvm_iommu(d); + int rc; ASSERT(spin_is_locked(&pcidevs_lock)); ASSERT(rmrr->base_address < rmrr->end_address); @@ -1746,18 +1691,21 @@ return 0; base = rmrr->base_address & PAGE_MASK_4K; - base_pfn = base >> PAGE_SHIFT_4K; + origin_base_pfn = base_pfn = base >> PAGE_SHIFT_4K; end = PAGE_ALIGN_4K(rmrr->end_address); end_pfn = end >> PAGE_SHIFT_4K; while ( base_pfn < end_pfn ) { - if ( intel_iommu_map_page(d, base_pfn, base_pfn, - IOMMUF_readable|IOMMUF_writable) ) + rc = intel_iommu_set_entry(d, base_pfn, base_pfn, + IOMMUF_readable|IOMMUF_writable, 1); + base_pfn++; + if ( !rc ) return -1; - base_pfn++; } + intel_iommu_flush_iotlb(d, origin_base_pfn, base_pfn - origin_base_pfn - 1); + mrmrr = xmalloc(struct mapped_rmrr); if ( !mrmrr ) return -ENOMEM; @@ -1768,6 +1716,33 @@ return 0; } +int intel_iommu_flush_iotlb(struct domain *d, unsigned long gfn, unsigned int nr) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + int flush_dev_iotlb; + + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + + if ( !test_bit(iommu->index, &hd->iommu_bitmap) ) + continue; + + flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; + if (iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d, iommu), + (paddr_t)gfn << PAGE_SHIFT_4K, nr, 0, flush_dev_iotlb) || + iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d, iommu), + (paddr_t)gfn << PAGE_SHIFT_4K, nr, 1, flush_dev_iotlb)) + iommu_flush_write_buffer(iommu); + } + return 0; +} + static int intel_iommu_add_device(struct pci_dev *pdev) { struct acpi_rmrr_unit *rmrr;