WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] Support for AGP aperture as IOMMU in AMD64 mode [1/2

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH] Support for AGP aperture as IOMMU in AMD64 mode [1/2]
From: "Langsdorf, Mark" <mark.langsdorf@xxxxxxx>
Date: Mon, 16 Jan 2006 17:50:29 -0600
Delivery-date: Mon, 16 Jan 2006 23:58:37 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AcYa8FH5dw2MpbsrQLSOc7Qw5xK/DwAAIC1Q
Thread-topic: [PATCH] Support for AGP aperture as IOMMU in AMD64 mode [1/2]
This patch adds support for using the AGP aperture built 
into every Athlon 64 and Opteron memory controller as an 
IOMMU.

It's a little large since it adds the entire aperture.c,
pci-dma.c, and pci-gart.c files from the mainline x86-64 
architecture to Xen, but makes relatively small changes 
to them.  I'm including the diffs for those changes 
(against pristine) in a separate file for easier 
comparison.

Default operation is for the SWIOTLB to be enabled unless
the user specifically requests IOMMU support.

I'm still working on getting the performance change benchmarked
but now that the code is working and appears stable I thought
I should get it out for commentary.

Signed-Off-By: Mark Langsdorf <mark.langsdorf@xxxxxxx>

# HG changeset patch
# User root@xxxxxxxxxxxxxx
# Node ID 32a2559f95bd3ffb0ff7701315965501f96eaa00
# Parent  a9ead230cc6038dd4905574b70328d3889c22c03
Support for AGP aperture as IOMMU in AMD64 mode

diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/i386/kernel/swiotlb.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/swiotlb.c       Mon Jan
16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/swiotlb.c       Mon Jan
16 23:04:02 2006
@@ -26,6 +26,9 @@
 #include <asm/dma.h>
 #include <asm/uaccess.h>
 #include <asm-xen/xen-public/memory.h>
+#ifdef CONFIG_X86_64
+#include <asm-xen/asm-x86_64/dma-mapping.h>
+#endif
 
 #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
 
@@ -384,6 +387,70 @@
        __sync_single(buffer, dma_addr, size, dir);
 }
 
+void *
+swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+                       dma_addr_t *dma_handle, int flags)
+{
+        unsigned long dev_addr;
+        void *ret;
+        int order = get_order(size);
+
+        /*
+         * XXX fix me: the DMA API should pass us an explicit DMA mask
+         * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a
~32
+         * bit range instead of a 16MB one).
+         */
+        flags |= GFP_DMA;
+
+        ret = (void *)__get_free_pages(flags, order);
+        if (ret && address_needs_mapping(hwdev, virt_to_phys(ret))) {
+                /*
+                 * The allocated memory isn't reachable by the device.
+                 * Fall back on swiotlb_map_single().
+                 */
+                free_pages((unsigned long) ret, order);
+                ret = NULL;
+        }
+        if (!ret) {
+                /*
+                 * We are either out of memory or the device can't DMA
+                 * to GFP_DMA memory; fall back on
+                 * swiotlb_map_single(), which will grab memory from
+                 * the lowest available address range.
+                 */
+                dma_addr_t handle;
+                handle = swiotlb_map_single(NULL, NULL, size,
DMA_FROM_DEVICE);
+                if (dma_mapping_error(handle))
+                        return NULL;
+
+                ret = phys_to_virt(handle);
+        }
+
+        memset(ret, 0, size);
+        dev_addr = virt_to_phys(ret);
+
+        /* Confirm address can be DMA'd by device */
+        if (address_needs_mapping(hwdev, dev_addr)) {
+                printk("hwdev DMA mask = 0x%016Lx, dev_addr =
0x%016lx\n",
+                       (unsigned long long)*hwdev->dma_mask, dev_addr);
+                panic("swiotlb_alloc_coherent: allocated memory is out
of "
+                      "range for device");
+        }
+        *dma_handle = dev_addr;
+        return ret;
+}
+
+void
+swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+                      dma_addr_t dma_handle)
+{
+        if (in_swiotlb_aperture((dma_addr_t) vaddr))
+                free_pages((unsigned long) vaddr, get_order(size));
+        else
+                /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+                swiotlb_unmap_single (hwdev, dma_handle, size, 
+DMA_TO_DEVICE); }
+
 static void
 swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)  {
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile      Mon Jan
16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile      Mon Jan
16 23:04:02 2006
@@ -35,7 +35,7 @@
 #obj-$(CONFIG_SOFTWARE_SUSPEND)        += suspend_asm.o
 #obj-$(CONFIG_CPU_FREQ)                += cpufreq/
 #obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
-#obj-$(CONFIG_GART_IOMMU)      += pci-gart.o aperture.o
+obj-$(CONFIG_GART_IOMMU)       += pci-gart.o aperture.o 
 obj-$(CONFIG_DUMMY_IOMMU)      += pci-nommu.o
 i386-obj-$(CONFIG_DUMMY_IOMMU) += pci-dma.o
 i386-obj-$(CONFIG_SWIOTLB)     += swiotlb.o
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c        Mon Jan
16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c        Mon Jan
16 23:04:02 2006
@@ -524,6 +524,30 @@
 extern unsigned long xen_override_max_pfn;
 extern union xen_start_info_union xen_start_info_union;
 
+
+int __init e820_mapped(unsigned long start, unsigned long end, unsigned

+type) {
+        int i;
+        dom0_op_t op;
+        struct dom0_memory_map_entry *map;
+
+       op.cmd = DOM0_PHYSICAL_MEMORY_MAP;
+        op.u.physical_memory_map.memory_map = map;
+        op.u.physical_memory_map.max_map_entries =
+                PAGE_SIZE / sizeof(struct dom0_memory_map_entry);
+        BUG_ON(HYPERVISOR_dom0_op(&op));
+
+
+        for (i = 0; i < op.u.physical_memory_map.nr_map_entries; i++) {
+                if ((type == E820_RAM) && !map[i].is_ram)
+                        continue;
+                if (map[i].start >= end || map[i].end < start)
+                        continue;
+                return 1;
+        }
+        return 0;
+}
+
 unsigned long __init e820_end_of_ram(void)
 {
        unsigned long max_end_pfn;
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c   Mon Jan
16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c   Mon Jan
16 23:04:02 2006
@@ -19,7 +19,6 @@
 int iommu_sac_force = 0;
 EXPORT_SYMBOL(iommu_sac_force);
 
-#if 0
 /* 
  * Dummy IO MMU functions
  */
@@ -59,9 +58,7 @@
        free_pages((unsigned long)vaddr, get_order(size));
 }
 EXPORT_SYMBOL(dma_free_coherent);
-#endif
 
-#if 0
 int dma_supported(struct device *hwdev, u64 mask)
 {
         /*
@@ -77,7 +74,6 @@
        return 1;
 } 
 EXPORT_SYMBOL(dma_supported);
-#endif
 
 int dma_get_cache_alignment(void)
 {
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c       Mon Jan
16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c       Mon Jan
16 23:04:02 2006
@@ -848,10 +848,6 @@
 
        e820_setup_gap();
 
-#ifdef CONFIG_GART_IOMMU
-       iommu_hole_init();
-#endif
-
 #ifdef CONFIG_XEN
        {
               physdev_op_t op;
@@ -896,6 +892,10 @@
 #endif
 
 #endif /* !CONFIG_XEN */
+
+#ifdef CONFIG_GART_IOMMU
+       iommu_hole_init();
+#endif
 }
 
 static int __init get_model_name(struct cpuinfo_x86 *c)
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c    Mon Jan 16
13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c    Mon Jan 16
23:04:02 2006
@@ -677,12 +677,14 @@
                pmd = pmd_offset(pud, address);
                if (!pmd || pmd_none(*pmd))
                        continue; 
+#ifndef CONFIG_XEN
                if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
                        /* Could handle this, but it should not happen
currently. */
                        printk(KERN_ERR 
               "clear_kernel_mapping: mapping has been split. will leak
memory\n"); 
                        pmd_ERROR(*pmd); 
                }
+#endif
                set_pmd(pmd, __pmd(0));                 
        }
        __flush_tlb_all();
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h
Mon Jan 16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h
Mon Jan 16 23:04:02 2006
@@ -1,1 +1,180 @@
-#include <asm-i386/dma-mapping.h>
+#ifndef _X8664_DMA_MAPPING_H
+#define _X8664_DMA_MAPPING_H 1
+
+/*
+ * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt 
+for
+ * documentation.
+ */
+
+#include <linux/config.h>
+
+#include <asm/scatterlist.h>
+#include <asm/io.h>
+#include <asm/swiotlb.h>
+
+extern dma_addr_t bad_dma_address;
+
+void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t
*dma_handle,
+                        unsigned gfp);
+void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
+                        dma_addr_t dma_handle);
+
+#define dma_mapping_error(x) \
+        (swiotlb ? swiotlb_dma_mapping_error(x) : ((x) == 
+bad_dma_address))
+
+#define dma_map_page(dev,page,offset,size,dir) \
+        dma_map_single((dev), page_address(page)+(offset), (size), 
+(dir))
+
+
+static inline int
+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+{
+        dma_addr_t mask = 0xffffffff;
+        /* If the device has a mask, use it, otherwise default to 32
bits */
+        if (hwdev && hwdev->dma_mask)
+                mask = *hwdev->dma_mask;
+        return (addr & ~mask) != 0;
+}
+
+static inline int
+range_straddles_page_boundary(void *p, size_t size)
+{
+        extern unsigned long *contiguous_bitmap;
+        return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE)
&&
+                !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap)); }
+
+#ifdef CONFIG_GART_IOMMU
+
+extern dma_addr_t dma_map_single(struct device *hwdev, void *ptr,
size_t size,
+                                int direction);
+extern void dma_unmap_single(struct device *dev, dma_addr_t addr,size_t
size,
+                            int direction);
+
+#else
+/* No IOMMU */
+
+#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY 1
+
+#if 0
+static inline int
+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+{
+        dma_addr_t mask = 0xffffffff;
+        /* If the device has a mask, use it, otherwise default to 32
bits */
+        if (hwdev && hwdev->dma_mask)
+                mask = *hwdev->dma_mask;
+        return (addr & ~mask) != 0;
+}
+
+static inline int
+range_straddles_page_boundary(void *p, size_t size)
+{
+        extern unsigned long *contiguous_bitmap;
+        return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE)
&&
+                !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap)); }
+#endif
+
+static inline dma_addr_t dma_map_single(struct device *hwdev, void
*ptr,
+                                       size_t size, int direction)
+{
+       dma_addr_t addr;
+
+       if (direction == DMA_NONE)
+               out_of_line_bug();
+       addr = virt_to_bus(ptr);
+
+       if ((addr+size) & ~*hwdev->dma_mask)
+               out_of_line_bug();
+       return addr;
+}
+
+
+static inline void dma_unmap_single(struct device *hwdev, dma_addr_t
dma_addr,
+                                   size_t size, int direction)
+{
+       if (direction == DMA_NONE)
+               out_of_line_bug();
+       /* Nothing to do */
+}
+
+#endif
+
+static inline void dma_sync_single_for_cpu(struct device *hwdev,
+                                              dma_addr_t dma_handle,
+                                              size_t size, int
direction)
+{
+       if (direction == DMA_NONE)
+               out_of_line_bug();
+
+       if (swiotlb)
+               return
swiotlb_sync_single_for_cpu(hwdev,dma_handle,size,direction);
+
+       flush_write_buffers();
+}
+
+static inline void dma_sync_single_for_device(struct device *hwdev,
+                                                 dma_addr_t dma_handle,
+                                                 size_t size, int
direction)
+{
+        if (direction == DMA_NONE)
+               out_of_line_bug();
+
+       if (swiotlb)
+               return 
+swiotlb_sync_single_for_device(hwdev,dma_handle,size,direction);
+
+       flush_write_buffers();
+}
+
+static inline void dma_sync_sg_for_cpu(struct device *hwdev,
+                                      struct scatterlist *sg,
+                                      int nelems, int direction)
+{
+       if (direction == DMA_NONE)
+               out_of_line_bug();
+
+       if (swiotlb)
+               return
swiotlb_sync_sg_for_cpu(hwdev,sg,nelems,direction);
+
+       flush_write_buffers();
+}
+
+static inline void dma_sync_sg_for_device(struct device *hwdev,
+                                         struct scatterlist *sg,
+                                         int nelems, int direction)
+{
+       if (direction == DMA_NONE)
+               out_of_line_bug();
+
+       if (swiotlb)
+               return
swiotlb_sync_sg_for_device(hwdev,sg,nelems,direction);
+
+       flush_write_buffers();
+}
+
+extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+                     int nents, int direction);
+extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+                        int nents, int direction);
+
+#define dma_unmap_page dma_unmap_single
+
+extern int dma_supported(struct device *hwdev, u64 mask); extern int 
+dma_get_cache_alignment(void); #define dma_is_consistent(h) 1
+
+static inline int dma_set_mask(struct device *dev, u64 mask)
+{
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+       *dev->dma_mask = mask;
+       return 0;
+}
+
+static inline void dma_cache_sync(void *vaddr, size_t size, enum 
+dma_data_direction dir) {
+       flush_write_buffers();
+}
+
+#endif
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/aperture.c
--- /dev/null   Mon Jan 16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/aperture.c    Mon Jan
16 23:04:02 2006
@@ -0,0 +1,286 @@
+/*
+ * Firmware replacement code.
+ * 
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge. 
+ * If all fails map the aperture over some low memory.  This is cheaper
than 
+ * doing bounce buffering. The memory is lost. This is done at early
boot 
+ * because only the bootmem allocator can allocate 32+MB. 
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/pci-direct.h>
+
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0;
+
+int fix_aperture __initdata = 1;
+
+/* This code runs before the PCI subsystem is initialized, so just
+   access the northbridge directly. */
+
+#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
+
+static u32 __init allocate_aperture(void)
+{
+#ifdef CONFIG_DISCONTIGMEM
+       pg_data_t *nd0 = NODE_DATA(0);
+#else
+       pg_data_t *nd0 = &contig_page_data;
+#endif 
+       u32 aper_size;
+       void *p; 
+
+       if (fallback_aper_order > 7) 
+               fallback_aper_order = 7; 
+       aper_size = (32 * 1024 * 1024) << fallback_aper_order;
+
+       /* 
+        * Aperture has to be naturally aligned. This means an 2GB
aperture won't
+        * have much chances to find a place in the lower 4GB of memory.
+        * Unfortunately we cannot move it up because that would make
the
+        * IOMMU useless.
+        */
+       p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); 
+       if (!p || __pa(p)+aper_size > 0xffffffff) {
+               printk("Cannot allocate aperture memory hole
(%p,%uK)\n",
+                      p, aper_size>>10);
+               if (p)
+                       free_bootmem_node(nd0, (unsigned long)p,
aper_size); 
+               return 0;
+       }
+       printk("Mapping aperture over %d KB of RAM @ %lx\n",
+              aper_size >> 10, __pa(p)); 
+       return (u32)__pa(p);
+}
+
+static int __init aperture_valid(char *name, u64 aper_base, u32 
+aper_size)
+{ 
+       if (!aper_base) 
+               return 0;
+       if (aper_size < 64*1024*1024) { 
+               printk("Aperture from %s too small (%d MB)\n", name,
aper_size>>20); 
+               return 0;
+       }
+       if (aper_base + aper_size >= 0xffffffff) { 
+               printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
+               return 0; 
+       }
+       if (0 && e820_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {  
+               printk("Aperture from %s pointing to e820 RAM.
Ignoring.\n",name);
+               return 0; 
+       } 
+       return 1;
+} 
+
+/* Find a PCI capability */
+static __u32 __init find_cap(int num, int slot, int func, int cap)
+{ 
+       u8 pos;
+       int bytes;
+       if (!(read_pci_config_16(num,slot,func,PCI_STATUS) &
PCI_STATUS_CAP_LIST))
+               return 0;
+       pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+       for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+               u8 id;
+               pos &= ~3; 
+               id =
read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+               if (id == 0xff)
+                       break;
+               if (id == cap) 
+                       return pos; 
+               pos =
read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
+       } 
+       return 0;
+} 
+
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 
+*order) {
+       u32 apsize;
+       u32 apsizereg;
+       int nbits;
+       u32 aper_low, aper_hi;
+       u64 aper;
+
+       printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+       apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+       if (apsizereg == 0xffffffff) {
+               printk("APSIZE in AGP bridge unreadable\n");
+               return 0;
+       }
+
+       apsize = apsizereg & 0xfff;
+       /* Some BIOS use weird encodings not in the AGPv3 table. */
+       if (apsize & 0xff) 
+               apsize |= 0xf00; 
+       nbits = hweight16(apsize);
+       *order = 7 - nbits;
+       if ((int)*order < 0) /* < 32MB */
+               *order = 0;
+       
+       aper_low = read_pci_config(num,slot,func, 0x10);
+       aper_hi = read_pci_config(num,slot,func,0x14);
+       aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+       printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+              aper, 32 << *order, apsizereg);
+
+       if (!aperture_valid("AGP bridge", aper, (32*1024*1024) <<
*order))
+           return 0;
+       return (u32)aper;
+} 
+
+/* Look for an AGP bridge. Windows only expects the aperture in the
+   AGP bridge and some BIOS forget to initialize the Northbridge too.
+   Work around this here.
+
+   Do an PCI bus scan by hand because we're running before the PCI
+   subsystem.
+
+   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+   generically. It's probably overkill to always scan all slots because
+   the AGP bridges should be always an own bus on the HT hierarchy, 
+   but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) {
+       int num, slot, func;
+
+       /* Poor man's PCI discovery */
+       for (num = 0; num < 32; num++) { 
+               for (slot = 0; slot < 32; slot++) { 
+                       for (func = 0; func < 8; func++) { 
+                               u32 class, cap;
+                               u8 type;
+                               class = read_pci_config(num,slot,func,
+
PCI_CLASS_REVISION);
+                               if (class == 0xffffffff)
+                                       break;
+                               
+                               switch (class >> 16) { 
+                               case PCI_CLASS_BRIDGE_HOST:
+                               case PCI_CLASS_BRIDGE_OTHER: /* needed?
*/
+                                       /* AGP bridge? */
+                                       cap =
find_cap(num,slot,func,PCI_CAP_ID_AGP);
+                                       if (!cap)
+                                               break;
+                                       *valid_agp = 1; 
+                                       return
read_agp(num,slot,func,cap,order);
+                               }
+                               
+                               /* No multi-function device? */
+                               type =
read_pci_config_byte(num,slot,func,
+
PCI_HEADER_TYPE);
+                               if (!(type & 0x80))
+                                       break;
+                       } 
+               } 
+       }
+       printk("No AGP bridge found\n"); 
+       return 0;
+}
+
+void __init iommu_hole_init(void)
+{ 
+       int fix, num; 
+       u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
+       u64 aper_base, last_aper_base = 0;
+       int valid_agp = 0;
+
+       if (iommu_aperture_disabled || !fix_aperture)
+               return;
+
+       printk("Checking aperture...\n");
+
+       fix = 0;
+       for (num = 24; num < 32; num++) {               
+               char name[30];
+               if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+                       continue;       
+
+               iommu_aperture = 1;
+
+               aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) &
7; 
+               aper_size = (32 * 1024 * 1024) << aper_order; 
+               aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+               aper_base <<= 25;
+
+               printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+                      aper_base, aper_size>>20);
+               
+               sprintf(name, "northbridge cpu %d", num-24);
+
+               if (!aperture_valid(name, aper_base, aper_size)) { 
+                       fix = 1; 
+                       break; 
+               }
+
+               if ((last_aper_order && aper_order != last_aper_order)
||
+                   (last_aper_base && aper_base != last_aper_base)) {
+                       fix = 1;
+                       break;
+               }
+               last_aper_order = aper_order;
+               last_aper_base = aper_base;
+       }
+
+       if (!fix && !fallback_aper_force) 
+               return;
+
+       if (!fallback_aper_force)
+               aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+               
+       if (aper_alloc) { 
+               /* Got the aperture from the AGP bridge */
+       } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) ||
+                  force_iommu ||
+                  valid_agp ||
+                  fallback_aper_force) { 
+               printk("Your BIOS doesn't leave a aperture memory
hole\n");
+               printk("Please enable the IOMMU option in the BIOS
setup\n");
+               printk("This costs you %d MB of RAM\n",
+                      32 << fallback_aper_order);
+
+               aper_order = fallback_aper_order;
+               aper_alloc = allocate_aperture();
+               if (!aper_alloc) { 
+                       /* Could disable AGP and IOMMU here, but it's
probably
+                          not worth it. But the later users cannot deal
with
+                          bad apertures and turning on the aperture
over memory
+                          causes very strange problems, so it's better
to 
+                          panic early. */
+                       panic("Not enough memory for aperture");
+               }
+       } else { 
+               return; 
+       }
+
+       /* Fix up the north bridges */
+       for (num = 24; num < 32; num++) {               
+               if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+                       continue;       
+
+               /* Don't enable translation yet. That is done later. 
+                  Assume this BIOS didn't initialise the GART so 
+                  just overwrite all previous bits */ 
+               write_pci_config(0, num, 3, 0x90, aper_order<<1); 
+               write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
+       }
+} 
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-dma.c
--- /dev/null   Mon Jan 16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-dma.c     Mon Jan
16 23:04:02 2006
@@ -0,0 +1,60 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scatter-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+              int nents, int direction)
+{
+       int i;
+
+       BUG_ON(direction == DMA_NONE);
+       for (i = 0; i < nents; i++ ) {
+               struct scatterlist *s = &sg[i];
+               BUG_ON(!s->page); 
+               s->dma_address = virt_to_bus(page_address(s->page)
+s->offset);
+               s->dma_length = s->length;
+       }
+       return nents;
+}
+
+EXPORT_SYMBOL(dma_map_sg);
+
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+                 int nents, int dir)
+{
+       int i;
+       for (i = 0; i < nents; i++) { 
+               struct scatterlist *s = &sg[i];
+               BUG_ON(s->page == NULL); 
+               BUG_ON(s->dma_address == 0); 
+               dma_unmap_single(dev, s->dma_address, s->dma_length,
dir);
+       }
+}
+
+EXPORT_SYMBOL(dma_unmap_sg);
diff -r a9ead230cc60 -r 32a2559f95bd
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-gart.c
--- /dev/null   Mon Jan 16 13:47:31 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-gart.c    Mon Jan
16 23:04:02 2006
@@ -0,0 +1,993 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ *
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU
for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on
systems
+ * with more than 4GB. 
+ *
+ * See Documentation/DMA-mapping.txt for the interface specification.
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/cacheflush.h>
+#include <asm/kdebug.h>
+#include <asm-xen/xen-public/memory.h>
+
+dma_addr_t bad_dma_address;
+
+unsigned long iommu_bus_base;  /* GART remapping area (physical) */
+static unsigned long iommu_size;       /* size of remapping area bytes
*/
+static unsigned long iommu_pages;      /* .. and in pages */
+
+u32 *iommu_gatt_base;          /* Remapping table */
+
+/* gart remapping */
+#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x))) #define 
+gart_to_virt(x) (phys_to_virt(gart_to_phys(x))) #include 
+<asm-xen/asm-i386/agp.h>
+
+int no_iommu;
+static int no_agp; 
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow = 1; 
+int force_iommu = 1;
+#else
+int panic_on_overflow = 0;
+int force_iommu = 0;
+#endif
+int iommu_merge = 1;
+int iommu_sac_force = 0; 
+
+/* If this is disabled the IOMMU will use an optimized flushing
strategy
+   of only flushing when an mapping is reused. With it true the GART is
flushed 
+   for every mapping. Problem is that doing the lazy flush seems to
trigger
+   bugs with some popular PCI cards, in particular 3ware (but has been
also
+   also seen with Qlogic at least). */
+int iommu_fullflush = 1;
+
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge = 0;
+
+#define MAX_NB 8
+
+/* Allocation bitmap for the remapping area */
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+static unsigned long *iommu_gart_bitmap; /* guarded by
iommu_bitmap_lock */
+
+static u32 gart_unmapped_entry;
+
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+       (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID |
GPTE_COHERENT) 
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 
+28))
+
+#define to_pages(addr,size) \
+       (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >>
PAGE_SHIFT)
+
+#define for_all_nb(dev) \
+       dev = NULL;     \
+       while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103,
dev))!=NULL)\
+            if (dev->bus->number == 0 &&
\
+                   (PCI_SLOT(dev->devfn) >= 24) &&
(PCI_SLOT(dev->devfn) <= 31))
+
+static struct pci_dev *northbridges[MAX_NB];
+static u32 northbridge_flush_word[MAX_NB];
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static int need_flush;                 /* global flush state. set for
each gart wrap */
+static dma_addr_t dma_map_area(struct device *dev, unsigned long
phys_mem,
+                              size_t size, int dir, int do_panic);
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible to

+i386. */ static struct device fallback_dev = {
+       .bus_id = "fallback device",
+       .coherent_dma_mask = 0xffffffff,
+       .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+static unsigned long alloc_iommu(int size) 
+{      
+       unsigned long offset, flags;
+
+       spin_lock_irqsave(&iommu_bitmap_lock, flags);   
+       offset =
find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+       if (offset == -1) {
+               need_flush = 1;
+               offset =
find_next_zero_string(iommu_gart_bitmap,0,next_bit,size);
+       }
+       if (offset != -1) { 
+               set_bit_string(iommu_gart_bitmap, offset, size); 
+               next_bit = offset+size; 
+               if (next_bit >= iommu_pages) { 
+                       next_bit = 0;
+                       need_flush = 1;
+               } 
+       } 
+       if (iommu_fullflush)
+               need_flush = 1;
+       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+       return offset;
+}
+
+static void free_iommu(unsigned long offset, int size)
+{ 
+       unsigned long flags;
+       if (size == 1) { 
+               clear_bit(offset, iommu_gart_bitmap); 
+               return;
+       }
+       spin_lock_irqsave(&iommu_bitmap_lock, flags);
+       __clear_bit_string(iommu_gart_bitmap, offset, size);
+       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+/*
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(struct device *dev)
+{ 
+       unsigned long flags;
+       int flushed = 0;
+       int i, max;
+
+       spin_lock_irqsave(&iommu_bitmap_lock, flags);
+       if (need_flush) { 
+               max = 0;
+               for (i = 0; i < MAX_NB; i++) {
+                       if (!northbridges[i]) 
+                               continue;
+                       pci_write_config_dword(northbridges[i], 0x9c, 
+                                              northbridge_flush_word[i]
| 1); 
+                       flushed++;
+                       max = i;
+               }
+               for (i = 0; i <= max; i++) {
+                       u32 w;
+                       if (!northbridges[i])
+                               continue;
+                       /* Make sure the hardware actually executed the
flush. */
+                       do { 
+                               pci_read_config_dword(northbridges[i],
0x9c, &w);
+                       } while (w & 1);
+               } 
+               if (!flushed) 
+                       printk("nothing to flush?\n");
+               need_flush = 0;
+       } 
+       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+/* Allocate DMA memory on node near device */
+noinline
+static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned

+order) {
+       struct page *page;
+       int node;
+       if (dev->bus == &pci_bus_type) {
+               cpumask_t mask;
+               mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
+               node = cpu_to_node(first_cpu(mask));
+       } else
+               node = numa_node_id();
+       page = alloc_pages_node(node, gfp, order);
+       return page ? page_address(page) : NULL;
+}
+
+/*
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t
*dma_handle,
+                  unsigned gfp)
+{
+       void *memory;
+       unsigned long dma_mask = 0;
+       u64 bus;
+
+       if (!dev)
+               dev = &fallback_dev;
+       dma_mask = dev->coherent_dma_mask;
+       if (dma_mask == 0) 
+               dma_mask = 0xffffffff;
+
+       /* Kludge to make it bug-to-bug compatible with i386. i386
+          uses the normal dma_mask for alloc_coherent. */
+       dma_mask &= *dev->dma_mask;
+
+ again:
+       memory = dma_alloc_pages(dev, gfp, get_order(size));
+       if (memory == NULL)
+               return NULL;
+
+       {
+               int high, mmu;
+               bus = virt_to_bus(memory);
+               high = (bus + size) >= dma_mask;
+               mmu = high;
+               if (force_iommu && !(gfp & GFP_DMA)) 
+                       mmu = 1;
+               if (no_iommu || dma_mask < 0xffffffffUL) { 
+                       if (high) {
+                               free_pages((unsigned long)memory,
+                                          get_order(size));
+
+                               if (swiotlb) {
+                                       return
+                                       swiotlb_alloc_coherent(dev,
size,
+
dma_handle,
+                                                              gfp);
+                               }
+
+                               if (!(gfp & GFP_DMA)) { 
+                                       gfp |= GFP_DMA; 
+                                       goto again;
+                               }
+                               return NULL;
+                       }
+                       mmu = 0; 
+               }       
+               memset(memory, 0, size); 
+               if (!mmu) { 
+                       *dma_handle = virt_to_bus(memory);
+                       return memory;
+               }
+       }
+
+       *dma_handle = dma_map_area(dev, bus, size,
PCI_DMA_BIDIRECTIONAL, 0);
+       if (*dma_handle == bad_dma_address)
+               goto error; 
+       flush_gart(dev);
+       return memory;
+       
+error:
+       if (panic_on_overflow)
+               panic("dma_alloc_coherent: IOMMU overflow by %lu
bytes\n", size);
+       free_pages((unsigned long)memory, get_order(size)); 
+       return NULL;
+}
+
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the
mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+                        void *vaddr, dma_addr_t bus)
+{
+       if (swiotlb) {
+               swiotlb_free_coherent(dev, size, vaddr, bus);
+               return;
+       }
+
+       dma_unmap_single(dev, bus, size, 0);
+       free_pages((unsigned long)vaddr, get_order(size));              
+}
+
+#ifdef CONFIG_IOMMU_LEAK
+
+#define SET_LEAK(x) if (iommu_leak_tab) \
+                       iommu_leak_tab[x] = __builtin_return_address(0);
+#define CLEAR_LEAK(x) if (iommu_leak_tab) \
+                       iommu_leak_tab[x] = NULL;
+
+/* Debugging aid for drivers that don't free their IOMMU tables */ 
+static void **iommu_leak_tab; static int leak_trace;
+int iommu_leak_pages = 20; 
+void dump_leak(void)
+{
+       int i;
+       static int dump; 
+       if (dump || !iommu_leak_tab) return;
+       dump = 1;
+       show_stack(NULL,NULL);
+       /* Very crude. dump some from the end of the table too */ 
+       printk("Dumping %d pages from end of IOMMU:\n",
iommu_leak_pages); 
+       for (i = 0; i < iommu_leak_pages; i+=2) {
+               printk("%lu: ", iommu_pages-i);
+               printk_address((unsigned long)
iommu_leak_tab[iommu_pages-i]);
+               printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
+       } 
+       printk("\n");
+}
+#else
+#define SET_LEAK(x)
+#define CLEAR_LEAK(x)
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir, int 
+do_panic) {
+       /* 
+        * Ran out of IOMMU space for this operation. This is very bad.
+        * Unfortunately the drivers cannot handle this operation
properly.
+        * Return some non mapped prereserved space in the aperture and 
+        * let the Northbridge deal with it. This will result in garbage
+        * in the IO operation. When the size exceeds the prereserved
space
+        * memory corruption will occur or random memory will be DMAed 
+        * out. Hopefully no network devices use single mappings that
big.
+        */
+       
+       printk(KERN_ERR
+  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+              size, dev->bus_id);
+
+       if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) {
+               if (dir == PCI_DMA_FROMDEVICE || dir ==
PCI_DMA_BIDIRECTIONAL)
+                       panic("PCI-DMA: Memory would be corrupted\n");
+               if (dir == PCI_DMA_TODEVICE || dir ==
PCI_DMA_BIDIRECTIONAL) 
+                       panic("PCI-DMA: Random memory would be
DMAed\n");
+       }
+
+#ifdef CONFIG_IOMMU_LEAK
+       dump_leak();
+#endif
+} 
+
+static inline int need_iommu(struct device *dev, unsigned long addr, 
+size_t size) {
+       u64 mask = *dev->dma_mask;
+       int high = addr + size >= mask;
+       int mmu = high;
+       if (force_iommu) 
+               mmu = 1; 
+       if (no_iommu) { 
+               if (high) 
+                       panic("PCI-DMA: high address but no IOMMU.\n"); 
+               mmu = 0; 
+       }       
+       return mmu;
+}
+
+static inline int nonforced_iommu(struct device *dev, unsigned long 
+addr, size_t size) {
+       u64 mask = *dev->dma_mask;
+       int high = addr + size >= mask;
+       int mmu = high;
+       if (no_iommu) { 
+               if (high) 
+                       panic("PCI-DMA: high address but no IOMMU.\n"); 
+               mmu = 0; 
+       }       
+       return mmu;
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, unsigned long
phys_mem,
+                               size_t size, int dir, int do_panic)
+{ 
+       unsigned long npages = to_pages(phys_mem, size);
+       unsigned long iommu_page = alloc_iommu(npages);
+       int i;
+       if (iommu_page == -1) {
+               if (!nonforced_iommu(dev, phys_mem, size))
+                       return phys_mem; 
+               if (panic_on_overflow)
+                       panic("dma_map_area overflow %lu bytes\n",
size);
+               iommu_full(dev, size, dir, do_panic);
+               return bad_dma_address;
+       }
+
+       for (i = 0; i < npages; i++) {
+               iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+               SET_LEAK(iommu_page + i);
+               phys_mem += PAGE_SIZE;
+       }
+       return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & 
+~PAGE_MASK); }
+
+/* Map a single area into the IOMMU */
+dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, 
+int dir) {
+       unsigned long phys_mem, bus;
+
+       BUG_ON(dir == DMA_NONE);
+
+       if (swiotlb)
+               return swiotlb_map_single(dev,addr,size,dir);
+       if (!dev)
+               dev = &fallback_dev;
+
+       phys_mem = virt_to_bus(addr);
+       if (!need_iommu(dev, phys_mem, size))
+               return phys_mem;
+
+       bus = dma_map_area(dev, phys_mem, size, dir, 1);
+       flush_gart(dev); 
+       return bus;
+} 
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist
*sg,
+                              int nents, int dir)
+{
+       int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+       printk(KERN_DEBUG "dma_map_sg overflow\n");
+#endif
+
+       for (i = 0; i < nents; i++ ) {
+               struct scatterlist *s = &sg[i];
+               unsigned long addr = page_to_phys(s->page) + s->offset; 
+               if (nonforced_iommu(dev, addr, s->length)) { 
+                       addr = dma_map_area(dev, addr, s->length, dir,
0);
+                       if (addr == bad_dma_address) { 
+                               if (i > 0) 
+                                       dma_unmap_sg(dev, sg, i, dir);
+                               nents = 0; 
+                               sg[0].dma_length = 0;
+                               break;
+                       }
+               }
+               s->dma_address = addr;
+               s->dma_length = s->length;
+       }
+       flush_gart(dev);
+       return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */ 
+static int __dma_map_cont(struct scatterlist *sg, int start, int
stopat,
+                     struct scatterlist *sout, unsigned long pages)
+{
+       unsigned long iommu_start = alloc_iommu(pages);
+       unsigned long iommu_page = iommu_start; 
+       int i;
+
+       if (iommu_start == -1)
+               return -1;
+       
+       for (i = start; i < stopat; i++) {
+               struct scatterlist *s = &sg[i];
+               unsigned long pages, addr;
+               unsigned long phys_addr = s->dma_address;
+               
+               BUG_ON(i > start && s->offset);
+               if (i == start) {
+                       *sout = *s; 
+                       sout->dma_address = iommu_bus_base;
+                       sout->dma_address += iommu_page*PAGE_SIZE +
s->offset;
+                       sout->dma_length = s->length;
+               } else { 
+                       sout->dma_length += s->length; 
+               }
+
+               addr = phys_addr;
+               pages = to_pages(s->offset, s->length); 
+               while (pages--) { 
+                       iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);

+                       SET_LEAK(iommu_page);
+                       addr += PAGE_SIZE;
+                       iommu_page++;
+       } 
+       } 
+       BUG_ON(iommu_page - iommu_start != pages);      
+       return 0;
+}
+
+static inline int dma_map_cont(struct scatterlist *sg, int start, int
stopat,
+                     struct scatterlist *sout,
+                     unsigned long pages, int need)
+{
+       if (!need) { 
+               BUG_ON(stopat - start != 1);
+               *sout = sg[start]; 
+               sout->dma_length = sg[start].length; 
+               return 0;
+       } 
+       return __dma_map_cont(sg, start, stopat, sout, pages);
+}
+               
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping.
+ */
+int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
int dir)
+{
+       int i;
+       int out;
+       int start;
+       unsigned long pages = 0;
+       int need = 0, nextneed;
+
+       BUG_ON(dir == DMA_NONE);
+       if (nents == 0) 
+               return 0;
+
+       if (swiotlb)
+               return swiotlb_map_sg(dev,sg,nents,dir);
+       if (!dev)
+               dev = &fallback_dev;
+
+       out = 0;
+       start = 0;
+       for (i = 0; i < nents; i++) {
+               struct scatterlist *s = &sg[i];
+               dma_addr_t addr = page_to_phys(s->page) + s->offset;
+               s->dma_address = addr;
+               BUG_ON(s->length == 0);
+
+               nextneed = need_iommu(dev, addr, s->length);
+
+               /* Handle the previous not yet processed entries */
+               if (i > start) {
+                       struct scatterlist *ps = &sg[i-1];
+                       /* Can only merge when the last chunk ends on a
page 
+                          boundary and the new one doesn't have an
offset. */
+                       if (!iommu_merge || !nextneed || !need ||
s->offset ||
+                           (ps->offset + ps->length) % PAGE_SIZE) { 
+                               if (dma_map_cont(sg, start, i, sg+out,
pages,
+                                                need) < 0)
+                                       goto error;
+                               out++;
+                               pages = 0;
+                               start = i;      
+                       }
+               }
+
+               need = nextneed;
+               pages += to_pages(s->offset, s->length);
+       }
+       if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+               goto error;
+       out++;
+       flush_gart(dev);
+       if (out < nents) 
+               sg[out].dma_length = 0; 
+       return out;
+
+error:
+       flush_gart(NULL);
+       dma_unmap_sg(dev, sg, nents, dir);
+       /* When it was forced try again unforced */
+       if (force_iommu) 
+               return dma_map_sg_nonforce(dev, sg, nents, dir);
+       if (panic_on_overflow)
+               panic("dma_map_sg: overflow on %lu pages\n", pages);
+       iommu_full(dev, pages << PAGE_SHIFT, dir, 0);
+       for (i = 0; i < nents; i++)
+               sg[i].dma_address = bad_dma_address;
+       return 0;
+}
+
+/*
+ * Free a DMA mapping.
+ */
+void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
+                     size_t size, int direction)
+{
+       unsigned long iommu_page; 
+       int npages;
+       int i;
+
+       if (swiotlb) {
+               swiotlb_unmap_single(dev,dma_addr,size,direction);
+               return;
+       }
+
+       if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 
+           dma_addr >= iommu_bus_base + iommu_size)
+               return;
+       iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;   
+       npages = to_pages(dma_addr, size);
+       for (i = 0; i < npages; i++) { 
+               iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 
+               CLEAR_LEAK(iommu_page + i);
+       }
+       free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */ 
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int
nents, int dir)
+{
+       int i;
+       if (swiotlb) {
+               swiotlb_unmap_sg(dev,sg,nents,dir);
+               return;
+       }
+       for (i = 0; i < nents; i++) { 
+               struct scatterlist *s = &sg[i];
+               if (!s->dma_length || !s->length) 
+                       break;
+               dma_unmap_single(dev, s->dma_address, s->dma_length,
dir);
+       }
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+       /* Copied from i386. Doesn't make much sense, because it will 
+          only work for pci_alloc_coherent.
+          The caller just has to use GFP_DMA in this case. */
+        if (mask < 0x00ffffff)
+                return 0;
+
+       /* Tell the device to use SAC when IOMMU force is on. 
+          This allows the driver to use cheaper accesses in some cases.
+
+          Problem with this is that if we overflow the IOMMU area
+          and return DAC as fallback address the device may not handle
it 
+correctly.
+          
+          As a special case some controllers have a 39bit address mode 
+          that is as efficient as 32bit (aic79xx). Don't force SAC for
these.
+          Assume all masks <= 40 bits are of this type. Normally this
doesn't
+          make any difference, but gives more gentle handling of IOMMU
overflow. */
+       if (iommu_sac_force && (mask >= 0xffffffffffULL)) { 
+               printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
dev->bus_id,mask);
+               return 0; 
+       }
+
+       return 1;
+}
+
+int dma_get_cache_alignment(void)
+{
+       return boot_cpu_data.x86_clflush_size;
+}
+
+EXPORT_SYMBOL(dma_unmap_sg);
+EXPORT_SYMBOL(dma_map_sg);
+EXPORT_SYMBOL(dma_map_single); EXPORT_SYMBOL(dma_unmap_single);
+EXPORT_SYMBOL(dma_supported);
+EXPORT_SYMBOL(no_iommu);
+EXPORT_SYMBOL(force_iommu); 
+EXPORT_SYMBOL(bad_dma_address);
+EXPORT_SYMBOL(iommu_bio_merge);
+EXPORT_SYMBOL(iommu_sac_force);
+EXPORT_SYMBOL(dma_get_cache_alignment);
+EXPORT_SYMBOL(dma_alloc_coherent);
+EXPORT_SYMBOL(dma_free_coherent);
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 
+aper_size) {
+       unsigned long a; 
+       if (!iommu_size) { 
+               iommu_size = aper_size; 
+               if (!no_agp) 
+                       iommu_size /= 2; 
+       }
+
+       a = aper + iommu_size; 
+       iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+
+       if (iommu_size < 64*1024*1024) 
+               printk(KERN_WARNING
+  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP 
+aperture in BIOS\n",iommu_size>>20);
+       
+       return iommu_size;
+}
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
+{ 
+       unsigned aper_size = 0, aper_base_32;
+       u64 aper_base;
+       unsigned aper_order;
+
+       pci_read_config_dword(dev, 0x94, &aper_base_32); 
+       pci_read_config_dword(dev, 0x90, &aper_order);
+       aper_order = (aper_order >> 1) & 7;     
+
+       aper_base = aper_base_32 & 0x7fff; 
+       aper_base <<= 25;
+
+       aper_size = (32 * 1024 * 1024) << aper_order; 
+       if (aper_base + aper_size >= 0xffffffff || !aper_size)
+               aper_base = 0;
+
+       *size = aper_size;
+       return aper_base;
+}
+
+/*
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.  
+ */
+static __init int init_k8_gatt(struct agp_kern_info *info)
+{ 
+       struct pci_dev *dev;
+       void *gatt;
+       unsigned aper_base, new_aper_base;
+       unsigned aper_size, gatt_size, new_aper_size;
+       
+       printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+
+       aper_size = aper_base = info->aper_size = 0;
+       for_all_nb(dev) { 
+               new_aper_base = read_aperture(dev, &new_aper_size); 
+               if (!new_aper_base) 
+                       goto nommu;
+               
+               if (!aper_base) { 
+                       aper_size = new_aper_size;
+                       aper_base = new_aper_base;
+               } 
+               if (aper_size != new_aper_size || aper_base !=
new_aper_base) 
+                       goto nommu;
+       }
+       if (!aper_base)
+               goto nommu; 
+       info->aper_base = aper_base;
+       info->aper_size = aper_size>>20;
+
+       gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
+       gatt = (void *) alloc_gatt_pages(get_order(gatt_size));
+       if (!gatt) 
+               panic("Cannot allocate GATT table"); 
+       memset(gatt, 0, gatt_size); 
+       if (!agp_gatt_table)
+               agp_gatt_table = gatt;
+       else
+               goto nommu;
+       
+       for_all_nb(dev) { 
+               u32 ctl; 
+               u32 gatt_reg;
+
+               gatt_reg = (0xffffffff & virt_to_gart(gatt)) >> 12;
+               gatt_reg <<= 4; 
+               pci_write_config_dword(dev, 0x98, gatt_reg);
+               pci_read_config_dword(dev, 0x90, &ctl);
+
+               ctl |= 1;
+               ctl &= ~((1<<4) | (1<<5));
+
+               pci_write_config_dword(dev, 0x90, ctl); 
+       }
+       flush_gart(NULL);
+       
+       printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base,
aper_size>>10); 
+       return 0;
+
+ nommu:
+       /* Should not happen anymore */
+       printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+              KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); 
+       return -1;
+} 
+
+extern int agp_amd64_init(void);
+
+static int __init pci_iommu_init(void)
+{ 
+       struct agp_kern_info info;
+       unsigned long aper_size;
+       unsigned long iommu_start;
+       struct pci_dev *dev;
+       unsigned long scratch;
+       long i;
+       long ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page,
NULL);
+
+#ifndef CONFIG_AGP_AMD64
+       no_agp = 1;
+#else
+       /* Makefile puts PCI initialization via subsys_initcall first.
*/
+       /* Add other K8 AGP bridge drivers here */
+       no_agp = no_agp || 
+               (agp_amd64_init() < 0) || 
+               (agp_copy_info(agp_bridge, &info) < 0);
+#endif 
+
+       if (swiotlb) { 
+               no_iommu = 1;
+               printk(KERN_INFO "PCI-DMA: Using software bounce
buffering for IO (SWIOTLB)\n");
+               return -1; 
+       }
+       
+       if (no_iommu ||
+           (!force_iommu && ram_end < 0xfffff) ||
+           !iommu_aperture ||
+           (no_agp && init_k8_gatt(&info) < 0)) {
+               printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); 
+               no_iommu = 1;
+               return -1;
+       }
+
+       aper_size = info.aper_size * 1024 * 1024;       
+       iommu_size = check_iommu_size(info.aper_base, aper_size); 
+       iommu_pages = iommu_size >> PAGE_SHIFT;
+
+       iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
+
get_order(iommu_pages/8)); 
+       if (!iommu_gart_bitmap) 
+               panic("Cannot allocate iommu bitmap\n"); 
+       memset(iommu_gart_bitmap, 0, iommu_pages/8);
+
+#ifdef CONFIG_IOMMU_LEAK
+       if (leak_trace) { 
+               iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+                                 get_order(iommu_pages*sizeof(void
*)));
+               if (iommu_leak_tab) 
+                       memset(iommu_leak_tab, 0, iommu_pages * 8); 
+               else
+                       printk("PCI-DMA: Cannot allocate leak trace
area\n"); 
+       }
+#endif
+
+       /* 
+        * Out of IOMMU space handling.
+        * Reserve some invalid pages at the beginning of the GART. 
+        */ 
+       set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+
+       agp_memory_reserved = iommu_size;       
+       printk(KERN_INFO
+              "PCI-DMA: Reserving %luMB of IOMMU area in the AGP
aperture\n",
+              iommu_size>>20);
+
+       iommu_start = aper_size - iommu_size;   
+       iommu_bus_base = info.aper_base + iommu_start; 
+       bad_dma_address = iommu_bus_base;
+       iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+       /* 
+        * Unmap the IOMMU part of the GART. The alias of the page is
+        * always mapped with cache enabled and there is no full cache
+        * coherency across the GART remapping. The unmapping avoids
+        * automatic prefetches from the CPU allocating cache lines in
+        * there. All CPU accesses are done via the direct mapping to
+        * the backing memory. The GART address is only used by PCI
+        * devices. 
+        */
+       clear_kernel_mapping((unsigned long)__va(iommu_bus_base),
iommu_size);
+
+       /* 
+        * Try to workaround a bug (thanks to BenH) 
+        * Set unmapped entries to a scratch page instead of 0. 
+        * Any prefetches that hit unmapped entries won't get an bus
abort
+        * then.
+        */
+       scratch = get_zeroed_page(GFP_KERNEL); 
+       if (!scratch) 
+               panic("Cannot allocate iommu scratch page");
+       gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+       for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+               iommu_gatt_base[i] = gart_unmapped_entry;
+
+       for_all_nb(dev) {
+               u32 flag; 
+               int cpu = PCI_SLOT(dev->devfn) - 24;
+               if (cpu >= MAX_NB)
+                       continue;
+               northbridges[cpu] = dev;
+               pci_read_config_dword(dev, 0x9c, &flag); /* cache flush
word */
+               northbridge_flush_word[cpu] = flag; 
+       }
+                    
+       flush_gart(NULL);
+
+       return 0;
+}
+
+/* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init);
+
+/*
iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,m
erge]
+         [,forcesac][,fullflush][,nomerge][,biomerge]
+   size  set size of iommu (in bytes) 
+   noagp don't initialize the AGP driver and use full aperture.
+   off   don't use the IOMMU
+   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK
is on)
+   memaper[=order] allocate an own aperture over RAM with size
32MB^order.  
+   noforce don't force IOMMU usage. Default.
+   force  Force IOMMU.
+   merge  Do lazy merging. This may improve performance on some block
devices.
+          Implies force (experimental)
+   biomerge Do merging at the BIO layer. This is more efficient than
merge,
+            but should be only done with very big IOMMUs. Implies
merge,force.
+   nomerge Don't do SG merging.
+   forcesac For SAC mode for masks <40bits  (experimental)
+   fullflush Flush IOMMU on each allocation (default) 
+   nofullflush Don't use IOMMU fullflush
+   allowed  overwrite iommu off workarounds for specific chipsets.
+   soft         Use software bounce buffering (default for Intel
machines)
+   noaperture Don't touch the aperture for AGP.
+*/
+__init int iommu_setup(char *p)
+{ 
+    int arg;
+
+    while (*p) {
+           if (!strncmp(p,"noagp",5))
+                   no_agp = 1;
+           if (!strncmp(p,"off",3))
+                   no_iommu = 1;
+           if (!strncmp(p,"force",5)) {
+                   force_iommu = 1;
+                   iommu_aperture_allowed = 1;
+           }
+           if (!strncmp(p,"allowed",7))
+                   iommu_aperture_allowed = 1;
+           if (!strncmp(p,"noforce",7)) {
+                   iommu_merge = 0;
+                   force_iommu = 0;
+           }
+           if (!strncmp(p, "memaper", 7)) {
+                   fallback_aper_force = 1; 
+                   p += 7; 
+                   if (*p == '=') {
+                           ++p;
+                           if (get_option(&p, &arg))
+                                   fallback_aper_order = arg;
+                   }
+           } 
+           if (!strncmp(p, "biomerge",8)) {
+                   iommu_bio_merge = 4096;
+                   iommu_merge = 1;
+                   force_iommu = 1;
+           }
+           if (!strncmp(p, "panic",5))
+                   panic_on_overflow = 1;
+           if (!strncmp(p, "nopanic",7))
+                   panic_on_overflow = 0;          
+           if (!strncmp(p, "merge",5)) {
+                   iommu_merge = 1;
+                   force_iommu = 1; 
+           }
+           if (!strncmp(p, "nomerge",7))
+                   iommu_merge = 0;
+           if (!strncmp(p, "forcesac",8))
+                   iommu_sac_force = 1;
+           if (!strncmp(p, "fullflush",8))
+                   iommu_fullflush = 1;
+           if (!strncmp(p, "nofullflush",11))
+                   iommu_fullflush = 0;
+           if (!strncmp(p, "soft",4))
+                   swiotlb = 1;
+           if (!strncmp(p, "noaperture",10))
+                   fix_aperture = 0;
+#ifdef CONFIG_IOMMU_LEAK
+           if (!strncmp(p,"leak",4)) {
+                   leak_trace = 1;
+                   p += 4; 
+                   if (*p == '=') ++p;
+                   if (isdigit(*p) && get_option(&p, &arg))
+                           iommu_leak_pages = arg;
+           } else
+#endif
+           if (isdigit(*p) && get_option(&p, &arg)) 
+                   iommu_size = arg;
+           p += strcspn(p, ",");
+           if (*p == ',')
+                   ++p;
+    }
+    if (force_iommu || fallback_aper_force)
+       swiotlb = -1;
+    return 1;
+}


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH] Support for AGP aperture as IOMMU in AMD64 mode [1/2], Langsdorf, Mark <=