WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Intel vt-d specific changes in arch/x86/h

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Intel vt-d specific changes in arch/x86/hvm/vmx/vtd.
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Wed, 26 Sep 2007 03:40:24 -0700
Delivery-date: Wed, 26 Sep 2007 04:30:10 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1189784449 -3600
# Node ID f4bbd3f327e4308aa2aebf5484fc32d1d1ff4b41
# Parent  acfa9290746f9c00e30dca7a62e9f7a96702b3b5
Intel vt-d specific changes in arch/x86/hvm/vmx/vtd.

Signed-off-by: Allen Kay <allen.m.kay@xxxxxxxxx>
Signed-off-by: Guy Zana <guy@xxxxxxxxxxxx>
---
 xen/arch/x86/hvm/vmx/vtd/Makefile      |    4 
 xen/arch/x86/hvm/vmx/vtd/dmar.c        |  494 ++++++++
 xen/arch/x86/hvm/vmx/vtd/dmar.h        |   90 +
 xen/arch/x86/hvm/vmx/vtd/intel-iommu.c | 1927 +++++++++++++++++++++++++++++++++
 xen/arch/x86/hvm/vmx/vtd/io.c          |  120 ++
 xen/arch/x86/hvm/vmx/vtd/msi.h         |  128 ++
 xen/arch/x86/hvm/vmx/vtd/pci-direct.h  |   48 
 xen/arch/x86/hvm/vmx/vtd/pci_regs.h    |  449 +++++++
 xen/arch/x86/hvm/vmx/vtd/utils.c       |  302 +++++
 9 files changed, 3562 insertions(+)

diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/Makefile Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,4 @@
+obj-y += intel-iommu.o
+obj-y += dmar.o
+obj-y += utils.o
+obj-y += io.o
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/dmar.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/dmar.c   Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@xxxxxxxxx>
+ * Copyright (C) Shaohua Li <shaohua.li@xxxxxxxxx>
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx> - adapted to xen
+ */
+
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/kernel.h>
+#include <xen/acpi.h>
+#include <xen/mm.h>
+#include <xen/xmalloc.h>
+#include <asm/string.h>
+#include "dmar.h"
+#include "pci-direct.h"
+#include "pci_regs.h"
+
+#undef PREFIX
+#define PREFIX VTDPREFIX "ACPI DMAR:"
+#define DEBUG
+
+#define MIN_SCOPE_LEN (sizeof(struct acpi_pci_path) + sizeof(struct 
acpi_dev_scope))
+
+LIST_HEAD(acpi_drhd_units);
+LIST_HEAD(acpi_rmrr_units);
+LIST_HEAD(acpi_atsr_units);
+LIST_HEAD(acpi_ioapic_units);
+
+u8 dmar_host_address_width;
+
+static int __init acpi_register_drhd_unit(struct acpi_drhd_unit *drhd)
+{
+    /*
+     * add INCLUDE_ALL at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if (drhd->include_all)
+        list_add_tail(&drhd->list, &acpi_drhd_units);
+    else
+        list_add(&drhd->list, &acpi_drhd_units);
+    return 0;
+}
+
+static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr)
+{
+    list_add(&rmrr->list, &acpi_rmrr_units);
+    return 0;
+}
+
+static int acpi_pci_device_match(struct pci_dev *devices, int cnt,
+                 struct pci_dev *dev)
+{
+    int i;
+
+    for (i = 0; i < cnt; i++) {
+        if ((dev->bus == devices->bus) &&
+            (dev->devfn == devices->devfn))
+            return 1;
+        devices++;
+    }
+    return 0;
+}
+
+static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr)
+{
+    /*
+     * add ALL_PORTS at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if (atsr->all_ports)
+        list_add_tail(&atsr->list, &acpi_atsr_units);
+    else
+        list_add(&atsr->list, &acpi_atsr_units);
+    return 0;
+}
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(struct pci_dev *dev)
+{
+    struct acpi_drhd_unit *drhd;
+    struct acpi_drhd_unit *include_all_drhd;
+
+    include_all_drhd = NULL;
+    list_for_each_entry(drhd, &acpi_drhd_units, list) {
+        if (drhd->include_all)
+            include_all_drhd = drhd;
+        if (acpi_pci_device_match(drhd->devices,
+                        drhd->devices_cnt, dev))
+        {
+            gdprintk(XENLOG_INFO VTDPREFIX, 
+                     "acpi_find_matched_drhd_unit: drhd->address = %lx\n",
+                     drhd->address);
+            return drhd;
+        }
+    }
+
+    if (include_all_drhd) {
+        gdprintk(XENLOG_INFO VTDPREFIX, 
+                 "acpi_find_matched_drhd_unit:include_all_drhd->addr = %lx\n",
+                 include_all_drhd->address);
+        return include_all_drhd;;
+    }
+
+    return(NULL);
+}
+
+struct acpi_rmrr_unit * acpi_find_matched_rmrr_unit(struct pci_dev *dev)
+{
+    struct acpi_rmrr_unit *rmrr;
+
+    list_for_each_entry(rmrr, &acpi_rmrr_units, list) {
+        if (acpi_pci_device_match(rmrr->devices,
+                        rmrr->devices_cnt, dev))
+            goto out;
+    }
+    rmrr = NULL;
+out:
+    return rmrr;
+}
+
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(struct pci_dev *dev)
+{
+    struct acpi_atsr_unit *atsru;
+    struct acpi_atsr_unit *all_ports_atsru;
+
+    all_ports_atsru = NULL;
+    list_for_each_entry(atsru, &acpi_atsr_units, list) {
+        if (atsru->all_ports)
+            all_ports_atsru = atsru;
+        if (acpi_pci_device_match(atsru->devices, atsru->devices_cnt, dev))
+            return atsru;
+    }
+    if (all_ports_atsru) {
+        gdprintk(XENLOG_INFO VTDPREFIX, 
+                 "acpi_find_matched_atsr_unit: all_ports_atsru\n");
+        return all_ports_atsru;;
+    }
+    return(NULL);
+}
+
+static int __init acpi_parse_dev_scope(void *start, void *end, int *cnt,
+                       struct pci_dev **devices)
+{
+    struct acpi_dev_scope *scope;
+    u8 bus, sub_bus, sec_bus;
+    struct acpi_pci_path *path;
+    struct acpi_ioapic_unit *acpi_ioapic_unit = NULL;
+    int count, dev_count=0;
+    struct pci_dev *pdev;
+    u8 dev, func;
+    u32 l;
+    void *tmp;
+
+    *cnt = 0;
+    tmp = start;
+    while (start < end) {
+        scope = start;
+        if (scope->length < MIN_SCOPE_LEN ||
+            (scope->dev_type != ACPI_DEV_ENDPOINT &&
+            scope->dev_type != ACPI_DEV_P2PBRIDGE)) {
+            printk(KERN_WARNING PREFIX "Invalid device scope\n");
+            return -EINVAL;
+        }
+        (*cnt)++;
+        start += scope->length;
+    }
+
+    start = tmp;
+    while (start < end) {
+        scope = start;
+        path = (struct acpi_pci_path *)(scope + 1);
+        count = (scope->length - sizeof(struct acpi_dev_scope))
+                   /sizeof(struct acpi_pci_path);
+        bus = scope->start_bus;
+
+        while (--count) {
+            bus = read_pci_config_byte(bus, path->dev,
+                                       path->fn, PCI_SECONDARY_BUS);
+            path++;
+        }
+
+        if (scope->dev_type == ACPI_DEV_ENDPOINT) {
+            printk(KERN_WARNING PREFIX
+                "found endpoint: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+                dev_count++;
+        } else if (scope->dev_type == ACPI_DEV_P2PBRIDGE) {
+            printk(KERN_WARNING PREFIX
+                "found bridge: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+
+            sec_bus = read_pci_config_byte(bus, path->dev,
+                                       path->fn, PCI_SECONDARY_BUS);
+            sub_bus = read_pci_config_byte(bus, path->dev,
+                                       path->fn, PCI_SUBORDINATE_BUS);
+            while (sec_bus <= sub_bus) {
+                for (dev = 0; dev < 32; dev++) {
+                    for (func = 0; func < 8; func++) {
+                        l = read_pci_config(sec_bus, dev, func, PCI_VENDOR_ID);
+
+                        /* some broken boards return 0 or ~0 if a slot is 
empty: */
+                        if (l == 0xffffffff || l == 0x00000000 ||
+                            l == 0x0000ffff || l == 0xffff0000)
+                            break;
+                        dev_count++;
+                    }
+                }
+                sec_bus++;
+            }
+        } else if (scope->dev_type == ACPI_DEV_IOAPIC) {
+            printk(KERN_WARNING PREFIX
+                "found IOAPIC: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+            dev_count++;
+        } else {
+            printk(KERN_WARNING PREFIX
+                "found MSI HPET: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+            dev_count++;
+        }
+
+        start += scope->length;
+    }
+
+    *cnt = dev_count;
+    *devices = xmalloc_array(struct pci_dev,  *cnt);
+    if (!*devices)
+        return -ENOMEM;
+    memset(*devices, 0, sizeof(struct pci_dev) * (*cnt));
+
+    pdev = *devices;
+    start = tmp;
+    while (start < end) {
+        scope = start;
+        path = (struct acpi_pci_path *)(scope + 1);
+        count = (scope->length - sizeof(struct acpi_dev_scope))
+                   /sizeof(struct acpi_pci_path);
+        bus = scope->start_bus;
+
+        while (--count) {
+            bus = read_pci_config_byte(bus, path->dev, path->fn, 
PCI_SECONDARY_BUS);
+            path++;
+        }
+
+        if (scope->dev_type == ACPI_DEV_ENDPOINT) {
+            printk(KERN_WARNING PREFIX
+                "found endpoint: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+
+            pdev->bus = bus;
+            pdev->devfn = PCI_DEVFN(path->dev, path->fn);
+            pdev++;
+        } else if (scope->dev_type == ACPI_DEV_P2PBRIDGE) {
+            printk(KERN_WARNING PREFIX
+                "found bridge: bus = %x dev = %x func = %x\n", bus, path->dev, 
path->fn);
+
+            sec_bus = read_pci_config_byte(bus, path->dev, path->fn, 
PCI_SECONDARY_BUS);
+            sub_bus = read_pci_config_byte(bus, path->dev, path->fn, 
PCI_SUBORDINATE_BUS);
+
+            while (sec_bus <= sub_bus) {
+                for (dev = 0; dev < 32; dev++) {
+                    for (func = 0; func < 8; func++) {
+                        l = read_pci_config(sec_bus, dev, func, PCI_VENDOR_ID);
+
+                        /* some broken boards return 0 or ~0 if a slot is 
empty: */
+                        if (l == 0xffffffff || l == 0x00000000 ||
+                            l == 0x0000ffff || l == 0xffff0000)
+                            break;
+
+                        pdev->bus = sec_bus;
+                        pdev->devfn = PCI_DEVFN(dev, func);
+                        pdev++;
+                    }
+                }
+                sec_bus++;
+            }
+        } else if (scope->dev_type == ACPI_DEV_IOAPIC) {
+            acpi_ioapic_unit = xmalloc(struct acpi_ioapic_unit);
+            acpi_ioapic_unit->apic_id = scope->enum_id;
+            acpi_ioapic_unit->ioapic.bdf.bus = bus;
+            acpi_ioapic_unit->ioapic.bdf.dev = path->dev;
+            acpi_ioapic_unit->ioapic.bdf.func = path->fn;
+            list_add(&acpi_ioapic_unit->list, &acpi_ioapic_units);
+            printk(KERN_WARNING PREFIX
+                "found IOAPIC: bus = %x dev = %x func = %x\n", bus, path->dev, 
path->fn);
+        } else {
+            printk(KERN_WARNING PREFIX
+                "found MSI HPET: bus = %x dev = %x func = %x\n", bus, 
path->dev, path->fn);
+        }
+        
+        start += scope->length;
+    }
+
+    return 0;
+}
+
+static int __init
+acpi_parse_one_drhd(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_drhd * drhd = (struct acpi_table_drhd *)header;
+    struct acpi_drhd_unit *dmaru;
+    int ret = 0;
+    static int include_all;
+
+    dmaru = xmalloc(struct acpi_drhd_unit);
+    if (!dmaru)
+        return -ENOMEM;
+    memset(dmaru, 0, sizeof(struct acpi_drhd_unit));
+
+    dmaru->address = drhd->address;
+    dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
+    printk(KERN_WARNING PREFIX "dmaru->address = %lx\n", dmaru->address);
+
+    if (!dmaru->include_all) {
+        ret = acpi_parse_dev_scope((void *)(drhd + 1),
+                ((void *)drhd) + header->length,
+                &dmaru->devices_cnt, &dmaru->devices);
+    }
+    else {
+        printk(KERN_WARNING PREFIX "found INCLUDE_ALL\n");
+        /* Only allow one INCLUDE_ALL */
+        if (include_all) {
+            printk(KERN_WARNING PREFIX "Only one INCLUDE_ALL "
+                "device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        include_all = 1;
+    }
+
+    if (ret)
+        xfree(dmaru);
+    else
+        acpi_register_drhd_unit(dmaru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_rmrr *rmrr = (struct acpi_table_rmrr *)header;
+    struct acpi_rmrr_unit *rmrru;
+    int ret = 0;
+
+    rmrru = xmalloc(struct acpi_rmrr_unit);
+    if (!rmrru)
+        return -ENOMEM;
+    memset(rmrru, 0, sizeof(struct acpi_rmrr_unit));
+
+#ifdef VTD_DEBUG
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "acpi_parse_one_rmrr: base = %lx end = %lx\n",
+        rmrr->base_address, rmrr->end_address);
+#endif
+
+    rmrru->base_address = rmrr->base_address;
+    rmrru->end_address = rmrr->end_address;
+    ret = acpi_parse_dev_scope((void *)(rmrr + 1),
+            ((void*)rmrr) + header->length,
+            &rmrru->devices_cnt, &rmrru->devices);
+
+    if (ret || (rmrru->devices_cnt == 0))
+        xfree(rmrru);
+    else
+        acpi_register_rmrr_unit(rmrru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_atsr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_atsr *atsr = (struct acpi_table_atsr *)header;
+    struct acpi_atsr_unit *atsru;
+    int ret = 0;
+    static int all_ports;
+
+    atsru = xmalloc(struct acpi_atsr_unit);
+    if (!atsru)
+        return -ENOMEM;
+    memset(atsru, 0, sizeof(struct acpi_atsr_unit));
+
+    atsru->all_ports = atsr->flags & 1; /* BIT0: ALL_PORTS */
+    if (!atsru->all_ports) {
+        ret = acpi_parse_dev_scope((void *)(atsr + 1),
+                ((void *)atsr) + header->length,
+                &atsru->devices_cnt, &atsru->devices);
+    }
+    else {
+        printk(KERN_WARNING PREFIX "found ALL_PORTS\n");
+        /* Only allow one ALL_PORTS */
+        if (all_ports) {
+            printk(KERN_WARNING PREFIX "Only one ALL_PORTS "
+                "device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        all_ports = 1;
+    }
+
+    if (ret)
+        xfree(atsr);
+    else
+        acpi_register_atsr_unit(atsru);
+    return ret;
+}
+
+static void __init
+acpi_table_print_dmar_entry(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_drhd *drhd;
+    struct acpi_table_rmrr *rmrr;
+
+    switch (header->type) {
+    case ACPI_DMAR_DRHD:
+        drhd = (struct acpi_table_drhd *)header;
+        break;
+    case ACPI_DMAR_RMRR:
+        rmrr = (struct acpi_table_rmrr *)header;
+        break;
+    }
+}
+
+static int __init
+acpi_parse_dmar(unsigned long phys_addr, unsigned long size)
+{
+    struct acpi_table_dmar *dmar = NULL;
+    struct acpi_dmar_entry_header *entry_header;
+    int ret = 0;
+
+    if (!phys_addr || !size)
+        return -EINVAL;
+
+    dmar = (struct acpi_table_dmar *)__acpi_map_table(phys_addr, size);
+    if (!dmar) {
+        printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
+        return -ENODEV;
+    }
+
+    if (!dmar->haw) {
+        printk (KERN_WARNING PREFIX "Zero: Invalid DMAR haw\n");
+        return -EINVAL;
+    }
+
+    dmar_host_address_width = dmar->haw;
+    printk (KERN_INFO PREFIX "Host address width %d\n",
+        dmar_host_address_width);
+
+    entry_header = (struct acpi_dmar_entry_header *)(dmar + 1);
+    while (((unsigned long)entry_header) < (((unsigned long)dmar) + size)) {
+        acpi_table_print_dmar_entry(entry_header);
+
+        switch (entry_header->type) {
+        case ACPI_DMAR_DRHD:
+            printk (KERN_INFO PREFIX "found ACPI_DMAR_DRHD\n");
+            ret = acpi_parse_one_drhd(entry_header);
+            break;
+        case ACPI_DMAR_RMRR:
+            printk (KERN_INFO PREFIX "found ACPI_DMAR_RMRR\n");
+            ret = acpi_parse_one_rmrr(entry_header);
+            break;
+        case ACPI_DMAR_ATSR:
+            printk (KERN_INFO PREFIX "found ACPI_DMAR_RMRR\n");
+            ret = acpi_parse_one_atsr(entry_header);
+            break;
+        default:
+            printk(KERN_WARNING PREFIX "Unknown DMAR structure type\n");
+            ret = -EINVAL;
+            break;
+        }
+        if (ret)
+            break;
+
+        entry_header = ((void *)entry_header + entry_header->length);
+    }
+    return ret;
+}
+
+int acpi_dmar_init(void)
+{
+    acpi_table_parse(ACPI_DMAR, acpi_parse_dmar);
+    if (list_empty(&acpi_drhd_units)) {
+        printk(KERN_ERR PREFIX "No DMAR devices found\n");
+        return -ENODEV;
+    } else
+        vtd_enabled = 1;
+    return 0;
+}
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/dmar.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/dmar.h   Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@xxxxxxxxx>
+ * Copyright (C) Shaohua Li <shaohua.li@xxxxxxxxx>
+ */
+
+#ifndef _DMAR_H_
+#define _DMAR_H_
+
+#include <xen/list.h>
+#include <asm/iommu.h>
+
+extern u8 dmar_host_address_width;
+
+struct acpi_drhd_unit {
+    struct list_head list;
+    unsigned long    address; /* register base address of the unit */
+    struct    pci_dev *devices; /* target devices */
+    int    devices_cnt;
+    u8    include_all:1;
+    struct iommu *iommu;
+};
+
+struct acpi_rmrr_unit {
+    struct list_head list;
+    unsigned long base_address;
+    unsigned long end_address;
+    struct pci_dev *devices; /* target devices */
+    int    devices_cnt;
+    u8    allow_all:1;
+};
+
+struct acpi_atsr_unit {
+    struct list_head list;
+    struct    pci_dev *devices; /* target devices */
+    int    devices_cnt;
+    u8    all_ports:1;
+};
+
+#define for_each_iommu(domain, iommu) \
+    list_for_each_entry(iommu, \
+        &(domain->arch.hvm_domain.hvm_iommu.iommu_list), list)
+
+#define for_each_pdev(domain, pdev) \
+    list_for_each_entry(pdev, \
+         &(domain->arch.hvm_domain.hvm_iommu.pdev_list), list)
+
+#define for_each_drhd_unit(drhd) \
+    list_for_each_entry(drhd, &acpi_drhd_units, list)
+#define for_each_rmrr_device(rmrr, pdev) \
+    list_for_each_entry(rmrr, &acpi_rmrr_units, list) { \
+        int _i; \
+        for (_i = 0; _i < rmrr->devices_cnt; _i++) { \
+            pdev = &(rmrr->devices[_i]);
+#define end_for_each_rmrr_device(rmrr, pdev) \
+        } \
+    }
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(struct pci_dev *dev);
+struct acpi_rmrr_unit * acpi_find_matched_rmrr_unit(struct pci_dev *dev);
+
+/* This one is for interrupt remapping */
+struct acpi_ioapic_unit {
+    struct list_head list;
+    int apic_id;
+    union {
+        u16 info;
+        struct {
+            u16 bus: 8,
+                dev: 5,
+                func: 3;
+        }bdf;
+    }ioapic;
+};
+
+#endif // _DMAR_H_
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/intel-iommu.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/intel-iommu.c    Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,1927 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@xxxxxxxxx>
+ * Copyright (C) Shaohua Li <shaohua.li@xxxxxxxxx>
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx> - adapted to xen
+ */
+
+#include <xen/init.h>
+#include <xen/irq.h>
+#include <xen/spinlock.h>
+#include <xen/sched.h>
+#include <xen/xmalloc.h>
+#include <xen/domain_page.h>
+#include <asm/delay.h>
+#include <asm/string.h>
+#include <asm/iommu.h>
+#include <asm/hvm/vmx/intel-iommu.h>
+#include "dmar.h"
+#include "pci-direct.h"
+#include "pci_regs.h"
+#include "msi.h"
+
+extern void print_iommu_regs(struct acpi_drhd_unit *drhd);
+extern void print_vtd_entries(struct domain *d, int bus, int devfn,
+                       unsigned long gmfn);
+extern void (*interrupt[])(void);
+
+#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
+
+#define time_after(a,b)         \
+        (typecheck(unsigned long, a) && \
+         typecheck(unsigned long, b) && \
+         ((long)(b) - (long)(a) < 0))
+
+unsigned int x86_clflush_size;
+void clflush_cache_range(void *adr, int size)
+{
+    int i;
+    for (i = 0; i < size; i += x86_clflush_size)
+        clflush(adr + i);
+}
+
+static void __iommu_flush_cache(struct iommu *iommu, void *addr, int size)
+{
+    if (!ecap_coherent(iommu->ecap))
+        clflush_cache_range(addr, size);
+}
+
+#define iommu_flush_cache_entry(iommu, addr) \
+       __iommu_flush_cache(iommu, addr, 8)
+#define iommu_flush_cache_page(iommu, addr) \
+       __iommu_flush_cache(iommu, addr, PAGE_SIZE_4K)
+
+int nr_iommus;
+/* context entry handling */
+static struct context_entry * device_to_context_entry(struct iommu *iommu,
+        u8 bus, u8 devfn)
+{
+    struct root_entry *root;
+    struct context_entry *context;
+    unsigned long phy_addr;
+    unsigned long flags;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    root = &iommu->root_entry[bus];
+    if (!root_present(*root)) {
+        phy_addr = (unsigned long) alloc_xenheap_page();
+        if (!phy_addr) {
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return NULL;
+        }
+        memset((void *) phy_addr, 0, PAGE_SIZE);
+        iommu_flush_cache_page(iommu, (void *)phy_addr);
+        phy_addr = virt_to_maddr((void *)phy_addr);
+        set_root_value(*root, phy_addr);
+        set_root_present(*root);
+        iommu_flush_cache_entry(iommu, root);
+    }
+    phy_addr = (unsigned long) get_context_addr(*root);
+    context = (struct context_entry *)maddr_to_virt(phy_addr);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+    return &context[devfn];
+}
+
+static int device_context_mapped(struct iommu *iommu, u8 bus, u8 devfn)
+{
+    struct root_entry *root;
+    struct context_entry *context;
+    unsigned long phy_addr;
+    int ret;
+    unsigned long flags;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    root = &iommu->root_entry[bus];
+    if (!root_present(*root)) {
+        ret = 0;
+        goto out;
+    }
+    phy_addr = get_context_addr(*root);
+    context = (struct context_entry *)maddr_to_virt(phy_addr);
+    ret = context_present(context[devfn]);
+out:
+    spin_unlock_irqrestore(&iommu->lock, flags);
+    return ret;
+}
+
+/* page table handling */
+#define LEVEL_STRIDE        (9)
+#define LEVEL_MASK        ((1 << LEVEL_STRIDE) - 1)
+#define agaw_to_level(val) ((val) + 2)
+#define agaw_to_width(val) (30 + val * LEVEL_STRIDE)
+#define width_to_agaw(w)  ((w - 30)/LEVEL_STRIDE)
+#define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE)
+#define address_level_offset(addr, level) \
+    ((addr >> level_to_offset_bits(level)) & LEVEL_MASK)
+#define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l))
+#define level_size(l) (1 << level_to_offset_bits(l))
+#define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l))
+static struct dma_pte * addr_to_dma_pte(struct domain *domain, u64 addr)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int addr_width = agaw_to_width(hd->agaw);
+    struct dma_pte *parent, *pte = NULL, *pgd;
+    int level = agaw_to_level(hd->agaw);
+    int offset;
+    unsigned long flags;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    addr &= (((u64)1) << addr_width) - 1;
+    spin_lock_irqsave(&hd->mapping_lock, flags);
+    if (!hd->pgd) {
+        pgd = (struct dma_pte *)alloc_xenheap_page();
+        if (!pgd && !hd->pgd) {
+            spin_unlock_irqrestore(&hd->mapping_lock, flags);
+            return NULL;
+        }
+        memset((u8*)pgd, 0, PAGE_SIZE);
+        if (!hd->pgd)
+            hd->pgd = pgd;
+        else /* somebody is fast */
+            free_xenheap_page((void *) pgd);
+    }
+    parent = hd->pgd;
+    while (level > 0) {
+        u8 *tmp;
+        offset = address_level_offset(addr, level);
+        pte = &parent[offset];
+        if (level == 1)
+            break;
+        if (dma_pte_addr(*pte) == 0) {
+            tmp = alloc_xenheap_page();
+            if (tmp == NULL)
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "addr_to_dma_pte: tmp == NULL\n");
+ 
+            memset(tmp, 0, PAGE_SIZE);
+            iommu_flush_cache_page(iommu, tmp);
+
+            if (!tmp && dma_pte_addr(*pte) == 0) {
+                spin_unlock_irqrestore(&hd->mapping_lock, flags);
+                return NULL;
+            }
+            if (dma_pte_addr(*pte) == 0) {
+                dma_set_pte_addr(*pte,
+                    virt_to_maddr(tmp));
+                /*
+                 * high level table always sets r/w, last level
+                 * page table control read/write
+                 */
+                dma_set_pte_readable(*pte);
+                dma_set_pte_writable(*pte);
+                iommu_flush_cache_entry(iommu, pte);
+            } else /* somebody is fast */
+                free_xenheap_page(tmp);
+        }
+        parent = maddr_to_virt(dma_pte_addr(*pte));
+        level--;
+    }
+    spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    return pte;
+}
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_addr_level_pte(struct domain *domain, u64 addr,
+        int level)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct dma_pte *parent, *pte = NULL;
+    int total = agaw_to_level(hd->agaw);
+    int offset;
+
+    parent = hd->pgd;
+    while (level <= total) {
+        offset = address_level_offset(addr, total);
+        pte = &parent[offset];
+        if (level == total)
+            return pte;
+
+        if (dma_pte_addr(*pte) == 0)
+            break;
+        parent = maddr_to_virt(dma_pte_addr(*pte));
+        total--;
+    }
+    return NULL;
+}
+
+static void iommu_flush_write_buffer(struct iommu *iommu)
+{
+       u32 val;
+       unsigned long flag;
+       unsigned long start_time;
+
+       if (!cap_rwbf(iommu->cap))
+               return;
+       val = iommu->gcmd | DMA_GCMD_WBF;
+
+       spin_lock_irqsave(&iommu->register_lock, flag);
+       dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
+
+       /* Make sure hardware complete it */
+       start_time = jiffies;
+       while (1) {
+               val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+               if (!(val & DMA_GSTS_WBFS))
+                       break;
+               if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))
+                       panic("DMAR hardware is malfunctional, please disable 
IOMMU\n");
+               cpu_relax();
+       }
+       spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_context(struct iommu *iommu,
+       u16 did, u16 source_id, u8 function_mask, u64 type,
+       int non_present_entry_flush)
+{
+       u64 val = 0;
+       unsigned long flag;
+       unsigned long start_time;
+
+       /*
+        * In the non-present entry flush case, if hardware doesn't cache
+        * non-present entry we do nothing and if hardware cache non-present
+        * entry, we flush entries of domain 0 (the domain id is used to cache
+        * any non-present entries)
+        */
+       if (non_present_entry_flush) {
+               if (!cap_caching_mode(iommu->cap))
+                       return 1;
+               else
+                       did = 0;
+       }
+
+        /* use register invalidation */
+        switch (type)
+        {
+            case DMA_CCMD_GLOBAL_INVL:
+                val = DMA_CCMD_GLOBAL_INVL;
+                break;
+            case DMA_CCMD_DOMAIN_INVL:
+                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+                break;
+            case DMA_CCMD_DEVICE_INVL:
+                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+                  |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
+                break;
+            default:
+                BUG();
+        }
+        val |= DMA_CCMD_ICC;
+
+        spin_lock_irqsave(&iommu->register_lock, flag);
+        dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
+
+        /* Make sure hardware complete it */
+        start_time = jiffies;
+        while (1) {
+            val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
+            if (!(val & DMA_CCMD_ICC))
+                break;
+            if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))
+                panic("DMAR hardware is malfunctional, please disable 
IOMMU\n");
+            cpu_relax();
+        }
+        spin_unlock_irqrestore(&iommu->register_lock, flag);
+       /* flush context entry will implictly flush write buffer */
+       return 0;
+}
+
+static int inline iommu_flush_context_global(struct iommu *iommu,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
+               non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_domain(struct iommu *iommu, u16 did,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
+               non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_device(struct iommu *iommu,
+       u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
+{
+       return __iommu_flush_context(iommu, did, source_id, function_mask,
+               DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_iotlb(struct iommu *iommu, u16 did,
+       u64 addr, unsigned int size_order, u64 type,
+       int non_present_entry_flush)
+{
+       int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+       u64 val = 0, val_iva = 0;
+       unsigned long flag;
+       unsigned long start_time;
+
+       /*
+        * In the non-present entry flush case, if hardware doesn't cache
+        * non-present entry we do nothing and if hardware cache non-present
+        * entry, we flush entries of domain 0 (the domain id is used to cache
+        * any non-present entries)
+        */
+       if (non_present_entry_flush) {
+               if (!cap_caching_mode(iommu->cap))
+                       return 1;
+               else
+                       did = 0;
+       }
+
+        /* use register invalidation */
+        switch (type) {
+            case DMA_TLB_GLOBAL_FLUSH:
+                /* global flush doesn't need set IVA_REG */
+                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+                break;
+            case DMA_TLB_DSI_FLUSH:
+                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+                break;
+            case DMA_TLB_PSI_FLUSH:
+                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+                /* Note: always flush non-leaf currently */
+                val_iva = size_order | addr;
+                break;
+            default:
+                BUG();
+        }
+        /* Note: set drain read/write */
+#if 0
+        /*
+         * This is probably to be super secure.. Looks like we can
+         * ignore it without any impact.
+         */
+        if (cap_read_drain(iommu->cap))
+            val |= DMA_TLB_READ_DRAIN;
+#endif
+        if (cap_write_drain(iommu->cap))
+            val |= DMA_TLB_WRITE_DRAIN;
+
+        spin_lock_irqsave(&iommu->register_lock, flag);
+        /* Note: Only uses first TLB reg currently */
+        if (val_iva)
+            dmar_writeq(iommu->reg, tlb_offset, val_iva);
+        dmar_writeq(iommu->reg, tlb_offset + 8, val);
+
+        /* Make sure hardware complete it */
+        start_time = jiffies;
+        while (1) {
+            val = dmar_readq(iommu->reg, tlb_offset + 8);
+            if (!(val & DMA_TLB_IVT))
+                break;
+            if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))
+                panic("DMAR hardware is malfunctional, please disable 
IOMMU\n");
+            cpu_relax();
+        }
+        spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+        /* check IOTLB invalidation granularity */
+        if (DMA_TLB_IAIG(val) == 0)
+            printk(KERN_ERR VTDPREFIX "IOMMU: flush IOTLB failed\n");
+        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+            printk(KERN_ERR VTDPREFIX "IOMMU: tlb flush request %x, actual 
%x\n",
+              (u32)DMA_TLB_IIRG(type), (u32)DMA_TLB_IAIG(val));
+       /* flush context entry will implictly flush write buffer */
+       return 0;
+}
+
+static int inline iommu_flush_iotlb_global(struct iommu *iommu,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
+               non_present_entry_flush);
+}
+
+static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
+               non_present_entry_flush);
+}
+
+static int inline get_alignment(u64 base, unsigned int size)
+{
+       int t = 0;
+       u64 end;
+
+       end = base + size - 1;
+       while (base != end) {
+               t++;
+               base >>= 1;
+               end >>= 1;
+       }
+       return t;
+}
+
+static int inline iommu_flush_iotlb_psi(struct iommu *iommu, u16 did,
+       u64 addr, unsigned int pages, int non_present_entry_flush)
+{
+       unsigned int align;
+
+       BUG_ON(addr & (~PAGE_MASK_4K));
+       BUG_ON(pages == 0);
+
+       /* Fallback to domain selective flush if no PSI support */
+       if (!cap_pgsel_inv(iommu->cap))
+               return iommu_flush_iotlb_dsi(iommu, did,
+                       non_present_entry_flush);
+
+       /*
+        * PSI requires page size is 2 ^ x, and the base address is naturally
+        * aligned to the size
+        */
+       align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
+       /* Fallback to domain selective flush if size is too big */
+       if (align > cap_max_amask_val(iommu->cap))
+               return iommu_flush_iotlb_dsi(iommu, did,
+                       non_present_entry_flush);
+
+       addr >>= PAGE_SHIFT_4K + align;
+       addr <<= PAGE_SHIFT_4K + align;
+
+       return __iommu_flush_iotlb(iommu, did, addr, align,
+               DMA_TLB_PSI_FLUSH, non_present_entry_flush);
+}
+
+void flush_all(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int i = 0;
+
+    wbinvd();
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        iommu_flush_context_global(iommu, 0);
+        iommu_flush_iotlb_global(iommu, 0);
+        i++;
+    }
+}
+
+/* clear one page's page table */
+static void dma_pte_clear_one(struct domain *domain, u64 addr)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *pte = NULL;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+
+    /* get last level pte */
+    pte = dma_addr_level_pte(domain, addr, 1);
+
+    if (pte) {
+        dma_clear_pte(*pte);
+        iommu_flush_cache_entry(drhd->iommu, pte);
+
+        for_each_drhd_unit(drhd) {
+            iommu = drhd->iommu;
+            if (cap_caching_mode(iommu->cap))
+            {
+                iommu_flush_iotlb_psi(iommu, domain->domain_id, addr, 1, 0);
+            }
+            else if (cap_rwbf(iommu->cap))
+                iommu_flush_write_buffer(iommu);
+        }
+    }
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct domain *domain, u64 start, u64 end)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    int addr_width = agaw_to_width(hd->agaw);
+
+    start &= (((u64)1) << addr_width) - 1;
+    end &= (((u64)1) << addr_width) - 1;
+    /* in case it's partial page */
+    start = PAGE_ALIGN_4K(start);
+    end &= PAGE_MASK_4K;
+
+    /* we don't need lock here, nobody else touches the iova range */
+    while (start < end) {
+        dma_pte_clear_one(domain, start);
+        start += PAGE_SIZE_4K;
+    }
+}
+
+/* free page table pages. last level pte should already be cleared */
+// static void dma_pte_free_pagetable(struct domain *domain, u64 start, u64 
end)
+void dma_pte_free_pagetable(struct domain *domain, u64 start, u64 end)
+{
+    struct acpi_drhd_unit *drhd;
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct iommu *iommu;
+    int addr_width = agaw_to_width(hd->agaw);
+    struct dma_pte *pte;
+    int total = agaw_to_level(hd->agaw);
+    int level;
+    u32 tmp;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    start &= (((u64)1) << addr_width) - 1;
+    end &= (((u64)1) << addr_width) - 1;
+
+    /* we don't need lock here, nobody else touches the iova range */
+    level = 2;
+    while (level <= total) {
+        tmp = align_to_level(start, level);
+        if (tmp >= end || (tmp + level_size(level) > end))
+            return;
+
+        while (tmp < end) {
+            pte = dma_addr_level_pte(domain, tmp, level);
+            if (pte) {
+                free_xenheap_page((void *) maddr_to_virt(dma_pte_addr(*pte)));
+                dma_clear_pte(*pte);
+                iommu_flush_cache_entry(iommu, pte);
+            }
+            tmp += level_size(level);
+        }
+        level++;
+    }
+    /* free pgd */
+    if (start == 0 && end == ((((u64)1) << addr_width) - 1)) {
+        free_xenheap_page((void *)hd->pgd);
+        hd->pgd = NULL;
+    }
+}
+
+/* iommu handling */
+static int iommu_set_root_entry(struct iommu *iommu)
+{
+    void *addr;
+    u32 cmd, sts;
+    struct root_entry *root;
+    unsigned long flags;
+
+    if (iommu == NULL)
+        gdprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_set_root_entry: iommu == NULL\n");
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    if (!iommu->root_entry) {
+        spin_unlock_irqrestore(&iommu->lock, flags);
+        root = (struct root_entry *)alloc_xenheap_page();
+        memset((u8*)root, 0, PAGE_SIZE);
+        iommu_flush_cache_page(iommu, root);
+        spin_lock_irqsave(&iommu->lock, flags);
+
+        if (!root && !iommu->root_entry) {
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return -ENOMEM;
+        }
+
+        if (!iommu->root_entry)
+            iommu->root_entry = root;
+        else /* somebody is fast */
+            free_xenheap_page((void *)root);
+    }
+    spin_unlock_irqrestore(&iommu->lock, flags);
+
+    addr = iommu->root_entry;
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writeq(iommu->reg, DMAR_RTADDR_REG, virt_to_maddr(addr));
+    cmd = iommu->gcmd | DMA_GCMD_SRTP;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
+
+    /* Make sure hardware complete it */
+    while (1) {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if (sts & DMA_GSTS_RTPS)
+            break;
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+    return 0;
+}
+
+static int iommu_enable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "iommu_enable_translation: enabling vt-d translation\n");
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd |= DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+    /* Make sure hardware complete it */
+    while (1) {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if (sts & DMA_GSTS_TES) {
+            break;
+        }
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return 0;
+}
+
+int iommu_disable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd &= ~ DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    /* Make sure hardware complete it */
+    while(1) {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if (!(sts & DMA_GSTS_TES))
+                break;
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return 0;
+}
+
+static struct iommu *vector_to_iommu[NR_VECTORS];
+static int iommu_page_fault_do_one(struct iommu *iommu, int type,
+        u8 fault_reason, u16 source_id, u32 addr)
+{
+    dprintk(XENLOG_WARNING VTDPREFIX,
+        "iommu_page_fault:%s: DEVICE %x:%x.%x addr %x REASON %x\n",
+        (type ? "DMA Read" : "DMA Write"),
+        (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+        PCI_FUNC(source_id & 0xFF), addr, fault_reason);
+
+    print_vtd_entries(current->domain, (source_id >> 8),(source_id & 0xff),
+                      (addr >> PAGE_SHIFT)); 
+    return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+static void iommu_page_fault(int vector, void *dev_id,
+        struct cpu_user_regs *regs)
+{
+    struct iommu *iommu = dev_id;
+    int reg, fault_index;
+    u32 fault_status;
+    unsigned long flags;
+
+    dprintk(XENLOG_WARNING VTDPREFIX,
+        "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+    /* FIXME: ignore advanced fault log */
+    if (!(fault_status & DMA_FSTS_PPF))
+        return;
+    fault_index = dma_fsts_fault_record_index(fault_status);
+    reg = cap_fault_reg_offset(iommu->cap);
+    while (1) {
+        u8 fault_reason;
+        u16 source_id;
+        u32 guest_addr;
+        int type;
+        u32 data;
+
+        /* highest 32 bits */
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        data = dmar_readl(iommu->reg, reg +
+                fault_index * PRIMARY_FAULT_REG_LEN + 12);
+        if (!(data & DMA_FRCD_F)) {
+            spin_unlock_irqrestore(&iommu->register_lock, flags);
+            break;
+        }
+
+        fault_reason = dma_frcd_fault_reason(data);
+        type = dma_frcd_type(data);
+
+        data = dmar_readl(iommu->reg, reg +
+                fault_index * PRIMARY_FAULT_REG_LEN + 8);
+        source_id = dma_frcd_source_id(data);
+
+        guest_addr = dmar_readq(iommu->reg, reg +
+                fault_index * PRIMARY_FAULT_REG_LEN);
+        guest_addr = dma_frcd_page_addr(guest_addr);
+        /* clear the fault */
+        dmar_writel(iommu->reg, reg +
+            fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+        iommu_page_fault_do_one(iommu, type, fault_reason,
+                source_id, guest_addr);
+
+        fault_index++;
+        if (fault_index > cap_num_fault_regs(iommu->cap))
+            fault_index = 0;
+    }
+    /* clear primary fault overflow */
+    if (fault_status & DMA_FSTS_PFO) {
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+    }
+    return;
+}
+
+static void dma_msi_unmask(unsigned int vector)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    unsigned long flags;
+
+    /* unmask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_mask(unsigned int vector)
+{
+    unsigned long flags;
+    struct iommu *iommu = vector_to_iommu[vector];
+
+    /* mask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static unsigned int dma_msi_startup(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    return 0;
+}
+
+static void dma_msi_end(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    ack_APIC_irq();
+}
+
+static void dma_msi_data_init(struct iommu *iommu, int vector)
+{
+    u32 msi_data = 0;
+    unsigned long flags;
+
+    /* Fixed, edge, assert mode. Follow MSI setting */
+    msi_data |= vector & 0xff;
+    msi_data |= 1 << 14;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
+{
+    u64 msi_address;
+    unsigned long flags;
+
+    /* Physical, dedicated cpu. Follow MSI setting */
+    msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
+    msi_address |= MSI_PHYSICAL_MODE << 2;
+    msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
+    msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
+    dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
+}
+
+static struct hw_interrupt_type dma_msi_type = {
+    .typename = "DMA_MSI",
+    .startup = dma_msi_startup,
+    .shutdown = dma_msi_mask,
+    .enable = dma_msi_unmask,
+    .disable = dma_msi_mask,
+    .ack = dma_msi_mask,
+    .end = dma_msi_end,
+    .set_affinity = dma_msi_set_affinity,
+};
+
+int iommu_set_interrupt(struct iommu *iommu)
+{
+    int vector, ret;
+    unsigned long flags;
+
+    vector = assign_irq_vector(AUTO_ASSIGN);
+    vector_to_iommu[vector] = iommu;
+
+    /* VT-d fault is a MSI, make irq == vector */
+    irq_vector[vector] = vector;
+    vector_irq[vector] = vector;
+
+    if (!vector) {
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
+        return -EINVAL;
+    }
+
+    spin_lock_irqsave(&irq_desc[vector].lock, flags);
+    irq_desc[vector].handler = &dma_msi_type;
+    spin_unlock_irqrestore(&irq_desc[vector].lock, flags);
+    set_intr_gate(vector, interrupt[vector]);
+    ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu);
+    if (ret)
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
+    return vector;
+}
+
+struct iommu *iommu_alloc(void *hw_data)
+{
+    struct acpi_drhd_unit *drhd = (struct acpi_drhd_unit *) hw_data;
+    struct iommu *iommu;
+    
+    if (nr_iommus > MAX_IOMMUS) {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+            "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
+        return NULL;
+    }
+        
+    iommu = xmalloc(struct iommu);
+    if (!iommu)
+        return NULL;
+    memset(iommu, 0, sizeof(struct iommu));
+
+    set_fixmap_nocache(FIX_IOMMU_REGS_BASE_0 + nr_iommus, drhd->address);
+    iommu->reg = (void *) fix_to_virt(FIX_IOMMU_REGS_BASE_0 + nr_iommus);
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "iommu_alloc: iommu->reg = %p drhd->address = %lx\n",
+        iommu->reg, drhd->address);
+    nr_iommus++;
+
+    if (!iommu->reg) {
+        printk(KERN_ERR VTDPREFIX "IOMMU: can't mapping the region\n");
+        goto error;
+    }
+
+    iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
+    iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
+
+    spin_lock_init(&iommu->lock);
+    spin_lock_init(&iommu->register_lock);
+
+    drhd->iommu = iommu;
+    return iommu;
+error:
+    xfree(iommu);
+    return NULL;
+}
+
+static void free_iommu(struct iommu *iommu)
+{
+    if (!iommu)
+        return;
+    if (iommu->root_entry)
+        free_xenheap_page((void *)iommu->root_entry);
+    if (iommu->reg)
+        iounmap(iommu->reg);
+    free_irq(iommu->vector);
+    xfree(iommu);
+}
+
+#define guestwidth_to_adjustwidth(gaw) ({ \
+    int agaw; \
+    int r = (gaw - 12) % 9; \
+    if (r == 0) \
+        agaw = gaw; \
+    else \
+        agaw = gaw + 9 - r; \
+    if (agaw > 64) \
+        agaw = 64; \
+    agaw; })
+int iommu_domain_init(struct domain *domain)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct iommu *iommu = NULL;
+    int guest_width = DEFAULT_DOMAIN_ADDRESS_WIDTH;
+    int adjust_width, agaw;
+    unsigned long sagaw;
+    struct acpi_drhd_unit *drhd;
+
+    if (list_empty(&acpi_drhd_units))
+        return 0;
+    spin_lock_init(&hd->mapping_lock);
+    spin_lock_init(&hd->iommu_list_lock);
+    INIT_LIST_HEAD(&hd->pdev_list);
+
+    for_each_drhd_unit(drhd) {
+        if (drhd->iommu)
+            iommu = drhd->iommu;
+        else
+            iommu = iommu_alloc(drhd);
+    }
+
+    /* calculate AGAW */
+    if (guest_width > cap_mgaw(iommu->cap))
+        guest_width = cap_mgaw(iommu->cap);
+    adjust_width = guestwidth_to_adjustwidth(guest_width);
+    agaw = width_to_agaw(adjust_width);
+    /* FIXME: hardware doesn't support it, choose a bigger one? */
+    sagaw = cap_sagaw(iommu->cap);
+    if (!test_bit(agaw, &sagaw)) {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+            "IOMMU: hardware doesn't support the agaw\n");
+        agaw = find_next_bit(&sagaw, 5, agaw);
+        if (agaw >= 5)
+            return -ENODEV;
+    }
+    hd->agaw = agaw;
+    return 0;
+}
+
+static int domain_context_mapping_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct context_entry *context;
+    unsigned long flags;
+    int ret = 0;
+
+    context = device_to_context_entry(iommu, bus, devfn);
+    if (!context) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_mapping_one:context == NULL:bdf = %x:%x:%x \n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return -ENOMEM;
+    }
+    spin_lock_irqsave(&iommu->lock, flags);
+    if (context_present(*context)) {
+        spin_unlock_irqrestore(&iommu->lock, flags);
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_mapping_one:context present:bdf=%x:%x:%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return 0;
+    }
+
+#ifdef VTD_DEBUG
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "context_mapping_one_1-%x:%x:%x-*context = %lx %lx\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn), context->hi, context->lo);
+#endif
+
+    /*
+     * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
+     * be 1 based as required by intel's iommu hw.
+     */
+    context_set_domain_id(*context, domain->domain_id);
+    context_set_address_width(*context, hd->agaw);
+
+    if (ecap_pass_thru(iommu->ecap))
+        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+    else {
+        context_set_address_root(*context, virt_to_maddr(hd->pgd));
+        context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+    }
+
+    context_set_fault_enable(*context);
+    context_set_present(*context);
+    iommu_flush_cache_entry(iommu, context);
+
+#ifdef VTD_DEBUG
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "context_mapping_one_2-%x:%x:%x-*context=%lx %lx hd->pgd = %p\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+        context->hi, context->lo, hd->pgd);
+#endif
+
+    if (iommu_flush_context_device(iommu, domain->domain_id,
+                    (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
+        iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, domain->domain_id, 0);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+    return ret;
+}
+
+static int __pci_find_next_cap(u8 bus, unsigned int devfn, u8 pos, int cap)
+{
+    u8 id;
+    int ttl = 48;
+
+    while (ttl--) {
+        pos = read_pci_config_byte(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos);
+        if (pos < 0x40)
+            break;
+        pos &= ~3;
+        id = read_pci_config_byte(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                 pos + PCI_CAP_LIST_ID);
+
+        if (id == 0xff)
+            break;
+        if (id == cap)
+            return pos;
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+#define PCI_BASE_CLASS_BRIDGE    0x06
+#define PCI_CLASS_BRIDGE_PCI     0x0604
+
+#define DEV_TYPE_PCIe_ENDPOINT   1
+#define DEV_TYPE_PCI_BRIDGE      2
+#define DEV_TYPE_PCI             3
+
+int pdev_type(struct pci_dev *dev)
+{
+    u16 class_device;
+    u16 status;
+
+    class_device = read_pci_config_16(dev->bus, PCI_SLOT(dev->devfn),
+                 PCI_FUNC(dev->devfn), PCI_CLASS_DEVICE);
+    if (class_device == PCI_CLASS_BRIDGE_PCI)
+        return DEV_TYPE_PCI_BRIDGE;
+
+    status = read_pci_config_16(dev->bus, PCI_SLOT(dev->devfn),
+                 PCI_FUNC(dev->devfn), PCI_STATUS);
+
+    if (!(status & PCI_STATUS_CAP_LIST))
+        return DEV_TYPE_PCI;
+
+    if (__pci_find_next_cap(dev->bus, dev->devfn, PCI_CAPABILITY_LIST, 
PCI_CAP_ID_EXP))
+        return DEV_TYPE_PCIe_ENDPOINT;
+
+    return DEV_TYPE_PCI;
+}
+
+#define MAX_BUSES 256
+struct pci_dev bus2bridge[MAX_BUSES];
+
+static int domain_context_mapping(
+    struct domain *domain,
+    struct iommu *iommu,
+    struct pci_dev *pdev)
+{
+    int ret = 0;
+    int dev, func, sec_bus, sub_bus;
+    u32 type;
+
+    type = pdev_type(pdev);
+    if (type == DEV_TYPE_PCI_BRIDGE) {
+        sec_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SECONDARY_BUS);
+
+        if (bus2bridge[sec_bus].bus == 0) {
+            bus2bridge[sec_bus].bus   =  pdev->bus;
+            bus2bridge[sec_bus].devfn =  pdev->devfn;
+        }
+
+        sub_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
+
+        if (sec_bus != sub_bus) {
+            dprintk(XENLOG_INFO VTDPREFIX,
+                "context_mapping: nested PCI bridge not supported\n");
+            dprintk(XENLOG_INFO VTDPREFIX,
+                "    bdf = %x:%x:%x sec_bus = %x sub_bus = %x\n",
+                pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
+                sec_bus, sub_bus);
+        }
+    }
+
+    if (type == DEV_TYPE_PCIe_ENDPOINT) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_mapping:PCIe : bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        ret = domain_context_mapping_one(domain, iommu,
+                  (u8)(pdev->bus), (u8) (pdev->devfn));
+    }
+
+    /* PCI devices */
+    if (type == DEV_TYPE_PCI) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_mapping:PCI: bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+
+        if (pdev->bus == 0)
+            ret = domain_context_mapping_one(domain, iommu,
+                      (u8)(pdev->bus), (u8) (pdev->devfn));
+        else {
+            if (bus2bridge[pdev->bus].bus != 0)
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "domain_context_mapping:bus2bridge[pdev->bus].bus==0\n");
+
+            ret = domain_context_mapping_one(domain, iommu,
+                      (u8)(bus2bridge[pdev->bus].bus),
+                      (u8)(bus2bridge[pdev->bus].devfn));
+
+            /* now map everything behind the PCI bridge */
+            for (dev = 0; dev < 32; dev++) {
+                for (func = 0; func < 8; func++) {
+                    ret = domain_context_mapping_one(domain, iommu,
+                              pdev->bus, (u8)PCI_DEVFN(dev, func));
+                    if (ret)
+                        return ret;
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+static int domain_context_unmap_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct context_entry *context;
+    unsigned long flags;
+
+    context = device_to_context_entry(iommu, bus, devfn);
+    if (!context) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_unmap_one-%x:%x:%x- context == NULL:return\n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return -ENOMEM;
+    }
+    spin_lock_irqsave(&iommu->lock, flags);
+    if (!context_present(*context)) {
+        spin_unlock_irqrestore(&iommu->lock, flags);
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_unmap_one-%x:%x:%x- context NOT present:return\n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return 0;
+    }
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "domain_context_unmap_one_1:bdf = %x:%x:%x\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+    context_clear_present(*context);
+    context_clear_entry(*context);
+    iommu_flush_cache_entry(iommu, context);
+    iommu_flush_context_global(iommu, 0);
+    iommu_flush_iotlb_global(iommu, 0);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "domain_context_unmap_one_2:bdf = %x:%x:%x\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+    return 0;
+}
+
+static int domain_context_unmap(
+    struct domain *domain,
+    struct iommu *iommu,
+    struct pci_dev *pdev)
+{
+    int ret = 0;
+    int dev, func, sec_bus, sub_bus;
+    u32 type;
+
+    type = pdev_type(pdev);
+    if (type == DEV_TYPE_PCI_BRIDGE) {
+        sec_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SECONDARY_BUS);
+        sub_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
+
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_unmap:BRIDGE:%x:%x:%x sec_bus=%x sub_bus=%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn),
+            PCI_FUNC(pdev->devfn), sec_bus, sub_bus);
+    }
+
+    if (type == DEV_TYPE_PCIe_ENDPOINT) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCIe : bdf = %x:%x:%x\n",
+                 pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        ret = domain_context_unmap_one(domain, iommu,
+                  (u8)(pdev->bus), (u8) (pdev->devfn));
+    }
+
+    /* PCI devices */
+    if (type == DEV_TYPE_PCI) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCI: bdf = %x:%x:%x\n",
+                 pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        if (pdev->bus == 0)
+            ret = domain_context_unmap_one(domain, iommu,
+                      (u8)(pdev->bus), (u8) (pdev->devfn));
+        else {
+            if (bus2bridge[pdev->bus].bus != 0)
+                gdprintk(XENLOG_INFO VTDPREFIX,
+                         
"domain_context_mapping:bus2bridge[pdev->bus].bus==0\n");
+
+            ret = domain_context_unmap_one(domain, iommu,
+                      (u8)(bus2bridge[pdev->bus].bus),
+                      (u8)(bus2bridge[pdev->bus].devfn));
+
+            /* now map everything behind the PCI bridge */
+            for (dev = 0; dev < 32; dev++) {
+                for (func = 0; func < 8; func++) {
+                    ret = domain_context_unmap_one(domain, iommu,
+                              pdev->bus, (u8)PCI_DEVFN(dev, func));
+                    if (ret)
+                        return ret;
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+void reassign_device_ownership(
+    struct domain *source,
+    struct domain *target,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *source_hd = domain_hvm_iommu(source);
+    struct hvm_iommu *target_hd = domain_hvm_iommu(target);
+    struct pci_dev *pdev;
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int status;
+    unsigned long flags;
+
+    gdprintk(XENLOG_ERR VTDPREFIX,
+        "reassign_device-%x:%x:%x- source = %d target = %d\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+        source->domain_id, target->domain_id);
+
+    for_each_pdev(source, pdev) {
+        if ( (pdev->bus != bus) || (pdev->devfn != devfn) )
+            continue;
+
+        pdev->bus = bus;
+        pdev->devfn = devfn;
+        drhd = acpi_find_matched_drhd_unit(pdev);
+        iommu = drhd->iommu;
+        domain_context_unmap(source, iommu, pdev);
+
+        /*
+         * move pci device from the source domain to target domain.
+         */
+        spin_lock_irqsave(&source_hd->iommu_list_lock, flags);
+        spin_lock_irqsave(&target_hd->iommu_list_lock, flags);
+        list_move(&pdev->list, &target_hd->pdev_list);
+        spin_unlock_irqrestore(&target_hd->iommu_list_lock, flags);
+        spin_unlock_irqrestore(&source_hd->iommu_list_lock, flags);
+
+        status = domain_context_mapping(target, iommu, pdev);
+        if (status != 0)
+            gdprintk(XENLOG_ERR VTDPREFIX, "domain_context_mapping failed\n");
+
+        /*
+         * We are done.
+         */
+        break;
+    }
+}
+
+void return_devices_to_dom0(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    struct pci_dev *pdev;
+
+    while (!list_empty(&hd->pdev_list)) {
+        pdev = list_entry(hd->pdev_list.next, typeof(*pdev), list);
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "return_devices_to_dom0: bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        reassign_device_ownership(d, dom0, pdev->bus, pdev->devfn);
+    }
+
+#ifdef VTD_DEBUG
+    for_each_pdev(dom0, pdev) {
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "return_devices_to_dom0:%x: bdf = %x:%x:%x\n",
+            dom0->domain_id, pdev->bus,
+            PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+    }
+#endif
+}
+
+void iommu_domain_teardown(struct domain *d)
+{
+  if (list_empty(&acpi_drhd_units))
+      return;
+
+#if CONFIG_PAGING_LEVELS == 3
+  {
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    int level = agaw_to_level(hd->agaw);
+    struct dma_pte *pgd = NULL;
+
+    switch (level)
+    {
+        case VTD_PAGE_TABLE_LEVEL_3:
+            if ( hd->pgd )
+                free_xenheap_page((void *)hd->pgd);
+            break;
+        case VTD_PAGE_TABLE_LEVEL_4:
+            if ( hd->pgd )
+            {
+                pgd = hd->pgd;
+                if ( pgd[0].val != 0 )
+                    free_xenheap_page((void*)maddr_to_virt(
+                        dma_pte_addr(pgd[0])));
+            }
+            break;
+        default:
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                "Unsupported p2m table sharing level!\n");
+            break;
+    }
+  }
+#endif
+    return_devices_to_dom0(d);
+}
+
+static int domain_context_mapped(struct domain *domain, struct pci_dev *pdev)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int ret;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        ret = device_context_mapped(iommu, pdev->bus, pdev->devfn);
+        if (ret)
+            return ret;
+    }
+    return 0;
+}
+
+int iommu_map_page(struct domain *d, paddr_t gfn, paddr_t mfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *pte = NULL;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if (ecap_pass_thru(iommu->ecap) && (d->domain_id == 0))
+        return 0;
+
+    pte = addr_to_dma_pte(d, gfn << PAGE_SHIFT_4K);
+    if (!pte)
+        return -ENOMEM;
+    dma_set_pte_addr(*pte, mfn << PAGE_SHIFT_4K);
+    dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
+    iommu_flush_cache_entry(iommu, pte);
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        if (cap_caching_mode(iommu->cap))
+            iommu_flush_iotlb_psi(iommu, d->domain_id,
+                                  gfn << PAGE_SHIFT_4K, 1, 0);
+        else if (cap_rwbf(iommu->cap))
+            iommu_flush_write_buffer(iommu);
+    }
+    return 0;
+}
+
+int iommu_unmap_page(struct domain *d, dma_addr_t gfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *pte = NULL;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if (ecap_pass_thru(iommu->ecap) && (d->domain_id == 0))
+        return 0;
+
+    /* get last level pte */
+    pte = dma_addr_level_pte(d, gfn << PAGE_SHIFT_4K, 1);
+    dma_pte_clear_one(d, gfn << PAGE_SHIFT_4K);
+    
+    return 0;
+}
+
+int iommu_page_mapping(struct domain *domain, dma_addr_t iova,
+            void *hpa, size_t size, int prot)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    unsigned long start_pfn, end_pfn;
+    struct dma_pte *pte = NULL;
+    int index;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+    if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+        return -EINVAL;
+    iova = (iova >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K;
+    start_pfn = (unsigned long)(((unsigned long) hpa) >> PAGE_SHIFT_4K);
+    end_pfn = (unsigned long)
+              ((PAGE_ALIGN_4K(((unsigned long)hpa) + size)) >> PAGE_SHIFT_4K);
+    index = 0;
+    while (start_pfn < end_pfn) {
+        pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
+        if (!pte)
+            return -ENOMEM;
+        dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
+        dma_set_pte_prot(*pte, prot);
+        iommu_flush_cache_entry(iommu, pte);
+        start_pfn++;
+        index++;
+    }
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        if (cap_caching_mode(iommu->cap))
+            iommu_flush_iotlb_psi(iommu, domain->domain_id, iova, size, 0);
+        else if (cap_rwbf(iommu->cap))
+            iommu_flush_write_buffer(iommu);
+    }
+    return 0;
+}
+
+int iommu_page_unmapping(struct domain *domain, dma_addr_t addr, size_t size)
+{
+    struct dma_pte *pte = NULL;
+
+    /* get last level pte */
+    pte = dma_addr_level_pte(domain, addr, 1);
+    dma_pte_clear_range(domain, addr, addr + size);
+    
+    return 0;
+}
+
+void iommu_flush(struct domain *d, dma_addr_t gfn, u64 *p2m_entry)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu = NULL;
+    struct dma_pte *pte = (struct dma_pte *) p2m_entry;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        if (cap_caching_mode(iommu->cap))
+            iommu_flush_iotlb_psi(iommu, d->domain_id,
+                gfn << PAGE_SHIFT_4K, 1, 0);
+        else if (cap_rwbf(iommu->cap))
+            iommu_flush_write_buffer(iommu);
+    }
+    iommu_flush_cache_entry(iommu, pte);
+}
+
+int
+prepare_device(struct domain *domain, struct pci_dev dev)
+{
+    return 0;
+}
+
+static int iommu_prepare_rmrr_dev(
+    struct domain *d,
+    struct acpi_rmrr_unit *rmrr,
+    struct pci_dev *pdev)
+{
+    struct acpi_drhd_unit *drhd;
+    unsigned long size;
+    int ret;
+
+    /* page table init */
+    size = rmrr->end_address - rmrr->base_address + 1;
+    ret = iommu_page_mapping(d, rmrr->base_address,
+        (void *)rmrr->base_address, size,
+        DMA_PTE_READ|DMA_PTE_WRITE);
+    if (ret)
+        return ret;
+
+    if (domain_context_mapped(d, pdev) == 0) {
+        drhd = acpi_find_matched_drhd_unit(pdev);
+        ret = domain_context_mapping(d, drhd->iommu, pdev);
+        if (!ret)
+            return 0;
+    }
+    return ret;
+}
+
+void __init setup_dom0_devices(void)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(dom0);
+    struct acpi_drhd_unit *drhd;
+    struct pci_dev *pdev;
+    int bus, dev, func;
+    u32 l;
+    u8 hdr_type;
+    int ret;
+
+#ifdef DEBUG_VTD_CONTEXT_ENTRY
+    for (bus = 0; bus < 256; bus++) {
+        for (dev = 0; dev < 32; dev++) { 
+            for (func = 0; func < 8; func++) { 
+                struct context_entry *context;
+                struct pci_dev device;
+
+                device.bus = bus; 
+                device.devfn = PCI_DEVFN(dev, func); 
+                drhd = acpi_find_matched_drhd_unit(&device);
+                context = device_to_context_entry(drhd->iommu,
+                    bus, PCI_DEVFN(dev, func));
+                if ((context->lo != 0) || (context->hi != 0))
+                    dprintk(XENLOG_INFO VTDPREFIX,
+                        "setup_dom0_devices-%x:%x:%x- context not 0\n",
+                        bus, dev, func);
+            }
+        }    
+    }        
+#endif
+
+    for (bus = 0; bus < 256; bus++) {
+        for (dev = 0; dev < 32; dev++) { 
+            for (func = 0; func < 8; func++) { 
+                l = read_pci_config(bus, dev, func, PCI_VENDOR_ID);
+                /* some broken boards return 0 or ~0 if a slot is empty: */
+                if (l == 0xffffffff || l == 0x00000000 ||
+                    l == 0x0000ffff || l == 0xffff0000)
+                    continue;
+                pdev = xmalloc(struct pci_dev);
+                pdev->bus = bus;
+                pdev->devfn = PCI_DEVFN(dev, func);
+                list_add_tail(&pdev->list, &hd->pdev_list);
+
+                drhd = acpi_find_matched_drhd_unit(pdev);
+                ret = domain_context_mapping(dom0, drhd->iommu, pdev);
+                if (ret != 0)
+                    gdprintk(XENLOG_ERR VTDPREFIX,
+                        "domain_context_mapping failed\n");
+
+                hdr_type = read_pci_config(bus, dev, func, PCI_HEADER_TYPE);
+                // if ((hdr_type & 0x8) == 0)
+                //      break;
+            }
+        }
+    }
+    for_each_pdev(dom0, pdev) {
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "setup_dom0_devices: bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+    }
+}
+
+void clear_fault_bit(struct iommu *iommu)
+{
+    u64 val;
+
+    val = dmar_readq(
+            iommu->reg,
+            cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
+    dmar_writeq(
+            iommu->reg,
+            cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
+            val);
+    dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
+}
+
+/*
+ * Called from ACPI discovery code, once all DMAR's and RMRR's are done
+ * scanning, we need to run through and initialize as much of it as necessary
+ */
+int vtd_enable = 1;
+static void setup_vtd_enable(char *s)
+{
+    if ( !strcmp(s, "0") )
+        vtd_enable = 0;
+    else if ( !strcmp(s, "1") )
+        vtd_enable = 1;
+    else
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "Unknown vtd_enable value specified: '%s'\n", s);
+    dprintk(XENLOG_INFO VTDPREFIX, "vtd_enable = %x\n", vtd_enable);
+}
+custom_param("vtd", setup_vtd_enable);
+
+static int init_vtd_hw(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int ret;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        ret = iommu_set_root_entry(iommu);
+        if (ret) {
+            gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
+            return -EIO;
+        }
+    }
+    return 0;
+}
+
+static int enable_vtd_translation(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int vector = 0;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        vector = iommu_set_interrupt(iommu);
+        dma_msi_data_init(iommu, vector);
+        dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
+        iommu->vector = vector;
+        clear_fault_bit(iommu);
+        if (vtd_enable && iommu_enable_translation(iommu))
+            return -EIO;
+    }
+    return 0;
+}
+
+static void setup_dom0_rmrr(void)
+{
+    struct acpi_rmrr_unit *rmrr;
+    struct pci_dev *pdev;
+    int ret;
+
+    for_each_rmrr_device(rmrr, pdev)
+        ret = iommu_prepare_rmrr_dev(dom0, rmrr, pdev);
+        if (ret)
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: mapping reserved region failed\n");
+    end_for_each_rmrr_device(rmrr, pdev)
+}
+
+int iommu_setup(void)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(dom0);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    if (list_empty(&acpi_drhd_units))
+        return 0;
+
+    INIT_LIST_HEAD(&hd->pdev_list);
+
+    /* start from scratch */
+    flush_all();
+
+    /* setup clflush size */
+    x86_clflush_size = ((cpuid_ebx(1) >> 8) & 0xff) * 8;
+
+    /*
+     * allocate IO page directory page for the domain.
+     */
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    hd->pgd = (struct dma_pte *)alloc_xenheap_page();
+    memset((u8*)hd->pgd, 0, PAGE_SIZE);
+
+    if (init_vtd_hw())
+        goto error;
+    setup_dom0_devices();
+    setup_dom0_rmrr();
+    if (enable_vtd_translation())
+        goto error;
+
+    return 0;
+
+error:
+    printk("iommu_setup() failed\n");
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        free_iommu(iommu);
+    }
+    return -EIO;
+}
+
+int assign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    struct acpi_rmrr_unit *rmrr;
+    struct pci_dev *pdev;
+    int ret = 0;
+
+    if (list_empty(&acpi_drhd_units))
+        return ret;
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "assign_device: bus = %x dev = %x func = %x\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+    reassign_device_ownership(dom0, d, bus, devfn);
+
+    /* setup rmrr identify mapping just once per domain */
+    if (list_empty(&hd->pdev_list))
+        for_each_rmrr_device(rmrr, pdev)
+            ret = iommu_prepare_rmrr_dev(d, rmrr, pdev);
+            if (ret)
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "IOMMU: mapping reserved region failed\n");
+        end_for_each_rmrr_device(rmrr, pdev)
+    return ret;
+}
+
+void iommu_set_pgd(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    unsigned long p2m_table;
+
+    if (hd->pgd) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "iommu_set_pgd_1: hd->pgd = %p\n", hd->pgd);
+        hd->pgd = NULL;
+    }
+    p2m_table = mfn_x(pagetable_get_mfn(d->arch.phys_table));
+
+#if CONFIG_PAGING_LEVELS == 3
+    if ( !hd->pgd )
+    {
+        int level = agaw_to_level(hd->agaw);
+        struct dma_pte *pmd = NULL;
+        struct dma_pte *pgd = NULL;
+        struct dma_pte *pte = NULL;
+        l3_pgentry_t *l3e;
+        unsigned long flags;
+        int i;
+
+        spin_lock_irqsave(&hd->mapping_lock, flags);
+        if (!hd->pgd) {
+            pgd = (struct dma_pte *)alloc_xenheap_page();
+            memset((u8*)pgd, 0, PAGE_SIZE);
+            if (!hd->pgd)
+                hd->pgd = pgd;
+            else /* somebody is fast */
+                free_xenheap_page((void *) pgd);
+        }
+
+        l3e = map_domain_page(p2m_table);
+        switch(level)
+        {
+            case VTD_PAGE_TABLE_LEVEL_3:        /* Weybridge */
+                /* We only support 8 entries for the PAE L3 p2m table */
+                for ( i = 0; i < 8 ; i++ )
+                {
+                    /* Don't create new L2 entry, use ones from p2m table */
+                    pgd[i].val = l3e[i].l3 | _PAGE_PRESENT | _PAGE_RW;
+                }
+                break;
+
+            case VTD_PAGE_TABLE_LEVEL_4:        /* Stoakley */
+                /* We allocate one more page for the top vtd page table. */
+                pmd = (struct dma_pte *)alloc_xenheap_page();
+                memset((u8*)pmd, 0, PAGE_SIZE);
+                pte = &pgd[0];
+                dma_set_pte_addr(*pte, virt_to_maddr(pmd));
+                dma_set_pte_readable(*pte);
+                dma_set_pte_writable(*pte);
+
+                for ( i = 0; i < 8; i++ )
+                {
+                    /* Don't create new L2 entry, use ones from p2m table */
+                    pmd[i].val = l3e[i].l3 | _PAGE_PRESENT | _PAGE_RW;
+                }
+                break;
+            default:
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "iommu_set_pgd:Unsupported p2m table sharing level!\n");
+                break;
+        }
+        unmap_domain_page(l3e);
+        spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    }
+#elif CONFIG_PAGING_LEVELS == 4
+    if ( !hd->pgd )
+    {
+        int level = agaw_to_level(hd->agaw);
+        l3_pgentry_t *l3e;
+        mfn_t pgd_mfn;
+
+        switch (level)
+        {
+            case VTD_PAGE_TABLE_LEVEL_3:
+                l3e = map_domain_page(p2m_table);
+                if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+                {
+                    gdprintk(XENLOG_ERR VTDPREFIX,
+                        "iommu_set_pgd: second level wasn't there\n");
+                    unmap_domain_page(l3e);
+                    return;
+                }
+                pgd_mfn = _mfn(l3e_get_pfn(*l3e));
+                unmap_domain_page(l3e);
+                hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                      pagetable_from_mfn(pgd_mfn)));
+                break;
+
+            case VTD_PAGE_TABLE_LEVEL_4:
+                pgd_mfn = _mfn(p2m_table);
+                hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                      pagetable_from_mfn(pgd_mfn)));
+                break;
+            default:
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "iommu_set_pgd:Unsupported p2m table sharing level!\n");
+                break;
+        }
+    }
+#endif
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "iommu_set_pgd: hd->pgd = %p\n", hd->pgd);
+}
+
+
+u8 iommu_state[MAX_IOMMU_REGS * MAX_IOMMUS];
+int iommu_suspend(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int i = 0;
+
+    if (!vtd_enable)
+        return 0;
+
+    flush_all();
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        iommu_state[DMAR_RTADDR_REG * i] =
+            (u64) dmar_readq(iommu->reg, DMAR_RTADDR_REG);
+        iommu_state[DMAR_FECTL_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
+        iommu_state[DMAR_FEDATA_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
+        iommu_state[DMAR_FEADDR_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
+        iommu_state[DMAR_FEUADDR_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
+        iommu_state[DMAR_PLMBASE_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_PLMBASE_REG);
+        iommu_state[DMAR_PLMLIMIT_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_PLMLIMIT_REG);
+        iommu_state[DMAR_PHMBASE_REG * i] =
+            (u64) dmar_readq(iommu->reg, DMAR_PHMBASE_REG);
+        iommu_state[DMAR_PHMLIMIT_REG * i] =
+            (u64) dmar_readq(iommu->reg, DMAR_PHMLIMIT_REG);
+        i++;
+    }
+
+    return 0;
+}
+
+int iommu_resume(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int i = 0;
+
+    if (!vtd_enable)
+        return 0;
+
+    flush_all();
+
+    init_vtd_hw();
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        dmar_writeq( iommu->reg, DMAR_RTADDR_REG,
+            (u64) iommu_state[DMAR_RTADDR_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FECTL_REG,
+            (u32) iommu_state[DMAR_FECTL_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FEDATA_REG,
+            (u32) iommu_state[DMAR_FEDATA_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FEADDR_REG,
+            (u32) iommu_state[DMAR_FEADDR_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
+            (u32) iommu_state[DMAR_FEUADDR_REG * i]);
+        dmar_writel(iommu->reg, DMAR_PLMBASE_REG,
+            (u32) iommu_state[DMAR_PLMBASE_REG * i]);
+        dmar_writel(iommu->reg, DMAR_PLMLIMIT_REG,
+            (u32) iommu_state[DMAR_PLMLIMIT_REG * i]);
+        dmar_writeq(iommu->reg, DMAR_PHMBASE_REG,
+            (u64) iommu_state[DMAR_PHMBASE_REG * i]);
+        dmar_writeq(iommu->reg, DMAR_PHMLIMIT_REG,
+            (u64) iommu_state[DMAR_PHMLIMIT_REG * i]);
+
+        if (iommu_enable_translation(iommu))
+            return -EIO;
+        i++;
+    }
+    return 0;
+}
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/io.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/io.c     Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx>
+ * Copyright (C) Xiaohui Xin <xiaohui.xin@xxxxxxxxx>
+ */
+
+#include <xen/init.h>
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+#include <xen/event.h>
+#include <xen/hypercall.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/paging.h>
+#include <asm/shadow.h>
+#include <asm/p2m.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/support.h>
+#include <asm/hvm/vpt.h>
+#include <asm/hvm/vpic.h>
+#include <asm/hvm/vlapic.h>
+#include <public/sched.h>
+#include <xen/iocap.h>
+#include <public/hvm/ioreq.h>
+
+int hvm_do_IRQ_dpci(struct domain *d, unsigned int mirq)
+{
+    uint32_t device, intx;
+    uint32_t link, isa_irq;
+    struct hvm_irq *hvm_irq;
+
+    if (!vtd_enabled || (d == dom0))
+        return 0;
+
+    if (d->arch.hvm_domain.irq.mirq[mirq].valid)
+    {
+        device = d->arch.hvm_domain.irq.mirq[mirq].device;
+        intx = d->arch.hvm_domain.irq.mirq[mirq].intx;
+        link = hvm_pci_intx_link(device, intx);
+        hvm_irq = &d->arch.hvm_domain.irq;
+        isa_irq = hvm_irq->pci_link.route[link];
+
+        if ( !d->arch.hvm_domain.irq.girq[isa_irq].valid )
+        {
+            d->arch.hvm_domain.irq.girq[isa_irq].valid = 1;
+            d->arch.hvm_domain.irq.girq[isa_irq].device = device;
+            d->arch.hvm_domain.irq.girq[isa_irq].intx = intx;
+            d->arch.hvm_domain.irq.girq[isa_irq].machine_gsi = mirq;
+        }
+
+        if ( !test_and_set_bit(mirq, d->arch.hvm_domain.irq.dirq_mask) )
+        {
+            vcpu_kick(d->vcpu[0]);
+            return 1;
+        }
+        else
+            dprintk(XENLOG_INFO, "Want to pending mirq, but failed\n");
+    }
+    return 0;
+}
+
+void hvm_dpci_eoi(unsigned int guest_gsi, union vioapic_redir_entry *ent)
+{
+    struct domain *d = current->domain;
+    uint32_t device, intx, machine_gsi;
+    irq_desc_t *desc;
+
+    if (d->arch.hvm_domain.irq.girq[guest_gsi].valid)
+    {
+        device = d->arch.hvm_domain.irq.girq[guest_gsi].device;
+        intx = d->arch.hvm_domain.irq.girq[guest_gsi].intx;
+        machine_gsi = d->arch.hvm_domain.irq.girq[guest_gsi].machine_gsi;
+        gdprintk(XENLOG_INFO, "hvm_dpci_eoi:: device %x intx %x\n",
+            device, intx);
+        hvm_pci_intx_deassert(d, device, intx);
+        if ( (ent == NULL) || (ent && ent->fields.mask == 0) ) {
+            desc = &irq_desc[irq_to_vector(machine_gsi)];
+            desc->handler->end(irq_to_vector(machine_gsi));
+        }
+    }
+}
+
+int release_devices(struct domain *d)
+{
+    struct hvm_domain *hd = &d->arch.hvm_domain;
+    uint32_t i;
+    int ret = 0;
+
+    if (!vtd_enabled)
+        return ret;
+
+    /* unbind irq */
+    for (i = 0; i < NR_IRQS; i++) {
+        if (hd->irq.mirq[i].valid)
+            ret = pirq_guest_unbind(d, i);
+    }
+    iommu_domain_teardown(d);
+    return ret;
+}
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/msi.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/msi.h    Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@xxxxxxxxx)
+ */
+
+#ifndef MSI_H
+#define MSI_H
+
+/*
+ * Assume the maximum number of hot plug slots supported by the system is about
+ * ten. The worstcase is that each of these slots is hot-added with a device,
+ * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which
+ * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined
+ * as below to ensure at least one message is assigned to each detected MSI/
+ * MSI-X device function.
+ */
+#define NR_HP_RESERVED_VECTORS         20
+
+extern int vector_irq[NR_VECTORS];
+extern void (*interrupt[NR_IRQS])(void);
+extern int pci_vector_resources(int last, int nr_released);
+
+/*
+ * MSI-X Address Register
+ */
+#define PCI_MSIX_FLAGS_QSIZE           0x7FF
+#define PCI_MSIX_FLAGS_ENABLE          (1 << 15)
+#define PCI_MSIX_FLAGS_BIRMASK         (7 << 0)
+#define PCI_MSIX_FLAGS_BITMASK         (1 << 0)
+
+#define PCI_MSIX_ENTRY_SIZE                    16
+#define  PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET      0
+#define  PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET      4
+#define  PCI_MSIX_ENTRY_DATA_OFFSET            8
+#define  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET     12
+
+#define msi_control_reg(base)          (base + PCI_MSI_FLAGS)
+#define msi_lower_address_reg(base)    (base + PCI_MSI_ADDRESS_LO)
+#define msi_upper_address_reg(base)    (base + PCI_MSI_ADDRESS_HI)
+#define msi_data_reg(base, is64bit)    \
+       ( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
+#define msi_mask_bits_reg(base, is64bit) \
+       ( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+#define msi_disable(control)           control &= ~PCI_MSI_FLAGS_ENABLE
+#define multi_msi_capable(control) \
+       (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
+#define multi_msi_enable(control, num) \
+       control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+#define is_64bit_address(control)      (control & PCI_MSI_FLAGS_64BIT)
+#define is_mask_bit_support(control)   (control & PCI_MSI_FLAGS_MASKBIT)
+#define msi_enable(control, num) multi_msi_enable(control, num); \
+       control |= PCI_MSI_FLAGS_ENABLE
+
+#define msix_table_offset_reg(base)    (base + 0x04)
+#define msix_pba_offset_reg(base)      (base + 0x08)
+#define msix_enable(control)           control |= PCI_MSIX_FLAGS_ENABLE
+#define msix_disable(control)          control &= ~PCI_MSIX_FLAGS_ENABLE
+#define msix_table_size(control)       ((control & PCI_MSIX_FLAGS_QSIZE)+1)
+#define multi_msix_capable             msix_table_size
+#define msix_unmask(address)           (address & ~PCI_MSIX_FLAGS_BITMASK)
+#define msix_mask(address)             (address | PCI_MSIX_FLAGS_BITMASK)
+#define msix_is_pending(address)       (address & PCI_MSIX_FLAGS_PENDMASK)
+
+/*
+ * MSI Defined Data Structures
+ */
+#define MSI_ADDRESS_HEADER             0xfee
+#define MSI_ADDRESS_HEADER_SHIFT       12
+#define MSI_ADDRESS_HEADER_MASK                0xfff000
+#define MSI_ADDRESS_DEST_ID_MASK       0xfff0000f
+#define MSI_TARGET_CPU_MASK            0xff
+#define MSI_TARGET_CPU_SHIFT           12
+#define MSI_DELIVERY_MODE              0
+#define MSI_LEVEL_MODE                 1       /* Edge always assert */
+#define MSI_TRIGGER_MODE               0       /* MSI is edge sensitive */
+#define MSI_PHYSICAL_MODE              0
+#define MSI_LOGICAL_MODE               1
+#define MSI_REDIRECTION_HINT_MODE      0
+
+#define __LITTLE_ENDIAN_BITFIELD       1
+
+struct msg_data {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u32   vector          :  8;
+       __u32   delivery_mode   :  3;   /* 000b: FIXED | 001b: lowest prior */
+       __u32   reserved_1      :  3;
+       __u32   level           :  1;   /* 0: deassert | 1: assert */
+       __u32   trigger         :  1;   /* 0: edge | 1: level */
+       __u32   reserved_2      : 16;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u32   reserved_2      : 16;
+       __u32   trigger         :  1;   /* 0: edge | 1: level */
+       __u32   level           :  1;   /* 0: deassert | 1: assert */
+       __u32   reserved_1      :  3;
+       __u32   delivery_mode   :  3;   /* 000b: FIXED | 001b: lowest prior */
+       __u32   vector          :  8;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+} __attribute__ ((packed));
+
+struct msg_address {
+       union {
+               struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+                       __u32   reserved_1      :  2;
+                       __u32   dest_mode       :  1;   /*0:physic | 1:logic */
+                       __u32   redirection_hint:  1;   /*0: dedicated CPU
+                                                         1: lowest priority */
+                       __u32   reserved_2      :  4;
+                       __u32   dest_id         : 24;   /* Destination ID */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+                       __u32   dest_id         : 24;   /* Destination ID */
+                       __u32   reserved_2      :  4;
+                       __u32   redirection_hint:  1;   /*0: dedicated CPU
+                                                         1: lowest priority */
+                       __u32   dest_mode       :  1;   /*0:physic | 1:logic */
+                       __u32   reserved_1      :  2;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+               }u;
+                       __u32  value;
+       }lo_address;
+       __u32   hi_address;
+} __attribute__ ((packed));
+
+#endif /* MSI_H */
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/pci-direct.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/pci-direct.h     Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,48 @@
+#ifndef ASM_PCI_DIRECT_H
+#define ASM_PCI_DIRECT_H 1
+
+#include <xen/types.h>
+#include <asm/io.h>
+
+/* Direct PCI access. This is used for PCI accesses in early boot before
+   the PCI subsystem works. */ 
+
+#define PDprintk(x...)
+
+static inline u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
+{
+    u32 v; 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    v = inl(0xcfc); 
+    if (v != 0xffffffff)
+        PDprintk("%x reading 4 from %x: %x\n", slot, offset, v);
+    return v;
+}
+
+static inline u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
+{
+    u8 v; 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    v = inb(0xcfc + (offset&3)); 
+    PDprintk("%x reading 1 from %x: %x\n", slot, offset, v);
+    return v;
+}
+
+static inline u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
+{
+    u16 v; 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    v = inw(0xcfc + (offset&2)); 
+    PDprintk("%x reading 2 from %x: %x\n", slot, offset, v);
+    return v;
+}
+
+static inline void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
+                    u32 val)
+{
+    PDprintk("%x writing to %x: %x\n", slot, offset, val); 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    outl(val, 0xcfc); 
+}
+
+#endif
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/pci_regs.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/pci_regs.h       Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,449 @@
+/*
+ *     pci_regs.h
+ *
+ *     PCI standard defines
+ *     Copyright 1994, Drew Eckhardt
+ *     Copyright 1997--1999 Martin Mares <mj@xxxxxx>
+ *
+ *     For more information, please consult the following manuals (look at
+ *     http://www.pcisig.com/ for how to get them):
+ *
+ *     PCI BIOS Specification
+ *     PCI Local Bus Specification
+ *     PCI to PCI Bridge Specification
+ *     PCI System Design Guide
+ */
+
+#ifndef LINUX_PCI_REGS_H
+#define LINUX_PCI_REGS_H
+
+/*
+ * Under PCI, each device has 256 bytes of configuration address space,
+ * of which the first 64 bytes are standardized as follows:
+ */
+#define PCI_VENDOR_ID          0x00    /* 16 bits */
+#define PCI_DEVICE_ID          0x02    /* 16 bits */
+#define PCI_COMMAND            0x04    /* 16 bits */
+#define  PCI_COMMAND_IO                0x1     /* Enable response in I/O space 
*/
+#define  PCI_COMMAND_MEMORY    0x2     /* Enable response in Memory space */
+#define  PCI_COMMAND_MASTER    0x4     /* Enable bus mastering */
+#define  PCI_COMMAND_SPECIAL   0x8     /* Enable response to special cycles */
+#define  PCI_COMMAND_INVALIDATE        0x10    /* Use memory write and 
invalidate */
+#define  PCI_COMMAND_VGA_PALETTE 0x20  /* Enable palette snooping */
+#define  PCI_COMMAND_PARITY    0x40    /* Enable parity checking */
+#define  PCI_COMMAND_WAIT      0x80    /* Enable address/data stepping */
+#define  PCI_COMMAND_SERR      0x100   /* Enable SERR */
+#define  PCI_COMMAND_FAST_BACK 0x200   /* Enable back-to-back writes */
+#define  PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */
+
+#define PCI_STATUS             0x06    /* 16 bits */
+#define  PCI_STATUS_CAP_LIST   0x10    /* Support Capability List */
+#define  PCI_STATUS_66MHZ      0x20    /* Support 66 Mhz PCI 2.1 bus */
+#define  PCI_STATUS_UDF                0x40    /* Support User Definable 
Features [obsolete] */
+#define  PCI_STATUS_FAST_BACK  0x80    /* Accept fast-back to back */
+#define  PCI_STATUS_PARITY     0x100   /* Detected parity error */
+#define  PCI_STATUS_DEVSEL_MASK        0x600   /* DEVSEL timing */
+#define  PCI_STATUS_DEVSEL_FAST                0x000
+#define  PCI_STATUS_DEVSEL_MEDIUM      0x200
+#define  PCI_STATUS_DEVSEL_SLOW                0x400
+#define  PCI_STATUS_SIG_TARGET_ABORT   0x800 /* Set on target abort */
+#define  PCI_STATUS_REC_TARGET_ABORT   0x1000 /* Master ack of " */
+#define  PCI_STATUS_REC_MASTER_ABORT   0x2000 /* Set on master abort */
+#define  PCI_STATUS_SIG_SYSTEM_ERROR   0x4000 /* Set when we drive SERR */
+#define  PCI_STATUS_DETECTED_PARITY    0x8000 /* Set on parity error */
+
+#define PCI_CLASS_REVISION     0x08    /* High 24 bits are class, low 8 
revision */
+#define PCI_REVISION_ID                0x08    /* Revision ID */
+#define PCI_CLASS_PROG         0x09    /* Reg. Level Programming Interface */
+#define PCI_CLASS_DEVICE       0x0a    /* Device class */
+
+#define PCI_CACHE_LINE_SIZE    0x0c    /* 8 bits */
+#define PCI_LATENCY_TIMER      0x0d    /* 8 bits */
+#define PCI_HEADER_TYPE                0x0e    /* 8 bits */
+#define  PCI_HEADER_TYPE_NORMAL                0
+#define  PCI_HEADER_TYPE_BRIDGE                1
+#define  PCI_HEADER_TYPE_CARDBUS       2
+
+#define PCI_BIST               0x0f    /* 8 bits */
+#define  PCI_BIST_CODE_MASK    0x0f    /* Return result */
+#define  PCI_BIST_START                0x40    /* 1 to start BIST, 2 secs or 
less */
+#define  PCI_BIST_CAPABLE      0x80    /* 1 if BIST capable */
+
+/*
+ * Base addresses specify locations in memory or I/O space.
+ * Decoded size can be determined by writing a value of
+ * 0xffffffff to the register, and reading it back.  Only
+ * 1 bits are decoded.
+ */
+#define PCI_BASE_ADDRESS_0     0x10    /* 32 bits */
+#define PCI_BASE_ADDRESS_1     0x14    /* 32 bits [htype 0,1 only] */
+#define PCI_BASE_ADDRESS_2     0x18    /* 32 bits [htype 0 only] */
+#define PCI_BASE_ADDRESS_3     0x1c    /* 32 bits */
+#define PCI_BASE_ADDRESS_4     0x20    /* 32 bits */
+#define PCI_BASE_ADDRESS_5     0x24    /* 32 bits */
+#define  PCI_BASE_ADDRESS_SPACE                0x01    /* 0 = memory, 1 = I/O 
*/
+#define  PCI_BASE_ADDRESS_SPACE_IO     0x01
+#define  PCI_BASE_ADDRESS_SPACE_MEMORY 0x00
+#define  PCI_BASE_ADDRESS_MEM_TYPE_MASK        0x06
+#define  PCI_BASE_ADDRESS_MEM_TYPE_32  0x00    /* 32 bit address */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_1M  0x02    /* Below 1M [obsolete] */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_64  0x04    /* 64 bit address */
+#define  PCI_BASE_ADDRESS_MEM_PREFETCH 0x08    /* prefetchable? */
+#define  PCI_BASE_ADDRESS_MEM_MASK     (~0x0fUL)
+#define  PCI_BASE_ADDRESS_IO_MASK      (~0x03UL)
+/* bit 1 is reserved if address_space = 1 */
+
+/* Header type 0 (normal devices) */
+#define PCI_CARDBUS_CIS                0x28
+#define PCI_SUBSYSTEM_VENDOR_ID        0x2c
+#define PCI_SUBSYSTEM_ID       0x2e
+#define PCI_ROM_ADDRESS                0x30    /* Bits 31..11 are address, 
10..1 reserved */
+#define  PCI_ROM_ADDRESS_ENABLE        0x01
+#define PCI_ROM_ADDRESS_MASK   (~0x7ffUL)
+
+#define PCI_CAPABILITY_LIST    0x34    /* Offset of first capability list 
entry */
+
+/* 0x35-0x3b are reserved */
+#define PCI_INTERRUPT_LINE     0x3c    /* 8 bits */
+#define PCI_INTERRUPT_PIN      0x3d    /* 8 bits */
+#define PCI_MIN_GNT            0x3e    /* 8 bits */
+#define PCI_MAX_LAT            0x3f    /* 8 bits */
+
+/* Header type 1 (PCI-to-PCI bridges) */
+#define PCI_PRIMARY_BUS                0x18    /* Primary bus number */
+#define PCI_SECONDARY_BUS      0x19    /* Secondary bus number */
+#define PCI_SUBORDINATE_BUS    0x1a    /* Highest bus number behind the bridge 
*/
+#define PCI_SEC_LATENCY_TIMER  0x1b    /* Latency timer for secondary 
interface */
+#define PCI_IO_BASE            0x1c    /* I/O range behind the bridge */
+#define PCI_IO_LIMIT           0x1d
+#define  PCI_IO_RANGE_TYPE_MASK        0x0fUL  /* I/O bridging type */
+#define  PCI_IO_RANGE_TYPE_16  0x00
+#define  PCI_IO_RANGE_TYPE_32  0x01
+#define  PCI_IO_RANGE_MASK     (~0x0fUL)
+#define PCI_SEC_STATUS         0x1e    /* Secondary status register, only bit 
14 used */
+#define PCI_MEMORY_BASE                0x20    /* Memory range behind */
+#define PCI_MEMORY_LIMIT       0x22
+#define  PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL
+#define  PCI_MEMORY_RANGE_MASK (~0x0fUL)
+#define PCI_PREF_MEMORY_BASE   0x24    /* Prefetchable memory range behind */
+#define PCI_PREF_MEMORY_LIMIT  0x26
+#define  PCI_PREF_RANGE_TYPE_MASK 0x0fUL
+#define  PCI_PREF_RANGE_TYPE_32        0x00
+#define  PCI_PREF_RANGE_TYPE_64        0x01
+#define  PCI_PREF_RANGE_MASK   (~0x0fUL)
+#define PCI_PREF_BASE_UPPER32  0x28    /* Upper half of prefetchable memory 
range */
+#define PCI_PREF_LIMIT_UPPER32 0x2c
+#define PCI_IO_BASE_UPPER16    0x30    /* Upper half of I/O addresses */
+#define PCI_IO_LIMIT_UPPER16   0x32
+/* 0x34 same as for htype 0 */
+/* 0x35-0x3b is reserved */
+#define PCI_ROM_ADDRESS1       0x38    /* Same as PCI_ROM_ADDRESS, but for 
htype 1 */
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_BRIDGE_CONTROL     0x3e
+#define  PCI_BRIDGE_CTL_PARITY 0x01    /* Enable parity detection on secondary 
interface */
+#define  PCI_BRIDGE_CTL_SERR   0x02    /* The same for SERR forwarding */
+#define  PCI_BRIDGE_CTL_NO_ISA 0x04    /* Disable bridging of ISA ports */
+#define  PCI_BRIDGE_CTL_VGA    0x08    /* Forward VGA addresses */
+#define  PCI_BRIDGE_CTL_MASTER_ABORT   0x20  /* Report master aborts */
+#define  PCI_BRIDGE_CTL_BUS_RESET      0x40    /* Secondary bus reset */
+#define  PCI_BRIDGE_CTL_FAST_BACK      0x80    /* Fast Back2Back enabled on 
secondary interface */
+
+/* Header type 2 (CardBus bridges) */
+#define PCI_CB_CAPABILITY_LIST 0x14
+/* 0x15 reserved */
+#define PCI_CB_SEC_STATUS      0x16    /* Secondary status */
+#define PCI_CB_PRIMARY_BUS     0x18    /* PCI bus number */
+#define PCI_CB_CARD_BUS                0x19    /* CardBus bus number */
+#define PCI_CB_SUBORDINATE_BUS 0x1a    /* Subordinate bus number */
+#define PCI_CB_LATENCY_TIMER   0x1b    /* CardBus latency timer */
+#define PCI_CB_MEMORY_BASE_0   0x1c
+#define PCI_CB_MEMORY_LIMIT_0  0x20
+#define PCI_CB_MEMORY_BASE_1   0x24
+#define PCI_CB_MEMORY_LIMIT_1  0x28
+#define PCI_CB_IO_BASE_0       0x2c
+#define PCI_CB_IO_BASE_0_HI    0x2e
+#define PCI_CB_IO_LIMIT_0      0x30
+#define PCI_CB_IO_LIMIT_0_HI   0x32
+#define PCI_CB_IO_BASE_1       0x34
+#define PCI_CB_IO_BASE_1_HI    0x36
+#define PCI_CB_IO_LIMIT_1      0x38
+#define PCI_CB_IO_LIMIT_1_HI   0x3a
+#define  PCI_CB_IO_RANGE_MASK  (~0x03UL)
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_CB_BRIDGE_CONTROL  0x3e
+#define  PCI_CB_BRIDGE_CTL_PARITY      0x01    /* Similar to standard bridge 
control register */
+#define  PCI_CB_BRIDGE_CTL_SERR                0x02
+#define  PCI_CB_BRIDGE_CTL_ISA         0x04
+#define  PCI_CB_BRIDGE_CTL_VGA         0x08
+#define  PCI_CB_BRIDGE_CTL_MASTER_ABORT        0x20
+#define  PCI_CB_BRIDGE_CTL_CB_RESET    0x40    /* CardBus reset */
+#define  PCI_CB_BRIDGE_CTL_16BIT_INT   0x80    /* Enable interrupt for 16-bit 
cards */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100 /* Prefetch enable for both 
memory regions */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200
+#define  PCI_CB_BRIDGE_CTL_POST_WRITES 0x400
+#define PCI_CB_SUBSYSTEM_VENDOR_ID     0x40
+#define PCI_CB_SUBSYSTEM_ID            0x42
+#define PCI_CB_LEGACY_MODE_BASE                0x44    /* 16-bit PC Card 
legacy mode base address (ExCa) */
+/* 0x48-0x7f reserved */
+
+/* Capability lists */
+
+#define PCI_CAP_LIST_ID                0       /* Capability ID */
+#define  PCI_CAP_ID_PM         0x01    /* Power Management */
+#define  PCI_CAP_ID_AGP                0x02    /* Accelerated Graphics Port */
+#define  PCI_CAP_ID_VPD                0x03    /* Vital Product Data */
+#define  PCI_CAP_ID_SLOTID     0x04    /* Slot Identification */
+#define  PCI_CAP_ID_MSI                0x05    /* Message Signalled Interrupts 
*/
+#define  PCI_CAP_ID_CHSWP      0x06    /* CompactPCI HotSwap */
+#define  PCI_CAP_ID_PCIX       0x07    /* PCI-X */
+#define  PCI_CAP_ID_HT_IRQCONF 0x08    /* HyperTransport IRQ Configuration */
+#define  PCI_CAP_ID_SHPC       0x0C    /* PCI Standard Hot-Plug Controller */
+#define  PCI_CAP_ID_EXP        0x10    /* PCI Express */
+#define  PCI_CAP_ID_MSIX       0x11    /* MSI-X */
+#define PCI_CAP_LIST_NEXT      1       /* Next capability in the list */
+#define PCI_CAP_FLAGS          2       /* Capability defined flags (16 bits) */
+#define PCI_CAP_SIZEOF         4
+
+/* Power Management Registers */
+
+#define PCI_PM_PMC             2       /* PM Capabilities Register */
+#define  PCI_PM_CAP_VER_MASK   0x0007  /* Version */
+#define  PCI_PM_CAP_PME_CLOCK  0x0008  /* PME clock required */
+#define  PCI_PM_CAP_RESERVED    0x0010  /* Reserved field */
+#define  PCI_PM_CAP_DSI                0x0020  /* Device specific 
initialization */
+#define  PCI_PM_CAP_AUX_POWER  0x01C0  /* Auxilliary power support mask */
+#define  PCI_PM_CAP_D1         0x0200  /* D1 power state support */
+#define  PCI_PM_CAP_D2         0x0400  /* D2 power state support */
+#define  PCI_PM_CAP_PME                0x0800  /* PME pin supported */
+#define  PCI_PM_CAP_PME_MASK   0xF800  /* PME Mask of all supported states */
+#define  PCI_PM_CAP_PME_D0     0x0800  /* PME# from D0 */
+#define  PCI_PM_CAP_PME_D1     0x1000  /* PME# from D1 */
+#define  PCI_PM_CAP_PME_D2     0x2000  /* PME# from D2 */
+#define  PCI_PM_CAP_PME_D3     0x4000  /* PME# from D3 (hot) */
+#define  PCI_PM_CAP_PME_D3cold 0x8000  /* PME# from D3 (cold) */
+#define PCI_PM_CTRL            4       /* PM control and status register */
+#define  PCI_PM_CTRL_STATE_MASK        0x0003  /* Current power state (D0 to 
D3) */
+#define  PCI_PM_CTRL_NO_SOFT_RESET     0x0004  /* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_PME_ENABLE        0x0100  /* PME pin enable */
+#define  PCI_PM_CTRL_DATA_SEL_MASK     0x1e00  /* Data select (??) */
+#define  PCI_PM_CTRL_DATA_SCALE_MASK   0x6000  /* Data scale (??) */
+#define  PCI_PM_CTRL_PME_STATUS        0x8000  /* PME pin status */
+#define PCI_PM_PPB_EXTENSIONS  6       /* PPB support extensions (??) */
+#define  PCI_PM_PPB_B2_B3      0x40    /* Stop clock when in D3hot (??) */
+#define  PCI_PM_BPCC_ENABLE    0x80    /* Bus power/clock control enable (??) 
*/
+#define PCI_PM_DATA_REGISTER   7       /* (??) */
+#define PCI_PM_SIZEOF          8
+
+/* AGP registers */
+
+#define PCI_AGP_VERSION                2       /* BCD version number */
+#define PCI_AGP_RFU            3       /* Rest of capability flags */
+#define PCI_AGP_STATUS         4       /* Status register */
+#define  PCI_AGP_STATUS_RQ_MASK        0xff000000      /* Maximum number of 
requests - 1 */
+#define  PCI_AGP_STATUS_SBA    0x0200  /* Sideband addressing supported */
+#define  PCI_AGP_STATUS_64BIT  0x0020  /* 64-bit addressing supported */
+#define  PCI_AGP_STATUS_FW     0x0010  /* FW transfers supported */
+#define  PCI_AGP_STATUS_RATE4  0x0004  /* 4x transfer rate supported */
+#define  PCI_AGP_STATUS_RATE2  0x0002  /* 2x transfer rate supported */
+#define  PCI_AGP_STATUS_RATE1  0x0001  /* 1x transfer rate supported */
+#define PCI_AGP_COMMAND                8       /* Control register */
+#define  PCI_AGP_COMMAND_RQ_MASK 0xff000000  /* Master: Maximum number of 
requests */
+#define  PCI_AGP_COMMAND_SBA   0x0200  /* Sideband addressing enabled */
+#define  PCI_AGP_COMMAND_AGP   0x0100  /* Allow processing of AGP transactions 
*/
+#define  PCI_AGP_COMMAND_64BIT 0x0020  /* Allow processing of 64-bit addresses 
*/
+#define  PCI_AGP_COMMAND_FW    0x0010  /* Force FW transfers */
+#define  PCI_AGP_COMMAND_RATE4 0x0004  /* Use 4x rate */
+#define  PCI_AGP_COMMAND_RATE2 0x0002  /* Use 2x rate */
+#define  PCI_AGP_COMMAND_RATE1 0x0001  /* Use 1x rate */
+#define PCI_AGP_SIZEOF         12
+
+/* Vital Product Data */
+
+#define PCI_VPD_ADDR           2       /* Address to access (15 bits!) */
+#define  PCI_VPD_ADDR_MASK     0x7fff  /* Address mask */
+#define  PCI_VPD_ADDR_F                0x8000  /* Write 0, 1 indicates 
completion */
+#define PCI_VPD_DATA           4       /* 32-bits of data returned here */
+
+/* Slot Identification */
+
+#define PCI_SID_ESR            2       /* Expansion Slot Register */
+#define  PCI_SID_ESR_NSLOTS    0x1f    /* Number of expansion slots available 
*/
+#define  PCI_SID_ESR_FIC       0x20    /* First In Chassis Flag */
+#define PCI_SID_CHASSIS_NR     3       /* Chassis Number */
+
+/* Message Signalled Interrupts registers */
+
+#define PCI_MSI_FLAGS          2       /* Various flags */
+#define  PCI_MSI_FLAGS_64BIT   0x80    /* 64-bit addresses allowed */
+#define  PCI_MSI_FLAGS_QSIZE   0x70    /* Message queue size configured */
+#define  PCI_MSI_FLAGS_QMASK   0x0e    /* Maximum queue size available */
+#define  PCI_MSI_FLAGS_ENABLE  0x01    /* MSI feature enabled */
+#define  PCI_MSI_FLAGS_MASKBIT 0x100   /* 64-bit mask bits allowed */
+#define PCI_MSI_RFU            3       /* Rest of capability flags */
+#define PCI_MSI_ADDRESS_LO     4       /* Lower 32 bits */
+#define PCI_MSI_ADDRESS_HI     8       /* Upper 32 bits (if 
PCI_MSI_FLAGS_64BIT set) */
+#define PCI_MSI_DATA_32                8       /* 16 bits of data for 32-bit 
devices */
+#define PCI_MSI_DATA_64                12      /* 16 bits of data for 64-bit 
devices */
+#define PCI_MSI_MASK_BIT       16      /* Mask bits register */
+
+/* CompactPCI Hotswap Register */
+
+#define PCI_CHSWP_CSR          2       /* Control and Status Register */
+#define  PCI_CHSWP_DHA         0x01    /* Device Hiding Arm */
+#define  PCI_CHSWP_EIM         0x02    /* ENUM# Signal Mask */
+#define  PCI_CHSWP_PIE         0x04    /* Pending Insert or Extract */
+#define  PCI_CHSWP_LOO         0x08    /* LED On / Off */
+#define  PCI_CHSWP_PI          0x30    /* Programming Interface */
+#define  PCI_CHSWP_EXT         0x40    /* ENUM# status - extraction */
+#define  PCI_CHSWP_INS         0x80    /* ENUM# status - insertion */
+
+/* PCI-X registers */
+
+#define PCI_X_CMD              2       /* Modes & Features */
+#define  PCI_X_CMD_DPERR_E     0x0001  /* Data Parity Error Recovery Enable */
+#define  PCI_X_CMD_ERO         0x0002  /* Enable Relaxed Ordering */
+#define  PCI_X_CMD_MAX_READ    0x000c  /* Max Memory Read Byte Count */
+#define  PCI_X_CMD_MAX_SPLIT   0x0070  /* Max Outstanding Split Transactions */
+#define  PCI_X_CMD_VERSION(x)  (((x) >> 12) & 3) /* Version */
+#define PCI_X_STATUS           4       /* PCI-X capabilities */
+#define  PCI_X_STATUS_DEVFN    0x000000ff      /* A copy of devfn */
+#define  PCI_X_STATUS_BUS      0x0000ff00      /* A copy of bus nr */
+#define  PCI_X_STATUS_64BIT    0x00010000      /* 64-bit device */
+#define  PCI_X_STATUS_133MHZ   0x00020000      /* 133 MHz capable */
+#define  PCI_X_STATUS_SPL_DISC 0x00040000      /* Split Completion Discarded */
+#define  PCI_X_STATUS_UNX_SPL  0x00080000      /* Unexpected Split Completion 
*/
+#define  PCI_X_STATUS_COMPLEX  0x00100000      /* Device Complexity */
+#define  PCI_X_STATUS_MAX_READ 0x00600000      /* Designed Max Memory Read 
Count */
+#define  PCI_X_STATUS_MAX_SPLIT        0x03800000      /* Designed Max 
Outstanding Split Transactions */
+#define  PCI_X_STATUS_MAX_CUM  0x1c000000      /* Designed Max Cumulative Read 
Size */
+#define  PCI_X_STATUS_SPL_ERR  0x20000000      /* Rcvd Split Completion Error 
Msg */
+#define  PCI_X_STATUS_266MHZ   0x40000000      /* 266 MHz capable */
+#define  PCI_X_STATUS_533MHZ   0x80000000      /* 533 MHz capable */
+
+/* PCI Express capability registers */
+
+#define PCI_EXP_FLAGS          2       /* Capabilities register */
+#define PCI_EXP_FLAGS_VERS     0x000f  /* Capability version */
+#define PCI_EXP_FLAGS_TYPE     0x00f0  /* Device/Port type */
+#define  PCI_EXP_TYPE_ENDPOINT 0x0     /* Express Endpoint */
+#define  PCI_EXP_TYPE_LEG_END  0x1     /* Legacy Endpoint */
+#define  PCI_EXP_TYPE_ROOT_PORT 0x4    /* Root Port */
+#define  PCI_EXP_TYPE_UPSTREAM 0x5     /* Upstream Port */
+#define  PCI_EXP_TYPE_DOWNSTREAM 0x6   /* Downstream Port */
+#define  PCI_EXP_TYPE_PCI_BRIDGE 0x7   /* PCI/PCI-X Bridge */
+#define PCI_EXP_FLAGS_SLOT     0x0100  /* Slot implemented */
+#define PCI_EXP_FLAGS_IRQ      0x3e00  /* Interrupt message number */
+#define PCI_EXP_DEVCAP         4       /* Device capabilities */
+#define  PCI_EXP_DEVCAP_PAYLOAD        0x07    /* Max_Payload_Size */
+#define  PCI_EXP_DEVCAP_PHANTOM        0x18    /* Phantom functions */
+#define  PCI_EXP_DEVCAP_EXT_TAG        0x20    /* Extended tags */
+#define  PCI_EXP_DEVCAP_L0S    0x1c0   /* L0s Acceptable Latency */
+#define  PCI_EXP_DEVCAP_L1     0xe00   /* L1 Acceptable Latency */
+#define  PCI_EXP_DEVCAP_ATN_BUT        0x1000  /* Attention Button Present */
+#define  PCI_EXP_DEVCAP_ATN_IND        0x2000  /* Attention Indicator Present 
*/
+#define  PCI_EXP_DEVCAP_PWR_IND        0x4000  /* Power Indicator Present */
+#define  PCI_EXP_DEVCAP_PWR_VAL        0x3fc0000 /* Slot Power Limit Value */
+#define  PCI_EXP_DEVCAP_PWR_SCL        0xc000000 /* Slot Power Limit Scale */
+#define PCI_EXP_DEVCTL         8       /* Device Control */
+#define  PCI_EXP_DEVCTL_CERE   0x0001  /* Correctable Error Reporting En. */
+#define  PCI_EXP_DEVCTL_NFERE  0x0002  /* Non-Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_FERE   0x0004  /* Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_URRE   0x0008  /* Unsupported Request Reporting En. */
+#define  PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */
+#define  PCI_EXP_DEVCTL_PAYLOAD        0x00e0  /* Max_Payload_Size */
+#define  PCI_EXP_DEVCTL_EXT_TAG        0x0100  /* Extended Tag Field Enable */
+#define  PCI_EXP_DEVCTL_PHANTOM        0x0200  /* Phantom Functions Enable */
+#define  PCI_EXP_DEVCTL_AUX_PME        0x0400  /* Auxiliary Power PM Enable */
+#define  PCI_EXP_DEVCTL_NOSNOOP_EN 0x0800  /* Enable No Snoop */
+#define  PCI_EXP_DEVCTL_READRQ 0x7000  /* Max_Read_Request_Size */
+#define PCI_EXP_DEVSTA         10      /* Device Status */
+#define  PCI_EXP_DEVSTA_CED    0x01    /* Correctable Error Detected */
+#define  PCI_EXP_DEVSTA_NFED   0x02    /* Non-Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_FED    0x04    /* Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_URD    0x08    /* Unsupported Request Detected */
+#define  PCI_EXP_DEVSTA_AUXPD  0x10    /* AUX Power Detected */
+#define  PCI_EXP_DEVSTA_TRPND  0x20    /* Transactions Pending */
+#define PCI_EXP_LNKCAP         12      /* Link Capabilities */
+#define PCI_EXP_LNKCTL         16      /* Link Control */
+#define PCI_EXP_LNKSTA         18      /* Link Status */
+#define PCI_EXP_SLTCAP         20      /* Slot Capabilities */
+#define PCI_EXP_SLTCTL         24      /* Slot Control */
+#define PCI_EXP_SLTSTA         26      /* Slot Status */
+#define PCI_EXP_RTCTL          28      /* Root Control */
+#define  PCI_EXP_RTCTL_SECEE   0x01    /* System Error on Correctable Error */
+#define  PCI_EXP_RTCTL_SENFEE  0x02    /* System Error on Non-Fatal Error */
+#define  PCI_EXP_RTCTL_SEFEE   0x04    /* System Error on Fatal Error */
+#define  PCI_EXP_RTCTL_PMEIE   0x08    /* PME Interrupt Enable */
+#define  PCI_EXP_RTCTL_CRSSVE  0x10    /* CRS Software Visibility Enable */
+#define PCI_EXP_RTCAP          30      /* Root Capabilities */
+#define PCI_EXP_RTSTA          32      /* Root Status */
+
+/* Extended Capabilities (PCI-X 2.0 and Express) */
+#define PCI_EXT_CAP_ID(header)         (header & 0x0000ffff)
+#define PCI_EXT_CAP_VER(header)                ((header >> 16) & 0xf)
+#define PCI_EXT_CAP_NEXT(header)       ((header >> 20) & 0xffc)
+
+#define PCI_EXT_CAP_ID_ERR     1
+#define PCI_EXT_CAP_ID_VC      2
+#define PCI_EXT_CAP_ID_DSN     3
+#define PCI_EXT_CAP_ID_PWR     4
+
+/* Advanced Error Reporting */
+#define PCI_ERR_UNCOR_STATUS   4       /* Uncorrectable Error Status */
+#define  PCI_ERR_UNC_TRAIN     0x00000001      /* Training */
+#define  PCI_ERR_UNC_DLP       0x00000010      /* Data Link Protocol */
+#define  PCI_ERR_UNC_POISON_TLP        0x00001000      /* Poisoned TLP */
+#define  PCI_ERR_UNC_FCP       0x00002000      /* Flow Control Protocol */
+#define  PCI_ERR_UNC_COMP_TIME 0x00004000      /* Completion Timeout */
+#define  PCI_ERR_UNC_COMP_ABORT        0x00008000      /* Completer Abort */
+#define  PCI_ERR_UNC_UNX_COMP  0x00010000      /* Unexpected Completion */
+#define  PCI_ERR_UNC_RX_OVER   0x00020000      /* Receiver Overflow */
+#define  PCI_ERR_UNC_MALF_TLP  0x00040000      /* Malformed TLP */
+#define  PCI_ERR_UNC_ECRC      0x00080000      /* ECRC Error Status */
+#define  PCI_ERR_UNC_UNSUP     0x00100000      /* Unsupported Request */
+#define PCI_ERR_UNCOR_MASK     8       /* Uncorrectable Error Mask */
+       /* Same bits as above */
+#define PCI_ERR_UNCOR_SEVER    12      /* Uncorrectable Error Severity */
+       /* Same bits as above */
+#define PCI_ERR_COR_STATUS     16      /* Correctable Error Status */
+#define  PCI_ERR_COR_RCVR      0x00000001      /* Receiver Error Status */
+#define  PCI_ERR_COR_BAD_TLP   0x00000040      /* Bad TLP Status */
+#define  PCI_ERR_COR_BAD_DLLP  0x00000080      /* Bad DLLP Status */
+#define  PCI_ERR_COR_REP_ROLL  0x00000100      /* REPLAY_NUM Rollover */
+#define  PCI_ERR_COR_REP_TIMER 0x00001000      /* Replay Timer Timeout */
+#define PCI_ERR_COR_MASK       20      /* Correctable Error Mask */
+       /* Same bits as above */
+#define PCI_ERR_CAP            24      /* Advanced Error Capabilities */
+#define  PCI_ERR_CAP_FEP(x)    ((x) & 31)      /* First Error Pointer */
+#define  PCI_ERR_CAP_ECRC_GENC 0x00000020      /* ECRC Generation Capable */
+#define  PCI_ERR_CAP_ECRC_GENE 0x00000040      /* ECRC Generation Enable */
+#define  PCI_ERR_CAP_ECRC_CHKC 0x00000080      /* ECRC Check Capable */
+#define  PCI_ERR_CAP_ECRC_CHKE 0x00000100      /* ECRC Check Enable */
+#define PCI_ERR_HEADER_LOG     28      /* Header Log Register (16 bytes) */
+#define PCI_ERR_ROOT_COMMAND   44      /* Root Error Command */
+#define PCI_ERR_ROOT_STATUS    48
+#define PCI_ERR_ROOT_COR_SRC   52
+#define PCI_ERR_ROOT_SRC       54
+
+/* Virtual Channel */
+#define PCI_VC_PORT_REG1       4
+#define PCI_VC_PORT_REG2       8
+#define PCI_VC_PORT_CTRL       12
+#define PCI_VC_PORT_STATUS     14
+#define PCI_VC_RES_CAP         16
+#define PCI_VC_RES_CTRL                20
+#define PCI_VC_RES_STATUS      26
+
+/* Power Budgeting */
+#define PCI_PWR_DSR            4       /* Data Select Register */
+#define PCI_PWR_DATA           8       /* Data Register */
+#define  PCI_PWR_DATA_BASE(x)  ((x) & 0xff)        /* Base Power */
+#define  PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3)    /* Data Scale */
+#define  PCI_PWR_DATA_PM_SUB(x)        (((x) >> 10) & 7)   /* PM Sub State */
+#define  PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
+#define  PCI_PWR_DATA_TYPE(x)  (((x) >> 15) & 7)   /* Type */
+#define  PCI_PWR_DATA_RAIL(x)  (((x) >> 18) & 7)   /* Power Rail */
+#define PCI_PWR_CAP            12      /* Capability */
+#define  PCI_PWR_CAP_BUDGET(x) ((x) & 1)       /* Included in system budget */
+
+#endif /* LINUX_PCI_REGS_H */
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/utils.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/utils.c  Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx>
+ */
+
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/irq.h>
+#include <xen/spinlock.h>
+#include <xen/sched.h>
+#include <asm/delay.h>
+#include <asm/iommu.h>
+#include <asm/hvm/vmx/intel-iommu.h>
+#include "dmar.h"
+#include "pci-direct.h"
+#include "pci_regs.h"
+#include "msi.h"
+
+#include <xen/mm.h>
+#include <xen/xmalloc.h>
+
+#if defined(__x86_64__)
+void print_iommu_regs(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+ 
+    printk("---- print_iommu_regs ----\n"); 
+    printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
+    printk("print_iommu_regs: DMAR_VER_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_VER_REG));
+    printk("print_iommu_regs: DMAR_CAP_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_CAP_REG));
+    printk("print_iommu_regs: n_fault_reg = %lx\n",
+                   cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_l = %lx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_h = %lx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) 
+ 8);
+    printk("print_iommu_regs: fault_recording_reg_l = %lx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG))));
+    printk("print_iommu_regs: fault_recording_reg_h = %lx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG)) + 8));
+    printk("print_iommu_regs: DMAR_ECAP_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_ECAP_REG));
+    printk("print_iommu_regs: DMAR_GCMD_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GCMD_REG));
+    printk("print_iommu_regs: DMAR_GSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GSTS_REG));
+    printk("print_iommu_regs: DMAR_RTADDR_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_RTADDR_REG));
+    printk("print_iommu_regs: DMAR_CCMD_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_CCMD_REG));
+    printk("print_iommu_regs: DMAR_FSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FSTS_REG));
+    printk("print_iommu_regs: DMAR_FECTL_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FECTL_REG));
+    printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEDATA_REG));
+    printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEADDR_REG));
+    printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEUADDR_REG));
+}
+
+void print_vtd_entries(struct domain *d, int bus, int devfn,
+                       unsigned long gmfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct context_entry *ctxt_entry;
+    struct root_entry *root_entry;
+    u64 *l3, *l2, *l1;
+    u32 l3_index, l2_index, l1_index;
+    u32 i = 0;
+
+    printk("print_vtd_entries: domain_id = %x bdf = %x:%x:%x devfn = %x, gmfn 
= %lx\n", d->domain_id, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), devfn, gmfn);
+
+    for_each_drhd_unit(drhd) {
+        printk("---- print_vtd_entries %d ----\n", i++);
+
+        if (hd->pgd == NULL) {
+            printk("    hg->pgd == NULL\n");
+            return;
+        }
+
+        iommu = drhd->iommu;
+        root_entry = iommu->root_entry;
+        printk("    hd->pgd = %p virt_to_maddr(hd->pgd) = %lx\n",
+               hd->pgd, virt_to_maddr(hd->pgd));
+
+        printk("    root_entry = %p\n", root_entry);
+        if (root_entry == NULL) {
+            printk("    root_entry == NULL\n");
+            return;
+        }
+
+        printk("    root_entry[%x] = %lx\n", bus, root_entry[bus].val);
+        printk("    maddr_to_virt(root_entry[%x]) = %p\n",
+            bus, maddr_to_virt(root_entry[bus].val));
+
+        if (root_entry[bus].val == 0) {
+            printk("    root_entry[%x].lo == 0\n", bus);
+            return;
+        }
+ 
+        ctxt_entry = maddr_to_virt((root_entry[bus].val >> PAGE_SHIFT) << 
PAGE_SHIFT);
+        if (ctxt_entry == NULL) {
+            printk("    ctxt_entry == NULL\n");
+            return;
+        }
+
+        if (ctxt_entry[devfn].lo == 0) {
+            printk("    ctxt_entry[%x].lo == 0\n", devfn);
+            return;
+        }
+
+        printk("    context = %p\n", ctxt_entry);
+        printk("    context[%x] = %lx %lx\n",
+               devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo);
+        printk("    maddr_to_virt(context[%x].lo) = %p\n",
+               devfn, maddr_to_virt(ctxt_entry[devfn].lo));
+        printk("    context[%x] = %lx\n", devfn, ctxt_entry[devfn].lo); 
+
+        l3 = maddr_to_virt(ctxt_entry[devfn].lo);
+        l3 = (u64*)(((u64) l3 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l3 = %p\n", l3); 
+        if (l3 == NULL) return;
+
+        l3_index = (gmfn >> 9 >> 9) & 0x1ff;
+        printk("    l3_index = %x\n", l3_index);
+        printk("    l3[%x] = %lx\n", l3_index, l3[l3_index]);
+
+        l2 = maddr_to_virt(l3[l3_index]);
+        l2 = (u64*)(((u64) l2 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l2 = %p\n", l2); 
+        if (l2 == NULL) return;
+
+        l2_index = (gmfn >> 9) & 0x1ff;
+        printk("    gmfn = %lx\n", gmfn);
+        printk("    gmfn >> 9= %lx\n", gmfn >> 9);
+        printk("    l2_index = %x\n", l2_index);
+        printk("    l2[%x] = %lx\n", l2_index, l2[l2_index]);
+
+        l1 = maddr_to_virt(l2[l2_index]);
+        l1 = (u64*)(((u64) l1 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        if (l1 == NULL) return;
+        l1_index = gmfn & 0x1ff;
+        printk("    l1 = %p\n", l1); 
+        printk("    l1_index = %x\n", l1_index);
+        printk("    l1[%x] = %lx\n", l1_index, l1[l1_index]); 
+    }
+}
+
+#else    // !m64
+
+void print_iommu_regs(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+ 
+    printk("---- print_iommu_regs ----\n"); 
+    printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
+    printk("print_iommu_regs: DMAR_VER_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_VER_REG));
+    printk("print_iommu_regs: DMAR_CAP_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_CAP_REG));
+    printk("print_iommu_regs: n_fault_reg = %llx\n",
+                   cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_l = %llx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_h = %llx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) 
+ 8);
+    printk("print_iommu_regs: fault_recording_reg_l = %llx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG))));
+    printk("print_iommu_regs: fault_recording_reg_h = %llx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG)) + 8));
+    printk("print_iommu_regs: DMAR_ECAP_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_ECAP_REG));
+    printk("print_iommu_regs: DMAR_GCMD_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GCMD_REG));
+    printk("print_iommu_regs: DMAR_GSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GSTS_REG));
+    printk("print_iommu_regs: DMAR_RTADDR_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_RTADDR_REG));
+    printk("print_iommu_regs: DMAR_CCMD_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_CCMD_REG));
+    printk("print_iommu_regs: DMAR_FSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FSTS_REG));
+    printk("print_iommu_regs: DMAR_FECTL_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FECTL_REG));
+    printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEDATA_REG));
+    printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEADDR_REG));
+    printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEUADDR_REG));
+}
+
+void print_vtd_entries(struct domain *d, int bus, int devfn,
+                       unsigned long gmfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct context_entry *ctxt_entry;
+    struct root_entry *root_entry;
+    u64 *l3, *l2, *l1;
+    u32 l3_index, l2_index, l1_index;
+    u32 i = 0;
+
+    printk("print_vtd_entries: domain_id = %x bdf = %x:%x:%x devfn = %x, gmfn 
= %lx\n", d->domain_id, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), devfn, gmfn);
+
+    for_each_drhd_unit(drhd) {
+        printk("---- print_vtd_entries %d ----\n", i++);
+
+        if (hd->pgd == NULL) {
+            printk("    hg->pgd == NULL\n");
+            return;
+        }
+
+        iommu = drhd->iommu;
+        root_entry = iommu->root_entry;
+        printk("    d->pgd = %p virt_to_maddr(hd->pgd) = %lx\n",
+               hd->pgd, virt_to_maddr(hd->pgd));
+
+        printk("    root_entry = %p\n", root_entry);
+        if (root_entry == NULL) {
+            printk("    root_entry == NULL\n");
+            return;
+        }
+
+        printk("    root_entry[%x] = %llx\n", bus, root_entry[bus].val);
+        printk("    maddr_to_virt(root_entry[%x]) = %p\n",
+            bus, maddr_to_virt(root_entry[bus].val));
+
+        if (root_entry[bus].val == 0) {
+            printk("    root_entry[%x].lo == 0\n", bus);
+            return;
+        }
+ 
+        ctxt_entry = maddr_to_virt((root_entry[bus].val >> PAGE_SHIFT) << 
PAGE_SHIFT);
+        if (ctxt_entry == NULL) {
+            printk("    ctxt_entry == NULL\n");
+            return;
+        }
+
+        if (ctxt_entry[devfn].lo == 0) {
+            printk("    ctxt_entry[%x].lo == 0\n", devfn);
+            return;
+        }
+
+        printk("    context = %p\n", ctxt_entry);
+        printk("    context[%x] = %llx %llx\n",
+               devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo);
+        printk("    maddr_to_virt(context[%x].lo) = %p\n",
+               devfn, maddr_to_virt(ctxt_entry[devfn].lo));
+        printk("    context[%x] = %llx\n", devfn, ctxt_entry[devfn].lo); 
+
+        l3 = maddr_to_virt(ctxt_entry[devfn].lo);
+        l3 = (u64*)(((u32) l3 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l3 = %p\n", l3); 
+        if (l3 == NULL) return;
+
+        l3_index = (gmfn >> 9 >> 9) & 0x1ff;
+        printk("    l3_index = %x\n", l3_index);
+        printk("    l3[%x] = %llx\n", l3_index, l3[l3_index]);
+
+        l2 = maddr_to_virt(l3[l3_index]);
+        l2 = (u64*)(((u32) l2 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l2 = %p\n", l2); 
+        if (l2 == NULL) return;
+
+        l2_index = (gmfn >> 9) & 0x1ff;
+        printk("    gmfn = %lx\n", gmfn);
+        printk("    gmfn >> 9= %lx\n", gmfn >> 9);
+        printk("    l2_index = %x\n", l2_index);
+        printk("    l2[%x] = %llx\n", l2_index, l2[l2_index]);
+
+        l1 = maddr_to_virt(l2[l2_index]);
+        l1 = (u64*)(((u32) l1 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        if (l1 == NULL) return;
+        l1_index = gmfn & 0x1ff;
+        printk("    l1 = %p\n", l1); 
+        printk("    l1_index = %x\n", l1_index);
+        printk("    l1[%x] = %llx\n", l1_index, l1[l1_index]); 
+    }
+}
+#endif    // !m64

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Intel vt-d specific changes in arch/x86/hvm/vmx/vtd., Xen patchbot-unstable <=