[Xen-changelog] [xen-unstable] Intel vt-d specific changes in ar

# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1189784449 -3600
# Node ID f4bbd3f327e4308aa2aebf5484fc32d1d1ff4b41
# Parent  acfa9290746f9c00e30dca7a62e9f7a96702b3b5
Intel vt-d specific changes in arch/x86/hvm/vmx/vtd.

Signed-off-by: Allen Kay <allen.m.kay@xxxxxxxxx>
Signed-off-by: Guy Zana <guy@xxxxxxxxxxxx>
---
 xen/arch/x86/hvm/vmx/vtd/Makefile      |    4 
 xen/arch/x86/hvm/vmx/vtd/dmar.c        |  494 ++++++++
 xen/arch/x86/hvm/vmx/vtd/dmar.h        |   90 +
 xen/arch/x86/hvm/vmx/vtd/intel-iommu.c | 1927 +++++++++++++++++++++++++++++++++
 xen/arch/x86/hvm/vmx/vtd/io.c          |  120 ++
 xen/arch/x86/hvm/vmx/vtd/msi.h         |  128 ++
 xen/arch/x86/hvm/vmx/vtd/pci-direct.h  |   48 
 xen/arch/x86/hvm/vmx/vtd/pci_regs.h    |  449 +++++++
 xen/arch/x86/hvm/vmx/vtd/utils.c       |  302 +++++
 9 files changed, 3562 insertions(+)

diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/Makefile Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,4 @@
+obj-y += intel-iommu.o
+obj-y += dmar.o
+obj-y += utils.o
+obj-y += io.o
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/dmar.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/dmar.c   Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@xxxxxxxxx>
+ * Copyright (C) Shaohua Li <shaohua.li@xxxxxxxxx>
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx> - adapted to xen
+ */
+
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/kernel.h>
+#include <xen/acpi.h>
+#include <xen/mm.h>
+#include <xen/xmalloc.h>
+#include <asm/string.h>
+#include "dmar.h"
+#include "pci-direct.h"
+#include "pci_regs.h"
+
+#undef PREFIX
+#define PREFIX VTDPREFIX "ACPI DMAR:"
+#define DEBUG
+
+#define MIN_SCOPE_LEN (sizeof(struct acpi_pci_path) + sizeof(struct 
acpi_dev_scope))
+
+LIST_HEAD(acpi_drhd_units);
+LIST_HEAD(acpi_rmrr_units);
+LIST_HEAD(acpi_atsr_units);
+LIST_HEAD(acpi_ioapic_units);
+
+u8 dmar_host_address_width;
+
+static int __init acpi_register_drhd_unit(struct acpi_drhd_unit *drhd)
+{
+    /*
+     * add INCLUDE_ALL at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if (drhd->include_all)
+        list_add_tail(&drhd->list, &acpi_drhd_units);
+    else
+        list_add(&drhd->list, &acpi_drhd_units);
+    return 0;
+}
+
+static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr)
+{
+    list_add(&rmrr->list, &acpi_rmrr_units);
+    return 0;
+}
+
+static int acpi_pci_device_match(struct pci_dev *devices, int cnt,
+                 struct pci_dev *dev)
+{
+    int i;
+
+    for (i = 0; i < cnt; i++) {
+        if ((dev->bus == devices->bus) &&
+            (dev->devfn == devices->devfn))
+            return 1;
+        devices++;
+    }
+    return 0;
+}
+
+static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr)
+{
+    /*
+     * add ALL_PORTS at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if (atsr->all_ports)
+        list_add_tail(&atsr->list, &acpi_atsr_units);
+    else
+        list_add(&atsr->list, &acpi_atsr_units);
+    return 0;
+}
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(struct pci_dev *dev)
+{
+    struct acpi_drhd_unit *drhd;
+    struct acpi_drhd_unit *include_all_drhd;
+
+    include_all_drhd = NULL;
+    list_for_each_entry(drhd, &acpi_drhd_units, list) {
+        if (drhd->include_all)
+            include_all_drhd = drhd;
+        if (acpi_pci_device_match(drhd->devices,
+                        drhd->devices_cnt, dev))
+        {
+            gdprintk(XENLOG_INFO VTDPREFIX, 
+                     "acpi_find_matched_drhd_unit: drhd->address = %lx\n",
+                     drhd->address);
+            return drhd;
+        }
+    }
+
+    if (include_all_drhd) {
+        gdprintk(XENLOG_INFO VTDPREFIX, 
+                 "acpi_find_matched_drhd_unit:include_all_drhd->addr = %lx\n",
+                 include_all_drhd->address);
+        return include_all_drhd;;
+    }
+
+    return(NULL);
+}
+
+struct acpi_rmrr_unit * acpi_find_matched_rmrr_unit(struct pci_dev *dev)
+{
+    struct acpi_rmrr_unit *rmrr;
+
+    list_for_each_entry(rmrr, &acpi_rmrr_units, list) {
+        if (acpi_pci_device_match(rmrr->devices,
+                        rmrr->devices_cnt, dev))
+            goto out;
+    }
+    rmrr = NULL;
+out:
+    return rmrr;
+}
+
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(struct pci_dev *dev)
+{
+    struct acpi_atsr_unit *atsru;
+    struct acpi_atsr_unit *all_ports_atsru;
+
+    all_ports_atsru = NULL;
+    list_for_each_entry(atsru, &acpi_atsr_units, list) {
+        if (atsru->all_ports)
+            all_ports_atsru = atsru;
+        if (acpi_pci_device_match(atsru->devices, atsru->devices_cnt, dev))
+            return atsru;
+    }
+    if (all_ports_atsru) {
+        gdprintk(XENLOG_INFO VTDPREFIX, 
+                 "acpi_find_matched_atsr_unit: all_ports_atsru\n");
+        return all_ports_atsru;;
+    }
+    return(NULL);
+}
+
+static int __init acpi_parse_dev_scope(void *start, void *end, int *cnt,
+                       struct pci_dev **devices)
+{
+    struct acpi_dev_scope *scope;
+    u8 bus, sub_bus, sec_bus;
+    struct acpi_pci_path *path;
+    struct acpi_ioapic_unit *acpi_ioapic_unit = NULL;
+    int count, dev_count=0;
+    struct pci_dev *pdev;
+    u8 dev, func;
+    u32 l;
+    void *tmp;
+
+    *cnt = 0;
+    tmp = start;
+    while (start < end) {
+        scope = start;
+        if (scope->length < MIN_SCOPE_LEN ||
+            (scope->dev_type != ACPI_DEV_ENDPOINT &&
+            scope->dev_type != ACPI_DEV_P2PBRIDGE)) {
+            printk(KERN_WARNING PREFIX "Invalid device scope\n");
+            return -EINVAL;
+        }
+        (*cnt)++;
+        start += scope->length;
+    }
+
+    start = tmp;
+    while (start < end) {
+        scope = start;
+        path = (struct acpi_pci_path *)(scope + 1);
+        count = (scope->length - sizeof(struct acpi_dev_scope))
+                   /sizeof(struct acpi_pci_path);
+        bus = scope->start_bus;
+
+        while (--count) {
+            bus = read_pci_config_byte(bus, path->dev,
+                                       path->fn, PCI_SECONDARY_BUS);
+            path++;
+        }
+
+        if (scope->dev_type == ACPI_DEV_ENDPOINT) {
+            printk(KERN_WARNING PREFIX
+                "found endpoint: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+                dev_count++;
+        } else if (scope->dev_type == ACPI_DEV_P2PBRIDGE) {
+            printk(KERN_WARNING PREFIX
+                "found bridge: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+
+            sec_bus = read_pci_config_byte(bus, path->dev,
+                                       path->fn, PCI_SECONDARY_BUS);
+            sub_bus = read_pci_config_byte(bus, path->dev,
+                                       path->fn, PCI_SUBORDINATE_BUS);
+            while (sec_bus <= sub_bus) {
+                for (dev = 0; dev < 32; dev++) {
+                    for (func = 0; func < 8; func++) {
+                        l = read_pci_config(sec_bus, dev, func, PCI_VENDOR_ID);
+
+                        /* some broken boards return 0 or ~0 if a slot is 
empty: */
+                        if (l == 0xffffffff || l == 0x00000000 ||
+                            l == 0x0000ffff || l == 0xffff0000)
+                            break;
+                        dev_count++;
+                    }
+                }
+                sec_bus++;
+            }
+        } else if (scope->dev_type == ACPI_DEV_IOAPIC) {
+            printk(KERN_WARNING PREFIX
+                "found IOAPIC: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+            dev_count++;
+        } else {
+            printk(KERN_WARNING PREFIX
+                "found MSI HPET: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+            dev_count++;
+        }
+
+        start += scope->length;
+    }
+
+    *cnt = dev_count;
+    *devices = xmalloc_array(struct pci_dev,  *cnt);
+    if (!*devices)
+        return -ENOMEM;
+    memset(*devices, 0, sizeof(struct pci_dev) * (*cnt));
+
+    pdev = *devices;
+    start = tmp;
+    while (start < end) {
+        scope = start;
+        path = (struct acpi_pci_path *)(scope + 1);
+        count = (scope->length - sizeof(struct acpi_dev_scope))
+                   /sizeof(struct acpi_pci_path);
+        bus = scope->start_bus;
+
+        while (--count) {
+            bus = read_pci_config_byte(bus, path->dev, path->fn, 
PCI_SECONDARY_BUS);
+            path++;
+        }
+
+        if (scope->dev_type == ACPI_DEV_ENDPOINT) {
+            printk(KERN_WARNING PREFIX
+                "found endpoint: bdf = %x:%x:%x\n", bus, path->dev, path->fn);
+
+            pdev->bus = bus;
+            pdev->devfn = PCI_DEVFN(path->dev, path->fn);
+            pdev++;
+        } else if (scope->dev_type == ACPI_DEV_P2PBRIDGE) {
+            printk(KERN_WARNING PREFIX
+                "found bridge: bus = %x dev = %x func = %x\n", bus, path->dev, 
path->fn);
+
+            sec_bus = read_pci_config_byte(bus, path->dev, path->fn, 
PCI_SECONDARY_BUS);
+            sub_bus = read_pci_config_byte(bus, path->dev, path->fn, 
PCI_SUBORDINATE_BUS);
+
+            while (sec_bus <= sub_bus) {
+                for (dev = 0; dev < 32; dev++) {
+                    for (func = 0; func < 8; func++) {
+                        l = read_pci_config(sec_bus, dev, func, PCI_VENDOR_ID);
+
+                        /* some broken boards return 0 or ~0 if a slot is 
empty: */
+                        if (l == 0xffffffff || l == 0x00000000 ||
+                            l == 0x0000ffff || l == 0xffff0000)
+                            break;
+
+                        pdev->bus = sec_bus;
+                        pdev->devfn = PCI_DEVFN(dev, func);
+                        pdev++;
+                    }
+                }
+                sec_bus++;
+            }
+        } else if (scope->dev_type == ACPI_DEV_IOAPIC) {
+            acpi_ioapic_unit = xmalloc(struct acpi_ioapic_unit);
+            acpi_ioapic_unit->apic_id = scope->enum_id;
+            acpi_ioapic_unit->ioapic.bdf.bus = bus;
+            acpi_ioapic_unit->ioapic.bdf.dev = path->dev;
+            acpi_ioapic_unit->ioapic.bdf.func = path->fn;
+            list_add(&acpi_ioapic_unit->list, &acpi_ioapic_units);
+            printk(KERN_WARNING PREFIX
+                "found IOAPIC: bus = %x dev = %x func = %x\n", bus, path->dev, 
path->fn);
+        } else {
+            printk(KERN_WARNING PREFIX
+                "found MSI HPET: bus = %x dev = %x func = %x\n", bus, 
path->dev, path->fn);
+        }
+        
+        start += scope->length;
+    }
+
+    return 0;
+}
+
+static int __init
+acpi_parse_one_drhd(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_drhd * drhd = (struct acpi_table_drhd *)header;
+    struct acpi_drhd_unit *dmaru;
+    int ret = 0;
+    static int include_all;
+
+    dmaru = xmalloc(struct acpi_drhd_unit);
+    if (!dmaru)
+        return -ENOMEM;
+    memset(dmaru, 0, sizeof(struct acpi_drhd_unit));
+
+    dmaru->address = drhd->address;
+    dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
+    printk(KERN_WARNING PREFIX "dmaru->address = %lx\n", dmaru->address);
+
+    if (!dmaru->include_all) {
+        ret = acpi_parse_dev_scope((void *)(drhd + 1),
+                ((void *)drhd) + header->length,
+                &dmaru->devices_cnt, &dmaru->devices);
+    }
+    else {
+        printk(KERN_WARNING PREFIX "found INCLUDE_ALL\n");
+        /* Only allow one INCLUDE_ALL */
+        if (include_all) {
+            printk(KERN_WARNING PREFIX "Only one INCLUDE_ALL "
+                "device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        include_all = 1;
+    }
+
+    if (ret)
+        xfree(dmaru);
+    else
+        acpi_register_drhd_unit(dmaru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_rmrr *rmrr = (struct acpi_table_rmrr *)header;
+    struct acpi_rmrr_unit *rmrru;
+    int ret = 0;
+
+    rmrru = xmalloc(struct acpi_rmrr_unit);
+    if (!rmrru)
+        return -ENOMEM;
+    memset(rmrru, 0, sizeof(struct acpi_rmrr_unit));
+
+#ifdef VTD_DEBUG
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "acpi_parse_one_rmrr: base = %lx end = %lx\n",
+        rmrr->base_address, rmrr->end_address);
+#endif
+
+    rmrru->base_address = rmrr->base_address;
+    rmrru->end_address = rmrr->end_address;
+    ret = acpi_parse_dev_scope((void *)(rmrr + 1),
+            ((void*)rmrr) + header->length,
+            &rmrru->devices_cnt, &rmrru->devices);
+
+    if (ret || (rmrru->devices_cnt == 0))
+        xfree(rmrru);
+    else
+        acpi_register_rmrr_unit(rmrru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_atsr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_atsr *atsr = (struct acpi_table_atsr *)header;
+    struct acpi_atsr_unit *atsru;
+    int ret = 0;
+    static int all_ports;
+
+    atsru = xmalloc(struct acpi_atsr_unit);
+    if (!atsru)
+        return -ENOMEM;
+    memset(atsru, 0, sizeof(struct acpi_atsr_unit));
+
+    atsru->all_ports = atsr->flags & 1; /* BIT0: ALL_PORTS */
+    if (!atsru->all_ports) {
+        ret = acpi_parse_dev_scope((void *)(atsr + 1),
+                ((void *)atsr) + header->length,
+                &atsru->devices_cnt, &atsru->devices);
+    }
+    else {
+        printk(KERN_WARNING PREFIX "found ALL_PORTS\n");
+        /* Only allow one ALL_PORTS */
+        if (all_ports) {
+            printk(KERN_WARNING PREFIX "Only one ALL_PORTS "
+                "device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        all_ports = 1;
+    }
+
+    if (ret)
+        xfree(atsr);
+    else
+        acpi_register_atsr_unit(atsru);
+    return ret;
+}
+
+static void __init
+acpi_table_print_dmar_entry(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_drhd *drhd;
+    struct acpi_table_rmrr *rmrr;
+
+    switch (header->type) {
+    case ACPI_DMAR_DRHD:
+        drhd = (struct acpi_table_drhd *)header;
+        break;
+    case ACPI_DMAR_RMRR:
+        rmrr = (struct acpi_table_rmrr *)header;
+        break;
+    }
+}
+
+static int __init
+acpi_parse_dmar(unsigned long phys_addr, unsigned long size)
+{
+    struct acpi_table_dmar *dmar = NULL;
+    struct acpi_dmar_entry_header *entry_header;
+    int ret = 0;
+
+    if (!phys_addr || !size)
+        return -EINVAL;
+
+    dmar = (struct acpi_table_dmar *)__acpi_map_table(phys_addr, size);
+    if (!dmar) {
+        printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
+        return -ENODEV;
+    }
+
+    if (!dmar->haw) {
+        printk (KERN_WARNING PREFIX "Zero: Invalid DMAR haw\n");
+        return -EINVAL;
+    }
+
+    dmar_host_address_width = dmar->haw;
+    printk (KERN_INFO PREFIX "Host address width %d\n",
+        dmar_host_address_width);
+
+    entry_header = (struct acpi_dmar_entry_header *)(dmar + 1);
+    while (((unsigned long)entry_header) < (((unsigned long)dmar) + size)) {
+        acpi_table_print_dmar_entry(entry_header);
+
+        switch (entry_header->type) {
+        case ACPI_DMAR_DRHD:
+            printk (KERN_INFO PREFIX "found ACPI_DMAR_DRHD\n");
+            ret = acpi_parse_one_drhd(entry_header);
+            break;
+        case ACPI_DMAR_RMRR:
+            printk (KERN_INFO PREFIX "found ACPI_DMAR_RMRR\n");
+            ret = acpi_parse_one_rmrr(entry_header);
+            break;
+        case ACPI_DMAR_ATSR:
+            printk (KERN_INFO PREFIX "found ACPI_DMAR_RMRR\n");
+            ret = acpi_parse_one_atsr(entry_header);
+            break;
+        default:
+            printk(KERN_WARNING PREFIX "Unknown DMAR structure type\n");
+            ret = -EINVAL;
+            break;
+        }
+        if (ret)
+            break;
+
+        entry_header = ((void *)entry_header + entry_header->length);
+    }
+    return ret;
+}
+
+int acpi_dmar_init(void)
+{
+    acpi_table_parse(ACPI_DMAR, acpi_parse_dmar);
+    if (list_empty(&acpi_drhd_units)) {
+        printk(KERN_ERR PREFIX "No DMAR devices found\n");
+        return -ENODEV;
+    } else
+        vtd_enabled = 1;
+    return 0;
+}
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/dmar.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/dmar.h   Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@xxxxxxxxx>
+ * Copyright (C) Shaohua Li <shaohua.li@xxxxxxxxx>
+ */
+
+#ifndef _DMAR_H_
+#define _DMAR_H_
+
+#include <xen/list.h>
+#include <asm/iommu.h>
+
+extern u8 dmar_host_address_width;
+
+struct acpi_drhd_unit {
+    struct list_head list;
+    unsigned long    address; /* register base address of the unit */
+    struct    pci_dev *devices; /* target devices */
+    int    devices_cnt;
+    u8    include_all:1;
+    struct iommu *iommu;
+};
+
+struct acpi_rmrr_unit {
+    struct list_head list;
+    unsigned long base_address;
+    unsigned long end_address;
+    struct pci_dev *devices; /* target devices */
+    int    devices_cnt;
+    u8    allow_all:1;
+};
+
+struct acpi_atsr_unit {
+    struct list_head list;
+    struct    pci_dev *devices; /* target devices */
+    int    devices_cnt;
+    u8    all_ports:1;
+};
+
+#define for_each_iommu(domain, iommu) \
+    list_for_each_entry(iommu, \
+        &(domain->arch.hvm_domain.hvm_iommu.iommu_list), list)
+
+#define for_each_pdev(domain, pdev) \
+    list_for_each_entry(pdev, \
+         &(domain->arch.hvm_domain.hvm_iommu.pdev_list), list)
+
+#define for_each_drhd_unit(drhd) \
+    list_for_each_entry(drhd, &acpi_drhd_units, list)
+#define for_each_rmrr_device(rmrr, pdev) \
+    list_for_each_entry(rmrr, &acpi_rmrr_units, list) { \
+        int _i; \
+        for (_i = 0; _i < rmrr->devices_cnt; _i++) { \
+            pdev = &(rmrr->devices[_i]);
+#define end_for_each_rmrr_device(rmrr, pdev) \
+        } \
+    }
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(struct pci_dev *dev);
+struct acpi_rmrr_unit * acpi_find_matched_rmrr_unit(struct pci_dev *dev);
+
+/* This one is for interrupt remapping */
+struct acpi_ioapic_unit {
+    struct list_head list;
+    int apic_id;
+    union {
+        u16 info;
+        struct {
+            u16 bus: 8,
+                dev: 5,
+                func: 3;
+        }bdf;
+    }ioapic;
+};
+
+#endif // _DMAR_H_
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/intel-iommu.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/intel-iommu.c    Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,1927 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@xxxxxxxxx>
+ * Copyright (C) Shaohua Li <shaohua.li@xxxxxxxxx>
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx> - adapted to xen
+ */
+
+#include <xen/init.h>
+#include <xen/irq.h>
+#include <xen/spinlock.h>
+#include <xen/sched.h>
+#include <xen/xmalloc.h>
+#include <xen/domain_page.h>
+#include <asm/delay.h>
+#include <asm/string.h>
+#include <asm/iommu.h>
+#include <asm/hvm/vmx/intel-iommu.h>
+#include "dmar.h"
+#include "pci-direct.h"
+#include "pci_regs.h"
+#include "msi.h"
+
+extern void print_iommu_regs(struct acpi_drhd_unit *drhd);
+extern void print_vtd_entries(struct domain *d, int bus, int devfn,
+                       unsigned long gmfn);
+extern void (*interrupt[])(void);
+
+#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
+
+#define time_after(a,b)         \
+        (typecheck(unsigned long, a) && \
+         typecheck(unsigned long, b) && \
+         ((long)(b) - (long)(a) < 0))
+
+unsigned int x86_clflush_size;
+void clflush_cache_range(void *adr, int size)
+{
+    int i;
+    for (i = 0; i < size; i += x86_clflush_size)
+        clflush(adr + i);
+}
+
+static void __iommu_flush_cache(struct iommu *iommu, void *addr, int size)
+{
+    if (!ecap_coherent(iommu->ecap))
+        clflush_cache_range(addr, size);
+}
+
+#define iommu_flush_cache_entry(iommu, addr) \
+       __iommu_flush_cache(iommu, addr, 8)
+#define iommu_flush_cache_page(iommu, addr) \
+       __iommu_flush_cache(iommu, addr, PAGE_SIZE_4K)
+
+int nr_iommus;
+/* context entry handling */
+static struct context_entry * device_to_context_entry(struct iommu *iommu,
+        u8 bus, u8 devfn)
+{
+    struct root_entry *root;
+    struct context_entry *context;
+    unsigned long phy_addr;
+    unsigned long flags;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    root = &iommu->root_entry[bus];
+    if (!root_present(*root)) {
+        phy_addr = (unsigned long) alloc_xenheap_page();
+        if (!phy_addr) {
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return NULL;
+        }
+        memset((void *) phy_addr, 0, PAGE_SIZE);
+        iommu_flush_cache_page(iommu, (void *)phy_addr);
+        phy_addr = virt_to_maddr((void *)phy_addr);
+        set_root_value(*root, phy_addr);
+        set_root_present(*root);
+        iommu_flush_cache_entry(iommu, root);
+    }
+    phy_addr = (unsigned long) get_context_addr(*root);
+    context = (struct context_entry *)maddr_to_virt(phy_addr);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+    return &context[devfn];
+}
+
+static int device_context_mapped(struct iommu *iommu, u8 bus, u8 devfn)
+{
+    struct root_entry *root;
+    struct context_entry *context;
+    unsigned long phy_addr;
+    int ret;
+    unsigned long flags;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    root = &iommu->root_entry[bus];
+    if (!root_present(*root)) {
+        ret = 0;
+        goto out;
+    }
+    phy_addr = get_context_addr(*root);
+    context = (struct context_entry *)maddr_to_virt(phy_addr);
+    ret = context_present(context[devfn]);
+out:
+    spin_unlock_irqrestore(&iommu->lock, flags);
+    return ret;
+}
+
+/* page table handling */
+#define LEVEL_STRIDE        (9)
+#define LEVEL_MASK        ((1 << LEVEL_STRIDE) - 1)
+#define agaw_to_level(val) ((val) + 2)
+#define agaw_to_width(val) (30 + val * LEVEL_STRIDE)
+#define width_to_agaw(w)  ((w - 30)/LEVEL_STRIDE)
+#define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE)
+#define address_level_offset(addr, level) \
+    ((addr >> level_to_offset_bits(level)) & LEVEL_MASK)
+#define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l))
+#define level_size(l) (1 << level_to_offset_bits(l))
+#define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l))
+static struct dma_pte * addr_to_dma_pte(struct domain *domain, u64 addr)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int addr_width = agaw_to_width(hd->agaw);
+    struct dma_pte *parent, *pte = NULL, *pgd;
+    int level = agaw_to_level(hd->agaw);
+    int offset;
+    unsigned long flags;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    addr &= (((u64)1) << addr_width) - 1;
+    spin_lock_irqsave(&hd->mapping_lock, flags);
+    if (!hd->pgd) {
+        pgd = (struct dma_pte *)alloc_xenheap_page();
+        if (!pgd && !hd->pgd) {
+            spin_unlock_irqrestore(&hd->mapping_lock, flags);
+            return NULL;
+        }
+        memset((u8*)pgd, 0, PAGE_SIZE);
+        if (!hd->pgd)
+            hd->pgd = pgd;
+        else /* somebody is fast */
+            free_xenheap_page((void *) pgd);
+    }
+    parent = hd->pgd;
+    while (level > 0) {
+        u8 *tmp;
+        offset = address_level_offset(addr, level);
+        pte = &parent[offset];
+        if (level == 1)
+            break;
+        if (dma_pte_addr(*pte) == 0) {
+            tmp = alloc_xenheap_page();
+            if (tmp == NULL)
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "addr_to_dma_pte: tmp == NULL\n");
+ 
+            memset(tmp, 0, PAGE_SIZE);
+            iommu_flush_cache_page(iommu, tmp);
+
+            if (!tmp && dma_pte_addr(*pte) == 0) {
+                spin_unlock_irqrestore(&hd->mapping_lock, flags);
+                return NULL;
+            }
+            if (dma_pte_addr(*pte) == 0) {
+                dma_set_pte_addr(*pte,
+                    virt_to_maddr(tmp));
+                /*
+                 * high level table always sets r/w, last level
+                 * page table control read/write
+                 */
+                dma_set_pte_readable(*pte);
+                dma_set_pte_writable(*pte);
+                iommu_flush_cache_entry(iommu, pte);
+            } else /* somebody is fast */
+                free_xenheap_page(tmp);
+        }
+        parent = maddr_to_virt(dma_pte_addr(*pte));
+        level--;
+    }
+    spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    return pte;
+}
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_addr_level_pte(struct domain *domain, u64 addr,
+        int level)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct dma_pte *parent, *pte = NULL;
+    int total = agaw_to_level(hd->agaw);
+    int offset;
+
+    parent = hd->pgd;
+    while (level <= total) {
+        offset = address_level_offset(addr, total);
+        pte = &parent[offset];
+        if (level == total)
+            return pte;
+
+        if (dma_pte_addr(*pte) == 0)
+            break;
+        parent = maddr_to_virt(dma_pte_addr(*pte));
+        total--;
+    }
+    return NULL;
+}
+
+static void iommu_flush_write_buffer(struct iommu *iommu)
+{
+       u32 val;
+       unsigned long flag;
+       unsigned long start_time;
+
+       if (!cap_rwbf(iommu->cap))
+               return;
+       val = iommu->gcmd | DMA_GCMD_WBF;
+
+       spin_lock_irqsave(&iommu->register_lock, flag);
+       dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
+
+       /* Make sure hardware complete it */
+       start_time = jiffies;
+       while (1) {
+               val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+               if (!(val & DMA_GSTS_WBFS))
+                       break;
+               if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))
+                       panic("DMAR hardware is malfunctional, please disable 
IOMMU\n");
+               cpu_relax();
+       }
+       spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_context(struct iommu *iommu,
+       u16 did, u16 source_id, u8 function_mask, u64 type,
+       int non_present_entry_flush)
+{
+       u64 val = 0;
+       unsigned long flag;
+       unsigned long start_time;
+
+       /*
+        * In the non-present entry flush case, if hardware doesn't cache
+        * non-present entry we do nothing and if hardware cache non-present
+        * entry, we flush entries of domain 0 (the domain id is used to cache
+        * any non-present entries)
+        */
+       if (non_present_entry_flush) {
+               if (!cap_caching_mode(iommu->cap))
+                       return 1;
+               else
+                       did = 0;
+       }
+
+        /* use register invalidation */
+        switch (type)
+        {
+            case DMA_CCMD_GLOBAL_INVL:
+                val = DMA_CCMD_GLOBAL_INVL;
+                break;
+            case DMA_CCMD_DOMAIN_INVL:
+                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+                break;
+            case DMA_CCMD_DEVICE_INVL:
+                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+                  |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
+                break;
+            default:
+                BUG();
+        }
+        val |= DMA_CCMD_ICC;
+
+        spin_lock_irqsave(&iommu->register_lock, flag);
+        dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
+
+        /* Make sure hardware complete it */
+        start_time = jiffies;
+        while (1) {
+            val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
+            if (!(val & DMA_CCMD_ICC))
+                break;
+            if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))
+                panic("DMAR hardware is malfunctional, please disable 
IOMMU\n");
+            cpu_relax();
+        }
+        spin_unlock_irqrestore(&iommu->register_lock, flag);
+       /* flush context entry will implictly flush write buffer */
+       return 0;
+}
+
+static int inline iommu_flush_context_global(struct iommu *iommu,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
+               non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_domain(struct iommu *iommu, u16 did,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
+               non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_device(struct iommu *iommu,
+       u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
+{
+       return __iommu_flush_context(iommu, did, source_id, function_mask,
+               DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_iotlb(struct iommu *iommu, u16 did,
+       u64 addr, unsigned int size_order, u64 type,
+       int non_present_entry_flush)
+{
+       int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+       u64 val = 0, val_iva = 0;
+       unsigned long flag;
+       unsigned long start_time;
+
+       /*
+        * In the non-present entry flush case, if hardware doesn't cache
+        * non-present entry we do nothing and if hardware cache non-present
+        * entry, we flush entries of domain 0 (the domain id is used to cache
+        * any non-present entries)
+        */
+       if (non_present_entry_flush) {
+               if (!cap_caching_mode(iommu->cap))
+                       return 1;
+               else
+                       did = 0;
+       }
+
+        /* use register invalidation */
+        switch (type) {
+            case DMA_TLB_GLOBAL_FLUSH:
+                /* global flush doesn't need set IVA_REG */
+                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+                break;
+            case DMA_TLB_DSI_FLUSH:
+                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+                break;
+            case DMA_TLB_PSI_FLUSH:
+                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+                /* Note: always flush non-leaf currently */
+                val_iva = size_order | addr;
+                break;
+            default:
+                BUG();
+        }
+        /* Note: set drain read/write */
+#if 0
+        /*
+         * This is probably to be super secure.. Looks like we can
+         * ignore it without any impact.
+         */
+        if (cap_read_drain(iommu->cap))
+            val |= DMA_TLB_READ_DRAIN;
+#endif
+        if (cap_write_drain(iommu->cap))
+            val |= DMA_TLB_WRITE_DRAIN;
+
+        spin_lock_irqsave(&iommu->register_lock, flag);
+        /* Note: Only uses first TLB reg currently */
+        if (val_iva)
+            dmar_writeq(iommu->reg, tlb_offset, val_iva);
+        dmar_writeq(iommu->reg, tlb_offset + 8, val);
+
+        /* Make sure hardware complete it */
+        start_time = jiffies;
+        while (1) {
+            val = dmar_readq(iommu->reg, tlb_offset + 8);
+            if (!(val & DMA_TLB_IVT))
+                break;
+            if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))
+                panic("DMAR hardware is malfunctional, please disable 
IOMMU\n");
+            cpu_relax();
+        }
+        spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+        /* check IOTLB invalidation granularity */
+        if (DMA_TLB_IAIG(val) == 0)
+            printk(KERN_ERR VTDPREFIX "IOMMU: flush IOTLB failed\n");
+        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+            printk(KERN_ERR VTDPREFIX "IOMMU: tlb flush request %x, actual 
%x\n",
+              (u32)DMA_TLB_IIRG(type), (u32)DMA_TLB_IAIG(val));
+       /* flush context entry will implictly flush write buffer */
+       return 0;
+}
+
+static int inline iommu_flush_iotlb_global(struct iommu *iommu,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
+               non_present_entry_flush);
+}
+
+static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
+       int non_present_entry_flush)
+{
+       return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
+               non_present_entry_flush);
+}
+
+static int inline get_alignment(u64 base, unsigned int size)
+{
+       int t = 0;
+       u64 end;
+
+       end = base + size - 1;
+       while (base != end) {
+               t++;
+               base >>= 1;
+               end >>= 1;
+       }
+       return t;
+}
+
+static int inline iommu_flush_iotlb_psi(struct iommu *iommu, u16 did,
+       u64 addr, unsigned int pages, int non_present_entry_flush)
+{
+       unsigned int align;
+
+       BUG_ON(addr & (~PAGE_MASK_4K));
+       BUG_ON(pages == 0);
+
+       /* Fallback to domain selective flush if no PSI support */
+       if (!cap_pgsel_inv(iommu->cap))
+               return iommu_flush_iotlb_dsi(iommu, did,
+                       non_present_entry_flush);
+
+       /*
+        * PSI requires page size is 2 ^ x, and the base address is naturally
+        * aligned to the size
+        */
+       align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
+       /* Fallback to domain selective flush if size is too big */
+       if (align > cap_max_amask_val(iommu->cap))
+               return iommu_flush_iotlb_dsi(iommu, did,
+                       non_present_entry_flush);
+
+       addr >>= PAGE_SHIFT_4K + align;
+       addr <<= PAGE_SHIFT_4K + align;
+
+       return __iommu_flush_iotlb(iommu, did, addr, align,
+               DMA_TLB_PSI_FLUSH, non_present_entry_flush);
+}
+
+void flush_all(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int i = 0;
+
+    wbinvd();
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        iommu_flush_context_global(iommu, 0);
+        iommu_flush_iotlb_global(iommu, 0);
+        i++;
+    }
+}
+
+/* clear one page's page table */
+static void dma_pte_clear_one(struct domain *domain, u64 addr)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *pte = NULL;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+
+    /* get last level pte */
+    pte = dma_addr_level_pte(domain, addr, 1);
+
+    if (pte) {
+        dma_clear_pte(*pte);
+        iommu_flush_cache_entry(drhd->iommu, pte);
+
+        for_each_drhd_unit(drhd) {
+            iommu = drhd->iommu;
+            if (cap_caching_mode(iommu->cap))
+            {
+                iommu_flush_iotlb_psi(iommu, domain->domain_id, addr, 1, 0);
+            }
+            else if (cap_rwbf(iommu->cap))
+                iommu_flush_write_buffer(iommu);
+        }
+    }
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct domain *domain, u64 start, u64 end)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    int addr_width = agaw_to_width(hd->agaw);
+
+    start &= (((u64)1) << addr_width) - 1;
+    end &= (((u64)1) << addr_width) - 1;
+    /* in case it's partial page */
+    start = PAGE_ALIGN_4K(start);
+    end &= PAGE_MASK_4K;
+
+    /* we don't need lock here, nobody else touches the iova range */
+    while (start < end) {
+        dma_pte_clear_one(domain, start);
+        start += PAGE_SIZE_4K;
+    }
+}
+
+/* free page table pages. last level pte should already be cleared */
+// static void dma_pte_free_pagetable(struct domain *domain, u64 start, u64 
end)
+void dma_pte_free_pagetable(struct domain *domain, u64 start, u64 end)
+{
+    struct acpi_drhd_unit *drhd;
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct iommu *iommu;
+    int addr_width = agaw_to_width(hd->agaw);
+    struct dma_pte *pte;
+    int total = agaw_to_level(hd->agaw);
+    int level;
+    u32 tmp;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    start &= (((u64)1) << addr_width) - 1;
+    end &= (((u64)1) << addr_width) - 1;
+
+    /* we don't need lock here, nobody else touches the iova range */
+    level = 2;
+    while (level <= total) {
+        tmp = align_to_level(start, level);
+        if (tmp >= end || (tmp + level_size(level) > end))
+            return;
+
+        while (tmp < end) {
+            pte = dma_addr_level_pte(domain, tmp, level);
+            if (pte) {
+                free_xenheap_page((void *) maddr_to_virt(dma_pte_addr(*pte)));
+                dma_clear_pte(*pte);
+                iommu_flush_cache_entry(iommu, pte);
+            }
+            tmp += level_size(level);
+        }
+        level++;
+    }
+    /* free pgd */
+    if (start == 0 && end == ((((u64)1) << addr_width) - 1)) {
+        free_xenheap_page((void *)hd->pgd);
+        hd->pgd = NULL;
+    }
+}
+
+/* iommu handling */
+static int iommu_set_root_entry(struct iommu *iommu)
+{
+    void *addr;
+    u32 cmd, sts;
+    struct root_entry *root;
+    unsigned long flags;
+
+    if (iommu == NULL)
+        gdprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_set_root_entry: iommu == NULL\n");
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    if (!iommu->root_entry) {
+        spin_unlock_irqrestore(&iommu->lock, flags);
+        root = (struct root_entry *)alloc_xenheap_page();
+        memset((u8*)root, 0, PAGE_SIZE);
+        iommu_flush_cache_page(iommu, root);
+        spin_lock_irqsave(&iommu->lock, flags);
+
+        if (!root && !iommu->root_entry) {
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return -ENOMEM;
+        }
+
+        if (!iommu->root_entry)
+            iommu->root_entry = root;
+        else /* somebody is fast */
+            free_xenheap_page((void *)root);
+    }
+    spin_unlock_irqrestore(&iommu->lock, flags);
+
+    addr = iommu->root_entry;
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writeq(iommu->reg, DMAR_RTADDR_REG, virt_to_maddr(addr));
+    cmd = iommu->gcmd | DMA_GCMD_SRTP;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
+
+    /* Make sure hardware complete it */
+    while (1) {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if (sts & DMA_GSTS_RTPS)
+            break;
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+    return 0;
+}
+
+static int iommu_enable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "iommu_enable_translation: enabling vt-d translation\n");
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd |= DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+    /* Make sure hardware complete it */
+    while (1) {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if (sts & DMA_GSTS_TES) {
+            break;
+        }
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return 0;
+}
+
+int iommu_disable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd &= ~ DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    /* Make sure hardware complete it */
+    while(1) {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if (!(sts & DMA_GSTS_TES))
+                break;
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return 0;
+}
+
+static struct iommu *vector_to_iommu[NR_VECTORS];
+static int iommu_page_fault_do_one(struct iommu *iommu, int type,
+        u8 fault_reason, u16 source_id, u32 addr)
+{
+    dprintk(XENLOG_WARNING VTDPREFIX,
+        "iommu_page_fault:%s: DEVICE %x:%x.%x addr %x REASON %x\n",
+        (type ? "DMA Read" : "DMA Write"),
+        (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+        PCI_FUNC(source_id & 0xFF), addr, fault_reason);
+
+    print_vtd_entries(current->domain, (source_id >> 8),(source_id & 0xff),
+                      (addr >> PAGE_SHIFT)); 
+    return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+static void iommu_page_fault(int vector, void *dev_id,
+        struct cpu_user_regs *regs)
+{
+    struct iommu *iommu = dev_id;
+    int reg, fault_index;
+    u32 fault_status;
+    unsigned long flags;
+
+    dprintk(XENLOG_WARNING VTDPREFIX,
+        "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+    /* FIXME: ignore advanced fault log */
+    if (!(fault_status & DMA_FSTS_PPF))
+        return;
+    fault_index = dma_fsts_fault_record_index(fault_status);
+    reg = cap_fault_reg_offset(iommu->cap);
+    while (1) {
+        u8 fault_reason;
+        u16 source_id;
+        u32 guest_addr;
+        int type;
+        u32 data;
+
+        /* highest 32 bits */
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        data = dmar_readl(iommu->reg, reg +
+                fault_index * PRIMARY_FAULT_REG_LEN + 12);
+        if (!(data & DMA_FRCD_F)) {
+            spin_unlock_irqrestore(&iommu->register_lock, flags);
+            break;
+        }
+
+        fault_reason = dma_frcd_fault_reason(data);
+        type = dma_frcd_type(data);
+
+        data = dmar_readl(iommu->reg, reg +
+                fault_index * PRIMARY_FAULT_REG_LEN + 8);
+        source_id = dma_frcd_source_id(data);
+
+        guest_addr = dmar_readq(iommu->reg, reg +
+                fault_index * PRIMARY_FAULT_REG_LEN);
+        guest_addr = dma_frcd_page_addr(guest_addr);
+        /* clear the fault */
+        dmar_writel(iommu->reg, reg +
+            fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+        iommu_page_fault_do_one(iommu, type, fault_reason,
+                source_id, guest_addr);
+
+        fault_index++;
+        if (fault_index > cap_num_fault_regs(iommu->cap))
+            fault_index = 0;
+    }
+    /* clear primary fault overflow */
+    if (fault_status & DMA_FSTS_PFO) {
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+    }
+    return;
+}
+
+static void dma_msi_unmask(unsigned int vector)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    unsigned long flags;
+
+    /* unmask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_mask(unsigned int vector)
+{
+    unsigned long flags;
+    struct iommu *iommu = vector_to_iommu[vector];
+
+    /* mask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static unsigned int dma_msi_startup(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    return 0;
+}
+
+static void dma_msi_end(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    ack_APIC_irq();
+}
+
+static void dma_msi_data_init(struct iommu *iommu, int vector)
+{
+    u32 msi_data = 0;
+    unsigned long flags;
+
+    /* Fixed, edge, assert mode. Follow MSI setting */
+    msi_data |= vector & 0xff;
+    msi_data |= 1 << 14;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
+{
+    u64 msi_address;
+    unsigned long flags;
+
+    /* Physical, dedicated cpu. Follow MSI setting */
+    msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
+    msi_address |= MSI_PHYSICAL_MODE << 2;
+    msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
+    msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
+    dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
+}
+
+static struct hw_interrupt_type dma_msi_type = {
+    .typename = "DMA_MSI",
+    .startup = dma_msi_startup,
+    .shutdown = dma_msi_mask,
+    .enable = dma_msi_unmask,
+    .disable = dma_msi_mask,
+    .ack = dma_msi_mask,
+    .end = dma_msi_end,
+    .set_affinity = dma_msi_set_affinity,
+};
+
+int iommu_set_interrupt(struct iommu *iommu)
+{
+    int vector, ret;
+    unsigned long flags;
+
+    vector = assign_irq_vector(AUTO_ASSIGN);
+    vector_to_iommu[vector] = iommu;
+
+    /* VT-d fault is a MSI, make irq == vector */
+    irq_vector[vector] = vector;
+    vector_irq[vector] = vector;
+
+    if (!vector) {
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
+        return -EINVAL;
+    }
+
+    spin_lock_irqsave(&irq_desc[vector].lock, flags);
+    irq_desc[vector].handler = &dma_msi_type;
+    spin_unlock_irqrestore(&irq_desc[vector].lock, flags);
+    set_intr_gate(vector, interrupt[vector]);
+    ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu);
+    if (ret)
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
+    return vector;
+}
+
+struct iommu *iommu_alloc(void *hw_data)
+{
+    struct acpi_drhd_unit *drhd = (struct acpi_drhd_unit *) hw_data;
+    struct iommu *iommu;
+    
+    if (nr_iommus > MAX_IOMMUS) {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+            "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
+        return NULL;
+    }
+        
+    iommu = xmalloc(struct iommu);
+    if (!iommu)
+        return NULL;
+    memset(iommu, 0, sizeof(struct iommu));
+
+    set_fixmap_nocache(FIX_IOMMU_REGS_BASE_0 + nr_iommus, drhd->address);
+    iommu->reg = (void *) fix_to_virt(FIX_IOMMU_REGS_BASE_0 + nr_iommus);
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "iommu_alloc: iommu->reg = %p drhd->address = %lx\n",
+        iommu->reg, drhd->address);
+    nr_iommus++;
+
+    if (!iommu->reg) {
+        printk(KERN_ERR VTDPREFIX "IOMMU: can't mapping the region\n");
+        goto error;
+    }
+
+    iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
+    iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
+
+    spin_lock_init(&iommu->lock);
+    spin_lock_init(&iommu->register_lock);
+
+    drhd->iommu = iommu;
+    return iommu;
+error:
+    xfree(iommu);
+    return NULL;
+}
+
+static void free_iommu(struct iommu *iommu)
+{
+    if (!iommu)
+        return;
+    if (iommu->root_entry)
+        free_xenheap_page((void *)iommu->root_entry);
+    if (iommu->reg)
+        iounmap(iommu->reg);
+    free_irq(iommu->vector);
+    xfree(iommu);
+}
+
+#define guestwidth_to_adjustwidth(gaw) ({ \
+    int agaw; \
+    int r = (gaw - 12) % 9; \
+    if (r == 0) \
+        agaw = gaw; \
+    else \
+        agaw = gaw + 9 - r; \
+    if (agaw > 64) \
+        agaw = 64; \
+    agaw; })
+int iommu_domain_init(struct domain *domain)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct iommu *iommu = NULL;
+    int guest_width = DEFAULT_DOMAIN_ADDRESS_WIDTH;
+    int adjust_width, agaw;
+    unsigned long sagaw;
+    struct acpi_drhd_unit *drhd;
+
+    if (list_empty(&acpi_drhd_units))
+        return 0;
+    spin_lock_init(&hd->mapping_lock);
+    spin_lock_init(&hd->iommu_list_lock);
+    INIT_LIST_HEAD(&hd->pdev_list);
+
+    for_each_drhd_unit(drhd) {
+        if (drhd->iommu)
+            iommu = drhd->iommu;
+        else
+            iommu = iommu_alloc(drhd);
+    }
+
+    /* calculate AGAW */
+    if (guest_width > cap_mgaw(iommu->cap))
+        guest_width = cap_mgaw(iommu->cap);
+    adjust_width = guestwidth_to_adjustwidth(guest_width);
+    agaw = width_to_agaw(adjust_width);
+    /* FIXME: hardware doesn't support it, choose a bigger one? */
+    sagaw = cap_sagaw(iommu->cap);
+    if (!test_bit(agaw, &sagaw)) {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+            "IOMMU: hardware doesn't support the agaw\n");
+        agaw = find_next_bit(&sagaw, 5, agaw);
+        if (agaw >= 5)
+            return -ENODEV;
+    }
+    hd->agaw = agaw;
+    return 0;
+}
+
+static int domain_context_mapping_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct context_entry *context;
+    unsigned long flags;
+    int ret = 0;
+
+    context = device_to_context_entry(iommu, bus, devfn);
+    if (!context) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_mapping_one:context == NULL:bdf = %x:%x:%x \n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return -ENOMEM;
+    }
+    spin_lock_irqsave(&iommu->lock, flags);
+    if (context_present(*context)) {
+        spin_unlock_irqrestore(&iommu->lock, flags);
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_mapping_one:context present:bdf=%x:%x:%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return 0;
+    }
+
+#ifdef VTD_DEBUG
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "context_mapping_one_1-%x:%x:%x-*context = %lx %lx\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn), context->hi, context->lo);
+#endif
+
+    /*
+     * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
+     * be 1 based as required by intel's iommu hw.
+     */
+    context_set_domain_id(*context, domain->domain_id);
+    context_set_address_width(*context, hd->agaw);
+
+    if (ecap_pass_thru(iommu->ecap))
+        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+    else {
+        context_set_address_root(*context, virt_to_maddr(hd->pgd));
+        context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+    }
+
+    context_set_fault_enable(*context);
+    context_set_present(*context);
+    iommu_flush_cache_entry(iommu, context);
+
+#ifdef VTD_DEBUG
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "context_mapping_one_2-%x:%x:%x-*context=%lx %lx hd->pgd = %p\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+        context->hi, context->lo, hd->pgd);
+#endif
+
+    if (iommu_flush_context_device(iommu, domain->domain_id,
+                    (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
+        iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, domain->domain_id, 0);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+    return ret;
+}
+
+static int __pci_find_next_cap(u8 bus, unsigned int devfn, u8 pos, int cap)
+{
+    u8 id;
+    int ttl = 48;
+
+    while (ttl--) {
+        pos = read_pci_config_byte(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos);
+        if (pos < 0x40)
+            break;
+        pos &= ~3;
+        id = read_pci_config_byte(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                 pos + PCI_CAP_LIST_ID);
+
+        if (id == 0xff)
+            break;
+        if (id == cap)
+            return pos;
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+#define PCI_BASE_CLASS_BRIDGE    0x06
+#define PCI_CLASS_BRIDGE_PCI     0x0604
+
+#define DEV_TYPE_PCIe_ENDPOINT   1
+#define DEV_TYPE_PCI_BRIDGE      2
+#define DEV_TYPE_PCI             3
+
+int pdev_type(struct pci_dev *dev)
+{
+    u16 class_device;
+    u16 status;
+
+    class_device = read_pci_config_16(dev->bus, PCI_SLOT(dev->devfn),
+                 PCI_FUNC(dev->devfn), PCI_CLASS_DEVICE);
+    if (class_device == PCI_CLASS_BRIDGE_PCI)
+        return DEV_TYPE_PCI_BRIDGE;
+
+    status = read_pci_config_16(dev->bus, PCI_SLOT(dev->devfn),
+                 PCI_FUNC(dev->devfn), PCI_STATUS);
+
+    if (!(status & PCI_STATUS_CAP_LIST))
+        return DEV_TYPE_PCI;
+
+    if (__pci_find_next_cap(dev->bus, dev->devfn, PCI_CAPABILITY_LIST, 
PCI_CAP_ID_EXP))
+        return DEV_TYPE_PCIe_ENDPOINT;
+
+    return DEV_TYPE_PCI;
+}
+
+#define MAX_BUSES 256
+struct pci_dev bus2bridge[MAX_BUSES];
+
+static int domain_context_mapping(
+    struct domain *domain,
+    struct iommu *iommu,
+    struct pci_dev *pdev)
+{
+    int ret = 0;
+    int dev, func, sec_bus, sub_bus;
+    u32 type;
+
+    type = pdev_type(pdev);
+    if (type == DEV_TYPE_PCI_BRIDGE) {
+        sec_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SECONDARY_BUS);
+
+        if (bus2bridge[sec_bus].bus == 0) {
+            bus2bridge[sec_bus].bus   =  pdev->bus;
+            bus2bridge[sec_bus].devfn =  pdev->devfn;
+        }
+
+        sub_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
+
+        if (sec_bus != sub_bus) {
+            dprintk(XENLOG_INFO VTDPREFIX,
+                "context_mapping: nested PCI bridge not supported\n");
+            dprintk(XENLOG_INFO VTDPREFIX,
+                "    bdf = %x:%x:%x sec_bus = %x sub_bus = %x\n",
+                pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
+                sec_bus, sub_bus);
+        }
+    }
+
+    if (type == DEV_TYPE_PCIe_ENDPOINT) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_mapping:PCIe : bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        ret = domain_context_mapping_one(domain, iommu,
+                  (u8)(pdev->bus), (u8) (pdev->devfn));
+    }
+
+    /* PCI devices */
+    if (type == DEV_TYPE_PCI) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_mapping:PCI: bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+
+        if (pdev->bus == 0)
+            ret = domain_context_mapping_one(domain, iommu,
+                      (u8)(pdev->bus), (u8) (pdev->devfn));
+        else {
+            if (bus2bridge[pdev->bus].bus != 0)
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "domain_context_mapping:bus2bridge[pdev->bus].bus==0\n");
+
+            ret = domain_context_mapping_one(domain, iommu,
+                      (u8)(bus2bridge[pdev->bus].bus),
+                      (u8)(bus2bridge[pdev->bus].devfn));
+
+            /* now map everything behind the PCI bridge */
+            for (dev = 0; dev < 32; dev++) {
+                for (func = 0; func < 8; func++) {
+                    ret = domain_context_mapping_one(domain, iommu,
+                              pdev->bus, (u8)PCI_DEVFN(dev, func));
+                    if (ret)
+                        return ret;
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+static int domain_context_unmap_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct context_entry *context;
+    unsigned long flags;
+
+    context = device_to_context_entry(iommu, bus, devfn);
+    if (!context) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_unmap_one-%x:%x:%x- context == NULL:return\n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return -ENOMEM;
+    }
+    spin_lock_irqsave(&iommu->lock, flags);
+    if (!context_present(*context)) {
+        spin_unlock_irqrestore(&iommu->lock, flags);
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_unmap_one-%x:%x:%x- context NOT present:return\n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        return 0;
+    }
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "domain_context_unmap_one_1:bdf = %x:%x:%x\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+    context_clear_present(*context);
+    context_clear_entry(*context);
+    iommu_flush_cache_entry(iommu, context);
+    iommu_flush_context_global(iommu, 0);
+    iommu_flush_iotlb_global(iommu, 0);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "domain_context_unmap_one_2:bdf = %x:%x:%x\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+    return 0;
+}
+
+static int domain_context_unmap(
+    struct domain *domain,
+    struct iommu *iommu,
+    struct pci_dev *pdev)
+{
+    int ret = 0;
+    int dev, func, sec_bus, sub_bus;
+    u32 type;
+
+    type = pdev_type(pdev);
+    if (type == DEV_TYPE_PCI_BRIDGE) {
+        sec_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SECONDARY_BUS);
+        sub_bus = read_pci_config_byte(pdev->bus, PCI_SLOT(pdev->devfn),
+                      PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
+
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "domain_context_unmap:BRIDGE:%x:%x:%x sec_bus=%x sub_bus=%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn),
+            PCI_FUNC(pdev->devfn), sec_bus, sub_bus);
+    }
+
+    if (type == DEV_TYPE_PCIe_ENDPOINT) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCIe : bdf = %x:%x:%x\n",
+                 pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        ret = domain_context_unmap_one(domain, iommu,
+                  (u8)(pdev->bus), (u8) (pdev->devfn));
+    }
+
+    /* PCI devices */
+    if (type == DEV_TYPE_PCI) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCI: bdf = %x:%x:%x\n",
+                 pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        if (pdev->bus == 0)
+            ret = domain_context_unmap_one(domain, iommu,
+                      (u8)(pdev->bus), (u8) (pdev->devfn));
+        else {
+            if (bus2bridge[pdev->bus].bus != 0)
+                gdprintk(XENLOG_INFO VTDPREFIX,
+                         
"domain_context_mapping:bus2bridge[pdev->bus].bus==0\n");
+
+            ret = domain_context_unmap_one(domain, iommu,
+                      (u8)(bus2bridge[pdev->bus].bus),
+                      (u8)(bus2bridge[pdev->bus].devfn));
+
+            /* now map everything behind the PCI bridge */
+            for (dev = 0; dev < 32; dev++) {
+                for (func = 0; func < 8; func++) {
+                    ret = domain_context_unmap_one(domain, iommu,
+                              pdev->bus, (u8)PCI_DEVFN(dev, func));
+                    if (ret)
+                        return ret;
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+void reassign_device_ownership(
+    struct domain *source,
+    struct domain *target,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *source_hd = domain_hvm_iommu(source);
+    struct hvm_iommu *target_hd = domain_hvm_iommu(target);
+    struct pci_dev *pdev;
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int status;
+    unsigned long flags;
+
+    gdprintk(XENLOG_ERR VTDPREFIX,
+        "reassign_device-%x:%x:%x- source = %d target = %d\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+        source->domain_id, target->domain_id);
+
+    for_each_pdev(source, pdev) {
+        if ( (pdev->bus != bus) || (pdev->devfn != devfn) )
+            continue;
+
+        pdev->bus = bus;
+        pdev->devfn = devfn;
+        drhd = acpi_find_matched_drhd_unit(pdev);
+        iommu = drhd->iommu;
+        domain_context_unmap(source, iommu, pdev);
+
+        /*
+         * move pci device from the source domain to target domain.
+         */
+        spin_lock_irqsave(&source_hd->iommu_list_lock, flags);
+        spin_lock_irqsave(&target_hd->iommu_list_lock, flags);
+        list_move(&pdev->list, &target_hd->pdev_list);
+        spin_unlock_irqrestore(&target_hd->iommu_list_lock, flags);
+        spin_unlock_irqrestore(&source_hd->iommu_list_lock, flags);
+
+        status = domain_context_mapping(target, iommu, pdev);
+        if (status != 0)
+            gdprintk(XENLOG_ERR VTDPREFIX, "domain_context_mapping failed\n");
+
+        /*
+         * We are done.
+         */
+        break;
+    }
+}
+
+void return_devices_to_dom0(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    struct pci_dev *pdev;
+
+    while (!list_empty(&hd->pdev_list)) {
+        pdev = list_entry(hd->pdev_list.next, typeof(*pdev), list);
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "return_devices_to_dom0: bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+        reassign_device_ownership(d, dom0, pdev->bus, pdev->devfn);
+    }
+
+#ifdef VTD_DEBUG
+    for_each_pdev(dom0, pdev) {
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "return_devices_to_dom0:%x: bdf = %x:%x:%x\n",
+            dom0->domain_id, pdev->bus,
+            PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+    }
+#endif
+}
+
+void iommu_domain_teardown(struct domain *d)
+{
+  if (list_empty(&acpi_drhd_units))
+      return;
+
+#if CONFIG_PAGING_LEVELS == 3
+  {
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    int level = agaw_to_level(hd->agaw);
+    struct dma_pte *pgd = NULL;
+
+    switch (level)
+    {
+        case VTD_PAGE_TABLE_LEVEL_3:
+            if ( hd->pgd )
+                free_xenheap_page((void *)hd->pgd);
+            break;
+        case VTD_PAGE_TABLE_LEVEL_4:
+            if ( hd->pgd )
+            {
+                pgd = hd->pgd;
+                if ( pgd[0].val != 0 )
+                    free_xenheap_page((void*)maddr_to_virt(
+                        dma_pte_addr(pgd[0])));
+            }
+            break;
+        default:
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                "Unsupported p2m table sharing level!\n");
+            break;
+    }
+  }
+#endif
+    return_devices_to_dom0(d);
+}
+
+static int domain_context_mapped(struct domain *domain, struct pci_dev *pdev)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int ret;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        ret = device_context_mapped(iommu, pdev->bus, pdev->devfn);
+        if (ret)
+            return ret;
+    }
+    return 0;
+}
+
+int iommu_map_page(struct domain *d, paddr_t gfn, paddr_t mfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *pte = NULL;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if (ecap_pass_thru(iommu->ecap) && (d->domain_id == 0))
+        return 0;
+
+    pte = addr_to_dma_pte(d, gfn << PAGE_SHIFT_4K);
+    if (!pte)
+        return -ENOMEM;
+    dma_set_pte_addr(*pte, mfn << PAGE_SHIFT_4K);
+    dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
+    iommu_flush_cache_entry(iommu, pte);
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        if (cap_caching_mode(iommu->cap))
+            iommu_flush_iotlb_psi(iommu, d->domain_id,
+                                  gfn << PAGE_SHIFT_4K, 1, 0);
+        else if (cap_rwbf(iommu->cap))
+            iommu_flush_write_buffer(iommu);
+    }
+    return 0;
+}
+
+int iommu_unmap_page(struct domain *d, dma_addr_t gfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *pte = NULL;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if (ecap_pass_thru(iommu->ecap) && (d->domain_id == 0))
+        return 0;
+
+    /* get last level pte */
+    pte = dma_addr_level_pte(d, gfn << PAGE_SHIFT_4K, 1);
+    dma_pte_clear_one(d, gfn << PAGE_SHIFT_4K);
+    
+    return 0;
+}
+
+int iommu_page_mapping(struct domain *domain, dma_addr_t iova,
+            void *hpa, size_t size, int prot)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    unsigned long start_pfn, end_pfn;
+    struct dma_pte *pte = NULL;
+    int index;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+    if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+        return -EINVAL;
+    iova = (iova >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K;
+    start_pfn = (unsigned long)(((unsigned long) hpa) >> PAGE_SHIFT_4K);
+    end_pfn = (unsigned long)
+              ((PAGE_ALIGN_4K(((unsigned long)hpa) + size)) >> PAGE_SHIFT_4K);
+    index = 0;
+    while (start_pfn < end_pfn) {
+        pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
+        if (!pte)
+            return -ENOMEM;
+        dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
+        dma_set_pte_prot(*pte, prot);
+        iommu_flush_cache_entry(iommu, pte);
+        start_pfn++;
+        index++;
+    }
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        if (cap_caching_mode(iommu->cap))
+            iommu_flush_iotlb_psi(iommu, domain->domain_id, iova, size, 0);
+        else if (cap_rwbf(iommu->cap))
+            iommu_flush_write_buffer(iommu);
+    }
+    return 0;
+}
+
+int iommu_page_unmapping(struct domain *domain, dma_addr_t addr, size_t size)
+{
+    struct dma_pte *pte = NULL;
+
+    /* get last level pte */
+    pte = dma_addr_level_pte(domain, addr, 1);
+    dma_pte_clear_range(domain, addr, addr + size);
+    
+    return 0;
+}
+
+void iommu_flush(struct domain *d, dma_addr_t gfn, u64 *p2m_entry)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu = NULL;
+    struct dma_pte *pte = (struct dma_pte *) p2m_entry;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        if (cap_caching_mode(iommu->cap))
+            iommu_flush_iotlb_psi(iommu, d->domain_id,
+                gfn << PAGE_SHIFT_4K, 1, 0);
+        else if (cap_rwbf(iommu->cap))
+            iommu_flush_write_buffer(iommu);
+    }
+    iommu_flush_cache_entry(iommu, pte);
+}
+
+int
+prepare_device(struct domain *domain, struct pci_dev dev)
+{
+    return 0;
+}
+
+static int iommu_prepare_rmrr_dev(
+    struct domain *d,
+    struct acpi_rmrr_unit *rmrr,
+    struct pci_dev *pdev)
+{
+    struct acpi_drhd_unit *drhd;
+    unsigned long size;
+    int ret;
+
+    /* page table init */
+    size = rmrr->end_address - rmrr->base_address + 1;
+    ret = iommu_page_mapping(d, rmrr->base_address,
+        (void *)rmrr->base_address, size,
+        DMA_PTE_READ|DMA_PTE_WRITE);
+    if (ret)
+        return ret;
+
+    if (domain_context_mapped(d, pdev) == 0) {
+        drhd = acpi_find_matched_drhd_unit(pdev);
+        ret = domain_context_mapping(d, drhd->iommu, pdev);
+        if (!ret)
+            return 0;
+    }
+    return ret;
+}
+
+void __init setup_dom0_devices(void)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(dom0);
+    struct acpi_drhd_unit *drhd;
+    struct pci_dev *pdev;
+    int bus, dev, func;
+    u32 l;
+    u8 hdr_type;
+    int ret;
+
+#ifdef DEBUG_VTD_CONTEXT_ENTRY
+    for (bus = 0; bus < 256; bus++) {
+        for (dev = 0; dev < 32; dev++) { 
+            for (func = 0; func < 8; func++) { 
+                struct context_entry *context;
+                struct pci_dev device;
+
+                device.bus = bus; 
+                device.devfn = PCI_DEVFN(dev, func); 
+                drhd = acpi_find_matched_drhd_unit(&device);
+                context = device_to_context_entry(drhd->iommu,
+                    bus, PCI_DEVFN(dev, func));
+                if ((context->lo != 0) || (context->hi != 0))
+                    dprintk(XENLOG_INFO VTDPREFIX,
+                        "setup_dom0_devices-%x:%x:%x- context not 0\n",
+                        bus, dev, func);
+            }
+        }    
+    }        
+#endif
+
+    for (bus = 0; bus < 256; bus++) {
+        for (dev = 0; dev < 32; dev++) { 
+            for (func = 0; func < 8; func++) { 
+                l = read_pci_config(bus, dev, func, PCI_VENDOR_ID);
+                /* some broken boards return 0 or ~0 if a slot is empty: */
+                if (l == 0xffffffff || l == 0x00000000 ||
+                    l == 0x0000ffff || l == 0xffff0000)
+                    continue;
+                pdev = xmalloc(struct pci_dev);
+                pdev->bus = bus;
+                pdev->devfn = PCI_DEVFN(dev, func);
+                list_add_tail(&pdev->list, &hd->pdev_list);
+
+                drhd = acpi_find_matched_drhd_unit(pdev);
+                ret = domain_context_mapping(dom0, drhd->iommu, pdev);
+                if (ret != 0)
+                    gdprintk(XENLOG_ERR VTDPREFIX,
+                        "domain_context_mapping failed\n");
+
+                hdr_type = read_pci_config(bus, dev, func, PCI_HEADER_TYPE);
+                // if ((hdr_type & 0x8) == 0)
+                //      break;
+            }
+        }
+    }
+    for_each_pdev(dom0, pdev) {
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "setup_dom0_devices: bdf = %x:%x:%x\n",
+            pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+    }
+}
+
+void clear_fault_bit(struct iommu *iommu)
+{
+    u64 val;
+
+    val = dmar_readq(
+            iommu->reg,
+            cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
+    dmar_writeq(
+            iommu->reg,
+            cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
+            val);
+    dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
+}
+
+/*
+ * Called from ACPI discovery code, once all DMAR's and RMRR's are done
+ * scanning, we need to run through and initialize as much of it as necessary
+ */
+int vtd_enable = 1;
+static void setup_vtd_enable(char *s)
+{
+    if ( !strcmp(s, "0") )
+        vtd_enable = 0;
+    else if ( !strcmp(s, "1") )
+        vtd_enable = 1;
+    else
+        dprintk(XENLOG_INFO VTDPREFIX,
+            "Unknown vtd_enable value specified: '%s'\n", s);
+    dprintk(XENLOG_INFO VTDPREFIX, "vtd_enable = %x\n", vtd_enable);
+}
+custom_param("vtd", setup_vtd_enable);
+
+static int init_vtd_hw(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int ret;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        ret = iommu_set_root_entry(iommu);
+        if (ret) {
+            gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
+            return -EIO;
+        }
+    }
+    return 0;
+}
+
+static int enable_vtd_translation(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int vector = 0;
+
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        vector = iommu_set_interrupt(iommu);
+        dma_msi_data_init(iommu, vector);
+        dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
+        iommu->vector = vector;
+        clear_fault_bit(iommu);
+        if (vtd_enable && iommu_enable_translation(iommu))
+            return -EIO;
+    }
+    return 0;
+}
+
+static void setup_dom0_rmrr(void)
+{
+    struct acpi_rmrr_unit *rmrr;
+    struct pci_dev *pdev;
+    int ret;
+
+    for_each_rmrr_device(rmrr, pdev)
+        ret = iommu_prepare_rmrr_dev(dom0, rmrr, pdev);
+        if (ret)
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: mapping reserved region failed\n");
+    end_for_each_rmrr_device(rmrr, pdev)
+}
+
+int iommu_setup(void)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(dom0);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    if (list_empty(&acpi_drhd_units))
+        return 0;
+
+    INIT_LIST_HEAD(&hd->pdev_list);
+
+    /* start from scratch */
+    flush_all();
+
+    /* setup clflush size */
+    x86_clflush_size = ((cpuid_ebx(1) >> 8) & 0xff) * 8;
+
+    /*
+     * allocate IO page directory page for the domain.
+     */
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    hd->pgd = (struct dma_pte *)alloc_xenheap_page();
+    memset((u8*)hd->pgd, 0, PAGE_SIZE);
+
+    if (init_vtd_hw())
+        goto error;
+    setup_dom0_devices();
+    setup_dom0_rmrr();
+    if (enable_vtd_translation())
+        goto error;
+
+    return 0;
+
+error:
+    printk("iommu_setup() failed\n");
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        free_iommu(iommu);
+    }
+    return -EIO;
+}
+
+int assign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    struct acpi_rmrr_unit *rmrr;
+    struct pci_dev *pdev;
+    int ret = 0;
+
+    if (list_empty(&acpi_drhd_units))
+        return ret;
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+        "assign_device: bus = %x dev = %x func = %x\n",
+        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+    reassign_device_ownership(dom0, d, bus, devfn);
+
+    /* setup rmrr identify mapping just once per domain */
+    if (list_empty(&hd->pdev_list))
+        for_each_rmrr_device(rmrr, pdev)
+            ret = iommu_prepare_rmrr_dev(d, rmrr, pdev);
+            if (ret)
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "IOMMU: mapping reserved region failed\n");
+        end_for_each_rmrr_device(rmrr, pdev)
+    return ret;
+}
+
+void iommu_set_pgd(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    unsigned long p2m_table;
+
+    if (hd->pgd) {
+        gdprintk(XENLOG_INFO VTDPREFIX,
+            "iommu_set_pgd_1: hd->pgd = %p\n", hd->pgd);
+        hd->pgd = NULL;
+    }
+    p2m_table = mfn_x(pagetable_get_mfn(d->arch.phys_table));
+
+#if CONFIG_PAGING_LEVELS == 3
+    if ( !hd->pgd )
+    {
+        int level = agaw_to_level(hd->agaw);
+        struct dma_pte *pmd = NULL;
+        struct dma_pte *pgd = NULL;
+        struct dma_pte *pte = NULL;
+        l3_pgentry_t *l3e;
+        unsigned long flags;
+        int i;
+
+        spin_lock_irqsave(&hd->mapping_lock, flags);
+        if (!hd->pgd) {
+            pgd = (struct dma_pte *)alloc_xenheap_page();
+            memset((u8*)pgd, 0, PAGE_SIZE);
+            if (!hd->pgd)
+                hd->pgd = pgd;
+            else /* somebody is fast */
+                free_xenheap_page((void *) pgd);
+        }
+
+        l3e = map_domain_page(p2m_table);
+        switch(level)
+        {
+            case VTD_PAGE_TABLE_LEVEL_3:        /* Weybridge */
+                /* We only support 8 entries for the PAE L3 p2m table */
+                for ( i = 0; i < 8 ; i++ )
+                {
+                    /* Don't create new L2 entry, use ones from p2m table */
+                    pgd[i].val = l3e[i].l3 | _PAGE_PRESENT | _PAGE_RW;
+                }
+                break;
+
+            case VTD_PAGE_TABLE_LEVEL_4:        /* Stoakley */
+                /* We allocate one more page for the top vtd page table. */
+                pmd = (struct dma_pte *)alloc_xenheap_page();
+                memset((u8*)pmd, 0, PAGE_SIZE);
+                pte = &pgd[0];
+                dma_set_pte_addr(*pte, virt_to_maddr(pmd));
+                dma_set_pte_readable(*pte);
+                dma_set_pte_writable(*pte);
+
+                for ( i = 0; i < 8; i++ )
+                {
+                    /* Don't create new L2 entry, use ones from p2m table */
+                    pmd[i].val = l3e[i].l3 | _PAGE_PRESENT | _PAGE_RW;
+                }
+                break;
+            default:
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "iommu_set_pgd:Unsupported p2m table sharing level!\n");
+                break;
+        }
+        unmap_domain_page(l3e);
+        spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    }
+#elif CONFIG_PAGING_LEVELS == 4
+    if ( !hd->pgd )
+    {
+        int level = agaw_to_level(hd->agaw);
+        l3_pgentry_t *l3e;
+        mfn_t pgd_mfn;
+
+        switch (level)
+        {
+            case VTD_PAGE_TABLE_LEVEL_3:
+                l3e = map_domain_page(p2m_table);
+                if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+                {
+                    gdprintk(XENLOG_ERR VTDPREFIX,
+                        "iommu_set_pgd: second level wasn't there\n");
+                    unmap_domain_page(l3e);
+                    return;
+                }
+                pgd_mfn = _mfn(l3e_get_pfn(*l3e));
+                unmap_domain_page(l3e);
+                hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                      pagetable_from_mfn(pgd_mfn)));
+                break;
+
+            case VTD_PAGE_TABLE_LEVEL_4:
+                pgd_mfn = _mfn(p2m_table);
+                hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                      pagetable_from_mfn(pgd_mfn)));
+                break;
+            default:
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                    "iommu_set_pgd:Unsupported p2m table sharing level!\n");
+                break;
+        }
+    }
+#endif
+    gdprintk(XENLOG_INFO VTDPREFIX,
+        "iommu_set_pgd: hd->pgd = %p\n", hd->pgd);
+}
+
+
+u8 iommu_state[MAX_IOMMU_REGS * MAX_IOMMUS];
+int iommu_suspend(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int i = 0;
+
+    if (!vtd_enable)
+        return 0;
+
+    flush_all();
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        iommu_state[DMAR_RTADDR_REG * i] =
+            (u64) dmar_readq(iommu->reg, DMAR_RTADDR_REG);
+        iommu_state[DMAR_FECTL_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
+        iommu_state[DMAR_FEDATA_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
+        iommu_state[DMAR_FEADDR_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
+        iommu_state[DMAR_FEUADDR_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
+        iommu_state[DMAR_PLMBASE_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_PLMBASE_REG);
+        iommu_state[DMAR_PLMLIMIT_REG * i] =
+            (u32) dmar_readl(iommu->reg, DMAR_PLMLIMIT_REG);
+        iommu_state[DMAR_PHMBASE_REG * i] =
+            (u64) dmar_readq(iommu->reg, DMAR_PHMBASE_REG);
+        iommu_state[DMAR_PHMLIMIT_REG * i] =
+            (u64) dmar_readq(iommu->reg, DMAR_PHMLIMIT_REG);
+        i++;
+    }
+
+    return 0;
+}
+
+int iommu_resume(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    int i = 0;
+
+    if (!vtd_enable)
+        return 0;
+
+    flush_all();
+
+    init_vtd_hw();
+    for_each_drhd_unit(drhd) {
+        iommu = drhd->iommu;
+        dmar_writeq( iommu->reg, DMAR_RTADDR_REG,
+            (u64) iommu_state[DMAR_RTADDR_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FECTL_REG,
+            (u32) iommu_state[DMAR_FECTL_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FEDATA_REG,
+            (u32) iommu_state[DMAR_FEDATA_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FEADDR_REG,
+            (u32) iommu_state[DMAR_FEADDR_REG * i]);
+        dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
+            (u32) iommu_state[DMAR_FEUADDR_REG * i]);
+        dmar_writel(iommu->reg, DMAR_PLMBASE_REG,
+            (u32) iommu_state[DMAR_PLMBASE_REG * i]);
+        dmar_writel(iommu->reg, DMAR_PLMLIMIT_REG,
+            (u32) iommu_state[DMAR_PLMLIMIT_REG * i]);
+        dmar_writeq(iommu->reg, DMAR_PHMBASE_REG,
+            (u64) iommu_state[DMAR_PHMBASE_REG * i]);
+        dmar_writeq(iommu->reg, DMAR_PHMLIMIT_REG,
+            (u64) iommu_state[DMAR_PHMLIMIT_REG * i]);
+
+        if (iommu_enable_translation(iommu))
+            return -EIO;
+        i++;
+    }
+    return 0;
+}
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/io.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/io.c     Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx>
+ * Copyright (C) Xiaohui Xin <xiaohui.xin@xxxxxxxxx>
+ */
+
+#include <xen/init.h>
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+#include <xen/event.h>
+#include <xen/hypercall.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/paging.h>
+#include <asm/shadow.h>
+#include <asm/p2m.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/support.h>
+#include <asm/hvm/vpt.h>
+#include <asm/hvm/vpic.h>
+#include <asm/hvm/vlapic.h>
+#include <public/sched.h>
+#include <xen/iocap.h>
+#include <public/hvm/ioreq.h>
+
+int hvm_do_IRQ_dpci(struct domain *d, unsigned int mirq)
+{
+    uint32_t device, intx;
+    uint32_t link, isa_irq;
+    struct hvm_irq *hvm_irq;
+
+    if (!vtd_enabled || (d == dom0))
+        return 0;
+
+    if (d->arch.hvm_domain.irq.mirq[mirq].valid)
+    {
+        device = d->arch.hvm_domain.irq.mirq[mirq].device;
+        intx = d->arch.hvm_domain.irq.mirq[mirq].intx;
+        link = hvm_pci_intx_link(device, intx);
+        hvm_irq = &d->arch.hvm_domain.irq;
+        isa_irq = hvm_irq->pci_link.route[link];
+
+        if ( !d->arch.hvm_domain.irq.girq[isa_irq].valid )
+        {
+            d->arch.hvm_domain.irq.girq[isa_irq].valid = 1;
+            d->arch.hvm_domain.irq.girq[isa_irq].device = device;
+            d->arch.hvm_domain.irq.girq[isa_irq].intx = intx;
+            d->arch.hvm_domain.irq.girq[isa_irq].machine_gsi = mirq;
+        }
+
+        if ( !test_and_set_bit(mirq, d->arch.hvm_domain.irq.dirq_mask) )
+        {
+            vcpu_kick(d->vcpu[0]);
+            return 1;
+        }
+        else
+            dprintk(XENLOG_INFO, "Want to pending mirq, but failed\n");
+    }
+    return 0;
+}
+
+void hvm_dpci_eoi(unsigned int guest_gsi, union vioapic_redir_entry *ent)
+{
+    struct domain *d = current->domain;
+    uint32_t device, intx, machine_gsi;
+    irq_desc_t *desc;
+
+    if (d->arch.hvm_domain.irq.girq[guest_gsi].valid)
+    {
+        device = d->arch.hvm_domain.irq.girq[guest_gsi].device;
+        intx = d->arch.hvm_domain.irq.girq[guest_gsi].intx;
+        machine_gsi = d->arch.hvm_domain.irq.girq[guest_gsi].machine_gsi;
+        gdprintk(XENLOG_INFO, "hvm_dpci_eoi:: device %x intx %x\n",
+            device, intx);
+        hvm_pci_intx_deassert(d, device, intx);
+        if ( (ent == NULL) || (ent && ent->fields.mask == 0) ) {
+            desc = &irq_desc[irq_to_vector(machine_gsi)];
+            desc->handler->end(irq_to_vector(machine_gsi));
+        }
+    }
+}
+
+int release_devices(struct domain *d)
+{
+    struct hvm_domain *hd = &d->arch.hvm_domain;
+    uint32_t i;
+    int ret = 0;
+
+    if (!vtd_enabled)
+        return ret;
+
+    /* unbind irq */
+    for (i = 0; i < NR_IRQS; i++) {
+        if (hd->irq.mirq[i].valid)
+            ret = pirq_guest_unbind(d, i);
+    }
+    iommu_domain_teardown(d);
+    return ret;
+}
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/msi.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/msi.h    Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@xxxxxxxxx)
+ */
+
+#ifndef MSI_H
+#define MSI_H
+
+/*
+ * Assume the maximum number of hot plug slots supported by the system is about
+ * ten. The worstcase is that each of these slots is hot-added with a device,
+ * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which
+ * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined
+ * as below to ensure at least one message is assigned to each detected MSI/
+ * MSI-X device function.
+ */
+#define NR_HP_RESERVED_VECTORS         20
+
+extern int vector_irq[NR_VECTORS];
+extern void (*interrupt[NR_IRQS])(void);
+extern int pci_vector_resources(int last, int nr_released);
+
+/*
+ * MSI-X Address Register
+ */
+#define PCI_MSIX_FLAGS_QSIZE           0x7FF
+#define PCI_MSIX_FLAGS_ENABLE          (1 << 15)
+#define PCI_MSIX_FLAGS_BIRMASK         (7 << 0)
+#define PCI_MSIX_FLAGS_BITMASK         (1 << 0)
+
+#define PCI_MSIX_ENTRY_SIZE                    16
+#define  PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET      0
+#define  PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET      4
+#define  PCI_MSIX_ENTRY_DATA_OFFSET            8
+#define  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET     12
+
+#define msi_control_reg(base)          (base + PCI_MSI_FLAGS)
+#define msi_lower_address_reg(base)    (base + PCI_MSI_ADDRESS_LO)
+#define msi_upper_address_reg(base)    (base + PCI_MSI_ADDRESS_HI)
+#define msi_data_reg(base, is64bit)    \
+       ( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
+#define msi_mask_bits_reg(base, is64bit) \
+       ( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+#define msi_disable(control)           control &= ~PCI_MSI_FLAGS_ENABLE
+#define multi_msi_capable(control) \
+       (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
+#define multi_msi_enable(control, num) \
+       control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+#define is_64bit_address(control)      (control & PCI_MSI_FLAGS_64BIT)
+#define is_mask_bit_support(control)   (control & PCI_MSI_FLAGS_MASKBIT)
+#define msi_enable(control, num) multi_msi_enable(control, num); \
+       control |= PCI_MSI_FLAGS_ENABLE
+
+#define msix_table_offset_reg(base)    (base + 0x04)
+#define msix_pba_offset_reg(base)      (base + 0x08)
+#define msix_enable(control)           control |= PCI_MSIX_FLAGS_ENABLE
+#define msix_disable(control)          control &= ~PCI_MSIX_FLAGS_ENABLE
+#define msix_table_size(control)       ((control & PCI_MSIX_FLAGS_QSIZE)+1)
+#define multi_msix_capable             msix_table_size
+#define msix_unmask(address)           (address & ~PCI_MSIX_FLAGS_BITMASK)
+#define msix_mask(address)             (address | PCI_MSIX_FLAGS_BITMASK)
+#define msix_is_pending(address)       (address & PCI_MSIX_FLAGS_PENDMASK)
+
+/*
+ * MSI Defined Data Structures
+ */
+#define MSI_ADDRESS_HEADER             0xfee
+#define MSI_ADDRESS_HEADER_SHIFT       12
+#define MSI_ADDRESS_HEADER_MASK                0xfff000
+#define MSI_ADDRESS_DEST_ID_MASK       0xfff0000f
+#define MSI_TARGET_CPU_MASK            0xff
+#define MSI_TARGET_CPU_SHIFT           12
+#define MSI_DELIVERY_MODE              0
+#define MSI_LEVEL_MODE                 1       /* Edge always assert */
+#define MSI_TRIGGER_MODE               0       /* MSI is edge sensitive */
+#define MSI_PHYSICAL_MODE              0
+#define MSI_LOGICAL_MODE               1
+#define MSI_REDIRECTION_HINT_MODE      0
+
+#define __LITTLE_ENDIAN_BITFIELD       1
+
+struct msg_data {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u32   vector          :  8;
+       __u32   delivery_mode   :  3;   /* 000b: FIXED | 001b: lowest prior */
+       __u32   reserved_1      :  3;
+       __u32   level           :  1;   /* 0: deassert | 1: assert */
+       __u32   trigger         :  1;   /* 0: edge | 1: level */
+       __u32   reserved_2      : 16;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u32   reserved_2      : 16;
+       __u32   trigger         :  1;   /* 0: edge | 1: level */
+       __u32   level           :  1;   /* 0: deassert | 1: assert */
+       __u32   reserved_1      :  3;
+       __u32   delivery_mode   :  3;   /* 000b: FIXED | 001b: lowest prior */
+       __u32   vector          :  8;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+} __attribute__ ((packed));
+
+struct msg_address {
+       union {
+               struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+                       __u32   reserved_1      :  2;
+                       __u32   dest_mode       :  1;   /*0:physic | 1:logic */
+                       __u32   redirection_hint:  1;   /*0: dedicated CPU
+                                                         1: lowest priority */
+                       __u32   reserved_2      :  4;
+                       __u32   dest_id         : 24;   /* Destination ID */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+                       __u32   dest_id         : 24;   /* Destination ID */
+                       __u32   reserved_2      :  4;
+                       __u32   redirection_hint:  1;   /*0: dedicated CPU
+                                                         1: lowest priority */
+                       __u32   dest_mode       :  1;   /*0:physic | 1:logic */
+                       __u32   reserved_1      :  2;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+               }u;
+                       __u32  value;
+       }lo_address;
+       __u32   hi_address;
+} __attribute__ ((packed));
+
+#endif /* MSI_H */
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/pci-direct.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/pci-direct.h     Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,48 @@
+#ifndef ASM_PCI_DIRECT_H
+#define ASM_PCI_DIRECT_H 1
+
+#include <xen/types.h>
+#include <asm/io.h>
+
+/* Direct PCI access. This is used for PCI accesses in early boot before
+   the PCI subsystem works. */ 
+
+#define PDprintk(x...)
+
+static inline u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
+{
+    u32 v; 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    v = inl(0xcfc); 
+    if (v != 0xffffffff)
+        PDprintk("%x reading 4 from %x: %x\n", slot, offset, v);
+    return v;
+}
+
+static inline u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
+{
+    u8 v; 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    v = inb(0xcfc + (offset&3)); 
+    PDprintk("%x reading 1 from %x: %x\n", slot, offset, v);
+    return v;
+}
+
+static inline u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
+{
+    u16 v; 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    v = inw(0xcfc + (offset&2)); 
+    PDprintk("%x reading 2 from %x: %x\n", slot, offset, v);
+    return v;
+}
+
+static inline void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
+                    u32 val)
+{
+    PDprintk("%x writing to %x: %x\n", slot, offset, val); 
+    outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+    outl(val, 0xcfc); 
+}
+
+#endif
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/pci_regs.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/pci_regs.h       Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,449 @@
+/*
+ *     pci_regs.h
+ *
+ *     PCI standard defines
+ *     Copyright 1994, Drew Eckhardt
+ *     Copyright 1997--1999 Martin Mares <mj@xxxxxx>
+ *
+ *     For more information, please consult the following manuals (look at
+ *     http://www.pcisig.com/ for how to get them):
+ *
+ *     PCI BIOS Specification
+ *     PCI Local Bus Specification
+ *     PCI to PCI Bridge Specification
+ *     PCI System Design Guide
+ */
+
+#ifndef LINUX_PCI_REGS_H
+#define LINUX_PCI_REGS_H
+
+/*
+ * Under PCI, each device has 256 bytes of configuration address space,
+ * of which the first 64 bytes are standardized as follows:
+ */
+#define PCI_VENDOR_ID          0x00    /* 16 bits */
+#define PCI_DEVICE_ID          0x02    /* 16 bits */
+#define PCI_COMMAND            0x04    /* 16 bits */
+#define  PCI_COMMAND_IO                0x1     /* Enable response in I/O space 
*/
+#define  PCI_COMMAND_MEMORY    0x2     /* Enable response in Memory space */
+#define  PCI_COMMAND_MASTER    0x4     /* Enable bus mastering */
+#define  PCI_COMMAND_SPECIAL   0x8     /* Enable response to special cycles */
+#define  PCI_COMMAND_INVALIDATE        0x10    /* Use memory write and 
invalidate */
+#define  PCI_COMMAND_VGA_PALETTE 0x20  /* Enable palette snooping */
+#define  PCI_COMMAND_PARITY    0x40    /* Enable parity checking */
+#define  PCI_COMMAND_WAIT      0x80    /* Enable address/data stepping */
+#define  PCI_COMMAND_SERR      0x100   /* Enable SERR */
+#define  PCI_COMMAND_FAST_BACK 0x200   /* Enable back-to-back writes */
+#define  PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */
+
+#define PCI_STATUS             0x06    /* 16 bits */
+#define  PCI_STATUS_CAP_LIST   0x10    /* Support Capability List */
+#define  PCI_STATUS_66MHZ      0x20    /* Support 66 Mhz PCI 2.1 bus */
+#define  PCI_STATUS_UDF                0x40    /* Support User Definable 
Features [obsolete] */
+#define  PCI_STATUS_FAST_BACK  0x80    /* Accept fast-back to back */
+#define  PCI_STATUS_PARITY     0x100   /* Detected parity error */
+#define  PCI_STATUS_DEVSEL_MASK        0x600   /* DEVSEL timing */
+#define  PCI_STATUS_DEVSEL_FAST                0x000
+#define  PCI_STATUS_DEVSEL_MEDIUM      0x200
+#define  PCI_STATUS_DEVSEL_SLOW                0x400
+#define  PCI_STATUS_SIG_TARGET_ABORT   0x800 /* Set on target abort */
+#define  PCI_STATUS_REC_TARGET_ABORT   0x1000 /* Master ack of " */
+#define  PCI_STATUS_REC_MASTER_ABORT   0x2000 /* Set on master abort */
+#define  PCI_STATUS_SIG_SYSTEM_ERROR   0x4000 /* Set when we drive SERR */
+#define  PCI_STATUS_DETECTED_PARITY    0x8000 /* Set on parity error */
+
+#define PCI_CLASS_REVISION     0x08    /* High 24 bits are class, low 8 
revision */
+#define PCI_REVISION_ID                0x08    /* Revision ID */
+#define PCI_CLASS_PROG         0x09    /* Reg. Level Programming Interface */
+#define PCI_CLASS_DEVICE       0x0a    /* Device class */
+
+#define PCI_CACHE_LINE_SIZE    0x0c    /* 8 bits */
+#define PCI_LATENCY_TIMER      0x0d    /* 8 bits */
+#define PCI_HEADER_TYPE                0x0e    /* 8 bits */
+#define  PCI_HEADER_TYPE_NORMAL                0
+#define  PCI_HEADER_TYPE_BRIDGE                1
+#define  PCI_HEADER_TYPE_CARDBUS       2
+
+#define PCI_BIST               0x0f    /* 8 bits */
+#define  PCI_BIST_CODE_MASK    0x0f    /* Return result */
+#define  PCI_BIST_START                0x40    /* 1 to start BIST, 2 secs or 
less */
+#define  PCI_BIST_CAPABLE      0x80    /* 1 if BIST capable */
+
+/*
+ * Base addresses specify locations in memory or I/O space.
+ * Decoded size can be determined by writing a value of
+ * 0xffffffff to the register, and reading it back.  Only
+ * 1 bits are decoded.
+ */
+#define PCI_BASE_ADDRESS_0     0x10    /* 32 bits */
+#define PCI_BASE_ADDRESS_1     0x14    /* 32 bits [htype 0,1 only] */
+#define PCI_BASE_ADDRESS_2     0x18    /* 32 bits [htype 0 only] */
+#define PCI_BASE_ADDRESS_3     0x1c    /* 32 bits */
+#define PCI_BASE_ADDRESS_4     0x20    /* 32 bits */
+#define PCI_BASE_ADDRESS_5     0x24    /* 32 bits */
+#define  PCI_BASE_ADDRESS_SPACE                0x01    /* 0 = memory, 1 = I/O 
*/
+#define  PCI_BASE_ADDRESS_SPACE_IO     0x01
+#define  PCI_BASE_ADDRESS_SPACE_MEMORY 0x00
+#define  PCI_BASE_ADDRESS_MEM_TYPE_MASK        0x06
+#define  PCI_BASE_ADDRESS_MEM_TYPE_32  0x00    /* 32 bit address */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_1M  0x02    /* Below 1M [obsolete] */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_64  0x04    /* 64 bit address */
+#define  PCI_BASE_ADDRESS_MEM_PREFETCH 0x08    /* prefetchable? */
+#define  PCI_BASE_ADDRESS_MEM_MASK     (~0x0fUL)
+#define  PCI_BASE_ADDRESS_IO_MASK      (~0x03UL)
+/* bit 1 is reserved if address_space = 1 */
+
+/* Header type 0 (normal devices) */
+#define PCI_CARDBUS_CIS                0x28
+#define PCI_SUBSYSTEM_VENDOR_ID        0x2c
+#define PCI_SUBSYSTEM_ID       0x2e
+#define PCI_ROM_ADDRESS                0x30    /* Bits 31..11 are address, 
10..1 reserved */
+#define  PCI_ROM_ADDRESS_ENABLE        0x01
+#define PCI_ROM_ADDRESS_MASK   (~0x7ffUL)
+
+#define PCI_CAPABILITY_LIST    0x34    /* Offset of first capability list 
entry */
+
+/* 0x35-0x3b are reserved */
+#define PCI_INTERRUPT_LINE     0x3c    /* 8 bits */
+#define PCI_INTERRUPT_PIN      0x3d    /* 8 bits */
+#define PCI_MIN_GNT            0x3e    /* 8 bits */
+#define PCI_MAX_LAT            0x3f    /* 8 bits */
+
+/* Header type 1 (PCI-to-PCI bridges) */
+#define PCI_PRIMARY_BUS                0x18    /* Primary bus number */
+#define PCI_SECONDARY_BUS      0x19    /* Secondary bus number */
+#define PCI_SUBORDINATE_BUS    0x1a    /* Highest bus number behind the bridge 
*/
+#define PCI_SEC_LATENCY_TIMER  0x1b    /* Latency timer for secondary 
interface */
+#define PCI_IO_BASE            0x1c    /* I/O range behind the bridge */
+#define PCI_IO_LIMIT           0x1d
+#define  PCI_IO_RANGE_TYPE_MASK        0x0fUL  /* I/O bridging type */
+#define  PCI_IO_RANGE_TYPE_16  0x00
+#define  PCI_IO_RANGE_TYPE_32  0x01
+#define  PCI_IO_RANGE_MASK     (~0x0fUL)
+#define PCI_SEC_STATUS         0x1e    /* Secondary status register, only bit 
14 used */
+#define PCI_MEMORY_BASE                0x20    /* Memory range behind */
+#define PCI_MEMORY_LIMIT       0x22
+#define  PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL
+#define  PCI_MEMORY_RANGE_MASK (~0x0fUL)
+#define PCI_PREF_MEMORY_BASE   0x24    /* Prefetchable memory range behind */
+#define PCI_PREF_MEMORY_LIMIT  0x26
+#define  PCI_PREF_RANGE_TYPE_MASK 0x0fUL
+#define  PCI_PREF_RANGE_TYPE_32        0x00
+#define  PCI_PREF_RANGE_TYPE_64        0x01
+#define  PCI_PREF_RANGE_MASK   (~0x0fUL)
+#define PCI_PREF_BASE_UPPER32  0x28    /* Upper half of prefetchable memory 
range */
+#define PCI_PREF_LIMIT_UPPER32 0x2c
+#define PCI_IO_BASE_UPPER16    0x30    /* Upper half of I/O addresses */
+#define PCI_IO_LIMIT_UPPER16   0x32
+/* 0x34 same as for htype 0 */
+/* 0x35-0x3b is reserved */
+#define PCI_ROM_ADDRESS1       0x38    /* Same as PCI_ROM_ADDRESS, but for 
htype 1 */
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_BRIDGE_CONTROL     0x3e
+#define  PCI_BRIDGE_CTL_PARITY 0x01    /* Enable parity detection on secondary 
interface */
+#define  PCI_BRIDGE_CTL_SERR   0x02    /* The same for SERR forwarding */
+#define  PCI_BRIDGE_CTL_NO_ISA 0x04    /* Disable bridging of ISA ports */
+#define  PCI_BRIDGE_CTL_VGA    0x08    /* Forward VGA addresses */
+#define  PCI_BRIDGE_CTL_MASTER_ABORT   0x20  /* Report master aborts */
+#define  PCI_BRIDGE_CTL_BUS_RESET      0x40    /* Secondary bus reset */
+#define  PCI_BRIDGE_CTL_FAST_BACK      0x80    /* Fast Back2Back enabled on 
secondary interface */
+
+/* Header type 2 (CardBus bridges) */
+#define PCI_CB_CAPABILITY_LIST 0x14
+/* 0x15 reserved */
+#define PCI_CB_SEC_STATUS      0x16    /* Secondary status */
+#define PCI_CB_PRIMARY_BUS     0x18    /* PCI bus number */
+#define PCI_CB_CARD_BUS                0x19    /* CardBus bus number */
+#define PCI_CB_SUBORDINATE_BUS 0x1a    /* Subordinate bus number */
+#define PCI_CB_LATENCY_TIMER   0x1b    /* CardBus latency timer */
+#define PCI_CB_MEMORY_BASE_0   0x1c
+#define PCI_CB_MEMORY_LIMIT_0  0x20
+#define PCI_CB_MEMORY_BASE_1   0x24
+#define PCI_CB_MEMORY_LIMIT_1  0x28
+#define PCI_CB_IO_BASE_0       0x2c
+#define PCI_CB_IO_BASE_0_HI    0x2e
+#define PCI_CB_IO_LIMIT_0      0x30
+#define PCI_CB_IO_LIMIT_0_HI   0x32
+#define PCI_CB_IO_BASE_1       0x34
+#define PCI_CB_IO_BASE_1_HI    0x36
+#define PCI_CB_IO_LIMIT_1      0x38
+#define PCI_CB_IO_LIMIT_1_HI   0x3a
+#define  PCI_CB_IO_RANGE_MASK  (~0x03UL)
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_CB_BRIDGE_CONTROL  0x3e
+#define  PCI_CB_BRIDGE_CTL_PARITY      0x01    /* Similar to standard bridge 
control register */
+#define  PCI_CB_BRIDGE_CTL_SERR                0x02
+#define  PCI_CB_BRIDGE_CTL_ISA         0x04
+#define  PCI_CB_BRIDGE_CTL_VGA         0x08
+#define  PCI_CB_BRIDGE_CTL_MASTER_ABORT        0x20
+#define  PCI_CB_BRIDGE_CTL_CB_RESET    0x40    /* CardBus reset */
+#define  PCI_CB_BRIDGE_CTL_16BIT_INT   0x80    /* Enable interrupt for 16-bit 
cards */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100 /* Prefetch enable for both 
memory regions */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200
+#define  PCI_CB_BRIDGE_CTL_POST_WRITES 0x400
+#define PCI_CB_SUBSYSTEM_VENDOR_ID     0x40
+#define PCI_CB_SUBSYSTEM_ID            0x42
+#define PCI_CB_LEGACY_MODE_BASE                0x44    /* 16-bit PC Card 
legacy mode base address (ExCa) */
+/* 0x48-0x7f reserved */
+
+/* Capability lists */
+
+#define PCI_CAP_LIST_ID                0       /* Capability ID */
+#define  PCI_CAP_ID_PM         0x01    /* Power Management */
+#define  PCI_CAP_ID_AGP                0x02    /* Accelerated Graphics Port */
+#define  PCI_CAP_ID_VPD                0x03    /* Vital Product Data */
+#define  PCI_CAP_ID_SLOTID     0x04    /* Slot Identification */
+#define  PCI_CAP_ID_MSI                0x05    /* Message Signalled Interrupts 
*/
+#define  PCI_CAP_ID_CHSWP      0x06    /* CompactPCI HotSwap */
+#define  PCI_CAP_ID_PCIX       0x07    /* PCI-X */
+#define  PCI_CAP_ID_HT_IRQCONF 0x08    /* HyperTransport IRQ Configuration */
+#define  PCI_CAP_ID_SHPC       0x0C    /* PCI Standard Hot-Plug Controller */
+#define  PCI_CAP_ID_EXP        0x10    /* PCI Express */
+#define  PCI_CAP_ID_MSIX       0x11    /* MSI-X */
+#define PCI_CAP_LIST_NEXT      1       /* Next capability in the list */
+#define PCI_CAP_FLAGS          2       /* Capability defined flags (16 bits) */
+#define PCI_CAP_SIZEOF         4
+
+/* Power Management Registers */
+
+#define PCI_PM_PMC             2       /* PM Capabilities Register */
+#define  PCI_PM_CAP_VER_MASK   0x0007  /* Version */
+#define  PCI_PM_CAP_PME_CLOCK  0x0008  /* PME clock required */
+#define  PCI_PM_CAP_RESERVED    0x0010  /* Reserved field */
+#define  PCI_PM_CAP_DSI                0x0020  /* Device specific 
initialization */
+#define  PCI_PM_CAP_AUX_POWER  0x01C0  /* Auxilliary power support mask */
+#define  PCI_PM_CAP_D1         0x0200  /* D1 power state support */
+#define  PCI_PM_CAP_D2         0x0400  /* D2 power state support */
+#define  PCI_PM_CAP_PME                0x0800  /* PME pin supported */
+#define  PCI_PM_CAP_PME_MASK   0xF800  /* PME Mask of all supported states */
+#define  PCI_PM_CAP_PME_D0     0x0800  /* PME# from D0 */
+#define  PCI_PM_CAP_PME_D1     0x1000  /* PME# from D1 */
+#define  PCI_PM_CAP_PME_D2     0x2000  /* PME# from D2 */
+#define  PCI_PM_CAP_PME_D3     0x4000  /* PME# from D3 (hot) */
+#define  PCI_PM_CAP_PME_D3cold 0x8000  /* PME# from D3 (cold) */
+#define PCI_PM_CTRL            4       /* PM control and status register */
+#define  PCI_PM_CTRL_STATE_MASK        0x0003  /* Current power state (D0 to 
D3) */
+#define  PCI_PM_CTRL_NO_SOFT_RESET     0x0004  /* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_PME_ENABLE        0x0100  /* PME pin enable */
+#define  PCI_PM_CTRL_DATA_SEL_MASK     0x1e00  /* Data select (??) */
+#define  PCI_PM_CTRL_DATA_SCALE_MASK   0x6000  /* Data scale (??) */
+#define  PCI_PM_CTRL_PME_STATUS        0x8000  /* PME pin status */
+#define PCI_PM_PPB_EXTENSIONS  6       /* PPB support extensions (??) */
+#define  PCI_PM_PPB_B2_B3      0x40    /* Stop clock when in D3hot (??) */
+#define  PCI_PM_BPCC_ENABLE    0x80    /* Bus power/clock control enable (??) 
*/
+#define PCI_PM_DATA_REGISTER   7       /* (??) */
+#define PCI_PM_SIZEOF          8
+
+/* AGP registers */
+
+#define PCI_AGP_VERSION                2       /* BCD version number */
+#define PCI_AGP_RFU            3       /* Rest of capability flags */
+#define PCI_AGP_STATUS         4       /* Status register */
+#define  PCI_AGP_STATUS_RQ_MASK        0xff000000      /* Maximum number of 
requests - 1 */
+#define  PCI_AGP_STATUS_SBA    0x0200  /* Sideband addressing supported */
+#define  PCI_AGP_STATUS_64BIT  0x0020  /* 64-bit addressing supported */
+#define  PCI_AGP_STATUS_FW     0x0010  /* FW transfers supported */
+#define  PCI_AGP_STATUS_RATE4  0x0004  /* 4x transfer rate supported */
+#define  PCI_AGP_STATUS_RATE2  0x0002  /* 2x transfer rate supported */
+#define  PCI_AGP_STATUS_RATE1  0x0001  /* 1x transfer rate supported */
+#define PCI_AGP_COMMAND                8       /* Control register */
+#define  PCI_AGP_COMMAND_RQ_MASK 0xff000000  /* Master: Maximum number of 
requests */
+#define  PCI_AGP_COMMAND_SBA   0x0200  /* Sideband addressing enabled */
+#define  PCI_AGP_COMMAND_AGP   0x0100  /* Allow processing of AGP transactions 
*/
+#define  PCI_AGP_COMMAND_64BIT 0x0020  /* Allow processing of 64-bit addresses 
*/
+#define  PCI_AGP_COMMAND_FW    0x0010  /* Force FW transfers */
+#define  PCI_AGP_COMMAND_RATE4 0x0004  /* Use 4x rate */
+#define  PCI_AGP_COMMAND_RATE2 0x0002  /* Use 2x rate */
+#define  PCI_AGP_COMMAND_RATE1 0x0001  /* Use 1x rate */
+#define PCI_AGP_SIZEOF         12
+
+/* Vital Product Data */
+
+#define PCI_VPD_ADDR           2       /* Address to access (15 bits!) */
+#define  PCI_VPD_ADDR_MASK     0x7fff  /* Address mask */
+#define  PCI_VPD_ADDR_F                0x8000  /* Write 0, 1 indicates 
completion */
+#define PCI_VPD_DATA           4       /* 32-bits of data returned here */
+
+/* Slot Identification */
+
+#define PCI_SID_ESR            2       /* Expansion Slot Register */
+#define  PCI_SID_ESR_NSLOTS    0x1f    /* Number of expansion slots available 
*/
+#define  PCI_SID_ESR_FIC       0x20    /* First In Chassis Flag */
+#define PCI_SID_CHASSIS_NR     3       /* Chassis Number */
+
+/* Message Signalled Interrupts registers */
+
+#define PCI_MSI_FLAGS          2       /* Various flags */
+#define  PCI_MSI_FLAGS_64BIT   0x80    /* 64-bit addresses allowed */
+#define  PCI_MSI_FLAGS_QSIZE   0x70    /* Message queue size configured */
+#define  PCI_MSI_FLAGS_QMASK   0x0e    /* Maximum queue size available */
+#define  PCI_MSI_FLAGS_ENABLE  0x01    /* MSI feature enabled */
+#define  PCI_MSI_FLAGS_MASKBIT 0x100   /* 64-bit mask bits allowed */
+#define PCI_MSI_RFU            3       /* Rest of capability flags */
+#define PCI_MSI_ADDRESS_LO     4       /* Lower 32 bits */
+#define PCI_MSI_ADDRESS_HI     8       /* Upper 32 bits (if 
PCI_MSI_FLAGS_64BIT set) */
+#define PCI_MSI_DATA_32                8       /* 16 bits of data for 32-bit 
devices */
+#define PCI_MSI_DATA_64                12      /* 16 bits of data for 64-bit 
devices */
+#define PCI_MSI_MASK_BIT       16      /* Mask bits register */
+
+/* CompactPCI Hotswap Register */
+
+#define PCI_CHSWP_CSR          2       /* Control and Status Register */
+#define  PCI_CHSWP_DHA         0x01    /* Device Hiding Arm */
+#define  PCI_CHSWP_EIM         0x02    /* ENUM# Signal Mask */
+#define  PCI_CHSWP_PIE         0x04    /* Pending Insert or Extract */
+#define  PCI_CHSWP_LOO         0x08    /* LED On / Off */
+#define  PCI_CHSWP_PI          0x30    /* Programming Interface */
+#define  PCI_CHSWP_EXT         0x40    /* ENUM# status - extraction */
+#define  PCI_CHSWP_INS         0x80    /* ENUM# status - insertion */
+
+/* PCI-X registers */
+
+#define PCI_X_CMD              2       /* Modes & Features */
+#define  PCI_X_CMD_DPERR_E     0x0001  /* Data Parity Error Recovery Enable */
+#define  PCI_X_CMD_ERO         0x0002  /* Enable Relaxed Ordering */
+#define  PCI_X_CMD_MAX_READ    0x000c  /* Max Memory Read Byte Count */
+#define  PCI_X_CMD_MAX_SPLIT   0x0070  /* Max Outstanding Split Transactions */
+#define  PCI_X_CMD_VERSION(x)  (((x) >> 12) & 3) /* Version */
+#define PCI_X_STATUS           4       /* PCI-X capabilities */
+#define  PCI_X_STATUS_DEVFN    0x000000ff      /* A copy of devfn */
+#define  PCI_X_STATUS_BUS      0x0000ff00      /* A copy of bus nr */
+#define  PCI_X_STATUS_64BIT    0x00010000      /* 64-bit device */
+#define  PCI_X_STATUS_133MHZ   0x00020000      /* 133 MHz capable */
+#define  PCI_X_STATUS_SPL_DISC 0x00040000      /* Split Completion Discarded */
+#define  PCI_X_STATUS_UNX_SPL  0x00080000      /* Unexpected Split Completion 
*/
+#define  PCI_X_STATUS_COMPLEX  0x00100000      /* Device Complexity */
+#define  PCI_X_STATUS_MAX_READ 0x00600000      /* Designed Max Memory Read 
Count */
+#define  PCI_X_STATUS_MAX_SPLIT        0x03800000      /* Designed Max 
Outstanding Split Transactions */
+#define  PCI_X_STATUS_MAX_CUM  0x1c000000      /* Designed Max Cumulative Read 
Size */
+#define  PCI_X_STATUS_SPL_ERR  0x20000000      /* Rcvd Split Completion Error 
Msg */
+#define  PCI_X_STATUS_266MHZ   0x40000000      /* 266 MHz capable */
+#define  PCI_X_STATUS_533MHZ   0x80000000      /* 533 MHz capable */
+
+/* PCI Express capability registers */
+
+#define PCI_EXP_FLAGS          2       /* Capabilities register */
+#define PCI_EXP_FLAGS_VERS     0x000f  /* Capability version */
+#define PCI_EXP_FLAGS_TYPE     0x00f0  /* Device/Port type */
+#define  PCI_EXP_TYPE_ENDPOINT 0x0     /* Express Endpoint */
+#define  PCI_EXP_TYPE_LEG_END  0x1     /* Legacy Endpoint */
+#define  PCI_EXP_TYPE_ROOT_PORT 0x4    /* Root Port */
+#define  PCI_EXP_TYPE_UPSTREAM 0x5     /* Upstream Port */
+#define  PCI_EXP_TYPE_DOWNSTREAM 0x6   /* Downstream Port */
+#define  PCI_EXP_TYPE_PCI_BRIDGE 0x7   /* PCI/PCI-X Bridge */
+#define PCI_EXP_FLAGS_SLOT     0x0100  /* Slot implemented */
+#define PCI_EXP_FLAGS_IRQ      0x3e00  /* Interrupt message number */
+#define PCI_EXP_DEVCAP         4       /* Device capabilities */
+#define  PCI_EXP_DEVCAP_PAYLOAD        0x07    /* Max_Payload_Size */
+#define  PCI_EXP_DEVCAP_PHANTOM        0x18    /* Phantom functions */
+#define  PCI_EXP_DEVCAP_EXT_TAG        0x20    /* Extended tags */
+#define  PCI_EXP_DEVCAP_L0S    0x1c0   /* L0s Acceptable Latency */
+#define  PCI_EXP_DEVCAP_L1     0xe00   /* L1 Acceptable Latency */
+#define  PCI_EXP_DEVCAP_ATN_BUT        0x1000  /* Attention Button Present */
+#define  PCI_EXP_DEVCAP_ATN_IND        0x2000  /* Attention Indicator Present 
*/
+#define  PCI_EXP_DEVCAP_PWR_IND        0x4000  /* Power Indicator Present */
+#define  PCI_EXP_DEVCAP_PWR_VAL        0x3fc0000 /* Slot Power Limit Value */
+#define  PCI_EXP_DEVCAP_PWR_SCL        0xc000000 /* Slot Power Limit Scale */
+#define PCI_EXP_DEVCTL         8       /* Device Control */
+#define  PCI_EXP_DEVCTL_CERE   0x0001  /* Correctable Error Reporting En. */
+#define  PCI_EXP_DEVCTL_NFERE  0x0002  /* Non-Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_FERE   0x0004  /* Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_URRE   0x0008  /* Unsupported Request Reporting En. */
+#define  PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */
+#define  PCI_EXP_DEVCTL_PAYLOAD        0x00e0  /* Max_Payload_Size */
+#define  PCI_EXP_DEVCTL_EXT_TAG        0x0100  /* Extended Tag Field Enable */
+#define  PCI_EXP_DEVCTL_PHANTOM        0x0200  /* Phantom Functions Enable */
+#define  PCI_EXP_DEVCTL_AUX_PME        0x0400  /* Auxiliary Power PM Enable */
+#define  PCI_EXP_DEVCTL_NOSNOOP_EN 0x0800  /* Enable No Snoop */
+#define  PCI_EXP_DEVCTL_READRQ 0x7000  /* Max_Read_Request_Size */
+#define PCI_EXP_DEVSTA         10      /* Device Status */
+#define  PCI_EXP_DEVSTA_CED    0x01    /* Correctable Error Detected */
+#define  PCI_EXP_DEVSTA_NFED   0x02    /* Non-Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_FED    0x04    /* Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_URD    0x08    /* Unsupported Request Detected */
+#define  PCI_EXP_DEVSTA_AUXPD  0x10    /* AUX Power Detected */
+#define  PCI_EXP_DEVSTA_TRPND  0x20    /* Transactions Pending */
+#define PCI_EXP_LNKCAP         12      /* Link Capabilities */
+#define PCI_EXP_LNKCTL         16      /* Link Control */
+#define PCI_EXP_LNKSTA         18      /* Link Status */
+#define PCI_EXP_SLTCAP         20      /* Slot Capabilities */
+#define PCI_EXP_SLTCTL         24      /* Slot Control */
+#define PCI_EXP_SLTSTA         26      /* Slot Status */
+#define PCI_EXP_RTCTL          28      /* Root Control */
+#define  PCI_EXP_RTCTL_SECEE   0x01    /* System Error on Correctable Error */
+#define  PCI_EXP_RTCTL_SENFEE  0x02    /* System Error on Non-Fatal Error */
+#define  PCI_EXP_RTCTL_SEFEE   0x04    /* System Error on Fatal Error */
+#define  PCI_EXP_RTCTL_PMEIE   0x08    /* PME Interrupt Enable */
+#define  PCI_EXP_RTCTL_CRSSVE  0x10    /* CRS Software Visibility Enable */
+#define PCI_EXP_RTCAP          30      /* Root Capabilities */
+#define PCI_EXP_RTSTA          32      /* Root Status */
+
+/* Extended Capabilities (PCI-X 2.0 and Express) */
+#define PCI_EXT_CAP_ID(header)         (header & 0x0000ffff)
+#define PCI_EXT_CAP_VER(header)                ((header >> 16) & 0xf)
+#define PCI_EXT_CAP_NEXT(header)       ((header >> 20) & 0xffc)
+
+#define PCI_EXT_CAP_ID_ERR     1
+#define PCI_EXT_CAP_ID_VC      2
+#define PCI_EXT_CAP_ID_DSN     3
+#define PCI_EXT_CAP_ID_PWR     4
+
+/* Advanced Error Reporting */
+#define PCI_ERR_UNCOR_STATUS   4       /* Uncorrectable Error Status */
+#define  PCI_ERR_UNC_TRAIN     0x00000001      /* Training */
+#define  PCI_ERR_UNC_DLP       0x00000010      /* Data Link Protocol */
+#define  PCI_ERR_UNC_POISON_TLP        0x00001000      /* Poisoned TLP */
+#define  PCI_ERR_UNC_FCP       0x00002000      /* Flow Control Protocol */
+#define  PCI_ERR_UNC_COMP_TIME 0x00004000      /* Completion Timeout */
+#define  PCI_ERR_UNC_COMP_ABORT        0x00008000      /* Completer Abort */
+#define  PCI_ERR_UNC_UNX_COMP  0x00010000      /* Unexpected Completion */
+#define  PCI_ERR_UNC_RX_OVER   0x00020000      /* Receiver Overflow */
+#define  PCI_ERR_UNC_MALF_TLP  0x00040000      /* Malformed TLP */
+#define  PCI_ERR_UNC_ECRC      0x00080000      /* ECRC Error Status */
+#define  PCI_ERR_UNC_UNSUP     0x00100000      /* Unsupported Request */
+#define PCI_ERR_UNCOR_MASK     8       /* Uncorrectable Error Mask */
+       /* Same bits as above */
+#define PCI_ERR_UNCOR_SEVER    12      /* Uncorrectable Error Severity */
+       /* Same bits as above */
+#define PCI_ERR_COR_STATUS     16      /* Correctable Error Status */
+#define  PCI_ERR_COR_RCVR      0x00000001      /* Receiver Error Status */
+#define  PCI_ERR_COR_BAD_TLP   0x00000040      /* Bad TLP Status */
+#define  PCI_ERR_COR_BAD_DLLP  0x00000080      /* Bad DLLP Status */
+#define  PCI_ERR_COR_REP_ROLL  0x00000100      /* REPLAY_NUM Rollover */
+#define  PCI_ERR_COR_REP_TIMER 0x00001000      /* Replay Timer Timeout */
+#define PCI_ERR_COR_MASK       20      /* Correctable Error Mask */
+       /* Same bits as above */
+#define PCI_ERR_CAP            24      /* Advanced Error Capabilities */
+#define  PCI_ERR_CAP_FEP(x)    ((x) & 31)      /* First Error Pointer */
+#define  PCI_ERR_CAP_ECRC_GENC 0x00000020      /* ECRC Generation Capable */
+#define  PCI_ERR_CAP_ECRC_GENE 0x00000040      /* ECRC Generation Enable */
+#define  PCI_ERR_CAP_ECRC_CHKC 0x00000080      /* ECRC Check Capable */
+#define  PCI_ERR_CAP_ECRC_CHKE 0x00000100      /* ECRC Check Enable */
+#define PCI_ERR_HEADER_LOG     28      /* Header Log Register (16 bytes) */
+#define PCI_ERR_ROOT_COMMAND   44      /* Root Error Command */
+#define PCI_ERR_ROOT_STATUS    48
+#define PCI_ERR_ROOT_COR_SRC   52
+#define PCI_ERR_ROOT_SRC       54
+
+/* Virtual Channel */
+#define PCI_VC_PORT_REG1       4
+#define PCI_VC_PORT_REG2       8
+#define PCI_VC_PORT_CTRL       12
+#define PCI_VC_PORT_STATUS     14
+#define PCI_VC_RES_CAP         16
+#define PCI_VC_RES_CTRL                20
+#define PCI_VC_RES_STATUS      26
+
+/* Power Budgeting */
+#define PCI_PWR_DSR            4       /* Data Select Register */
+#define PCI_PWR_DATA           8       /* Data Register */
+#define  PCI_PWR_DATA_BASE(x)  ((x) & 0xff)        /* Base Power */
+#define  PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3)    /* Data Scale */
+#define  PCI_PWR_DATA_PM_SUB(x)        (((x) >> 10) & 7)   /* PM Sub State */
+#define  PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
+#define  PCI_PWR_DATA_TYPE(x)  (((x) >> 15) & 7)   /* Type */
+#define  PCI_PWR_DATA_RAIL(x)  (((x) >> 18) & 7)   /* Power Rail */
+#define PCI_PWR_CAP            12      /* Capability */
+#define  PCI_PWR_CAP_BUDGET(x) ((x) & 1)       /* Included in system budget */
+
+#endif /* LINUX_PCI_REGS_H */
diff -r acfa9290746f -r f4bbd3f327e4 xen/arch/x86/hvm/vmx/vtd/utils.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vtd/utils.c  Fri Sep 14 16:40:49 2007 +0100
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx>
+ */
+
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/irq.h>
+#include <xen/spinlock.h>
+#include <xen/sched.h>
+#include <asm/delay.h>
+#include <asm/iommu.h>
+#include <asm/hvm/vmx/intel-iommu.h>
+#include "dmar.h"
+#include "pci-direct.h"
+#include "pci_regs.h"
+#include "msi.h"
+
+#include <xen/mm.h>
+#include <xen/xmalloc.h>
+
+#if defined(__x86_64__)
+void print_iommu_regs(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+ 
+    printk("---- print_iommu_regs ----\n"); 
+    printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
+    printk("print_iommu_regs: DMAR_VER_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_VER_REG));
+    printk("print_iommu_regs: DMAR_CAP_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_CAP_REG));
+    printk("print_iommu_regs: n_fault_reg = %lx\n",
+                   cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_l = %lx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_h = %lx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) 
+ 8);
+    printk("print_iommu_regs: fault_recording_reg_l = %lx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG))));
+    printk("print_iommu_regs: fault_recording_reg_h = %lx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG)) + 8));
+    printk("print_iommu_regs: DMAR_ECAP_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_ECAP_REG));
+    printk("print_iommu_regs: DMAR_GCMD_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GCMD_REG));
+    printk("print_iommu_regs: DMAR_GSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GSTS_REG));
+    printk("print_iommu_regs: DMAR_RTADDR_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_RTADDR_REG));
+    printk("print_iommu_regs: DMAR_CCMD_REG = %lx\n",
+                   dmar_readq(iommu->reg,DMAR_CCMD_REG));
+    printk("print_iommu_regs: DMAR_FSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FSTS_REG));
+    printk("print_iommu_regs: DMAR_FECTL_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FECTL_REG));
+    printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEDATA_REG));
+    printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEADDR_REG));
+    printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEUADDR_REG));
+}
+
+void print_vtd_entries(struct domain *d, int bus, int devfn,
+                       unsigned long gmfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct context_entry *ctxt_entry;
+    struct root_entry *root_entry;
+    u64 *l3, *l2, *l1;
+    u32 l3_index, l2_index, l1_index;
+    u32 i = 0;
+
+    printk("print_vtd_entries: domain_id = %x bdf = %x:%x:%x devfn = %x, gmfn 
= %lx\n", d->domain_id, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), devfn, gmfn);
+
+    for_each_drhd_unit(drhd) {
+        printk("---- print_vtd_entries %d ----\n", i++);
+
+        if (hd->pgd == NULL) {
+            printk("    hg->pgd == NULL\n");
+            return;
+        }
+
+        iommu = drhd->iommu;
+        root_entry = iommu->root_entry;
+        printk("    hd->pgd = %p virt_to_maddr(hd->pgd) = %lx\n",
+               hd->pgd, virt_to_maddr(hd->pgd));
+
+        printk("    root_entry = %p\n", root_entry);
+        if (root_entry == NULL) {
+            printk("    root_entry == NULL\n");
+            return;
+        }
+
+        printk("    root_entry[%x] = %lx\n", bus, root_entry[bus].val);
+        printk("    maddr_to_virt(root_entry[%x]) = %p\n",
+            bus, maddr_to_virt(root_entry[bus].val));
+
+        if (root_entry[bus].val == 0) {
+            printk("    root_entry[%x].lo == 0\n", bus);
+            return;
+        }
+ 
+        ctxt_entry = maddr_to_virt((root_entry[bus].val >> PAGE_SHIFT) << 
PAGE_SHIFT);
+        if (ctxt_entry == NULL) {
+            printk("    ctxt_entry == NULL\n");
+            return;
+        }
+
+        if (ctxt_entry[devfn].lo == 0) {
+            printk("    ctxt_entry[%x].lo == 0\n", devfn);
+            return;
+        }
+
+        printk("    context = %p\n", ctxt_entry);
+        printk("    context[%x] = %lx %lx\n",
+               devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo);
+        printk("    maddr_to_virt(context[%x].lo) = %p\n",
+               devfn, maddr_to_virt(ctxt_entry[devfn].lo));
+        printk("    context[%x] = %lx\n", devfn, ctxt_entry[devfn].lo); 
+
+        l3 = maddr_to_virt(ctxt_entry[devfn].lo);
+        l3 = (u64*)(((u64) l3 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l3 = %p\n", l3); 
+        if (l3 == NULL) return;
+
+        l3_index = (gmfn >> 9 >> 9) & 0x1ff;
+        printk("    l3_index = %x\n", l3_index);
+        printk("    l3[%x] = %lx\n", l3_index, l3[l3_index]);
+
+        l2 = maddr_to_virt(l3[l3_index]);
+        l2 = (u64*)(((u64) l2 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l2 = %p\n", l2); 
+        if (l2 == NULL) return;
+
+        l2_index = (gmfn >> 9) & 0x1ff;
+        printk("    gmfn = %lx\n", gmfn);
+        printk("    gmfn >> 9= %lx\n", gmfn >> 9);
+        printk("    l2_index = %x\n", l2_index);
+        printk("    l2[%x] = %lx\n", l2_index, l2[l2_index]);
+
+        l1 = maddr_to_virt(l2[l2_index]);
+        l1 = (u64*)(((u64) l1 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        if (l1 == NULL) return;
+        l1_index = gmfn & 0x1ff;
+        printk("    l1 = %p\n", l1); 
+        printk("    l1_index = %x\n", l1_index);
+        printk("    l1[%x] = %lx\n", l1_index, l1[l1_index]); 
+    }
+}
+
+#else    // !m64
+
+void print_iommu_regs(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+ 
+    printk("---- print_iommu_regs ----\n"); 
+    printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
+    printk("print_iommu_regs: DMAR_VER_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_VER_REG));
+    printk("print_iommu_regs: DMAR_CAP_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_CAP_REG));
+    printk("print_iommu_regs: n_fault_reg = %llx\n",
+                   cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_l = %llx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_h = %llx\n",
+                   cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) 
+ 8);
+    printk("print_iommu_regs: fault_recording_reg_l = %llx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG))));
+    printk("print_iommu_regs: fault_recording_reg_h = %llx\n",
+        dmar_readq(iommu->reg, cap_fault_reg_offset(dmar_readq(iommu->reg, 
DMAR_CAP_REG)) + 8));
+    printk("print_iommu_regs: DMAR_ECAP_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_ECAP_REG));
+    printk("print_iommu_regs: DMAR_GCMD_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GCMD_REG));
+    printk("print_iommu_regs: DMAR_GSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_GSTS_REG));
+    printk("print_iommu_regs: DMAR_RTADDR_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_RTADDR_REG));
+    printk("print_iommu_regs: DMAR_CCMD_REG = %llx\n",
+                   dmar_readq(iommu->reg,DMAR_CCMD_REG));
+    printk("print_iommu_regs: DMAR_FSTS_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FSTS_REG));
+    printk("print_iommu_regs: DMAR_FECTL_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FECTL_REG));
+    printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEDATA_REG));
+    printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEADDR_REG));
+    printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n",
+                   dmar_readl(iommu->reg,DMAR_FEUADDR_REG));
+}
+
+void print_vtd_entries(struct domain *d, int bus, int devfn,
+                       unsigned long gmfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct context_entry *ctxt_entry;
+    struct root_entry *root_entry;
+    u64 *l3, *l2, *l1;
+    u32 l3_index, l2_index, l1_index;
+    u32 i = 0;
+
+    printk("print_vtd_entries: domain_id = %x bdf = %x:%x:%x devfn = %x, gmfn 
= %lx\n", d->domain_id, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), devfn, gmfn);
+
+    for_each_drhd_unit(drhd) {
+        printk("---- print_vtd_entries %d ----\n", i++);
+
+        if (hd->pgd == NULL) {
+            printk("    hg->pgd == NULL\n");
+            return;
+        }
+
+        iommu = drhd->iommu;
+        root_entry = iommu->root_entry;
+        printk("    d->pgd = %p virt_to_maddr(hd->pgd) = %lx\n",
+               hd->pgd, virt_to_maddr(hd->pgd));
+
+        printk("    root_entry = %p\n", root_entry);
+        if (root_entry == NULL) {
+            printk("    root_entry == NULL\n");
+            return;
+        }
+
+        printk("    root_entry[%x] = %llx\n", bus, root_entry[bus].val);
+        printk("    maddr_to_virt(root_entry[%x]) = %p\n",
+            bus, maddr_to_virt(root_entry[bus].val));
+
+        if (root_entry[bus].val == 0) {
+            printk("    root_entry[%x].lo == 0\n", bus);
+            return;
+        }
+ 
+        ctxt_entry = maddr_to_virt((root_entry[bus].val >> PAGE_SHIFT) << 
PAGE_SHIFT);
+        if (ctxt_entry == NULL) {
+            printk("    ctxt_entry == NULL\n");
+            return;
+        }
+
+        if (ctxt_entry[devfn].lo == 0) {
+            printk("    ctxt_entry[%x].lo == 0\n", devfn);
+            return;
+        }
+
+        printk("    context = %p\n", ctxt_entry);
+        printk("    context[%x] = %llx %llx\n",
+               devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo);
+        printk("    maddr_to_virt(context[%x].lo) = %p\n",
+               devfn, maddr_to_virt(ctxt_entry[devfn].lo));
+        printk("    context[%x] = %llx\n", devfn, ctxt_entry[devfn].lo); 
+
+        l3 = maddr_to_virt(ctxt_entry[devfn].lo);
+        l3 = (u64*)(((u32) l3 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l3 = %p\n", l3); 
+        if (l3 == NULL) return;
+
+        l3_index = (gmfn >> 9 >> 9) & 0x1ff;
+        printk("    l3_index = %x\n", l3_index);
+        printk("    l3[%x] = %llx\n", l3_index, l3[l3_index]);
+
+        l2 = maddr_to_virt(l3[l3_index]);
+        l2 = (u64*)(((u32) l2 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l2 = %p\n", l2); 
+        if (l2 == NULL) return;
+
+        l2_index = (gmfn >> 9) & 0x1ff;
+        printk("    gmfn = %lx\n", gmfn);
+        printk("    gmfn >> 9= %lx\n", gmfn >> 9);
+        printk("    l2_index = %x\n", l2_index);
+        printk("    l2[%x] = %llx\n", l2_index, l2[l2_index]);
+
+        l1 = maddr_to_virt(l2[l2_index]);
+        l1 = (u64*)(((u32) l1 >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        if (l1 == NULL) return;
+        l1_index = gmfn & 0x1ff;
+        printk("    l1 = %p\n", l1); 
+        printk("    l1_index = %x\n", l1_index);
+        printk("    l1[%x] = %llx\n", l1_index, l1[l1_index]); 
+    }
+}
+#endif    // !m64

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
WARNING - OLD ARCHIVES

xen-changelog

[Xen-changelog] [xen-unstable] Intel vt-d specific changes in arch/x86/h