WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH V4 07/10] Introduce Xen PCI Passthrough, qdevice (1/3

To: QEMU-devel <qemu-devel@xxxxxxxxxx>, Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH V4 07/10] Introduce Xen PCI Passthrough, qdevice (1/3)
From: Anthony PERARD <anthony.perard@xxxxxxxxxx>
Date: Thu, 17 Nov 2011 15:17:10 +0000
Cc: Anthony PERARD <anthony.perard@xxxxxxxxxx>, Guy Zana <guy@xxxxxxxxxxxx>, Xen Devel <xen-devel@xxxxxxxxxxxxxxxxxxx>, Allen Kay <allen.m.kay@xxxxxxxxx>
Delivery-date: Thu, 17 Nov 2011 07:50:24 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <1321543033-22090-1-git-send-email-anthony.perard@xxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <1321543033-22090-1-git-send-email-anthony.perard@xxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
From: Allen Kay <allen.m.kay@xxxxxxxxx>

A more complete history can be found here:
git://xenbits.xensource.com/qemu-xen-unstable.git

Signed-off-by: Allen Kay <allen.m.kay@xxxxxxxxx>
Signed-off-by: Guy Zana <guy@xxxxxxxxxxxx>
Signed-off-by: Anthony PERARD <anthony.perard@xxxxxxxxxx>
---
 Makefile.target                      |    2 +
 hw/xen_common.h                      |    3 +
 hw/xen_pci_passthrough.c             |  831 ++++++++++++++++++++++++++++++++++
 hw/xen_pci_passthrough.h             |  282 ++++++++++++
 hw/xen_pci_passthrough_config_init.c |   11 +
 xen-all.c                            |   12 +
 6 files changed, 1141 insertions(+), 0 deletions(-)
 create mode 100644 hw/xen_pci_passthrough.c
 create mode 100644 hw/xen_pci_passthrough.h
 create mode 100644 hw/xen_pci_passthrough_config_init.c

diff --git a/Makefile.target b/Makefile.target
index e527c1b..33435a3 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -221,6 +221,8 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
 
 # Xen PCI Passthrough
 obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_config_init.o
 
 # Inter-VM PCI shared memory
 CONFIG_IVSHMEM =
diff --git a/hw/xen_common.h b/hw/xen_common.h
index 0409ac7..48916fd 100644
--- a/hw/xen_common.h
+++ b/hw/xen_common.h
@@ -135,4 +135,7 @@ static inline int xc_fd(xc_interface *xen_xc)
 
 void destroy_hvm_domain(void);
 
+/* shutdown/destroy current domain because of an error */
+void xen_shutdown_fatal_error(const char *fmt, ...) GCC_FMT_ATTR(1, 2);
+
 #endif /* QEMU_HW_XEN_COMMON_H */
diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
new file mode 100644
index 0000000..998470b
--- /dev/null
+++ b/hw/xen_pci_passthrough.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Alex Novik <alex@xxxxxxxxxxxx>
+ * Allen Kay <allen.m.kay@xxxxxxxxx>
+ * Guy Zana <guy@xxxxxxxxxxxx>
+ *
+ * This file implements direct PCI assignment to a HVM guest
+ */
+
+/*
+ * Interrupt Disable policy:
+ *
+ * INTx interrupt:
+ *   Initialize(register_real_device)
+ *     Map INTx(xc_physdev_map_pirq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Set machine_irq and assigned_device->machine_irq to '0'.
+ *         * Don't bind INTx.
+ *
+ *     Bind INTx(xc_domain_bind_pt_pci_irq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Unmap INTx.
+ *         - Decrement mapped_machine_irq[machine_irq]
+ *         - Set assigned_device->machine_irq to '0'.
+ *
+ *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
+ *     Write '0'
+ *       <ptdev->msi_trans_en is false>
+ *         - Set real bit to '0' if assigned_device->machine_irq isn't '0'.
+ *
+ *     Write '1'
+ *       <ptdev->msi_trans_en is false>
+ *         - Set real bit to '1'.
+ */
+
+#include <sys/ioctl.h>
+
+#include "pci.h"
+#include "xen.h"
+#include "xen_backend.h"
+#include "xen_pci_passthrough.h"
+
+#define PCI_BAR_ENTRIES (6)
+
+#define PT_NR_IRQS          (256)
+char mapped_machine_irq[PT_NR_IRQS] = {0};
+
+void pt_log(const PCIDevice *d, const char *f, ...)
+{
+    va_list ap;
+
+    va_start(ap, f);
+    if (d) {
+        fprintf(stderr, "[%02x:%02x.%x] ", pci_bus_num(d->bus),
+                PCI_SLOT(d->devfn), PCI_FUNC(d->devfn));
+    }
+    vfprintf(stderr, f, ap);
+    va_end(ap);
+}
+
+
+/* Config Space */
+static int pt_pci_config_access_check(PCIDevice *d, uint32_t address, int len)
+{
+    /* check offset range */
+    if (address >= 0xFF) {
+        PT_ERR(d, "Failed to access register with offset exceeding 0xFF. "
+               "(addr: 0x%02x, len: %d)\n", address, len);
+        return -1;
+    }
+
+    /* check read size */
+    if ((len != 1) && (len != 2) && (len != 4)) {
+        PT_ERR(d, "Failed to access register with invalid access length. "
+               "(addr: 0x%02x, len: %d)\n", address, len);
+        return -1;
+    }
+
+    /* check offset alignment */
+    if (address & (len - 1)) {
+        PT_ERR(d, "Failed to access register with invalid access size "
+               "alignment. (addr: 0x%02x, len: %d)\n", address, len);
+        return -1;
+    }
+
+    return 0;
+}
+
+int pt_bar_offset_to_index(uint32_t offset)
+{
+    int index = 0;
+
+    /* check Exp ROM BAR */
+    if (offset == PCI_ROM_ADDRESS) {
+        return PCI_ROM_SLOT;
+    }
+
+    /* calculate BAR index */
+    index = (offset - PCI_BASE_ADDRESS_0) >> 2;
+    if (index >= PCI_NUM_REGIONS) {
+        return -1;
+    }
+
+    return index;
+}
+
+static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    uint32_t val = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    int rc = 0;
+    int emul_len = 0;
+    uint32_t find_addr = address;
+
+    if (pt_pci_config_access_check(d, address, len)) {
+        goto exit;
+    }
+
+    /* check power state transition flags */
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        /* can't accept until previous power state transition is completed.
+         * so finish previous request here.
+         */
+        PT_WARN(d, "Guest want to write during power state transition\n");
+        goto exit;
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, address);
+    if (reg_grp_entry) {
+        /* check 0 Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* no need to emulate, just return 0 */
+            val = 0;
+            goto exit;
+        }
+    }
+
+    /* read I/O device register value */
+    rc = host_pci_get_block(s->real_device, address, (uint8_t *)&val, len);
+    if (rc < 0) {
+        PT_ERR(d, "pci_read_block failed. return value: %d.\n", rc);
+        memset(&val, 0xff, len);
+    }
+
+    /* just return the I/O device register value for
+     * passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto exit;
+    }
+
+    /* adjust the read value to appropriate CFC-CFF window */
+    val <<= (address & 3) << 3;
+    emul_len = len;
+
+    /* loop around the guest requested size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            XenPTRegInfo *reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation based on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.read) {
+                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.read) {
+                    rc = reg->u.w.read(s, reg_entry,
+                                       (uint16_t *)ptr_val, valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.read) {
+                    rc = reg->u.dw.read(s, reg_entry,
+                                        (uint32_t *)ptr_val, valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                xen_shutdown_fatal_error("Internal error: Invalid read "
+                                         "emulation. (%s, rc: %d)\n",
+                                         __func__, rc);
+                return 0;
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before returning them to pci bus emulator */
+    val >>= ((address & 3) << 3);
+
+exit:
+    PT_LOG_CONFIG(d, address, val, len);
+    return val;
+}
+
+static void pt_pci_write_config(PCIDevice *d, uint32_t address,
+                                uint32_t val, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    int index = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    int rc = 0;
+    uint32_t read_val = 0;
+    int emul_len = 0;
+    XenPTReg *reg_entry = NULL;
+    uint32_t find_addr = address;
+    XenPTRegInfo *reg = NULL;
+
+    if (pt_pci_config_access_check(d, address, len)) {
+        return;
+    }
+
+    PT_LOG_CONFIG(d, address, val, len);
+
+    /* check unused BAR register */
+    index = pt_bar_offset_to_index(address);
+    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
+        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
+        PT_WARN(d, "Guest attempt to set address to unused Base Address "
+                "Register. (addr: 0x%02x, len: %d)\n", address, len);
+    }
+
+    /* check power state transition flags */
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        /* can't accept until previous power state transition is completed.
+         * so finish previous request here.
+         */
+        PT_WARN(d, "Guest want to write during power state transition\n");
+        return;
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, address);
+    if (reg_grp_entry) {
+        /* check 0 Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* ignore silently */
+            PT_WARN(d, "Access to 0 Hardwired register. "
+                    "(addr: 0x%02x, len: %d)\n", address, len);
+            return;
+        }
+    }
+
+    /* read I/O device register value */
+    rc = host_pci_get_block(s->real_device, address,
+                             (uint8_t *)&read_val, len);
+    if (rc < 0) {
+        PT_ERR(d, "pci_read_block failed. return value: %d.\n", rc);
+        memset(&read_val, 0xff, len);
+    }
+
+    /* pass directly to the real device for passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto out;
+    }
+
+    /* adjust the read and write value to appropriate CFC-CFF window */
+    read_val <<= (address & 3) << 3;
+    val <<= (address & 3) << 3;
+    emul_len = len;
+
+    /* loop around the guest requested size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation based on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.write) {
+                    rc = reg->u.b.write(s, reg_entry, ptr_val,
+                                        read_val >> ((real_offset & 3) << 3),
+                                        valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.write) {
+                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
+                                        (read_val >> ((real_offset & 3) << 3)),
+                                        valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.write) {
+                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
+                                         (read_val >> ((real_offset & 3) << 
3)),
+                                         valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                xen_shutdown_fatal_error("Internal error: Invalid write"
+                                         " emulation. (%s, rc: %d)\n",
+                                         __func__, rc);
+                return;
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before passing them to host_pci_device */
+    val >>= (address & 3) << 3;
+
+out:
+    if (!(reg && reg->no_wb)) {
+        /* unknown regs are passed through */
+        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
+
+        if (rc < 0) {
+            PT_ERR(d, "pci_write_block failed. return value: %d.\n", rc);
+        }
+    }
+}
+
+/* ioport/iomem space*/
+static void pt_iomem_map(XenPCIPassthroughState *s, int i,
+                         pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    PCIIORegion *r = &s->dev.io_regions[i];
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int ret = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG(&s->dev, "e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%d"
+           " len=%#"PRIx64" index=%d first_map=%d\n",
+           e_phys, s->bases[i].access.maddr, type,
+           e_size, i, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
+        /* Remove old mapping */
+        memory_region_del_subregion(r->address_space,
+                                    r->memory);
+        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
+                               old_ebase >> XC_PAGE_SHIFT,
+                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
+                               DPCI_REMOVE_MAPPING);
+        if (ret != 0) {
+            PT_ERR(&s->dev, "remove old mapping failed!\n");
+            return;
+        }
+    }
+
+    /* map only valid guest address */
+    if (e_phys != PCI_BAR_UNMAPPED) {
+        /* Create new mapping */
+        memory_region_add_subregion_overlap(r->address_space,
+                                            e_phys, r->memory, 1);
+        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
+                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
+                                   DPCI_ADD_MAPPING);
+
+        if (ret != 0) {
+            PT_ERR(&s->dev, "create new mapping failed!\n");
+        }
+    }
+}
+
+static void pt_ioport_map(XenPCIPassthroughState *s, int i,
+                          pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    PCIIORegion *r = &s->dev.io_regions[i];
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int ret = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG(&s->dev, "e_phys=%#04"PRIx64" pio_base=%#04"PRIx64" len=%"PRId64
+           " index=%d first_map=%d\n",
+           e_phys, s->bases[i].access.pio_base, e_size, i, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
+        /* Remove old mapping */
+        memory_region_del_subregion(r->address_space,
+                                    r->memory);
+        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
+                                       s->bases[i].access.pio_base, e_size,
+                                       DPCI_REMOVE_MAPPING);
+        if (ret != 0) {
+            PT_ERR(&s->dev, "remove old mapping failed!\n");
+            return;
+        }
+    }
+
+    /* map only valid guest address (include 0) */
+    if (e_phys != PCI_BAR_UNMAPPED) {
+        /* Create new mapping */
+        memory_region_add_subregion_overlap(r->address_space,
+                                            e_phys, r->memory, 1);
+        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
+                                       s->bases[i].access.pio_base, e_size,
+                                       DPCI_ADD_MAPPING);
+        if (ret != 0) {
+            PT_ERR(&s->dev, "create new mapping failed!\n");
+        }
+    }
+
+}
+
+
+/* mapping BAR */
+
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable)
+{
+    PCIDevice *dev = &s->dev;
+    PCIIORegion *r;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    pcibus_t r_size = 0, r_addr = PCI_BAR_UNMAPPED;
+    int rc = 0;
+
+    r = &dev->io_regions[bar];
+
+    /* check valid region */
+    if (!r->size) {
+        return;
+    }
+
+    base = &s->bases[bar];
+    /* skip unused BAR or upper 64bit BAR */
+    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
+        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
+           return;
+    }
+
+    /* copy region address to temporary */
+    r_addr = r->addr;
+
+    /* need unmapping in case I/O Space or Memory Space disable */
+    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
+        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
+        r_addr = PCI_BAR_UNMAPPED;
+    }
+    if ((bar == PCI_ROM_SLOT) && (r_addr != PCI_BAR_UNMAPPED)) {
+        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
+        if (reg_grp_entry) {
+            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
+            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
+                r_addr = PCI_BAR_UNMAPPED;
+            }
+        }
+    }
+
+    /* prevent guest software mapping memory resource to 00000000h */
+    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
+        r_addr = PCI_BAR_UNMAPPED;
+    }
+
+    r_size = pt_get_emul_size(base->bar_flag, r->size);
+
+    rc = pci_check_bar_overlap(dev, r_addr, r_size, r->type);
+    if (rc) {
+        PT_WARN(dev, "Region: %d (addr: %#"FMT_PCIBUS
+                ", len: %#"FMT_PCIBUS") is overlapped.\n",
+                bar, r_addr, r_size);
+    }
+
+    /* check whether we need to update the mapping or not */
+    if (r_addr != s->bases[bar].e_physbase) {
+        /* mapping BAR */
+        if (base->bar_flag == PT_BAR_FLAG_IO) {
+            pt_ioport_map(s, bar, r_addr, r_size, r->type);
+        } else {
+            pt_iomem_map(s, bar, r_addr, r_size, r->type);
+        }
+    }
+}
+
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
+{
+    int i;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        pt_bar_mapping_one(s, i, io_enable, mem_enable);
+    }
+}
+
+static uint64_t bar_read(void *o, target_phys_addr_t addr, unsigned size)
+{
+    PCIDevice *d = o;
+    PT_ERR(d, "Should not read BAR through QEMU. @0x"TARGET_FMT_plx"\n", addr);
+    return 0;
+}
+static void bar_write(void *o, target_phys_addr_t addr,
+                      uint64_t data, unsigned size)
+{
+    PCIDevice* d = o;
+    PT_ERR(d, "Should not write BAR through QEMU. @0x"TARGET_FMT_plx"\n", 
addr);
+}
+
+static const MemoryRegionOps ops = {
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .read = bar_read,
+    .write = bar_write,
+};
+
+/* register regions */
+static int pt_register_regions(XenPCIPassthroughState *s)
+{
+    int i = 0;
+    uint32_t bar_data = 0;
+    HostPCIDevice *d = s->real_device;
+
+    /* Register PIO/MMIO BARs */
+    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
+        HostPCIIORegion *r = &d->io_regions[i];
+
+        if (r->base_addr && r->size) {
+            s->bases[i].e_physbase = r->base_addr;
+            s->bases[i].access.u = r->base_addr;
+
+            /* Register current region */
+            if (r->flags & IORESOURCE_IO) {
+                memory_region_init_io(&s->bar[i], &ops, &s->dev,
+                                      "xen-pci-pt-bar-io", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
+                                 &s->bar[i]);
+            } else if (r->flags & IORESOURCE_PREFETCH) {
+                memory_region_init_io(&s->bar[i], &ops, &s->dev,
+                                      "xen-pci-pt-bar-mem", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                                 &s->bar[i]);
+            } else {
+                memory_region_init_io(&s->bar[i], &ops, &s->dev,
+                                      "xen-pci-pt-bar-mem", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
+                                 &s->bar[i]);
+            }
+
+            PT_LOG(&s->dev, "IO region registered (size=0x%08"PRIx64
+                   " base_addr=0x%08"PRIx64")\n",
+                   r->size, r->base_addr);
+        }
+    }
+
+    /* Register expansion ROM address */
+    if (d->rom.base_addr && d->rom.size) {
+        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
+        if (host_pci_get_long(d, PCI_ROM_ADDRESS, &bar_data)) {
+            return 0;
+        }
+        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
+            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
+            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
+        }
+
+        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
+        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
+
+        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
+                                      "xen-pci-pt-rom", d->rom.size);
+        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                         &s->rom);
+
+        PT_LOG(&s->dev, "Expansion ROM registered (size=0x%08"PRIx64
+               " base_addr=0x%08"PRIx64")\n",
+               d->rom.size, d->rom.base_addr);
+    }
+
+    return 0;
+}
+
+static void pt_unregister_regions(XenPCIPassthroughState *s)
+{
+    int i, type, rc;
+    uint32_t e_size;
+    PCIDevice *d = &s->dev;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        e_size = s->bases[i].e_size;
+        if ((e_size == 0) || (s->bases[i].e_physbase == PT_PCI_BAR_UNMAPPED)) {
+            continue;
+        }
+
+        type = d->io_regions[i].type;
+
+        if (type == PCI_BASE_ADDRESS_SPACE_MEMORY
+            || type == PCI_BASE_ADDRESS_MEM_PREFETCH) {
+            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                    (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
+                    DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_ERR(d, "remove old mem mapping failed!\n");
+                continue;
+            }
+
+        } else if (type == PCI_BASE_ADDRESS_SPACE_IO) {
+            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
+                        s->bases[i].e_physbase,
+                        s->bases[i].access.pio_base,
+                        e_size,
+                        DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_ERR(d, "remove old io mapping failed!\n");
+                continue;
+            }
+        }
+    }
+}
+
+static int pt_initfn(PCIDevice *pcidev)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
+    int dom, bus;
+    unsigned slot, func;
+    int rc = 0;
+    uint8_t machine_irq = 0;
+    int pirq = PT_UNASSIGNED_PIRQ;
+
+    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
+        PT_ERR(pcidev, "Failed to parse BDF: %s\n", s->hostaddr);
+        return -1;
+    }
+
+    /* register real device */
+    PT_LOG(pcidev, "Assigning real physical device %02x:%02x.%x"
+           " to devfn %#x\n", bus, slot, func, s->dev.devfn);
+
+    s->real_device = host_pci_device_get(bus, slot, func);
+    if (!s->real_device) {
+        return -1;
+    }
+
+    s->is_virtfn = s->real_device->is_virtfn;
+    if (s->is_virtfn) {
+        PT_LOG(pcidev, "%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
+               s->real_device->domain, bus, slot, func);
+    }
+
+    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
+    if (host_pci_get_block(s->real_device, 0, pcidev->config,
+                           PCI_CONFIG_SPACE_SIZE) == -1) {
+        host_pci_device_put(s->real_device);
+        return -1;
+    }
+
+    /* Handle real device's MMIO/PIO BARs */
+    pt_register_regions(s);
+
+    /* Bind interrupt */
+    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
+        PT_LOG(pcidev, "no pin interrupt\n");
+        goto out;
+    }
+
+    host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE, &machine_irq);
+    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
+
+    if (rc < 0) {
+        PT_ERR(pcidev, "Mapping machine irq %u to pirq %i failed, (rc: %d)\n",
+               machine_irq, pirq, rc);
+
+        /* Disable PCI intx assertion (turn on bit10 of devctl) */
+        host_pci_set_word(s->real_device,
+                          PCI_COMMAND,
+                          pci_get_word(s->dev.config + PCI_COMMAND)
+                          | PCI_COMMAND_INTX_DISABLE);
+        machine_irq = 0;
+        s->machine_irq = 0;
+    } else {
+        machine_irq = pirq;
+        s->machine_irq = pirq;
+        mapped_machine_irq[machine_irq]++;
+    }
+
+    /* bind machine_irq to device */
+    if (rc < 0 && machine_irq != 0) {
+        uint8_t e_device = PCI_SLOT(s->dev.devfn);
+        uint8_t e_intx = pci_intx(s);
+
+        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
+                                       e_device, e_intx);
+        if (rc < 0) {
+            PT_ERR(pcidev, "Binding of interrupt %i failed! (rc: %d)\n",
+                   e_intx, rc);
+
+            /* Disable PCI intx assertion (turn on bit10 of devctl) */
+            host_pci_set_word(s->real_device, PCI_COMMAND,
+                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
+                              | PCI_COMMAND_INTX_DISABLE);
+            mapped_machine_irq[machine_irq]--;
+
+            if (mapped_machine_irq[machine_irq] == 0) {
+                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
+                    PT_ERR(pcidev, "Unmapping of machine interrupt %i failed!"
+                           " (rc: %d)\n", machine_irq, rc);
+                }
+            }
+            s->machine_irq = 0;
+        }
+    }
+
+out:
+    PT_LOG(pcidev, "Real physical device %02x:%02x.%x registered successfuly!"
+           "\nIRQ type = %s\n", bus, slot, func, "INTx");
+
+    return 0;
+}
+
+static int pt_unregister_device(PCIDevice *pcidev)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
+    uint8_t e_device, e_intx;
+    uint8_t machine_irq;
+    int rc;
+
+    /* Unbind interrupt */
+    e_device = PCI_SLOT(s->dev.devfn);
+    e_intx = pci_intx(s);
+    machine_irq = s->machine_irq;
+
+    if (machine_irq) {
+        rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
+                                     PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0);
+        if (rc < 0) {
+            PT_ERR(pcidev, "Unbinding of interrupt failed! rc=%d\n", rc);
+        }
+    }
+
+    if (machine_irq) {
+        mapped_machine_irq[machine_irq]--;
+
+        if (mapped_machine_irq[machine_irq] == 0) {
+            rc = xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq);
+
+            if (rc < 0) {
+                PT_ERR(pcidev, "Unmaping of interrupt failed! rc=%d\n", rc);
+            }
+        }
+    }
+
+    /* unregister real device's MMIO/PIO BARs */
+    pt_unregister_regions(s);
+
+    host_pci_device_put(s->real_device);
+
+    return 0;
+}
+
+static PCIDeviceInfo xen_pci_passthrough = {
+    .init = pt_initfn,
+    .exit = pt_unregister_device,
+    .qdev.name = "xen-pci-passthrough",
+    .qdev.desc = "Assign an host pci device with Xen",
+    .qdev.size = sizeof(XenPCIPassthroughState),
+    .config_read = pt_pci_read_config,
+    .config_write = pt_pci_write_config,
+    .is_express = 0,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
+        DEFINE_PROP_BIT("power-mgmt", XenPCIPassthroughState, power_mgmt,
+                        0, false),
+        DEFINE_PROP_END_OF_LIST(),
+    }
+};
+
+static void xen_passthrough_register(void)
+{
+    pci_qdev_register(&xen_pci_passthrough);
+}
+
+device_init(xen_passthrough_register);
diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
new file mode 100644
index 0000000..110325c
--- /dev/null
+++ b/hw/xen_pci_passthrough.h
@@ -0,0 +1,282 @@
+#ifndef QEMU_HW_XEN_PCI_PASSTHROUGH_H
+#  define QEMU_HW_XEN_PCI_PASSTHROUGH_H
+
+#include "qemu-common.h"
+#include "xen_common.h"
+#include "pci.h"
+#include "host-pci-device.h"
+
+/* #define PT_LOGGING_ENABLED */
+/* #define PT_DEBUG_PCI_CONFIG_ACCESS */
+
+void pt_log(const PCIDevice *d, const char *f, ...) GCC_FMT_ATTR(2, 3);
+
+#define PT_ERR(d, _f, _a...)  pt_log(d, "%s: Error: " _f, __func__, ##_a)
+
+#ifdef PT_LOGGING_ENABLED
+#  define PT_LOG(d, _f, _a...)  pt_log(d, "%s: " _f, __func__, ##_a)
+#  define PT_WARN(d, _f, _a...) pt_log(d, "%s: Warning: " _f, __func__, ##_a)
+#else
+#  define PT_LOG(d, _f, _a...)
+#  define PT_WARN(d, _f, _a...)
+#endif
+
+#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
+#  define PT_LOG_CONFIG(d, addr, val, len) \
+    pt_log(d, "%s: address=0x%04x val=0x%08x len=%d\n", \
+           __func__, addr, val, len)
+#else
+#  define PT_LOG_CONFIG(d, addr, val, len)
+#endif
+
+
+typedef struct XenPTRegInfo XenPTRegInfo;
+typedef struct XenPTReg XenPTReg;
+
+typedef struct XenPCIPassthroughState XenPCIPassthroughState;
+
+/* function type for config reg */
+typedef int (*conf_reg_init)
+    (XenPCIPassthroughState *, XenPTRegInfo *, uint32_t real_offset,
+     uint32_t *data);
+typedef int (*conf_dword_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t dev_value, uint32_t valid_mask);
+typedef int (*conf_word_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t dev_value, uint16_t valid_mask);
+typedef int (*conf_byte_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t dev_value, uint8_t valid_mask);
+typedef int (*conf_dword_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t valid_mask);
+typedef int (*conf_word_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t valid_mask);
+typedef int (*conf_byte_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t valid_mask);
+typedef int (*conf_dword_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint32_t dev_value, uint32_t *val);
+typedef int (*conf_word_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint16_t dev_value, uint16_t *val);
+typedef int (*conf_byte_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint8_t dev_value, uint8_t *val);
+
+/* power state transition */
+#define PT_FLAG_TRANSITING  0x0001
+
+#define PT_BAR_ALLF         0xFFFFFFFF  /* BAR ALLF value */
+#define PT_PCI_BAR_UNMAPPED (-1)
+#define PT_UNASSIGNED_PIRQ (-1)
+
+
+typedef enum {
+    GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
+    GRP_TYPE_EMU,                               /* emul reg group */
+} RegisterGroupType;
+
+typedef enum {
+    PT_BAR_FLAG_MEM = 0,                        /* Memory type BAR */
+    PT_BAR_FLAG_IO,                             /* I/O type BAR */
+    PT_BAR_FLAG_UPPER,                          /* upper 64bit BAR */
+    PT_BAR_FLAG_UNUSED,                         /* unused BAR */
+} PTBarFlag;
+
+
+typedef struct XenPTRegion {
+    /* Virtual phys base & size */
+    uint32_t e_physbase;
+    uint32_t e_size;
+    /* Index of region in qemu */
+    uint32_t memory_index;
+    /* BAR flag */
+    PTBarFlag bar_flag;
+    /* Translation of the emulated address */
+    union {
+        uint64_t maddr;
+        uint64_t pio_base;
+        uint64_t u;
+    } access;
+} XenPTRegion;
+
+/* XenPTRegInfo declaration
+ * - only for emulated register (either a part or whole bit).
+ * - for passthrough register that need special behavior (like interacting with
+ *   other component), set emu_mask to all 0 and specify r/w func properly.
+ * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
+ */
+
+/* emulated register infomation */
+struct XenPTRegInfo {
+    uint32_t offset;
+    uint32_t size;
+    uint32_t init_val;
+    /* reg read only field mask (ON:RO/ROS, OFF:other) */
+    uint32_t ro_mask;
+    /* reg emulate field mask (ON:emu, OFF:passthrough) */
+    uint32_t emu_mask;
+    /* no write back allowed */
+    uint32_t no_wb;
+    conf_reg_init init;
+    /* read/write/restore function pointer
+     * for double_word/word/byte size */
+    union {
+        struct {
+            conf_dword_write write;
+            conf_dword_read read;
+            conf_dword_restore restore;
+        } dw;
+        struct {
+            conf_word_write write;
+            conf_word_read read;
+            conf_word_restore restore;
+        } w;
+        struct {
+            conf_byte_write write;
+            conf_byte_read read;
+            conf_byte_restore restore;
+        } b;
+    } u;
+};
+
+/* emulated register management */
+struct XenPTReg {
+    QLIST_ENTRY(XenPTReg) entries;
+    XenPTRegInfo *reg;
+    uint32_t data;
+};
+
+typedef struct XenPTRegGroupInfo XenPTRegGroupInfo;
+
+/* emul reg group size initialize method */
+typedef int (*pt_reg_size_init_fn)
+    (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
+     uint32_t base_offset, uint8_t *size);
+
+/* emulated register group infomation */
+struct XenPTRegGroupInfo {
+    uint8_t grp_id;
+    RegisterGroupType grp_type;
+    uint8_t grp_size;
+    pt_reg_size_init_fn size_init;
+    XenPTRegInfo *emu_reg_tbl;
+};
+
+/* emul register group management table */
+typedef struct XenPTRegGroup {
+    QLIST_ENTRY(XenPTRegGroup) entries;
+    const XenPTRegGroupInfo *reg_grp;
+    uint32_t base_offset;
+    uint8_t size;
+    QLIST_HEAD(, XenPTReg) reg_tbl_list;
+} XenPTRegGroup;
+
+
+typedef struct XenPTPM {
+    QEMUTimer *pm_timer;  /* QEMUTimer struct */
+    int no_soft_reset;    /* No Soft Reset flags */
+    uint16_t flags;       /* power state transition flags */
+    uint16_t pmc_field;   /* Power Management Capabilities field */
+    int pm_delay;         /* power state transition delay */
+    uint16_t cur_state;   /* current power state */
+    uint16_t req_state;   /* requested power state */
+    uint32_t pm_base;     /* Power Management Capability reg base offset */
+    uint32_t aer_base;    /* AER Capability reg base offset */
+} XenPTPM;
+
+struct XenPCIPassthroughState {
+    PCIDevice dev;
+
+    char *hostaddr;
+    bool is_virtfn;
+    HostPCIDevice *real_device;
+    XenPTRegion bases[PCI_NUM_REGIONS]; /* Access regions */
+    QLIST_HEAD(, XenPTRegGroup) reg_grp_tbl;
+
+    uint32_t machine_irq;
+
+    uint32_t power_mgmt;
+    XenPTPM *pm_state;
+
+    MemoryRegion bar[PCI_NUM_REGIONS - 1];
+    MemoryRegion rom;
+};
+
+int pt_config_init(XenPCIPassthroughState *s);
+void pt_config_delete(XenPCIPassthroughState *s);
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable);
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable);
+XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address);
+XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address);
+int pt_bar_offset_to_index(uint32_t offset);
+
+static inline pcibus_t pt_get_emul_size(PTBarFlag flag, pcibus_t r_size)
+{
+    /* align resource size (memory type only) */
+    if (flag == PT_BAR_FLAG_MEM) {
+        return (r_size + XC_PAGE_SIZE - 1) & XC_PAGE_MASK;
+    } else {
+        return r_size;
+    }
+}
+
+/* INTx */
+/* The PCI Local Bus Specification, Rev. 3.0,
+ * Section 6.2.4 Miscellaneous Registers, pp 223
+ * outlines 5 valid values for the intertupt pin (intx).
+ *  0: For devices (or device functions) that don't use an interrupt in
+ *  1: INTA#
+ *  2: INTB#
+ *  3: INTC#
+ *  4: INTD#
+ *
+ * Xen uses the following 4 values for intx
+ *  0: INTA#
+ *  1: INTB#
+ *  2: INTC#
+ *  3: INTD#
+ *
+ * Observing that these list of values are not the same, pci_read_intx()
+ * uses the following mapping from hw to xen values.
+ * This seems to reflect the current usage within Xen.
+ *
+ * PCI hardware    | Xen | Notes
+ * ----------------+-----+----------------------------------------------------
+ * 0               | 0   | No interrupt
+ * 1               | 0   | INTA#
+ * 2               | 1   | INTB#
+ * 3               | 2   | INTC#
+ * 4               | 3   | INTD#
+ * any other value | 0   | This should never happen, log error message
+ */
+
+static inline uint8_t pci_read_intx(XenPCIPassthroughState *s)
+{
+    uint8_t v = 0;
+    host_pci_get_byte(s->real_device, PCI_INTERRUPT_PIN, &v);
+    return v;
+}
+
+static inline uint8_t pci_intx(XenPCIPassthroughState *s)
+{
+    uint8_t r_val = pci_read_intx(s);
+
+    PT_LOG(&s->dev, "intx=%i\n", r_val);
+    if (r_val < 1 || r_val > 4) {
+        PT_LOG(&s->dev, "Interrupt pin read from hardware is out of range:"
+               " value=%i, acceptable range is 1 - 4\n", r_val);
+        r_val = 0;
+    } else {
+        r_val -= 1;
+    }
+
+    return r_val;
+}
+
+#endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
diff --git a/hw/xen_pci_passthrough_config_init.c 
b/hw/xen_pci_passthrough_config_init.c
new file mode 100644
index 0000000..1e9de64
--- /dev/null
+++ b/hw/xen_pci_passthrough_config_init.c
@@ -0,0 +1,11 @@
+#include "xen_pci_passthrough.h"
+
+XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
+{
+    return NULL;
+}
+
+XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
+{
+    return NULL;
+}
diff --git a/xen-all.c b/xen-all.c
index b5e28ab..0e3bbcf 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -979,3 +979,15 @@ void destroy_hvm_domain(void)
         xc_interface_close(xc_handle);
     }
 }
+
+void xen_shutdown_fatal_error(const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    va_end(ap);
+    fprintf(stderr, "Will destroy the domain.\n");
+    /* destroy the domain */
+    qemu_system_shutdown_request();
+}
-- 
Anthony PERARD


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel