WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] x86: protect MSI-X table and pending bit array from

To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH] x86: protect MSI-X table and pending bit array from guest writes
From: "Jan Beulich" <JBeulich@xxxxxxxxxx>
Date: Mon, 20 Sep 2010 14:23:51 +0100
Cc: yunhong.jiang@xxxxxxxxx
Delivery-date: Mon, 20 Sep 2010 06:27:31 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
These structures are used by Xen, and hence guests must not be able
to fiddle with them.

qemu-dm currently plays with the MSI-X table, requiring Dom0 to
still have write access. This is broken (explicitly allowing the guest
write access to the mask bit) and should be fixed in qemu-dm, at which
time Dom0 won't need any special casing anymore.

The changes are made under the assumption that p2m_mmio_direct will
only ever be used for order 0 pages.

An open question is whether dealing with pv guests (including the
IOMMU-less case) is necessary, as handling mappings a domain may
already have in place at the time the first interrupt gets set up
would require scanning all of the guest's L1 page table pages.
Currently a hole still remains allowing PV guests to map these ranges
before actually setting up any MSI-X vector for a device.

An alternative would be to determine and insert the address ranges
earlier into mmio_ro_ranges, but that would require a hook in the
PCI config space writes, which is particularly problematic in case
MMCONFIG accesses are being used.

A second alternative would be to require Dom0 to report all devices
(or at least all MSI-X capable ones) regardless of whether they would
be used by that domain, and do so after resources got determined/
assigned for them (i.e. a second notification later than the one
currently happening from the PCI bus scan would be needed).

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>
Acked-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

--- 2010-09-20.orig/xen/arch/x86/mm.c   2010-09-20 14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/mm.c        2010-09-20 10:00:07.000000000 +0200
@@ -822,7 +822,13 @@ get_page_from_l1e(
             return 0;
         }
 
-        return 1;
+        if ( !(l1f & _PAGE_RW) || IS_PRIV(pg_owner) ||
+             !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
+            return 1;
+        dprintk(XENLOG_G_WARNING,
+                "d%d: Forcing read-only access to MFN %lx\n",
+                l1e_owner->domain_id, mfn);
+        return -1;
     }
 
     if ( unlikely(real_pg_owner != pg_owner) )
@@ -1178,9 +1184,15 @@ static int alloc_l1_table(struct page_in
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
     {
-        if ( is_guest_l1_slot(i) &&
-             unlikely(!get_page_from_l1e(pl1e[i], d, d)) )
-            goto fail;
+        if ( is_guest_l1_slot(i) )
+            switch ( get_page_from_l1e(pl1e[i], d, d) )
+            {
+            case 0:
+                goto fail;
+            case -1:
+                l1e_remove_flags(pl1e[i], _PAGE_RW);
+                break;
+            }
 
         adjust_guest_l1e(pl1e[i], d);
     }
@@ -1764,8 +1776,14 @@ static int mod_l1_entry(l1_pgentry_t *pl
             return rc;
         }
 
-        if ( unlikely(!get_page_from_l1e(nl1e, pt_dom, pg_dom)) )
+        switch ( get_page_from_l1e(nl1e, pt_dom, pg_dom) )
+        {
+        case 0:
             return 0;
+        case -1:
+            l1e_remove_flags(nl1e, _PAGE_RW);
+            break;
+        }
         
         adjust_guest_l1e(nl1e, pt_dom);
         if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
@@ -4919,8 +4937,9 @@ static int ptwr_emulated_update(
 
     /* Check the new PTE. */
     nl1e = l1e_from_intpte(val);
-    if ( unlikely(!get_page_from_l1e(nl1e, d, d)) )
+    switch ( get_page_from_l1e(nl1e, d, d) )
     {
+    case 0:
         if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
              !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
         {
@@ -4939,6 +4958,10 @@ static int ptwr_emulated_update(
             MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
             return X86EMUL_UNHANDLEABLE;
         }
+        break;
+    case -1:
+        l1e_remove_flags(nl1e, _PAGE_RW);
+        break;
     }
 
     adjust_guest_l1e(nl1e, d);
--- 2010-09-20.orig/xen/arch/x86/mm/hap/p2m-ept.c       2010-09-20 
14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/mm/hap/p2m-ept.c    2010-09-20 10:00:07.000000000 
+0200
@@ -72,9 +72,13 @@ static void ept_p2m_type_to_flags(ept_en
             entry->r = entry->w = entry->x = 0;
             return;
         case p2m_ram_rw:
-        case p2m_mmio_direct:
             entry->r = entry->w = entry->x = 1;
             return;
+        case p2m_mmio_direct:
+            entry->r = entry->x = 1;
+            entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
+                                                    entry->mfn);
+            return;
         case p2m_ram_logdirty:
         case p2m_ram_ro:
         case p2m_ram_shared:
@@ -722,6 +726,9 @@ static void ept_change_entry_type_global
     if ( ept_get_asr(d) == 0 )
         return;
 
+    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
+
     ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt);
 
     ept_sync_domain(d);
--- 2010-09-20.orig/xen/arch/x86/mm/p2m.c       2010-09-20 14:29:04.000000000 
+0200
+++ 2010-09-20/xen/arch/x86/mm/p2m.c    2010-09-20 10:00:07.000000000 +0200
@@ -72,7 +72,7 @@ boolean_param("hap_1gb", opt_hap_1gb);
 #define SUPERPAGE_PAGES (1UL << 9)
 #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
 
-static unsigned long p2m_type_to_flags(p2m_type_t t) 
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
 {
     unsigned long flags;
 #ifdef __x86_64__
@@ -101,7 +101,9 @@ static unsigned long p2m_type_to_flags(p
     case p2m_mmio_dm:
         return flags;
     case p2m_mmio_direct:
-        return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
+        if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
+            flags |= _PAGE_RW;
+        return flags | P2M_BASE_FLAGS | _PAGE_PCD;
     case p2m_populate_on_demand:
         return flags;
     }
@@ -1299,8 +1301,10 @@ p2m_set_entry(struct p2m_domain *p2m, un
             domain_crash(p2m->domain);
             goto out;
         }
+        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) 
-            ? l3e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt) | _PAGE_PSE)
+            ? l3e_from_pfn(mfn_x(mfn),
+                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
         paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
@@ -1334,7 +1338,8 @@ p2m_set_entry(struct p2m_domain *p2m, un
         ASSERT(p2m_entry);
         
         if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-            entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
+            entry_content = l1e_from_pfn(mfn_x(mfn),
+                                         p2m_type_to_flags(p2mt, mfn));
         else
             entry_content = l1e_empty();
         
@@ -1358,9 +1363,11 @@ p2m_set_entry(struct p2m_domain *p2m, un
             goto out;
         }
         
+        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt) | _PAGE_PSE);
+                                       p2m_type_to_flags(p2mt, mfn) |
+                                       _PAGE_PSE);
         else
             l2e_content = l2e_empty();
         
@@ -2437,6 +2444,7 @@ void p2m_change_type_global(struct p2m_d
 #endif /* CONFIG_PAGING_LEVELS == 4 */
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
 
     if ( !paging_mode_translate(p2m->domain) )
         return;
@@ -2478,7 +2486,7 @@ void p2m_change_type_global(struct p2m_d
                     continue;
                 mfn = l3e_get_pfn(l3e[i3]);
                 gfn = get_gpfn_from_mfn(mfn);
-                flags = p2m_type_to_flags(nt);
+                flags = p2m_type_to_flags(nt, _mfn(mfn));
                 l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
                 paging_write_p2m_entry(p2m->domain, gfn,
                                        (l1_pgentry_t *)&l3e[i3],
@@ -2509,7 +2517,7 @@ void p2m_change_type_global(struct p2m_d
 #endif
                                )
                            * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    flags = p2m_type_to_flags(nt);
+                    flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
                     paging_write_p2m_entry(p2m->domain, gfn,
                                            (l1_pgentry_t *)&l2e[i2],
@@ -2533,7 +2541,7 @@ void p2m_change_type_global(struct p2m_d
                                     )
                            * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
                     /* create a new 1le entry with the new type */
-                    flags = p2m_type_to_flags(nt);
+                    flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = l1e_from_pfn(mfn, flags);
                     paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
                                            l1mfn, l1e_content, 1);
--- 2010-09-20.orig/xen/arch/x86/mm/shadow/multi.c      2010-09-20 
14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/mm/shadow/multi.c   2010-09-20 10:00:07.000000000 
+0200
@@ -674,7 +674,9 @@ _sh_propagate(struct vcpu *v, 
     }
 
     /* Read-only memory */
-    if ( p2m_is_readonly(p2mt) )
+    if ( p2m_is_readonly(p2mt) ||
+         (p2mt == p2m_mmio_direct &&
+          rangeset_contains_singleton(mmio_ro_ranges, mfn_x(target_mfn))) )
         sflags &= ~_PAGE_RW;
     
     // protect guest page tables
@@ -1225,15 +1227,19 @@ static int shadow_set_l1e(struct vcpu *v
         /* About to install a new reference */        
         if ( shadow_mode_refcounts(d) ) {
             TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
-            if ( shadow_get_page_from_l1e(new_sl1e, d, new_type) == 0 ) 
+            switch ( shadow_get_page_from_l1e(new_sl1e, d, new_type) )
             {
+            case 0:
                 /* Doesn't look like a pagetable. */
                 flags |= SHADOW_SET_ERROR;
                 new_sl1e = shadow_l1e_empty();
-            }
-            else
-            {
+                break;
+            case -1:
+                shadow_l1e_remove_flags(new_sl1e, _PAGE_RW);
+                /* fall through */
+            default:
                 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
+                break;
             }
         }
     } 
--- 2010-09-20.orig/xen/arch/x86/msi.c  2010-09-20 14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/msi.c       2010-09-20 14:29:34.000000000 +0200
@@ -16,12 +16,14 @@
 #include <xen/errno.h>
 #include <xen/pci.h>
 #include <xen/pci_regs.h>
+#include <xen/iocap.h>
 #include <xen/keyhandler.h>
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
 #include <asm/msi.h>
 #include <asm/fixmap.h>
+#include <asm/p2m.h>
 #include <mach_apic.h>
 #include <io_ports.h>
 #include <public/physdev.h>
@@ -520,6 +522,43 @@ static int msi_capability_init(struct pc
     return 0;
 }
 
+static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 bir)
+{
+    u8 limit;
+    u32 addr;
+
+    switch ( pci_conf_read8(bus, slot, func, PCI_HEADER_TYPE) )
+    {
+    case PCI_HEADER_TYPE_NORMAL:
+        limit = 6;
+        break;
+    case PCI_HEADER_TYPE_BRIDGE:
+        limit = 2;
+        break;
+    case PCI_HEADER_TYPE_CARDBUS:
+        limit = 1;
+        break;
+    default:
+        return 0;
+    }
+
+    if ( bir >= limit )
+        return 0;
+    addr = pci_conf_read32(bus, slot, func, PCI_BASE_ADDRESS_0 + bir * 4);
+    if ( (addr & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO )
+        return 0;
+    if ( (addr & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == 
PCI_BASE_ADDRESS_MEM_TYPE_64 )
+    {
+        addr &= ~PCI_BASE_ADDRESS_MEM_MASK;
+        if ( ++bir >= limit )
+            return 0;
+        return addr |
+               ((u64)pci_conf_read32(bus, slot, func,
+                                     PCI_BASE_ADDRESS_0 + bir * 4) << 32);
+    }
+    return addr & ~PCI_BASE_ADDRESS_MEM_MASK;
+}
+
 /**
  * msix_capability_init - configure device's MSI-X capability
  * @dev: pointer to the pci_dev data structure of MSI-X device function
@@ -532,7 +571,8 @@ static int msi_capability_init(struct pc
  **/
 static int msix_capability_init(struct pci_dev *dev,
                                 struct msi_info *msi,
-                                struct msi_desc **desc)
+                                struct msi_desc **desc,
+                                unsigned int nr_entries)
 {
     struct msi_desc *entry;
     int pos;
@@ -587,6 +627,66 @@ static int msix_capability_init(struct p
 
     list_add_tail(&entry->list, &dev->msi_list);
 
+    if ( !dev->msix_nr_entries )
+    {
+        u64 pba_paddr;
+        u32 pba_offset;
+
+        ASSERT(!dev->msix_used_entries);
+        WARN_ON(msi->table_base != read_pci_mem_bar(bus, slot, func, bir));
+
+        dev->msix_nr_entries = nr_entries;
+        dev->msix_table.first = PFN_DOWN(table_paddr);
+        dev->msix_table.last = PFN_DOWN(table_paddr +
+                                        nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
+        WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_table.first,
+                                        dev->msix_table.last));
+
+        pba_offset = pci_conf_read32(bus, slot, func,
+                                     msix_pba_offset_reg(pos));
+        bir = (u8)(pba_offset & PCI_MSIX_BIRMASK);
+        pba_paddr = read_pci_mem_bar(bus, slot, func, bir);
+        WARN_ON(!pba_paddr);
+        pba_paddr += pba_offset & ~PCI_MSIX_BIRMASK;
+
+        dev->msix_pba.first = PFN_DOWN(pba_paddr);
+        dev->msix_pba.last = PFN_DOWN(pba_paddr +
+                                      BITS_TO_LONGS(nr_entries) - 1);
+        WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first,
+                                        dev->msix_pba.last));
+
+        if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
+                                dev->msix_table.last) )
+            WARN();
+        if ( rangeset_add_range(mmio_ro_ranges, dev->msix_pba.first,
+                                dev->msix_pba.last) )
+            WARN();
+
+        if ( dev->domain )
+            p2m_change_entry_type_global(p2m_get_hostp2m(dev->domain),
+                                         p2m_mmio_direct, p2m_mmio_direct);
+        if ( !dev->domain || !paging_mode_translate(dev->domain) )
+        {
+            struct domain *d = dev->domain;
+
+            if ( !d )
+                for_each_domain(d)
+                    if ( !paging_mode_translate(d) &&
+                         (iomem_access_permitted(d, dev->msix_table.first,
+                                                 dev->msix_table.last) ||
+                          iomem_access_permitted(d, dev->msix_pba.first,
+                                                 dev->msix_pba.last)) )
+                        break;
+            if ( d )
+            {
+                /* XXX How to deal with existing mappings? */
+            }
+        }
+    }
+    WARN_ON(dev->msix_nr_entries != nr_entries);
+    WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
+    ++dev->msix_used_entries;
+
     /* Mask interrupt here */
     writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
@@ -717,7 +817,7 @@ static int __pci_enable_msix(struct msi_
 
     }
 
-    status = msix_capability_init(pdev, msi, desc);
+    status = msix_capability_init(pdev, msi, desc, nr_entries);
     return status;
 }
 
@@ -742,6 +842,16 @@ static void __pci_disable_msix(struct ms
     writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
     pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
+
+    if ( !--dev->msix_used_entries )
+    {
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
+                                  dev->msix_table.last) )
+            WARN();
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
+                                   dev->msix_pba.last) )
+            WARN();
+    }
 }
 
 /*
--- 2010-09-20.orig/xen/drivers/passthrough/io.c        2010-09-20 
14:29:04.000000000 +0200
+++ 2010-09-20/xen/drivers/passthrough/io.c     2010-09-20 10:00:07.000000000 
+0200
@@ -26,6 +26,8 @@
 #include <xen/hvm/irq.h>
 #include <xen/tasklet.h>
 
+struct rangeset *__read_mostly mmio_ro_ranges;
+
 static void hvm_dirq_assist(unsigned long _d);
 
 bool_t pt_irq_need_timer(uint32_t flags)
@@ -565,3 +567,11 @@ void hvm_dpci_eoi(struct domain *d, unsi
 unlock:
     spin_unlock(&d->event_lock);
 }
+
+static int __init setup_mmio_ro_ranges(void)
+{
+    mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
+                                  RANGESETF_prettyprint_hex);
+    return 0;
+}
+__initcall(setup_mmio_ro_ranges);
--- 2010-09-20.orig/xen/include/xen/iommu.h     2010-09-20 14:29:04.000000000 
+0200
+++ 2010-09-20/xen/include/xen/iommu.h  2010-09-20 10:00:07.000000000 +0200
@@ -31,6 +31,8 @@ extern bool_t force_iommu, iommu_verbose
 extern bool_t iommu_workaround_bios_bug, iommu_passthrough;
 extern bool_t iommu_snoop, iommu_qinval, iommu_intremap;
 
+extern struct rangeset *mmio_ro_ranges;
+
 #define domain_hvm_iommu(d)     (&d->arch.hvm_domain.hvm_iommu)
 
 #define MAX_IOMMUS 32
--- 2010-09-20.orig/xen/include/xen/pci.h       2010-09-20 14:29:04.000000000 
+0200
+++ 2010-09-20/xen/include/xen/pci.h    2010-09-20 10:00:07.000000000 +0200
@@ -45,6 +45,10 @@ struct pci_dev {
     struct list_head domain_list;
 
     struct list_head msi_list;
+    unsigned int msix_nr_entries, msix_used_entries;
+    struct {
+        unsigned long first, last;
+    } msix_table, msix_pba;
     int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
     int msix_table_idx[MAX_MSIX_TABLE_PAGES];
     spinlock_t msix_table_lock;


Attachment: x86-msix-protect-table-and-pba.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>