WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] [XEN] Prefetch multiple shadow entries pe

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] [XEN] Prefetch multiple shadow entries per pagefault
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 02 Nov 2006 22:08:16 +0000
Delivery-date: Thu, 02 Nov 2006 21:39:52 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
# Node ID 0b6f49d25d4fbb0c3ab6cd1f2ac9dd32ed3d1ff9
# Parent  77e1baf0a5679a155c1f2ccdf3328bf476bf0c8b
[XEN] Prefetch multiple shadow entries per pagefault
Also, clean up the shadow *_propagate/fault routines.
This allows us to quickly dispatch some guest-not-present faults
and most MMIO accesses without taking the shadow lock.
Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
---
 xen/arch/x86/mm/shadow/common.c  |   12 
 xen/arch/x86/mm/shadow/multi.c   |  675 ++++++++++++++++++++-------------------
 xen/arch/x86/mm/shadow/private.h |  117 ------
 xen/arch/x86/mm/shadow/types.h   |   71 ++++
 xen/include/asm-x86/perfc_defn.h |    3 
 xen/include/asm-x86/shadow.h     |    6 
 6 files changed, 446 insertions(+), 438 deletions(-)

diff -r 77e1baf0a567 -r 0b6f49d25d4f xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Wed Nov 01 10:02:00 2006 +0000
+++ b/xen/arch/x86/mm/shadow/common.c   Wed Nov 01 10:31:11 2006 +0000
@@ -1327,8 +1327,18 @@ static void sh_hash_audit_bucket(struct 
              && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
              && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
         {
+            struct page_info *gpg = mfn_to_page(_mfn(e->n));
             /* Bad shadow flags on guest page? */
-            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
+            BUG_ON( !(gpg->shadow_flags & (1<<e->t)) );
+            /* Bad type count on guest page? */
+            if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
+                 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
+            {
+                SHADOW_ERROR("MFN %#"SH_PRI_mfn" shadowed (by %#"SH_PRI_mfn")"
+                             " but has typecount %#lx\n",
+                             e->n, mfn_x(e->smfn), gpg->u.inuse.type_info);
+                BUG();
+            }
         }
         /* That entry was OK; on we go */
         e = e->next;
diff -r 77e1baf0a567 -r 0b6f49d25d4f xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Wed Nov 01 10:02:00 2006 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c    Wed Nov 01 10:31:11 2006 +0000
@@ -36,10 +36,7 @@
 #include "private.h"
 #include "types.h"
 
-/* The first cut: an absolutely synchronous, trap-and-emulate version,
- * supporting only HVM guests (and so only "external" shadow mode). 
- *
- * THINGS TO DO LATER:
+/* THINGS TO DO LATER:
  * 
  * TEARDOWN HEURISTICS
  * Also: have a heuristic for when to destroy a previous paging-mode's 
@@ -55,14 +52,6 @@
  * map_domain_page() version is OK on PAE, we could maybe allow a lightweight 
  * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
  * to share l2h pages again. 
- *
- * PAE L3 COPYING
- * In this code, we copy all 32 bytes of a PAE L3 every time we change an 
- * entry in it, and every time we change CR3.  We copy it for the linear 
- * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
- * buffer so it fits in CR3.  Maybe we can avoid some of this recopying 
- * by using the shadow directly in some places. 
- * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending.
  *
  * GUEST_WALK_TABLES TLB FLUSH COALESCE
  * guest_walk_tables can do up to three remote TLB flushes as it walks to
@@ -98,9 +87,6 @@ static char *fetch_type_names[] = {
     [ft_demand_write] "demand write",
 };
 #endif
-
-/* XXX forward declarations */
-static inline void sh_update_linear_entries(struct vcpu *v);
 
 /**************************************************************************/
 /* Hash table mapping from guest pagetables to shadows
@@ -460,16 +446,20 @@ static u32 guest_set_ad_bits(struct vcpu
     u32 flags;
     int res = 0;
 
+    ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
+    ASSERT(level <= GUEST_PAGING_LEVELS);
+    ASSERT(shadow_lock_is_acquired(v->domain));
+
+    flags = guest_l1e_get_flags(*ep);
+
+    /* Only set A and D bits for guest-initiated accesses */
+    if ( !(ft & FETCH_TYPE_DEMAND) )
+        return flags;
+
     ASSERT(valid_mfn(gmfn)
            && (sh_mfn_is_a_page_table(gmfn)
                || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) 
                    == 0)));
-    ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
-    ASSERT(level <= GUEST_PAGING_LEVELS);
-    ASSERT(ft == ft_demand_read || ft == ft_demand_write);
-    ASSERT(shadow_lock_is_acquired(v->domain));
-
-    flags = guest_l1e_get_flags(*ep);
 
     /* PAE l3s do not have A and D bits */
     ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
@@ -496,12 +486,20 @@ static u32 guest_set_ad_bits(struct vcpu
     /* Set the bit(s) */
     sh_mark_dirty(v->domain, gmfn);
     SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
-                  "old flags = %#x, new flags = %#x\n", 
-                  gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep), 
flags);
+                 "old flags = %#x, new flags = %#x\n", 
+                 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep), 
+                 flags);
     *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
     
-    /* Propagate this change to any existing shadows */
-    res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
+    /* Propagate this change to any other shadows of the page 
+     * (only necessary if there is more than one shadow) */
+    if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
+    {
+        u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
+        /* More than one type bit set in shadow-flags? */
+        if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
+            res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
+    }
 
     /* We should never need to flush the TLB or recopy PAE entries */
     ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
@@ -637,78 +635,69 @@ shadow_l4_index(mfn_t *smfn, u32 guest_i
 
 
 /**************************************************************************/
-/* Functions which compute shadow entries from their corresponding guest
- * entries.
- *
- * These are the "heart" of the shadow code.
- *
- * There are two sets of these: those that are called on demand faults (read
- * faults and write faults), and those that are essentially called to
- * "prefetch" (or propagate) entries from the guest into the shadow.  The read
- * fault and write fault are handled as two separate cases for L1 entries (due
- * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
- * into the respective demand_fault functions.
+/* Function which computes shadow entries from their corresponding guest
+ * entries.  This is the "heart" of the shadow code. It operates using
+ * level-1 shadow types, but handles all levels of entry.
+ * Don't call it directly, but use the four wrappers below.
  */
-// The function below tries to capture all of the flag manipulation for the
-// demand and propagate functions into one place.
-//
-static always_inline u32
-sh_propagate_flags(struct vcpu *v, mfn_t target_mfn, 
-                    u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, 
-                    int mmio, int level, fetch_type_t ft)
-{
-#define CHECK(_cond)                                    \
-do {                                                    \
-    if (unlikely(!(_cond)))                             \
-    {                                                   \
-        printk("%s %s %d ASSERTION (%s) FAILED\n",      \
-               __func__, __FILE__, __LINE__, #_cond);   \
-        domain_crash(d);                                \
-    }                                                   \
-} while (0);
-
+
+static always_inline void
+_sh_propagate(struct vcpu *v, 
+              void *guest_entry_ptr, 
+              mfn_t guest_table_mfn, 
+              mfn_t target_mfn, 
+              void *shadow_entry_ptr,
+              int level,
+              fetch_type_t ft, 
+              int mmio)
+{
+    guest_l1e_t *gp = guest_entry_ptr;
+    shadow_l1e_t *sp = shadow_entry_ptr;
     struct domain *d = v->domain;
     u32 pass_thru_flags;
-    u32 sflags;
+    u32 gflags, sflags;
 
     /* We don't shadow PAE l3s */
     ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
 
-    // XXX -- might want to think about PAT support for HVM guests...
-
-#ifndef NDEBUG
-    // MMIO can only occur from L1e's
-    //
-    if ( mmio )
-        CHECK(level == 1);
-
-    // We should always have a pointer to the guest entry if it's a non-PSE
-    // non-MMIO demand access.
-    if ( ft & FETCH_TYPE_DEMAND )
-        CHECK(guest_entry_ptr || level == 1);
-#endif
-
-    // A not-present guest entry has a special signature in the shadow table,
-    // so that we do not have to consult the guest tables multiple times...
-    //
+    if ( valid_mfn(guest_table_mfn) )
+        /* Handle A and D bit propagation into the guest */
+        gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
+    else 
+    {
+        /* Must be an fl1e or a prefetch */
+        ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
+        gflags = guest_l1e_get_flags(*gp);
+    }
+
     if ( unlikely(!(gflags & _PAGE_PRESENT)) )
-        return _PAGE_SHADOW_GUEST_NOT_PRESENT;
-
-    // Must have a valid target_mfn, unless this is mmio, or unless this is a
-    // prefetch.  In the case of a prefetch, an invalid mfn means that we can
-    // not usefully shadow anything, and so we return early.
+    {
+        /* If a guest l1 entry is not present, shadow with the magic 
+         * guest-not-present entry. */
+        if ( level == 1 )
+            *sp = sh_l1e_gnp();
+        else 
+            *sp = shadow_l1e_empty();
+        goto done;
+    }
+
+    if ( level == 1 && mmio )
+    {
+        /* Guest l1e maps MMIO space */
+        *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
+        goto done;
+    }
+
+    // Must have a valid target_mfn, unless this is a prefetch.  In the
+    // case of a prefetch, an invalid mfn means that we can not usefully
+    // shadow anything, and so we return early.
     //
     if ( !valid_mfn(target_mfn) )
     {
-        CHECK((ft == ft_prefetch) || mmio);
-        if ( !mmio )
-            return 0;
-    }
-
-    // Set the A and D bits in the guest entry, if we need to.
-    if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
-        gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
-    
+        ASSERT((ft == ft_prefetch));
+        *sp = shadow_l1e_empty();
+        goto done;
+    }
 
     // Propagate bits from the guest to the shadow.
     // Some of these may be overwritten, below.
@@ -719,12 +708,7 @@ do {                                    
                        _PAGE_RW | _PAGE_PRESENT);
     if ( guest_supports_nx(v) )
         pass_thru_flags |= _PAGE_NX_BIT;
-    sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
-
-    // Copy the guest's RW bit into the SHADOW_RW bit.
-    //
-    if ( gflags & _PAGE_RW )
-        sflags |= _PAGE_SHADOW_RW;
+    sflags = gflags & pass_thru_flags;
 
     // Set the A&D bits for higher level shadows.
     // Higher level entries do not, strictly speaking, have dirty bits, but
@@ -750,49 +734,35 @@ do {                                    
                   && !(gflags & _PAGE_DIRTY)) )
         sflags &= ~_PAGE_RW;
 
-    // MMIO caching
+    // shadow_mode_log_dirty support
     //
-    // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
-    // to cache the fact that this entry  is in MMIO space.
+    // Only allow the guest write access to a page a) on a demand fault,
+    // or b) if the page is already marked as dirty.
     //
-    if ( (level == 1) && mmio )
-    {
-        sflags &= ~(_PAGE_PRESENT);
-        sflags |= _PAGE_SHADOW_MMIO;
-    }
-    else 
-    {
-        // shadow_mode_log_dirty support
-        //
-        // Only allow the guest write access to a page a) on a demand fault,
-        // or b) if the page is already marked as dirty.
-        //
-        if ( unlikely((level == 1) &&
-                      !(ft & FETCH_TYPE_WRITE) &&
-                      shadow_mode_log_dirty(d) &&
-                      !sh_mfn_is_dirty(d, target_mfn)) )
-        {
+    if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
+    {
+        if ( ft & FETCH_TYPE_WRITE ) 
+            sh_mark_dirty(d, target_mfn);
+        else if ( !sh_mfn_is_dirty(d, target_mfn) )
             sflags &= ~_PAGE_RW;
-        }
-        
-        // protect guest page tables
-        //
-        if ( unlikely((level == 1) &&
-                      sh_mfn_is_a_page_table(target_mfn)) )
-        {
-            if ( shadow_mode_trap_reads(d) )
-            {
-                // if we are trapping both reads & writes, then mark this page
-                // as not present...
-                //
-                sflags &= ~_PAGE_PRESENT;
-            }
-            else
-            {
-                // otherwise, just prevent any writes...
-                //
-                sflags &= ~_PAGE_RW;
-            }
+    }
+    
+    // protect guest page tables
+    //
+    if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
+    {
+        if ( shadow_mode_trap_reads(d) )
+        {
+            // if we are trapping both reads & writes, then mark this page
+            // as not present...
+            //
+            sflags &= ~_PAGE_PRESENT;
+        }
+        else
+        {
+            // otherwise, just prevent any writes...
+            //
+            sflags &= ~_PAGE_RW;
         }
     }
 
@@ -804,9 +774,17 @@ do {                                    
         sflags |= _PAGE_USER;
     }
 
-    return sflags;
-#undef CHECK
-}
+    *sp = shadow_l1e_from_mfn(target_mfn, sflags);
+ done:
+    SHADOW_DEBUG(PROPAGATE,
+                 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
+                 fetch_type_names[ft], level, gp->l1, sp->l1);
+}
+
+
+/* These four wrappers give us a little bit of type-safety back around the 
+ * use of void-* pointers in _sh_propagate(), and allow the compiler to 
+ * optimize out some level checks. */
 
 #if GUEST_PAGING_LEVELS >= 4
 static void
@@ -814,19 +792,10 @@ l4e_propagate_from_guest(struct vcpu *v,
                          guest_l4e_t *gl4e,
                          mfn_t gl4mfn,
                          mfn_t sl3mfn,
-                         shadow_l4e_t *sl4p,
+                         shadow_l4e_t *sl4e,
                          fetch_type_t ft)
 {
-    u32 gflags = guest_l4e_get_flags(*gl4e);
-    u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
-                                     gl4mfn, 0, 4, ft);
-
-    *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
-
-    SHADOW_DEBUG(PROPAGATE,
-                  "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n",
-                  fetch_type_names[ft], gl4e->l4, sl4p->l4);
-    ASSERT(sflags != -1);
+    _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
 }
 
 static void
@@ -834,19 +803,10 @@ l3e_propagate_from_guest(struct vcpu *v,
                          guest_l3e_t *gl3e,
                          mfn_t gl3mfn, 
                          mfn_t sl2mfn, 
-                         shadow_l3e_t *sl3p,
+                         shadow_l3e_t *sl3e,
                          fetch_type_t ft)
 {
-    u32 gflags = guest_l3e_get_flags(*gl3e);
-    u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
-                                     gl3mfn, 0, 3, ft);
-
-    *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
-
-    SHADOW_DEBUG(PROPAGATE,
-                  "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n",
-                  fetch_type_names[ft], gl3e->l3, sl3p->l3);
-    ASSERT(sflags != -1);
+    _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
 }
 #endif // GUEST_PAGING_LEVELS >= 4
 
@@ -854,95 +814,23 @@ l2e_propagate_from_guest(struct vcpu *v,
 l2e_propagate_from_guest(struct vcpu *v, 
                          guest_l2e_t *gl2e,
                          mfn_t gl2mfn,
-                         mfn_t sl1mfn, 
-                         shadow_l2e_t *sl2p,
+                         mfn_t sl1mfn,
+                         shadow_l2e_t *sl2e,
                          fetch_type_t ft)
 {
-    u32 gflags = guest_l2e_get_flags(*gl2e);
-    u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, 
-                                     gl2mfn, 0, 2, ft);
-
-    *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
-
-    SHADOW_DEBUG(PROPAGATE,
-                  "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n",
-                  fetch_type_names[ft], gl2e->l2, sl2p->l2);
-    ASSERT(sflags != -1);
-}
-
-static inline int
-l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
-               int mmio)
-/* returns 1 if emulation is required, and 0 otherwise */
-{
-    struct domain *d = v->domain;
-    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
-    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
-                                     mmio, 1, ft_demand_read);
-
-    if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) )
-    {
-        // emulation required!
-        *sl1p = shadow_l1e_empty();
-        return 1;
-    }
-
-    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
-
-    SHADOW_DEBUG(PROPAGATE,
-                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
-                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
-
-    ASSERT(sflags != -1);
-    return 0;
-}
-
-static inline int
-l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
-                int mmio)
-/* returns 1 if emulation is required, and 0 otherwise */
-{
-    struct domain *d = v->domain;
-    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
-    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
-                                     mmio, 1, ft_demand_write);
-
-    sh_mark_dirty(d, gmfn);
-
-    if ( !mmio && sh_mfn_is_a_page_table(gmfn) )
-    {
-        // emulation required!
-        *sl1p = shadow_l1e_empty();
-        return 1;
-    }
-
-    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
-
-    SHADOW_DEBUG(PROPAGATE,
-                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
-                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
-
-    ASSERT(sflags != -1);
-    return 0;
-}
-
-static inline void
-l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
+    _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
+}
+
+static void
+l1e_propagate_from_guest(struct vcpu *v, 
+                         guest_l1e_t *gl1e,
+                         mfn_t gl1mfn,
+                         mfn_t gmfn, 
+                         shadow_l1e_t *sl1e,
+                         fetch_type_t ft, 
                          int mmio)
 {
-    gfn_t gfn = guest_l1e_get_gfn(gl1e);
-    mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
-    u32 gflags = guest_l1e_get_flags(gl1e);
-    u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), 
-                                     mmio, 1, ft_prefetch);
-
-    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
-
-    SHADOW_DEBUG(PROPAGATE,
-                  "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
-                  gl1e.l1, sl1p->l1);
-
-    ASSERT(sflags != -1);
+    _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
 }
 
 
@@ -956,8 +844,6 @@ l1e_propagate_from_guest(struct vcpu *v,
  * SHADOW_SET_FLUSH   -- the caller must cause a TLB flush.
  * SHADOW_SET_ERROR   -- the input is not a valid entry (for example, if
  *                        shadow_get_page_from_l1e() fails).
- * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
- *                             copies of their PAE L3 entries re-copied.
  */
 
 static inline void safe_write_entry(void *dst, void *src) 
@@ -1041,16 +927,13 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
     int res;
     mfn_t mfn;
     struct domain *owner;
-    shadow_l1e_t sanitized_sl1e =
-        shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
-
-    //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
-    //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
+
+    ASSERT(!sh_l1e_is_magic(sl1e));
 
     if ( !shadow_mode_refcounts(d) )
         return 1;
 
-    res = get_page_from_l1e(sanitized_sl1e, d);
+    res = get_page_from_l1e(sl1e, d);
 
     // If a privileged domain is attempting to install a map of a page it does
     // not own, we let it succeed anyway.
@@ -1062,7 +945,7 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
          (owner = page_get_owner(mfn_to_page(mfn))) &&
          (d != owner) )
     {
-        res = get_page_from_l1e(sanitized_sl1e, owner);
+        res = get_page_from_l1e(sl1e, owner);
         SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
                        "which is owned by domain %d: %s\n",
                        d->domain_id, mfn_x(mfn), owner->domain_id,
@@ -1250,7 +1133,8 @@ static int shadow_set_l1e(struct vcpu *v
 
     if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
     
-    if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) 
+    if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
+         && !sh_l1e_is_magic(new_sl1e) ) 
     {
         /* About to install a new reference */        
         if ( shadow_mode_refcounts(d) ) {
@@ -1267,7 +1151,8 @@ static int shadow_set_l1e(struct vcpu *v
     shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
     flags |= SHADOW_SET_CHANGED;
 
-    if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) 
+    if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT) 
+         && !sh_l1e_is_magic(old_sl1e) )
     {
         /* We lost a reference to an old mfn. */
         /* N.B. Unlike higher-level sets, never need an extra flush 
@@ -2133,7 +2018,8 @@ void sh_destroy_l1_shadow(struct vcpu *v
         /* Decrement refcounts of all the old entries */
         mfn_t sl1mfn = smfn; 
         SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
-            if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) 
+            if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
+                 && !sh_l1e_is_magic(*sl1e) )
                 shadow_put_page_from_l1e(*sl1e, d);
         });
     }
@@ -2399,16 +2285,17 @@ static int validate_gl1e(struct vcpu *v,
     guest_l1e_t *new_gl1e = new_ge;
     shadow_l1e_t *sl1p = se;
     gfn_t gfn;
-    mfn_t mfn;
-    int result = 0;
+    mfn_t gmfn;
+    int result = 0, mmio;
 
     perfc_incrc(shadow_validate_gl1e_calls);
 
     gfn = guest_l1e_get_gfn(*new_gl1e);
-    mfn = vcpu_gfn_to_mfn(v, gfn);
-
-    l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, 
-                             /* mmio? */ !valid_mfn(mfn));
+    gmfn = vcpu_gfn_to_mfn(v, gfn);
+
+    mmio = (hvm_guest(v) && shadow_vcpu_mode_translate(v) && !valid_mfn(gmfn));
+    l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e, 
+                             ft_prefetch, mmio);
     
     result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
     return result;
@@ -2576,6 +2463,80 @@ static inline void reset_early_unshadow(
 #endif
 }
 
+
+
+/**************************************************************************/
+/* Optimization: Prefetch multiple L1 entries.  This is called after we have 
+ * demand-faulted a shadow l1e in the fault handler, to see if it's
+ * worth fetching some more.
+ */
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
+
+/* XXX magic number */
+#define PREFETCH_DISTANCE 32
+
+static void sh_prefetch(struct vcpu *v, walk_t *gw, 
+                        shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
+{
+    int i, dist, mmio;
+    gfn_t gfn;
+    mfn_t gmfn;
+    guest_l1e_t gl1e;
+    shadow_l1e_t sl1e;
+    u32 gflags;
+
+    /* Prefetch no further than the end of the _shadow_ l1 MFN */
+    dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
+    /* And no more than a maximum fetches-per-fault */
+    if ( dist > PREFETCH_DISTANCE )
+        dist = PREFETCH_DISTANCE;
+
+    for ( i = 1; i < dist ; i++ ) 
+    {
+        /* No point in prefetching if there's already a shadow */
+        if ( ptr_sl1e[i].l1 != 0 )
+            break;
+
+        if ( gw->l1e )
+        {
+            /* Normal guest page; grab the next guest entry */
+            gl1e = gw->l1e[i];
+            /* Not worth continuing if we hit an entry that will need another
+             * fault for A/D-bit propagation anyway */
+            gflags = guest_l1e_get_flags(gl1e);
+            if ( (gflags & _PAGE_PRESENT) 
+                 && (!(gflags & _PAGE_ACCESSED)
+                     || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
+                break;
+        } 
+        else 
+        {
+            /* Fragmented superpage, unless we've been called wrongly */
+            ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
+            /* Increment the l1e's GFN by the right number of guest pages */
+            gl1e = guest_l1e_from_gfn(
+                _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i), 
+                guest_l1e_get_flags(gw->eff_l1e));
+        }
+
+        /* Look at the gfn that the l1e is pointing at */
+        gfn = guest_l1e_get_gfn(gl1e);
+        gmfn = vcpu_gfn_to_mfn(v, gfn);
+        mmio = ( hvm_guest(v) 
+                 && shadow_vcpu_mode_translate(v) 
+                 && mmio_space(gfn_to_paddr(gfn)) );
+
+        /* Propagate the entry.  Safe to use a pointer to our local 
+         * gl1e, since this is not a demand-fetch so there will be no 
+         * write-back to the guest. */
+        l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
+                                 gmfn, &sl1e, ft_prefetch, mmio);
+        (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
+    }
+}
+
+#endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
 
 
 /**************************************************************************/
@@ -2602,16 +2563,70 @@ static int sh_page_fault(struct vcpu *v,
     int r, mmio;
     fetch_type_t ft = 0;
 
+    SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
+                   v->domain->domain_id, v->vcpu_id, va, regs->error_code);
+
     //
     // XXX: Need to think about eventually mapping superpages directly in the
     //      shadow (when possible), as opposed to splintering them into a
     //      bunch of 4K maps.
     //
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
+    if ( (regs->error_code & PFEC_reserved_bit) )
+    {
+        /* The only reasons for reserved bits to be set in shadow entries 
+         * are the two "magic" shadow_l1e entries. */
+        if ( likely((__copy_from_user(&sl1e, 
+                                      (sh_linear_l1_table(v) 
+                                       + shadow_l1_linear_offset(va)),
+                                      sizeof(sl1e)) == 0)
+                    && sh_l1e_is_magic(sl1e)) )
+        {
+            if ( sh_l1e_is_gnp(sl1e) )
+            {
+                if ( likely(!hvm_guest(v) || shadow_vcpu_mode_translate(v)) )
+                { 
+                    /* Not-present in a guest PT: pass to the guest as
+                     * a not-present fault (by flipping two bits). */
+                    ASSERT(regs->error_code & PFEC_page_present);
+                    regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
+                    perfc_incrc(shadow_fault_fast_gnp);
+                    SHADOW_PRINTK("fast path not-present\n");
+                    return 0;
+                }
+                else 
+                {
+                    /* Not-present in the P2M: MMIO */
+                    gpa = va;
+                }
+            }
+            else
+            {
+                /* Magic MMIO marker: extract gfn for MMIO address */
+                ASSERT(sh_l1e_is_mmio(sl1e));
+                gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e)))) 
+                       << PAGE_SHIFT) 
+                    | (va & ~PAGE_MASK);
+            }
+            perfc_incrc(shadow_fault_fast_mmio);
+            SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
+            reset_early_unshadow(v);
+            handle_mmio(gpa);
+            return EXCRET_fault_fixed;
+        }
+        else
+        {
+            /* This should be exceptionally rare: another vcpu has fixed
+             * the tables between the fault and our reading the l1e.
+             * Fall through to the normal fault handing logic */
+            perfc_incrc(shadow_fault_fast_fail);
+            SHADOW_PRINTK("fast path false alarm!\n");
+        }
+    }
+#endif /* SHOPT_FAST_FAULT_PATH */
+
     shadow_lock(d);
-
-    SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
-                   v->domain->domain_id, v->vcpu_id, va, regs->error_code);
     
     shadow_audit_tables(v);
                    
@@ -2659,8 +2674,9 @@ static int sh_page_fault(struct vcpu *v,
     }
 
     // Was it a write fault?
-    //
-    if ( regs->error_code & PFEC_write_access )
+    ft = ((regs->error_code & PFEC_write_access)
+          ? ft_demand_write : ft_demand_read);
+    if ( ft == ft_demand_write )
     {
         if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
         {
@@ -2685,26 +2701,19 @@ static int sh_page_fault(struct vcpu *v,
         }
     }
 
-    /* Is this an MMIO access? */
+    /* What mfn is the guest trying to access? */
     gfn = guest_l1e_get_gfn(gw.eff_l1e);
+    gmfn = vcpu_gfn_to_mfn(v, gfn);
     mmio = ( hvm_guest(v) 
              && shadow_vcpu_mode_translate(v) 
              && mmio_space(gfn_to_paddr(gfn)) );
 
-    /* For MMIO, the shadow holds the *gfn*; for normal accesses, it holds 
-     * the equivalent mfn. */
-    if ( mmio ) 
-        gmfn = _mfn(gfn_x(gfn));
-    else
-    {
-        gmfn = vcpu_gfn_to_mfn(v, gfn);
-        if ( !valid_mfn(gmfn) )
-        {
-            perfc_incrc(shadow_fault_bail_bad_gfn);
-            SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", 
-                           gfn_x(gfn), mfn_x(gmfn));
-            goto not_a_shadow_fault;
-        }
+    if ( !mmio && !valid_mfn(gmfn) )
+    {
+        perfc_incrc(shadow_fault_bail_bad_gfn);
+        SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", 
+                      gfn_x(gfn), mfn_x(gmfn));
+        goto not_a_shadow_fault;
     }
 
     /* Make sure there is enough free shadow memory to build a chain of
@@ -2717,44 +2726,39 @@ static int sh_page_fault(struct vcpu *v,
      * for the shadow entry, since we might promote a page here. */
     // XXX -- this code will need to change somewhat if/when the shadow code
     // can directly map superpages...
-    ft = ((regs->error_code & PFEC_write_access) ?
-          ft_demand_write : ft_demand_read);
     ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
     ASSERT(ptr_sl1e);
 
-    /* Calculate the shadow entry */
-    if ( ft == ft_demand_write )
-    {
-        if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
+    /* Calculate the shadow entry and write it */
+    l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn, 
+                             gmfn, &sl1e, ft, mmio);
+    r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
+    /* Prefetch some more shadow entries */
+    sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
+#endif
+
+    /* Need to emulate accesses to page tables */
+    if ( sh_mfn_is_a_page_table(gmfn) )
+    {
+        if ( ft == ft_demand_write )
         {
             perfc_incrc(shadow_fault_emulate_write);
             goto emulate;
         }
-    }
-    else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
-    {
-        perfc_incrc(shadow_fault_emulate_read);
-        goto emulate;
-    }
-
-    /* Quick sanity check: we never make an MMIO entry that's got the 
-     * _PAGE_PRESENT flag set in it. */
-    ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
-
-    r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+        else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
+        {
+            perfc_incrc(shadow_fault_emulate_read);
+            goto emulate;
+        }
+    }
 
     if ( mmio ) 
     {
         gpa = guest_walk_to_gpa(&gw);
         goto mmio;
     }
-
-#if 0
-    if ( !(r & SHADOW_SET_CHANGED) )
-        debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte
-                          ") did not change anything\n",
-                          __func__, gw.va, l1e_get_intpte(sl1e));
-#endif
 
     perfc_incrc(shadow_fault_fixed);
     d->arch.shadow.fault_count++;
@@ -2769,7 +2773,6 @@ static int sh_page_fault(struct vcpu *v,
     return EXCRET_fault_fixed;
 
  emulate:
-
     /* Take the register set we were called with */
     emul_regs = *regs;
     if ( hvm_guest(v) )
@@ -3932,25 +3935,48 @@ int sh_audit_l1_table(struct vcpu *v, mf
     gfn_t gfn;
     char *s;
     int done = 0;
-
+    
     /* Follow the backpointer */
     gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
     gl1e = gp = sh_map_domain_page(gl1mfn);
     SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 
-        s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
-                            shadow_l1e_get_flags(*sl1e));
-        if ( s ) AUDIT_FAIL(1, "%s", s);
-
-        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
-        {
-            gfn = guest_l1e_get_gfn(*gl1e);
-            mfn = shadow_l1e_get_mfn(*sl1e);
-            gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
-            if ( mfn_x(gmfn) != mfn_x(mfn) )
-                AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
-                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
-                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+        if ( sh_l1e_is_magic(*sl1e) ) 
+        {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
+            if ( sh_l1e_is_gnp(*sl1e) )
+            {
+                if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
+                    AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
+            } 
+            else 
+            {
+                ASSERT(sh_l1e_is_mmio(*sl1e));
+                gfn = sh_l1e_mmio_get_gfn(*sl1e);
+                if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
+                    AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn 
+                               " but guest gfn is %" SH_PRI_gfn,
+                               gfn_x(gfn),
+                               gfn_x(guest_l1e_get_gfn(*gl1e)));
+            }
+#endif
+        }
+        else 
+        {
+            s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
+                               shadow_l1e_get_flags(*sl1e));
+            if ( s ) AUDIT_FAIL(1, "%s", s);
+            
+            if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
+            {
+                gfn = guest_l1e_get_gfn(*gl1e);
+                mfn = shadow_l1e_get_mfn(*sl1e);
+                gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
+                if ( mfn_x(gmfn) != mfn_x(mfn) )
+                    AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
+                               " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
+                               gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+            }
         }
     });
     sh_unmap_domain_page(gp);
@@ -3973,7 +3999,8 @@ int sh_audit_fl1_table(struct vcpu *v, m
         if ( !(f == 0 
                || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
                         _PAGE_ACCESSED|_PAGE_DIRTY) 
-               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
+               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
+               || sh_l1e_is_magic(*sl1e)) )
             AUDIT_FAIL(1, "fl1e has bad flags");
     });
     return 0;
@@ -4011,7 +4038,7 @@ int sh_audit_l2_table(struct vcpu *v, mf
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
                            " (--> %" SH_PRI_mfn ")"
-                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
+                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
                            gfn_x(gfn), 
                            (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
                            : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
@@ -4053,7 +4080,7 @@ int sh_audit_l3_table(struct vcpu *v, mf
                                      : PGC_SH_l2_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
-                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
+                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
                            gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
         }
     });
@@ -4088,7 +4115,7 @@ int sh_audit_l4_table(struct vcpu *v, mf
                                      PGC_SH_l3_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
-                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
+                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
                            gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
         }
     });
diff -r 77e1baf0a567 -r 0b6f49d25d4f xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Wed Nov 01 10:02:00 2006 +0000
+++ b/xen/arch/x86/mm/shadow/private.h  Wed Nov 01 10:31:11 2006 +0000
@@ -30,111 +30,6 @@
 #include <xen/domain_page.h>
 #include <asm/x86_emulate.h>
 #include <asm/hvm/support.h>
-
-
-/******************************************************************************
- * Definitions for the use of the "available" bits in the shadow PTEs.
- *
- * Review of the low 12 bits of a shadow page table entry:
- *
- *         in a guest:                      in a shadow:
- * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB
- * Bit 10: _PAGE_AVAIL1                     _PAGE_SHADOW_RW ("SW" below)
- * Bit  9: _PAGE_AVAIL0                     _PAGE_SHADOW_PRESENT ("SP" below)
- * Bit  8: _PAGE_GLOBAL                     _PAGE_SHADOW_MMIO ("MMIO" below),
- *                                          aka _PAGE_SHADOW_GUEST_NOT_PRESENT
- * Bit  7: _PAGE_PSE, aka _PAGE_PAT
- * Bit  6: _PAGE_DIRTY
- * Bit  5: _PAGE_ACCESSED
- * Bit  4: _PAGE_PCD
- * Bit  3: _PAGE_PWT
- * Bit  2: _PAGE_USER
- * Bit  1: _PAGE_RW ("GW" below)
- * Bit  0: _PAGE_PRESENT ("GP" below)
- *
- * Given a guest entry, as shown below, we can expect the following in the
- * corresponding shadow entry:
- *
- * Guest entry  Shadow entry      Commentary
- * -----------  ----------------  ---------------------------------------------
- *       Maps     
- * GP GW  IO    GP SP GW SW MMIO 
- * -- -- ----   -- -- -- -- ----
- *  -  -   -     0  0  0  0   0   The guest entry has not yet been shadowed.
- *  0  -   -     0  0  0  0   1   The guest entry is marked not-present.
- *  1  1  no     ?  1  ?  1   0   Writable entry in the guest.
- *  1  0  no     ?  1  0  0   0   Read-only entry in the guest.
- *  1  1  yes    0  1  ?  1   1   Writable MMIO mapping in the guest.
- *  1  0  yes    0  1  0  0   1   Read-only MMIO mapping in the guest.
- *
- * Normally, we would expect that GP=1 in the guest to imply GP=1 in the
- * shadow, and similarly for GW=1.  However, various functionality that may be
- * implemented via the shadow can cause GP or GW to be cleared in such cases.
- * A & D bit emulation is a prime example of such functionality.
- *
- * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same
- * entry will always be zero, too.
-
- * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests.  It is
- * currently available for random (ab)use in shadow entries.
- *
- * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow,
- * but currently there is no benefit, as the guest's TLB is flushed on every
- * transition of CR3 anyway due to the HVM exit/re-entry.
- *
- * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used
- * as the _PAGE_SHADOW_MMIO bit.  In such entries, if _PAGE_SHADOW_MMIO is
- * set, then the entry contains the *gfn* directly from the corresponding
- * guest entry (not an mfn!!).
- *
- * Bit 7 is set in a guest L2 to signify a superpage entry.  The current
- * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the
- * resulting shadow L1 table is called an FL1.  Note that there is no guest
- * page that corresponds to an FL1.
- *
- * Bit 7 in a guest L1 is the PAT2 bit.  Currently we do not support PAT in
- * this shadow code.
- *
- * Bit 6 is the dirty bit.
- *
- * Bit 5 is the accessed bit.
- *
- * Bit 4 is the cache disable bit.  If set in a guest, the hardware is
- * supposed to refuse to cache anything found via this entry.  It can be set
- * in an L4e, L3e, L2e, or L1e.  This shadow code currently does not support
- * cache disable bits.  They are silently ignored.
- *
- * Bit 4 is a guest L1 is also the PAT1 bit.  Currently we do not support PAT
- * in this shadow code.
- *
- * Bit 3 is the cache write-thru bit.  If set in a guest, the hardware is
- * supposed to use write-thru instead of write-back caching for anything found
- * via this entry.  It can be set in an L4e, L3e, L2e, or L1e.  This shadow
- * code currently does not support cache write-thru bits.  They are silently
- * ignored.
- *
- * Bit 3 is a guest L1 is also the PAT0 bit.  Currently we do not support PAT
- * in this shadow code.
- *
- * Bit 2 is the user bit.
- *
- * Bit 1 is the read-write bit.
- *
- * Bit 0 is the present bit.
- */
-
-// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by
-// the appropriate shadow rules.
-#define _PAGE_SHADOW_RW                 _PAGE_AVAIL1
-
-// Copy of the _PAGE_PRESENT bit from the guest's PTE
-#define _PAGE_SHADOW_PRESENT            _PAGE_AVAIL0
-
-// The matching guest entry maps MMIO space
-#define _PAGE_SHADOW_MMIO               _PAGE_GLOBAL
-
-// Shadow flags value used when the guest is not present
-#define _PAGE_SHADOW_GUEST_NOT_PRESENT  _PAGE_GLOBAL
 
 
 /******************************************************************************
@@ -151,13 +46,13 @@
     } while (0)
 
 // The flags for use with SHADOW_DEBUG:
-#define SHADOW_DEBUG_PROPAGATE         0
-#define SHADOW_DEBUG_MAKE_SHADOW       0
-#define SHADOW_DEBUG_DESTROY_SHADOW    0
+#define SHADOW_DEBUG_PROPAGATE         1
+#define SHADOW_DEBUG_MAKE_SHADOW       1
+#define SHADOW_DEBUG_DESTROY_SHADOW    1
 #define SHADOW_DEBUG_P2M               0
-#define SHADOW_DEBUG_A_AND_D           0
-#define SHADOW_DEBUG_EMULATE           0
-#define SHADOW_DEBUG_LOGDIRTY          1
+#define SHADOW_DEBUG_A_AND_D           1
+#define SHADOW_DEBUG_EMULATE           1
+#define SHADOW_DEBUG_LOGDIRTY          0
 
 
 /******************************************************************************
diff -r 77e1baf0a567 -r 0b6f49d25d4f xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h    Wed Nov 01 10:02:00 2006 +0000
+++ b/xen/arch/x86/mm/shadow/types.h    Wed Nov 01 10:31:11 2006 +0000
@@ -591,6 +591,77 @@ accumulate_guest_flags(struct vcpu *v, w
     return accumulated_flags;
 }
 
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
+/******************************************************************************
+ * We implement a "fast path" for two special cases: faults that require
+ * MMIO emulation, and faults where the guest PTE is not present.  We
+ * record these as shadow l1 entries that have reserved bits set in
+ * them, so we can spot them immediately in the fault handler and handle
+ * them without needing to hold the shadow lock or walk the guest
+ * pagetables.
+ *
+ * This is only feasible for PAE and 64bit Xen: 32-bit non-PAE PTEs don't
+ * have reserved bits that we can use for this.
+ */
+
+#define SH_L1E_MAGIC 0xffffffff00000000ULL
+static inline int sh_l1e_is_magic(shadow_l1e_t sl1e)
+{
+    return ((sl1e.l1 & SH_L1E_MAGIC) == SH_L1E_MAGIC);
+}
+
+/* Guest not present: a single magic value */
+static inline shadow_l1e_t sh_l1e_gnp(void) 
+{
+    return (shadow_l1e_t){ -1ULL };
+}
+
+static inline int sh_l1e_is_gnp(shadow_l1e_t sl1e) 
+{
+    return (sl1e.l1 == sh_l1e_gnp().l1);
+}
+
+/* MMIO: an invalid PTE that contains the GFN of the equivalent guest l1e.
+ * We store 28 bits of GFN in bits 4:32 of the entry.
+ * The present bit is set, and the U/S and R/W bits are taken from the guest.
+ * Bit 3 is always 0, to differentiate from gnp above.  */
+#define SH_L1E_MMIO_MAGIC       0xffffffff00000001ULL
+#define SH_L1E_MMIO_MAGIC_MASK  0xffffffff00000009ULL
+#define SH_L1E_MMIO_GFN_MASK    0x00000000fffffff0ULL
+#define SH_L1E_MMIO_GFN_SHIFT   4
+
+static inline shadow_l1e_t sh_l1e_mmio(gfn_t gfn, u32 gflags) 
+{
+    return (shadow_l1e_t) { (SH_L1E_MMIO_MAGIC 
+                             | (gfn_x(gfn) << SH_L1E_MMIO_GFN_SHIFT) 
+                             | (gflags & (_PAGE_USER|_PAGE_RW))) };
+}
+
+static inline int sh_l1e_is_mmio(shadow_l1e_t sl1e) 
+{
+    return ((sl1e.l1 & SH_L1E_MMIO_MAGIC_MASK) == SH_L1E_MMIO_MAGIC);
+}
+
+static inline gfn_t sh_l1e_mmio_get_gfn(shadow_l1e_t sl1e) 
+{
+    return _gfn((sl1e.l1 & SH_L1E_MMIO_GFN_MASK) >> SH_L1E_MMIO_GFN_SHIFT);
+}
+
+static inline u32 sh_l1e_mmio_get_flags(shadow_l1e_t sl1e) 
+{
+    return (u32)((sl1e.l1 & (_PAGE_USER|_PAGE_RW)));
+}
+
+#else
+
+#define sh_l1e_gnp() shadow_l1e_empty()
+#define sh_l1e_mmio(_gfn, _flags) shadow_l1e_empty()
+#define sh_l1e_is_magic(_e) (0)
+
+#endif /* SHOPT_FAST_FAULT_PATH */
+
+
 #endif /* _XEN_SHADOW_TYPES_H */
 
 /*
diff -r 77e1baf0a567 -r 0b6f49d25d4f xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Wed Nov 01 10:02:00 2006 +0000
+++ b/xen/include/asm-x86/perfc_defn.h  Wed Nov 01 10:31:11 2006 +0000
@@ -43,6 +43,9 @@ PERFCOUNTER_CPU(shadow_a_update,       "
 PERFCOUNTER_CPU(shadow_a_update,       "shadow A bit update")
 PERFCOUNTER_CPU(shadow_ad_update,      "shadow A&D bit update")
 PERFCOUNTER_CPU(shadow_fault,          "calls to shadow_fault")
+PERFCOUNTER_CPU(shadow_fault_fast_gnp, "shadow_fault fast path n/p")
+PERFCOUNTER_CPU(shadow_fault_fast_mmio, "shadow_fault fast path mmio")
+PERFCOUNTER_CPU(shadow_fault_fast_fail, "shadow_fault fast path error")
 PERFCOUNTER_CPU(shadow_fault_bail_bad_gfn, "shadow_fault guest bad gfn")
 PERFCOUNTER_CPU(shadow_fault_bail_not_present, 
                                         "shadow_fault guest not-present")
diff -r 77e1baf0a567 -r 0b6f49d25d4f xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h      Wed Nov 01 10:02:00 2006 +0000
+++ b/xen/include/asm-x86/shadow.h      Wed Nov 01 10:31:11 2006 +0000
@@ -161,8 +161,10 @@ extern int shadow_audit_enable;
  */
 #define SHOPT_WRITABLE_HEURISTIC  0x01  /* Guess at RW PTEs via linear maps */
 #define SHOPT_EARLY_UNSHADOW      0x02  /* Unshadow l1s on fork or exit */
-
-#define SHADOW_OPTIMIZATIONS      0x03
+#define SHOPT_FAST_FAULT_PATH     0x04  /* Fast-path MMIO and not-present */
+#define SHOPT_PREFETCH            0x08  /* Shadow multiple entries per fault */
+
+#define SHADOW_OPTIMIZATIONS      0x0f
 
 
 /* With shadow pagetables, the different kinds of address start 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] [XEN] Prefetch multiple shadow entries per pagefault, Xen patchbot-unstable <=