[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v4 8/9] xen/arm: Implement hypercall for dirty page tracing



Add hypercall (shadow op: enable/disable and clean/peek dirted page bitmap).
It consists of two parts: dirty page detecting and saving.
For detecting, we setup the guest p2m's leaf PTE read-only and whenever
the guest tries to write something, permission fault happens and traps into xen.
The permission-faulted GPA should be saved for the toolstack (when it wants to 
see
which pages are dirted). For this purpose, we temporarily save the GPAs into 
linked
list by using 'add_mapped_vaddr' function and when toolstack wants
(log_dirty_op function) the GPAs are copied into bitmap and the linnked list is 
flushed.

Additionally, for supporting parallel migration of domUs, vlpt area should be 
context
switched.

Signed-off-by: Jaeyong Yoo <jaeyong.yoo@xxxxxxxxxxx>
---
 xen/arch/arm/domain.c           |  15 ++
 xen/arch/arm/domctl.c           |  13 ++
 xen/arch/arm/mm.c               |  23 ++-
 xen/arch/arm/p2m.c              | 300 ++++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/traps.c            |  21 ++-
 xen/include/asm-arm/domain.h    |   5 +
 xen/include/asm-arm/mm.h        |   4 +
 xen/include/asm-arm/p2m.h       |   4 +
 xen/include/asm-arm/processor.h |   2 +
 9 files changed, 382 insertions(+), 5 deletions(-)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 57be341..40ebeed 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -215,6 +215,10 @@ static void ctxt_switch_to(struct vcpu *n)
     WRITE_SYSREG(hcr, HCR_EL2);
     isb();
 
+    /* vlpt area context switching for dirty-page tracing */
+    if ( n->domain->arch.dirty.mode )
+        restore_vlpt(n->domain);
+
     /* This is could trigger an hardware interrupt from the virtual
      * timer. The interrupt needs to be injected into the guest. */
     virt_timer_restore(n);
@@ -512,6 +516,17 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags)
     spin_lock_init(&d->arch.map_lock);
     d->arch.map_domain.nr_banks = 0;
 
+    /* init for dirty-page tracing */
+    d->arch.dirty.count = 0;
+    d->arch.dirty.mode = 0;
+    d->arch.dirty.head = NULL;
+    d->arch.dirty.mvn_head = NULL;
+    spin_lock_init(&d->arch.dirty.lock);
+
+    d->arch.dirty.second_lvl_start = 0;
+    d->arch.dirty.second_lvl_end = 0;
+    d->arch.dirty.second_lvl = NULL;
+
     clear_page(d->shared_info);
     share_xen_page_with_guest(
         virt_to_page(d->shared_info), d, XENSHARE_writable);
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 78fb322..7edce12 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -92,6 +92,19 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain 
*d,
             xfree(c.data);
     }
     break;
+    case XEN_DOMCTL_shadow_op:
+    {
+        domain_pause(d);
+        ret = dirty_mode_op(d, &domctl->u.shadow_op);
+        domain_unpause(d);
+
+        if ( (&domctl->u.shadow_op)->op == XEN_DOMCTL_SHADOW_OP_CLEAN ||
+             (&domctl->u.shadow_op)->op == XEN_DOMCTL_SHADOW_OP_PEEK )
+        {
+            copyback = 1;
+        }
+    }
+    break;
 
     default:
         return -EINVAL;
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 98edbc9..5b9d26d 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -838,7 +838,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e)
     create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
 }
 
-enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
 static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)
 {
     lpae_t pte;
@@ -1460,6 +1459,28 @@ void cleanup_vlpt(struct domain *d)
     unmap_domain_page_global(d->arch.dirty.second_lvl);
 }
 
+/* routine for dirty-page tracing
+ *
+ * On first write, it page faults, its entry is changed to read-write,
+ * and on retry the write succeeds.
+ *
+ * for locating p2m of the faulting entry, we use virtual-linear page table.
+ */
+void handle_page_fault(struct domain *d, paddr_t addr)
+{
+    lpae_t *vlp2m_pte = get_vlpt_3lvl_pte(addr);
+    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 )
+    {
+        lpae_t pte = *vlp2m_pte;
+        pte.p2m.write = 1;
+        write_pte(vlp2m_pte, pte);
+        flush_tlb_local();
+
+        /* in order to remove mappings in cleanup stage */
+        add_mapped_vaddr(d, addr);
+    }
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 2d09fef..3b2b11a 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -6,6 +6,8 @@
 #include <xen/bitops.h>
 #include <asm/flushtlb.h>
 #include <asm/gic.h>
+#include <xen/guest_access.h>
+#include <xen/pfn.h>
 
 void dump_p2m_lookup(struct domain *d, paddr_t addr)
 {
@@ -408,6 +410,304 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long 
gpfn)
     return p >> PAGE_SHIFT;
 }
 
+#define MAX_VA_PER_NODE (PAGE_SIZE - 2 * sizeof(void *) - sizeof(int))\
+                        / sizeof(unsigned long)
+
+/* An array-based linked list for storing virtual addresses (i.e., the 3rd
+ * level table mapping) that should be destroyed after live migration */
+struct mapped_va_node
+{
+    struct page_info *next;
+    struct mapped_va_node *mvn_next;
+    int items;
+    unsigned long vaddrs[MAX_VA_PER_NODE];
+};
+
+int add_mapped_vaddr(struct domain *d, unsigned long va)
+{
+    struct page_info *page_head;
+    struct mapped_va_node *mvn_head;
+
+    spin_lock(&d->arch.dirty.lock);
+
+    page_head = d->arch.dirty.head;
+    mvn_head = d->arch.dirty.mvn_head;
+    if ( !mvn_head )
+    {
+        page_head = alloc_domheap_page(NULL, 0);
+        if ( !page_head )
+        {
+            spin_unlock(&d->arch.dirty.lock);
+            return -ENOMEM;
+        }
+
+        mvn_head = map_domain_page_global(__page_to_mfn(page_head));
+        mvn_head->items = 0;
+        mvn_head->next = NULL;
+        mvn_head->mvn_next = NULL;
+        d->arch.dirty.head = page_head;
+        d->arch.dirty.mvn_head = mvn_head;
+    }
+
+    if ( mvn_head->items == MAX_VA_PER_NODE )
+    {
+        struct page_info *page;
+
+        page = alloc_domheap_page(NULL, 0);
+        if ( !page )
+        {
+            spin_unlock(&d->arch.dirty.lock);
+            return -ENOMEM;
+        }
+
+        mvn_head = map_domain_page_global(__page_to_mfn(page));
+        mvn_head->items = 0;
+        mvn_head->next = d->arch.dirty.head;
+        mvn_head->mvn_next = d->arch.dirty.mvn_head;
+
+        d->arch.dirty.mvn_head = mvn_head;
+        d->arch.dirty.head = page;
+    }
+
+    mvn_head->vaddrs[mvn_head->items] = va;
+    mvn_head->items ++;
+
+    spin_unlock(&d->arch.dirty.lock);
+    return 0;
+}
+
+/* get the dirty bitmap from the linked list stored by add_mapped_vaddr
+ * and also clear the mapped vaddrs linked list */
+static void get_dirty_bitmap(struct domain *d, uint8_t *bitmap[])
+{
+    struct page_info *page_head;
+    struct mapped_va_node *mvn_head;
+    struct page_info *tmp_page;
+    struct mapped_va_node *tmp_mvn;
+    vaddr_t gma_start = 0;
+    vaddr_t gma_end = 0;
+
+    /* this hypercall is called from domain 0, and we don't know which guest's
+     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
+    restore_vlpt(d);
+
+    spin_lock(&d->arch.dirty.lock);
+
+    page_head = d->arch.dirty.head;
+    mvn_head = d->arch.dirty.mvn_head;
+    get_gma_start_end(d, &gma_start, &gma_end);
+
+    while ( page_head )
+    {
+        int i;
+
+        for ( i = 0; i < mvn_head->items; ++i )
+        {
+            unsigned int bit_offset;
+            int bitmap_index, bitmap_offset;
+            lpae_t *vlp2m_pte;
+
+            bit_offset = third_linear_offset(mvn_head->vaddrs[i] - gma_start);
+            bitmap_index = bit_offset >> (PAGE_SHIFT + 3);
+            bitmap_offset = bit_offset & ((1ul << (PAGE_SHIFT + 3)) - 1);
+
+            __set_bit(bitmap_offset, bitmap[bitmap_index]);
+
+            vlp2m_pte = get_vlpt_3lvl_pte(mvn_head->vaddrs[i]);
+            vlp2m_pte->p2m.write = 0;
+        }
+
+        tmp_page = page_head;
+        page_head = mvn_head->next;
+
+        tmp_mvn = mvn_head;
+        mvn_head = mvn_head->mvn_next;
+
+        unmap_domain_page_global(tmp_mvn);
+        free_domheap_page(tmp_page);
+    }
+
+    d->arch.dirty.head = NULL;
+    d->arch.dirty.mvn_head = NULL;
+
+    spin_unlock(&d->arch.dirty.lock);
+}
+
+
+/* Change types across all p2m entries in a domain */
+static void p2m_change_entry_type_global(struct domain *d, enum mg nt)
+{
+    struct p2m_domain *p2m = &d->arch.p2m;
+    vaddr_t ram_base;
+    int i1, i2, i3;
+    int first_index, second_index, third_index;
+    lpae_t *first = __map_domain_page(p2m->first_level);
+    lpae_t pte, *second = NULL, *third = NULL;
+
+    get_gma_start_end(d, &ram_base, NULL);
+
+    first_index = first_table_offset((uint64_t)ram_base);
+    second_index = second_table_offset((uint64_t)ram_base);
+    third_index = third_table_offset((uint64_t)ram_base);
+
+    BUG_ON( !first && "Can't map first level p2m." );
+
+    spin_lock(&p2m->lock);
+
+    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
+    {
+        lpae_walk_t first_pte = first[i1].walk;
+        if ( !first_pte.valid || !first_pte.table )
+            goto out;
+
+        second = map_domain_page(first_pte.base);
+        BUG_ON( !second && "Can't map second level p2m.");
+        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
+        {
+            lpae_walk_t second_pte = second[i2].walk;
+
+            /* a nasty hack for vlpt due to the difference
+             * of page table entry layout between p2m and pt */
+            second[i2].pt.ro = 0;
+
+            if ( !second_pte.valid || !second_pte.table )
+                goto out;
+
+            third = map_domain_page(second_pte.base);
+            BUG_ON( !third && "Can't map third level p2m.");
+
+            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
+            {
+                lpae_walk_t third_pte = third[i3].walk;
+                int write = 0;
+                if ( !third_pte.valid )
+                    goto out;
+
+                pte = third[i3];
+                if ( pte.p2m.write == 1 && nt == mg_ro )
+                {
+                    pte.p2m.write = 0;
+                    write = 1;
+                }
+                else if ( pte.p2m.write == 0 && nt == mg_rw )
+                {
+                    pte.p2m.write = 1;
+                    write = 1;
+                }
+                if ( write )
+                    write_pte(&third[i3], pte);
+            }
+            unmap_domain_page(third);
+
+            third = NULL;
+            third_index = 0;
+        }
+        unmap_domain_page(second);
+
+        second = NULL;
+        second_index = 0;
+        third_index = 0;
+    }
+
+out:
+    flush_tlb_all_local();
+    if ( third ) unmap_domain_page(third);
+    if ( second ) unmap_domain_page(second);
+    if ( first ) unmap_domain_page(first);
+
+    spin_unlock(&p2m->lock);
+}
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats. */
+int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    unsigned long gmfn_start;
+    unsigned long gmfn_end;
+    unsigned long gmfns;
+    unsigned int bitmap_pages;
+    int rc = 0, clean = 0, peek = 1;
+    uint8_t *bitmap[256]; /* bitmap[256] covers 32GB ram */
+    int i;
+
+    BUG_ON( !d->arch.map_domain.nr_banks );
+
+    gmfn_start = d->arch.map_domain.bank[0].start >> PAGE_SHIFT;
+    gmfn_end = domain_get_maximum_gpfn(d);
+    gmfns = gmfn_end - gmfn_start;
+    bitmap_pages = PFN_UP((gmfns + 7) / 8);
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) )
+    {
+        peek = 0;
+    }
+    else
+    {
+        /* mapping to the bitmap from guest param */
+        vaddr_t to = (vaddr_t)sc->dirty_bitmap.p;
+
+        BUG_ON( to & ~PAGE_MASK && "offset not aligned to PAGE SIZE");
+
+        for ( i = 0; i < bitmap_pages; ++i )
+        {
+            paddr_t g;
+            rc = gvirt_to_maddr(to, &g);
+            if ( rc )
+                return rc;
+            bitmap[i] = map_domain_page(g>>PAGE_SHIFT);
+            memset(bitmap[i], 0x00, PAGE_SIZE);
+            to += PAGE_SIZE;
+        }
+    }
+
+    clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+    sc->stats.dirty_count = d->arch.dirty.count;
+
+    get_dirty_bitmap(d, bitmap);
+    if ( peek )
+    {
+        for ( i = 0; i < bitmap_pages; ++i )
+        {
+            unmap_domain_page(bitmap[i]);
+        }
+    }
+
+    return 0;
+}
+
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    long ret = 0;
+    switch (sc->op)
+    {
+        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+        case XEN_DOMCTL_SHADOW_OP_OFF:
+        {
+            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
+
+            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
+            p2m_change_entry_type_global(d, nt);
+
+            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
+                cleanup_vlpt(d);
+            else
+                prepare_vlpt(d);
+        }
+        break;
+
+        case XEN_DOMCTL_SHADOW_OP_CLEAN:
+        case XEN_DOMCTL_SHADOW_OP_PEEK:
+        {
+            ret = log_dirty_op(d, sc);
+        }
+        break;
+
+        default:
+            return -ENOSYS;
+    }
+    return ret;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index 4c0fc32..3b78ed2 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1313,6 +1313,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs 
*regs,
     const char *msg;
     int rc, level = -1;
     mmio_info_t info;
+    int page_fault = ((dabt.dfsc & FSC_MASK) ==
+                      (FSC_FLT_PERM + FSC_3D_LEVEL) && dabt.write);
 
     if ( !check_conditional_instr(regs, hsr) )
     {
@@ -1327,22 +1329,23 @@ static void do_trap_data_abort_guest(struct 
cpu_user_regs *regs,
     info.gva = READ_SYSREG64(FAR_EL2);
 #endif
 
-    if (dabt.s1ptw)
+    if ( dabt.s1ptw && !page_fault )
         goto bad_data_abort;
 
     rc = gva_to_ipa(info.gva, &info.gpa);
-    if ( rc == -EFAULT )
+    if ( rc == -EFAULT && !page_fault )
         goto bad_data_abort;
 
     /* XXX: Decode the instruction if ISS is not valid */
-    if ( !dabt.valid )
+    if ( !dabt.valid && !page_fault )
         goto bad_data_abort;
 
     /*
      * Erratum 766422: Thumb store translation fault to Hypervisor may
      * not have correct HSR Rt value.
      */
-    if ( cpu_has_erratum_766422() && (regs->cpsr & PSR_THUMB) && dabt.write )
+    if ( cpu_has_erratum_766422() && (regs->cpsr & PSR_THUMB) && dabt.write
+            && !page_fault)
     {
         rc = decode_instruction(regs, &info.dabt);
         if ( rc )
@@ -1358,6 +1361,16 @@ static void do_trap_data_abort_guest(struct 
cpu_user_regs *regs,
         return;
     }
 
+    /* handle permission fault on write */
+    if ( page_fault )
+    {
+        if ( current->domain->arch.dirty.mode == 0 )
+           goto bad_data_abort;
+
+        handle_page_fault(current->domain, info.gpa);
+        return;
+    }
+
 bad_data_abort:
 
     msg = decode_fsc( dabt.dfsc, &level);
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 7e7743a..de349e9 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -114,6 +114,11 @@ struct arch_domain
 
     /* dirty-page tracing */
     struct {
+        spinlock_t lock;
+        int mode;
+        unsigned int count;
+        struct page_info *head;     /* maintain the mapped vaddrs */
+        struct mapped_va_node *mvn_head;  /* va of the head */
         int second_lvl_start;            /* for context switch */
         int second_lvl_end;
         lpae_t *second_lvl;
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index b9cbcf2..c0eade0 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -366,6 +366,10 @@ static inline lpae_t * get_vlpt_3lvl_pte(vaddr_t addr)
         (ipa_third << 3) );
 }
 
+/* dirty-page tracing */
+enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
+void handle_page_fault(struct domain *d, paddr_t addr);
+
 #endif /*  __ARCH_ARM_MM__ */
 /*
  * Local variables:
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index c660820..0bac332 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -2,6 +2,7 @@
 #define _XEN_P2M_H
 
 #include <xen/mm.h>
+#include <public/domctl.h>
 
 struct domain;
 
@@ -110,6 +111,9 @@ static inline int get_page_and_type(struct page_info *page,
     return rc;
 }
 
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
+int add_mapped_vaddr(struct domain *d, unsigned long va);
+
 #endif /* _XEN_P2M_H */
 
 /*
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index 5294421..fced6ad 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -399,6 +399,8 @@ union hsr {
 #define FSC_CPR        (0x3a) /* Coprocossor Abort */
 
 #define FSC_LL_MASK    (0x03<<0)
+#define FSC_MASK       (0x3f) /* Fault status mask */
+#define FSC_3D_LEVEL   (0x03) /* Third level fault*/
 
 /* Time counter hypervisor control register */
 #define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */
-- 
1.8.1.2


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.