[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 5/8] x86/vm-event/monitor: don't compromise monitor_write_data on domain cleanup



The arch_vm_event structure is dynamically allocated and freed @
vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
disables domain monitoring (xc_monitor_disable), which in turn effectively
discards any information that was in arch_vm_event.write_data.

But this can yield unexpected behavior since if a CR-write was awaiting to be
committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
before xc_monitor_disable is called, then the domain CR write is wrongfully
ignored, which of course, in these cases, can easily render a domain crash.

To fix the issue, this patch makes only arch_vm_event.emul_read_data dynamically
allocated instead of the whole arch_vm_event structure. With this we can avoid
invalidation of an awaiting arch_vm_event.write_data by selectively cleaning up
only emul_read_data and emulate_flags @ vm_event_cleanup_domain.

Small note: arch_vm_event structure definition needed to be moved from
asm-x86/vm_event.h to asm-x86/domain.h in the process.

Signed-off-by: Corneliu ZUZU <czuzu@xxxxxxxxxxxxxxx>
---
 xen/arch/x86/domain.c          |  5 ++--
 xen/arch/x86/hvm/emulate.c     |  8 +++---
 xen/arch/x86/hvm/hvm.c         | 62 ++++++++++++++++++------------------------
 xen/arch/x86/mm/p2m.c          |  4 +--
 xen/arch/x86/monitor.c         |  7 +----
 xen/arch/x86/vm_event.c        | 16 +++++------
 xen/include/asm-x86/domain.h   | 42 +++++++++++++++++-----------
 xen/include/asm-x86/monitor.h  |  3 +-
 xen/include/asm-x86/vm_event.h | 10 -------
 9 files changed, 73 insertions(+), 84 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index bb59247..06e68ae 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v)
 
 void vcpu_destroy(struct vcpu *v)
 {
-    xfree(v->arch.vm_event);
-    v->arch.vm_event = NULL;
+    v->arch.vm_event.emulate_flags = 0;
+    xfree(v->arch.vm_event.emul_read_data);
+    v->arch.vm_event.emul_read_data = NULL;
 
     if ( is_pv_32bit_vcpu(v) )
     {
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 855af4d..68f5515 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -73,12 +73,12 @@ static int set_context_data(void *buffer, unsigned int size)
 {
     struct vcpu *curr = current;
 
-    if ( curr->arch.vm_event )
+    if ( curr->arch.vm_event.emul_read_data )
     {
         unsigned int safe_size =
-            min(size, curr->arch.vm_event->emul_read_data.size);
+            min(size, curr->arch.vm_event.emul_read_data->size);
 
-        memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size);
+        memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size);
         memset(buffer + safe_size, 0, size - safe_size);
         return X86EMUL_OKAY;
     }
@@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear(
      * vm_event being triggered for repeated writes to a whole page.
      */
     if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) &&
-         current->arch.vm_event->emulate_flags != 0 )
+         current->arch.vm_event.emulate_flags != 0 )
        max_reps = 1;
 
     /*
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 884ae40..03dffb8 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
     if ( !handle_hvm_io_completion(v) )
         return;
 
-    if ( unlikely(v->arch.vm_event) )
+    if ( unlikely(v->arch.vm_event.emulate_flags) )
     {
-        if ( v->arch.vm_event->emulate_flags )
-        {
-            enum emul_kind kind = EMUL_KIND_NORMAL;
+        enum emul_kind kind;
 
-            if ( v->arch.vm_event->emulate_flags &
-                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
-                kind = EMUL_KIND_SET_CONTEXT;
-            else if ( v->arch.vm_event->emulate_flags &
-                      VM_EVENT_FLAG_EMULATE_NOWRITE )
-                kind = EMUL_KIND_NOWRITE;
+        ASSERT(v->arch.vm_event.emul_read_data);
 
-            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
-                                       HVM_DELIVER_NO_ERROR_CODE);
+        kind = EMUL_KIND_NORMAL;
 
-            v->arch.vm_event->emulate_flags = 0;
-        }
+        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA 
)
+            kind = EMUL_KIND_SET_CONTEXT;
+        else if ( v->arch.vm_event.emulate_flags &
+                  VM_EVENT_FLAG_EMULATE_NOWRITE )
+            kind = EMUL_KIND_NOWRITE;
+
+        hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
+                                   HVM_DELIVER_NO_ERROR_CODE);
+
+        v->arch.vm_event.emulate_flags = 0;
     }
 
     arch_monitor_write_data(v);
@@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t may_defer)
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
     {
-        ASSERT(v->arch.vm_event);
-
         if ( hvm_monitor_crX(CR0, value, old_value) )
         {
             /*
              * The actual write will occur in arch_monitor_write_data(), if
              * permitted.
              */
-            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-            v->arch.vm_event->write_data.status = MWS_CR0;
-            v->arch.vm_event->write_data.value = value;
+            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+            v->arch.vm_event.write_data.status = MWS_CR0;
+            v->arch.vm_event.write_data.value = value;
 
             return X86EMUL_OKAY;
         }
@@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
     {
-        ASSERT(v->arch.vm_event);
-
         if ( hvm_monitor_crX(CR3, value, old) )
         {
             /*
              * The actual write will occur in arch_monitor_write_data(), if
              * permitted.
              */
-            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-            v->arch.vm_event->write_data.status = MWS_CR3;
-            v->arch.vm_event->write_data.value = value;
+            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+            v->arch.vm_event.write_data.status = MWS_CR3;
+            v->arch.vm_event.write_data.value = value;
 
             return X86EMUL_OKAY;
         }
@@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t may_defer)
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
     {
-        ASSERT(v->arch.vm_event);
-
         if ( hvm_monitor_crX(CR4, value, old_cr) )
         {
             /*
              * The actual write will occur in arch_monitor_write_data(), if
              * permitted.
              */
-            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-            v->arch.vm_event->write_data.status = MWS_CR4;
-            v->arch.vm_event->write_data.value = value;
+            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+            v->arch.vm_event.write_data.status = MWS_CR4;
+            v->arch.vm_event.write_data.value = value;
 
             return X86EMUL_OKAY;
         }
@@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t 
msr_content,
 
     if ( may_defer && unlikely(monitored_msr(v->domain, msr)) )
     {
-        ASSERT(v->arch.vm_event);
-
         /*
          * The actual write will occur in arch_monitor_write_data(), if
          * permitted.
          */
-        ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-        v->arch.vm_event->write_data.status = MWS_MSR;
-        v->arch.vm_event->write_data.msr = msr;
-        v->arch.vm_event->write_data.value = msr_content;
+        ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+        v->arch.vm_event.write_data.status = MWS_MSR;
+        v->arch.vm_event.write_data.msr = msr;
+        v->arch.vm_event.write_data.value = msr_content;
 
         hvm_monitor_msr(msr, msr_content);
         return X86EMUL_OKAY;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 16733a4..9bcaa8a 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu *v,
             }
         }
 
-        v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
+        v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0;
 
         if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
-            v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
+            *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data;
     }
 }
 
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index 5c8d4da..88d14ae 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d)
 
 void arch_monitor_write_data(struct vcpu *v)
 {
-    struct monitor_write_data *w;
-
-    if ( likely(!v->arch.vm_event) )
-        return;
-
-    w = &v->arch.vm_event->write_data;
+    struct monitor_write_data *w = &v->arch.vm_event.write_data;
 
     if ( likely(MWS_NOWRITE == w->status) )
         return;
diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
index 825da48..f21ff10 100644
--- a/xen/arch/x86/vm_event.c
+++ b/xen/arch/x86/vm_event.c
@@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        if ( v->arch.vm_event )
+        if ( v->arch.vm_event.emul_read_data )
             continue;
 
-        v->arch.vm_event = xzalloc(struct arch_vm_event);
+        v->arch.vm_event.emul_read_data =
+                xzalloc(struct vm_event_emul_read_data);
 
-        if ( !v->arch.vm_event )
+        if ( !v->arch.vm_event.emul_read_data )
             return -ENOMEM;
     }
 
@@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        xfree(v->arch.vm_event);
-        v->arch.vm_event = NULL;
+        v->arch.vm_event.emulate_flags = 0;
+        xfree(v->arch.vm_event.emul_read_data);
+        v->arch.vm_event.emul_read_data = NULL;
     }
 
     d->arch.mem_access_emulate_each_rep = 0;
@@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu *v, 
vm_event_response_t *rsp)
 {
     if ( rsp->flags & VM_EVENT_FLAG_DENY )
     {
-        ASSERT(v->arch.vm_event);
-
         /* deny flag requires the vCPU to be paused */
         if ( !atomic_read(&v->vm_event_pause_count) )
             return;
 
-        v->arch.vm_event->write_data.status = MWS_NOWRITE;
+        v->arch.vm_event.write_data.status = MWS_NOWRITE;
     }
 }
 
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index a22ee6b..7ea5c8f 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -259,21 +259,6 @@ struct pv_domain
     struct cpuidmasks *cpuidmasks;
 };
 
-enum monitor_write_status
-{
-    MWS_NOWRITE = 0,
-    MWS_MSR,
-    MWS_CR0,
-    MWS_CR3,
-    MWS_CR4,
-};
-
-struct monitor_write_data {
-    enum monitor_write_status status;
-    uint32_t msr;
-    uint64_t value;
-};
-
 struct arch_domain
 {
     struct page_info *perdomain_l3_pg;
@@ -496,6 +481,31 @@ typedef enum __packed {
     SMAP_CHECK_DISABLED,        /* disable the check */
 } smap_check_policy_t;
 
+enum monitor_write_status
+{
+    MWS_NOWRITE = 0,
+    MWS_MSR,
+    MWS_CR0,
+    MWS_CR3,
+    MWS_CR4,
+};
+
+struct monitor_write_data {
+    enum monitor_write_status status;
+    uint32_t msr;
+    uint64_t value;
+};
+
+/*
+ * Should we emulate the next matching instruction on VCPU resume
+ * after a vm_event?
+ */
+struct arch_vm_event {
+    uint32_t emulate_flags;
+    struct vm_event_emul_read_data *emul_read_data;
+    struct monitor_write_data write_data;
+};
+
 struct arch_vcpu
 {
     /*
@@ -569,7 +579,7 @@ struct arch_vcpu
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
 
-    struct arch_vm_event *vm_event;
+    struct arch_vm_event vm_event;
 };
 
 smap_check_policy_t smap_policy_change(struct vcpu *v,
diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
index 1068376..984ac4c 100644
--- a/xen/include/asm-x86/monitor.h
+++ b/xen/include/asm-x86/monitor.h
@@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct 
xen_domctl_monitor_op *mop)
          * Enabling mem_access_emulate_each_rep without a vm_event subscriber
          * is meaningless.
          */
-        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
+        if ( d->max_vcpus && d->vcpu[0] &&
+             d->vcpu[0]->arch.vm_event.emul_read_data )
             d->arch.mem_access_emulate_each_rep = !!mop->event;
         else
             rc = -EINVAL;
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
index 026f42e..c83583d 100644
--- a/xen/include/asm-x86/vm_event.h
+++ b/xen/include/asm-x86/vm_event.h
@@ -22,16 +22,6 @@
 #include <xen/sched.h>
 #include <xen/vm_event.h>
 
-/*
- * Should we emulate the next matching instruction on VCPU resume
- * after a vm_event?
- */
-struct arch_vm_event {
-    uint32_t emulate_flags;
-    struct vm_event_emul_read_data emul_read_data;
-    struct monitor_write_data write_data;
-};
-
 int vm_event_init_domain(struct domain *d);
 
 void vm_event_cleanup_domain(struct domain *d);
-- 
2.5.0


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.