|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH 5/8] x86/vm-event/monitor: don't compromise monitor_write_data on domain cleanup
On 06/30/16 21:45, Corneliu ZUZU wrote:
> The arch_vm_event structure is dynamically allocated and freed @
> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack
> user
> disables domain monitoring (xc_monitor_disable), which in turn effectively
> discards any information that was in arch_vm_event.write_data.
>
> But this can yield unexpected behavior since if a CR-write was awaiting to be
> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
> before xc_monitor_disable is called, then the domain CR write is wrongfully
> ignored, which of course, in these cases, can easily render a domain crash.
>
> To fix the issue, this patch makes only arch_vm_event.emul_read_data
> dynamically
> allocated instead of the whole arch_vm_event structure. With this we can avoid
> invalidation of an awaiting arch_vm_event.write_data by selectively cleaning
> up
> only emul_read_data and emulate_flags @ vm_event_cleanup_domain.
>
> Small note: arch_vm_event structure definition needed to be moved from
> asm-x86/vm_event.h to asm-x86/domain.h in the process.
>
> Signed-off-by: Corneliu ZUZU <czuzu@xxxxxxxxxxxxxxx>
> ---
> xen/arch/x86/domain.c | 5 ++--
> xen/arch/x86/hvm/emulate.c | 8 +++---
> xen/arch/x86/hvm/hvm.c | 62
> ++++++++++++++++++------------------------
> xen/arch/x86/mm/p2m.c | 4 +--
> xen/arch/x86/monitor.c | 7 +----
> xen/arch/x86/vm_event.c | 16 +++++------
> xen/include/asm-x86/domain.h | 42 +++++++++++++++++-----------
> xen/include/asm-x86/monitor.h | 3 +-
> xen/include/asm-x86/vm_event.h | 10 -------
> 9 files changed, 73 insertions(+), 84 deletions(-)
>
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index bb59247..06e68ae 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v)
>
> void vcpu_destroy(struct vcpu *v)
> {
> - xfree(v->arch.vm_event);
> - v->arch.vm_event = NULL;
> + v->arch.vm_event.emulate_flags = 0;
> + xfree(v->arch.vm_event.emul_read_data);
> + v->arch.vm_event.emul_read_data = NULL;
>
> if ( is_pv_32bit_vcpu(v) )
> {
> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
> index 855af4d..68f5515 100644
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -73,12 +73,12 @@ static int set_context_data(void *buffer, unsigned int
> size)
> {
> struct vcpu *curr = current;
>
> - if ( curr->arch.vm_event )
> + if ( curr->arch.vm_event.emul_read_data )
> {
> unsigned int safe_size =
> - min(size, curr->arch.vm_event->emul_read_data.size);
> + min(size, curr->arch.vm_event.emul_read_data->size);
>
> - memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size);
> + memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size);
> memset(buffer + safe_size, 0, size - safe_size);
> return X86EMUL_OKAY;
> }
> @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear(
> * vm_event being triggered for repeated writes to a whole page.
> */
> if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) &&
> - current->arch.vm_event->emulate_flags != 0 )
> + current->arch.vm_event.emulate_flags != 0 )
> max_reps = 1;
>
> /*
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 884ae40..03dffb8 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
> if ( !handle_hvm_io_completion(v) )
> return;
>
> - if ( unlikely(v->arch.vm_event) )
> + if ( unlikely(v->arch.vm_event.emulate_flags) )
> {
> - if ( v->arch.vm_event->emulate_flags )
> - {
> - enum emul_kind kind = EMUL_KIND_NORMAL;
> + enum emul_kind kind;
>
> - if ( v->arch.vm_event->emulate_flags &
> - VM_EVENT_FLAG_SET_EMUL_READ_DATA )
> - kind = EMUL_KIND_SET_CONTEXT;
> - else if ( v->arch.vm_event->emulate_flags &
> - VM_EVENT_FLAG_EMULATE_NOWRITE )
> - kind = EMUL_KIND_NOWRITE;
> + ASSERT(v->arch.vm_event.emul_read_data);
>
> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
> - HVM_DELIVER_NO_ERROR_CODE);
> + kind = EMUL_KIND_NORMAL;
Why do the "kind = EMUL_KIND_NORMAL;" assignment separately after the
ASSERT()? Could it not be left the same way as before ("enum emul_kind
kind = EMUL_KIND_NORMAL;") above the ASSERT()?
It's not a big change and I won't hold the patch over it, but small
changes add up in the review process so unnecessary changes are best
either avoided, or done in a standalone cleanup patch.
> - v->arch.vm_event->emulate_flags = 0;
> - }
> + if ( v->arch.vm_event.emulate_flags &
> VM_EVENT_FLAG_SET_EMUL_READ_DATA )
> + kind = EMUL_KIND_SET_CONTEXT;
> + else if ( v->arch.vm_event.emulate_flags &
> + VM_EVENT_FLAG_EMULATE_NOWRITE )
> + kind = EMUL_KIND_NOWRITE;
> +
> + hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
> + HVM_DELIVER_NO_ERROR_CODE);
> +
> + v->arch.vm_event.emulate_flags = 0;
> }
>
> arch_monitor_write_data(v);
> @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t may_defer)
> if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled
> &
> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
> {
> - ASSERT(v->arch.vm_event);
> -
> if ( hvm_monitor_crX(CR0, value, old_value) )
> {
> /*
> * The actual write will occur in arch_monitor_write_data(), if
> * permitted.
> */
> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> - v->arch.vm_event->write_data.status = MWS_CR0;
> - v->arch.vm_event->write_data.value = value;
> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> + v->arch.vm_event.write_data.status = MWS_CR0;
> + v->arch.vm_event.write_data.value = value;
>
> return X86EMUL_OKAY;
> }
> @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
> if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled
> &
> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> {
> - ASSERT(v->arch.vm_event);
> -
> if ( hvm_monitor_crX(CR3, value, old) )
> {
> /*
> * The actual write will occur in arch_monitor_write_data(), if
> * permitted.
> */
> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> - v->arch.vm_event->write_data.status = MWS_CR3;
> - v->arch.vm_event->write_data.value = value;
> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> + v->arch.vm_event.write_data.status = MWS_CR3;
> + v->arch.vm_event.write_data.value = value;
>
> return X86EMUL_OKAY;
> }
> @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t may_defer)
> if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled
> &
> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
> {
> - ASSERT(v->arch.vm_event);
> -
> if ( hvm_monitor_crX(CR4, value, old_cr) )
> {
> /*
> * The actual write will occur in arch_monitor_write_data(), if
> * permitted.
> */
> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> - v->arch.vm_event->write_data.status = MWS_CR4;
> - v->arch.vm_event->write_data.value = value;
> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> + v->arch.vm_event.write_data.status = MWS_CR4;
> + v->arch.vm_event.write_data.value = value;
>
> return X86EMUL_OKAY;
> }
> @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr,
> uint64_t msr_content,
>
> if ( may_defer && unlikely(monitored_msr(v->domain, msr)) )
> {
> - ASSERT(v->arch.vm_event);
> -
> /*
> * The actual write will occur in arch_monitor_write_data(), if
> * permitted.
> */
> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> - v->arch.vm_event->write_data.status = MWS_MSR;
> - v->arch.vm_event->write_data.msr = msr;
> - v->arch.vm_event->write_data.value = msr_content;
> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> + v->arch.vm_event.write_data.status = MWS_MSR;
> + v->arch.vm_event.write_data.msr = msr;
> + v->arch.vm_event.write_data.value = msr_content;
>
> hvm_monitor_msr(msr, msr_content);
> return X86EMUL_OKAY;
> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 16733a4..9bcaa8a 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu *v,
> }
> }
>
> - v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
> + v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0;
>
> if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
> - v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
> + *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data;
> }
> }
>
> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
> index 5c8d4da..88d14ae 100644
> --- a/xen/arch/x86/monitor.c
> +++ b/xen/arch/x86/monitor.c
> @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d)
>
> void arch_monitor_write_data(struct vcpu *v)
> {
> - struct monitor_write_data *w;
> -
> - if ( likely(!v->arch.vm_event) )
> - return;
> -
> - w = &v->arch.vm_event->write_data;
> + struct monitor_write_data *w = &v->arch.vm_event.write_data;
>
> if ( likely(MWS_NOWRITE == w->status) )
> return;
> diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
> index 825da48..f21ff10 100644
> --- a/xen/arch/x86/vm_event.c
> +++ b/xen/arch/x86/vm_event.c
> @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d)
>
> for_each_vcpu ( d, v )
> {
> - if ( v->arch.vm_event )
> + if ( v->arch.vm_event.emul_read_data )
> continue;
>
> - v->arch.vm_event = xzalloc(struct arch_vm_event);
> + v->arch.vm_event.emul_read_data =
> + xzalloc(struct vm_event_emul_read_data);
>
> - if ( !v->arch.vm_event )
> + if ( !v->arch.vm_event.emul_read_data )
> return -ENOMEM;
> }
>
> @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d)
>
> for_each_vcpu ( d, v )
> {
> - xfree(v->arch.vm_event);
> - v->arch.vm_event = NULL;
> + v->arch.vm_event.emulate_flags = 0;
> + xfree(v->arch.vm_event.emul_read_data);
> + v->arch.vm_event.emul_read_data = NULL;
> }
>
> d->arch.mem_access_emulate_each_rep = 0;
> @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu *v,
> vm_event_response_t *rsp)
> {
> if ( rsp->flags & VM_EVENT_FLAG_DENY )
> {
> - ASSERT(v->arch.vm_event);
> -
> /* deny flag requires the vCPU to be paused */
> if ( !atomic_read(&v->vm_event_pause_count) )
> return;
>
> - v->arch.vm_event->write_data.status = MWS_NOWRITE;
> + v->arch.vm_event.write_data.status = MWS_NOWRITE;
> }
> }
>
> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
> index a22ee6b..7ea5c8f 100644
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -259,21 +259,6 @@ struct pv_domain
> struct cpuidmasks *cpuidmasks;
> };
>
> -enum monitor_write_status
> -{
> - MWS_NOWRITE = 0,
> - MWS_MSR,
> - MWS_CR0,
> - MWS_CR3,
> - MWS_CR4,
> -};
> -
> -struct monitor_write_data {
> - enum monitor_write_status status;
> - uint32_t msr;
> - uint64_t value;
> -};
> -
> struct arch_domain
> {
> struct page_info *perdomain_l3_pg;
> @@ -496,6 +481,31 @@ typedef enum __packed {
> SMAP_CHECK_DISABLED, /* disable the check */
> } smap_check_policy_t;
>
> +enum monitor_write_status
> +{
> + MWS_NOWRITE = 0,
> + MWS_MSR,
> + MWS_CR0,
> + MWS_CR3,
> + MWS_CR4,
> +};
> +
> +struct monitor_write_data {
> + enum monitor_write_status status;
> + uint32_t msr;
> + uint64_t value;
> +};
> +
> +/*
> + * Should we emulate the next matching instruction on VCPU resume
> + * after a vm_event?
> + */
> +struct arch_vm_event {
> + uint32_t emulate_flags;
> + struct vm_event_emul_read_data *emul_read_data;
> + struct monitor_write_data write_data;
> +};
> +
> struct arch_vcpu
> {
> /*
> @@ -569,7 +579,7 @@ struct arch_vcpu
> /* A secondary copy of the vcpu time info. */
> XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
>
> - struct arch_vm_event *vm_event;
> + struct arch_vm_event vm_event;
> };
>
> smap_check_policy_t smap_policy_change(struct vcpu *v,
> diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
> index 1068376..984ac4c 100644
> --- a/xen/include/asm-x86/monitor.h
> +++ b/xen/include/asm-x86/monitor.h
> @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct
> xen_domctl_monitor_op *mop)
> * Enabling mem_access_emulate_each_rep without a vm_event subscriber
> * is meaningless.
> */
> - if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
> + if ( d->max_vcpus && d->vcpu[0] &&
> + d->vcpu[0]->arch.vm_event.emul_read_data )
Again, I won't hold the patch over this, but if there are additional
reviews that require changes and cause another version of it, please add
a small line to the comment above the if, stating that emul_read_data
only gets allocated when vm_event gets enabled, otherwise (especially
for newcomers) that check might look confusing.
Otherwise:
Acked-by: Razvan Cojocaru <rcojocaru@xxxxxxxxxxxxxxx>
Thanks,
Razvan
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |