[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 5/6] vpci: fix execution of long running operations



> -----Original Message-----
> From: Roger Pau Monne
> Sent: 10 October 2018 10:10
> To: xen-devel@xxxxxxxxxxxxxxxxxxxx
> Cc: Paul Durrant <Paul.Durrant@xxxxxxxxxx>; Jan Beulich
> <jbeulich@xxxxxxxx>; Andrew Cooper <Andrew.Cooper3@xxxxxxxxxx>; Wei Liu
> <wei.liu2@xxxxxxxxxx>; George Dunlap <George.Dunlap@xxxxxxxxxx>; Ian
> Jackson <Ian.Jackson@xxxxxxxxxx>; Julien Grall <julien.grall@xxxxxxx>;
> Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>; Stefano Stabellini
> <sstabellini@xxxxxxxxxx>; Tim (Xen.org) <tim@xxxxxxx>
> Subject: Re: [PATCH 5/6] vpci: fix execution of long running operations
> 
> On Tue, Oct 09, 2018 at 11:42:35AM +0200, Roger Pau Monne wrote:
> > BAR map/unmap is a long running operation that needs to be preempted
> > in order to avoid overrunning the assigned vCPU time (or even
> > triggering the watchdog).
> >
> > Current logic for this preemption is wrong, and won't work at all for
> > AMD since only Intel makes use of hvm_io_pending (and even in that
> > case the current code is wrong).
> >
> > Instead move the code that performs the mapping/unmapping to
> > handle_hvm_io_completion and use do_softirq in order to execute the
> > pending softirqs while the {un}mapping takes place. Note that
> > do_softirq can also trigger a context switch to another vCPU, so
> > having pending vpci operations shouldn't prevent fair scheduling.
> >
> > When the {un}map operation is queued (as a result of a trapped PCI
> > access) a schedule softirq is raised in order to force a context
> > switch and the execution of handle_hvm_io_completion.
> 
> The logic of this patch is wrong, and calling do_softirq in order to
> preempt can lead to stack overflow due to recursion because the callee
> vCPU is not blocked and can be scheduled when calling do_softirq,
> leading to a recursive execution of vpci_process_pending if the MMIO
> area to modify is big enough.
> 
> Instead of running those operation on the vCPU context I think it's
> easier to switch to a task and instead pause the guest vCPU until the
> BARs are mapped and memory decoding is enabled.
> 
> Below is the updated patch.
> 
> ---8<---
> From 5c9f8802c46594a39de776d2414210fb21127b78 Mon Sep 17 00:00:00 2001
> From: Roger Pau Monne <roger.pau@xxxxxxxxxx>
> Date: Wed, 10 Oct 2018 11:04:45 +0200
> Subject: [PATCH 5/6] vpci: fix execution of long running operations
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> BAR map/unmap is a long running operation that needs to be preempted
> in order to avoid overrunning the assigned vCPU time (or even
> triggering the watchdog).
> 
> Current logic for this preemption is wrong, and won't work at all for
> AMD since only Intel makes use of hvm_io_pending (and even in that
> case the current code is wrong).
> 
> Instead move the code that performs the mapping/unmapping to
> a tasklet and pause the guest vCPU until the {un}mapping is done and
> the memory decoding bit has been toggled.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>

ioreq modification...

Reviewed-by: Paul Durrant <paul.durrant@xxxxxxxxxx>

> ---
> Cc: Paul Durrant <paul.durrant@xxxxxxxxxx>
> Cc: Jan Beulich <jbeulich@xxxxxxxx>
> Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Cc: Wei Liu <wei.liu2@xxxxxxxxxx>
> Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>
> Cc: Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
> Cc: Julien Grall <julien.grall@xxxxxxx>
> Cc: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
> Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx>
> Cc: Tim Deegan <tim@xxxxxxx>
> ---
>  xen/arch/x86/hvm/ioreq.c  |  3 --
>  xen/common/domain.c       |  2 ++
>  xen/drivers/vpci/header.c | 70 +++++++++++++++++++++++----------------
>  xen/include/xen/vpci.h    | 15 +++------
>  4 files changed, 48 insertions(+), 42 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
> index 3569beaad5..31429265f8 100644
> --- a/xen/arch/x86/hvm/ioreq.c
> +++ b/xen/arch/x86/hvm/ioreq.c
> @@ -85,9 +85,6 @@ bool hvm_io_pending(struct vcpu *v)
>      struct hvm_ioreq_server *s;
>      unsigned int id;
> 
> -    if ( has_vpci(d) && vpci_process_pending(v) )
> -        return true;
> -
>      FOR_EACH_IOREQ_SERVER(d, id, s)
>      {
>          struct hvm_ioreq_vcpu *sv;
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 65151e2ac4..54d2c26bd9 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -153,6 +153,8 @@ struct vcpu *vcpu_create(
> 
>      grant_table_init_vcpu(v);
> 
> +    vpci_init_vcpu(v);
> +
>      if ( !zalloc_cpumask_var(&v->cpu_hard_affinity) ||
>           !zalloc_cpumask_var(&v->cpu_hard_affinity_tmp) ||
>           !zalloc_cpumask_var(&v->cpu_hard_affinity_saved) ||
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index 9234de9b26..4af85d3c02 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -118,39 +118,48 @@ static void modify_decoding(const struct pci_dev
> *pdev, bool map, bool rom_only)
>                       cmd);
>  }
> 
> -bool vpci_process_pending(struct vcpu *v)
> +static void vpci_process_pending(unsigned long data)
>  {
> -    if ( v->vpci.mem )
> +    struct vcpu *v = (void *)data;
> +    struct map_data map_data = {
> +        .d = v->domain,
> +        .map = v->vpci.map,
> +    };
> +    int rc;
> +
> +    ASSERT(v->vpci.mem && atomic_read(&v->pause_count));
> +
> +    while ( (rc = rangeset_consume_ranges(v->vpci.mem, map_range,
> &map_data)) ==
> +            -ERESTART )
>      {
> -        struct map_data data = {
> -            .d = v->domain,
> -            .map = v->vpci.map,
> -        };
> -        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
> -
> -        if ( rc == -ERESTART )
> -            return true;
> -
> -        spin_lock(&v->vpci.pdev->vpci->lock);
> -        /* Disable memory decoding unconditionally on failure. */
> -        modify_decoding(v->vpci.pdev, !rc && v->vpci.map,
> -                        !rc && v->vpci.rom_only);
> -        spin_unlock(&v->vpci.pdev->vpci->lock);
> -
> -        rangeset_destroy(v->vpci.mem);
> -        v->vpci.mem = NULL;
> -        if ( rc )
> -            /*
> -             * FIXME: in case of failure remove the device from the
> domain.
> -             * Note that there might still be leftover mappings. While
> this is
> -             * safe for Dom0, for DomUs the domain will likely need to be
> -             * killed in order to avoid leaking stale p2m mappings on
> -             * failure.
> -             */
> -            vpci_remove_device(v->vpci.pdev);
> +        tasklet_schedule(&v->vpci.task);
> +        return;
>      }
> 
> -    return false;
> +    spin_lock(&v->vpci.pdev->vpci->lock);
> +    /* Disable memory decoding unconditionally on failure. */
> +    modify_decoding(v->vpci.pdev, !rc && v->vpci.map,
> +                    !rc && v->vpci.rom_only);
> +    spin_unlock(&v->vpci.pdev->vpci->lock);
> +
> +    rangeset_destroy(v->vpci.mem);
> +    v->vpci.mem = NULL;
> +    if ( rc )
> +        /*
> +         * FIXME: in case of failure remove the device from the domain.
> +         * Note that there might still be leftover mappings. While this
> is
> +         * safe for Dom0, for DomUs the domain will likely need to be
> +         * killed in order to avoid leaking stale p2m mappings on
> +         * failure.
> +         */
> +        vpci_remove_device(v->vpci.pdev);
> +
> +    vcpu_unpause(v);
> +}
> +
> +void vpci_init_vcpu(struct vcpu *v)
> +{
> +    tasklet_init(&v->vpci.task, vpci_process_pending, (unsigned long)v);
>  }
> 
>  static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> @@ -183,6 +192,9 @@ static void defer_map(struct domain *d, struct pci_dev
> *pdev,
>      curr->vpci.mem = mem;
>      curr->vpci.map = map;
>      curr->vpci.rom_only = rom_only;
> +    /* Pause the vCPU and schedule the tasklet that will perform the
> mapping. */
> +    vcpu_pause_nosync(curr);
> +    tasklet_schedule(&curr->vpci.task);
>  }
> 
>  static int modify_bars(const struct pci_dev *pdev, bool map, bool
> rom_only)
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index af2b8580ee..f97c48a8f1 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -49,11 +49,8 @@ uint32_t vpci_hw_read16(const struct pci_dev *pdev,
> unsigned int reg,
>  uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
>                          void *data);
> 
> -/*
> - * Check for pending vPCI operations on this vcpu. Returns true if the
> vcpu
> - * should not run.
> - */
> -bool __must_check vpci_process_pending(struct vcpu *v);
> +/* Init per-vCPU vPCI data. */
> +void vpci_init_vcpu(struct vcpu *v);
> 
>  struct vpci {
>      /* List of vPCI handlers for a device. */
> @@ -142,6 +139,8 @@ struct vpci {
>  };
> 
>  struct vpci_vcpu {
> +    /* Per-vCPU tasklet to enqueue pending operations. */
> +    struct tasklet task;
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs.
> */
>      struct rangeset *mem;
>      struct pci_dev *pdev;
> @@ -233,11 +232,7 @@ static inline void vpci_write(pci_sbdf_t sbdf,
> unsigned int reg,
>      ASSERT_UNREACHABLE();
>  }
> 
> -static inline bool vpci_process_pending(struct vcpu *v)
> -{
> -    ASSERT_UNREACHABLE();
> -    return false;
> -}
> +static inline void vpci_init_vcpu(struct vcpu *v) { }
>  #endif
> 
>  #endif
> --
> 2.19.0

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.