[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v9 3/5] xen/mem_sharing: VM forking



On Fri, Feb 21, 2020 at 10:49:21AM -0800, Tamas K Lengyel wrote:
> VM forking is the process of creating a domain with an empty memory space and 
> a
> parent domain specified from which to populate the memory when necessary. For
> the new domain to be functional the VM state is copied over as part of the 
> fork
> operation (HVM params, hap allocation, etc).
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@xxxxxxxxx>
> ---
> v9: remove stale header
>     fix tsc incarnation being bumped on set
> ---
>  xen/arch/x86/domain.c             |  11 ++
>  xen/arch/x86/hvm/hvm.c            |   2 +-
>  xen/arch/x86/mm/mem_sharing.c     | 222 ++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/p2m.c             |  11 +-
>  xen/include/asm-x86/mem_sharing.h |  17 +++
>  xen/include/public/memory.h       |   5 +
>  xen/include/xen/sched.h           |   2 +
>  7 files changed, 268 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index fe63c23676..1ab0ca0942 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2203,6 +2203,17 @@ int domain_relinquish_resources(struct domain *d)
>              ret = relinquish_shared_pages(d);
>              if ( ret )
>                  return ret;
> +
> +            /*
> +             * If the domain is forked, decrement the parent's pause count
> +             * and release the domain.
> +             */
> +            if ( d->parent )
> +            {
> +                domain_unpause(d->parent);
> +                put_domain(d->parent);
> +                d->parent = NULL;
> +            }
>          }
>  #endif
>  
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index a339b36a0d..9bfa603f8c 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1915,7 +1915,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned 
> long gla,
>      }
>  #endif
>  
> -    /* Spurious fault? PoD and log-dirty also take this path. */
> +    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
>      if ( p2m_is_ram(p2mt) )
>      {
>          rc = 1;
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 3835bc928f..ad5db9d8d5 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -22,6 +22,7 @@
>  
>  #include <xen/types.h>
>  #include <xen/domain_page.h>
> +#include <xen/event.h>
>  #include <xen/spinlock.h>
>  #include <xen/rwlock.h>
>  #include <xen/mm.h>
> @@ -36,6 +37,9 @@
>  #include <asm/altp2m.h>
>  #include <asm/atomic.h>
>  #include <asm/event.h>
> +#include <asm/hap.h>
> +#include <asm/hvm/hvm.h>
> +#include <asm/hvm/save.h>
>  #include <xsm/xsm.h>
>  
>  #include "mm-locks.h"
> @@ -1444,6 +1448,194 @@ static inline int mem_sharing_control(struct domain 
> *d, bool enable)
>      return 0;
>  }
>  
> +/*
> + * Forking a page only gets called when the VM faults due to no entry being
> + * in the EPT for the access. Depending on the type of access we either
> + * populate the physmap with a shared entry for read-only access or
> + * fork the page if its a write access.
> + *
> + * The client p2m is already locked so we only need to lock
> + * the parent's here.
> + */
> +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
> +{
> +    int rc = -ENOENT;
> +    shr_handle_t handle;
> +    struct domain *parent;

Can you constify parent, I assume there are no changes made to the
parent domain, just the forked one.

> +    struct p2m_domain *p2m;
> +    unsigned long gfn_l = gfn_x(gfn);
> +    mfn_t mfn, new_mfn;
> +    p2m_type_t p2mt;
> +    struct page_info *page;
> +
> +    if ( !mem_sharing_is_fork(d) )
> +        return -ENOENT;
> +
> +    parent = d->parent;

You can initialize at declaration time.

> +
> +    if ( !unsharing )
> +    {
> +        /* For read-only accesses we just add a shared entry to the physmap 
> */
> +        while ( parent )
> +        {
> +            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
> +                break;
> +
> +            parent = parent->parent;
> +        }
> +
> +        if ( !rc )
> +        {
> +            /* The client's p2m is already locked */
> +            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
> +
> +            p2m_lock(pp2m);
> +            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
> +            p2m_unlock(pp2m);
> +
> +            if ( !rc )
> +                return 0;
> +        }

Don't you need to return here, or else you will fallback into the
unsharing path?

> +    }
> +
> +    /*
> +     * If it's a write access (ie. unsharing) or if adding a shared entry to
> +     * the physmap failed we'll fork the page directly.
> +     */
> +    p2m = p2m_get_hostp2m(d);
> +    parent = d->parent;
> +
> +    while ( parent )
> +    {
> +        mfn = get_gfn_query(parent, gfn_l, &p2mt);
> +
> +        if ( mfn_valid(mfn) && p2m_is_any_ram(p2mt) )

This would also cover grants, but I'm not sure how those are handled
by forking, as access to those is granted on a per-domain basis. Ie:
the parent will have access to the grant, but not the child.

> +            break;
> +
> +        put_gfn(parent, gfn_l);
> +        parent = parent->parent;
> +    }
> +
> +    if ( !parent )
> +        return -ENOENT;
> +
> +    if ( !(page = alloc_domheap_page(d, 0)) )
> +    {
> +        put_gfn(parent, gfn_l);
> +        return -ENOMEM;
> +    }
> +
> +    new_mfn = page_to_mfn(page);
> +    copy_domain_page(new_mfn, mfn);
> +    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> +
> +    put_gfn(parent, gfn_l);
> +
> +    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,

So the child p2m is going to be populated using 4K pages exclusively?
Maybe it would make sense to try to use 2M if the parent domain page
is also a 2M page or larger?

> +                          p2m->default_access, -1);
> +}
> +
> +static int bring_up_vcpus(struct domain *cd, struct cpupool *cpupool)
> +{
> +    int ret;
> +    unsigned int i;
> +
> +    if ( (ret = cpupool_move_domain(cd, cpupool)) )
> +        return ret;
> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        if ( cd->vcpu[i] )
> +            continue;
> +
> +        if ( !vcpu_create(cd, i) )
> +            return -EINVAL;
> +    }
> +
> +    domain_update_node_affinity(cd);
> +    return 0;
> +}
> +
> +static int fork_hap_allocation(struct domain *cd, struct domain *d)

Can you constify the parent domain?

Also cd and d and not very helpful names, I would just use child and
parent.

> +{
> +    int rc;
> +    bool preempted;
> +    unsigned long mb = hap_get_allocation(d);
> +
> +    if ( mb == hap_get_allocation(cd) )
> +        return 0;
> +
> +    paging_lock(cd);
> +    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
> +    paging_unlock(cd);
> +
> +    if ( rc )
> +        return rc;
> +
> +    if ( preempted )
> +        return -ERESTART;
> +
> +    return 0;

You can join all the checks into a single return:

return preempted ? -ERESTART : rc;

> +}
> +
> +static void fork_tsc(struct domain *cd, struct domain *d)
> +{
> +    uint32_t tsc_mode;
> +    uint32_t gtsc_khz;
> +    uint32_t incarnation;
> +    uint64_t elapsed_nsec;
> +
> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> +    /* Don't bump incarnation on set */
> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);

Why is the incarnation not bumped?

> +}
> +
> +static int mem_sharing_fork(struct domain *d, struct domain *cd)
> +{
> +    int rc = -EINVAL;
> +
> +    if ( !cd->controller_pause_count )
> +        return rc;

-EBUSY might be better here.

> +
> +    /*
> +     * We only want to get and pause the parent once, not each time this
> +     * operation is restarted due to preemption.
> +     */
> +    if ( !cd->parent_paused )
> +    {
> +        ASSERT(get_domain(d));

We are trying to avoid such constructs, instead I suggest:

if ( !get_domain(parent) )
{
    ASSERT_UNREACHABLE();
    return -EBUSY;
}

> +        domain_pause(d);
> +
> +        cd->parent_paused = true;
> +        cd->max_pages = d->max_pages;
> +        cd->max_vcpus = d->max_vcpus;
> +    }
> +
> +    /* this is preemptible so it's the first to get done */
> +    if ( (rc = fork_hap_allocation(cd, d)) )
> +        goto done;
> +
> +    if ( (rc = bring_up_vcpus(cd, d->cpupool)) )
> +        goto done;
> +
> +    if ( (rc = hvm_copy_context_and_params(cd, d)) )
> +        goto done;
> +
> +    fork_tsc(cd, d);
> +
> +    cd->parent = d;
> +
> + done:
> +    if ( rc && rc != -ERESTART )
> +    {
> +        domain_unpause(d);
> +        put_domain(d);
> +        cd->parent_paused = false;
> +    }
> +
> +    return rc;
> +}
> +
>  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
>  {
>      int rc;
> @@ -1698,6 +1890,36 @@ int 
> mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
>          rc = debug_gref(d, mso.u.debug.u.gref);
>          break;
>  
> +    case XENMEM_sharing_op_fork:
> +    {
> +        struct domain *pd;
> +
> +        rc = -EINVAL;
> +        if ( mso.u.fork._pad[0] || mso.u.fork._pad[1] ||
> +             mso.u.fork._pad[2] )
> +            goto out;
> +
> +        rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
> +                                               &pd);
> +        if ( rc )
> +            goto out;
> +
> +        if ( !mem_sharing_enabled(pd) )
> +        {
> +            if ( (rc = mem_sharing_control(pd, true)) )
> +                goto out;

Please join both conditions:

if ( !mem_sharing_enabled(pd) &&
     (rc = mem_sharing_control(pd, true)) )
    goto out;

> +        }
> +
> +        rc = mem_sharing_fork(pd, d);
> +
> +        if ( rc == -ERESTART )
> +            rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
> +                                               "lh", XENMEM_sharing_op,
> +                                               arg);
> +        rcu_unlock_domain(pd);
> +        break;
> +    }
> +
>      default:
>          rc = -ENOSYS;
>          break;
> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index c5f428d67c..7c4d2fd7a0 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -509,6 +509,14 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, 
> unsigned long gfn_l,
>  
>      mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
>  
> +    /* Check if we need to fork the page */
> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> +         !mem_sharing_fork_page(p2m->domain, gfn, !!(q & P2M_UNSHARE)) )
> +    {
> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> +    }

No need for the braces.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.