[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC PATCH 3/8]: PVH: memory manager and paging related changes



On Thu, 2012-08-16 at 02:02 +0100, Mukesh Rathor wrote:
> 
> ---
>  arch/x86/xen/mmu.c              |  179 
> ++++++++++++++++++++++++++++++++++++---
>  arch/x86/xen/mmu.h              |    2 +
>  include/xen/interface/memory.h  |   27 ++++++-
>  include/xen/interface/physdev.h |   10 ++
>  include/xen/xen-ops.h           |    7 ++
>  5 files changed, 211 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index b65a761..44a6477 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -330,6 +330,38 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval)
>         __xen_set_pte(ptep, pteval);
>  }
> 
> +/* This for PV guest in hvm container */

Shall I stop mentioning uses of "HVM" in the context of PVH? ;-)

> +void xen_set_clr_mmio_pvh_pte(unsigned long pfn, unsigned long mfn,
> +                             int nr_mfns, int add_mapping)
> +{
> +       int rc;
> +       struct physdev_map_iomem iomem;
> +
> +       iomem.first_gfn = pfn;
> +       iomem.first_mfn = mfn;
> +       iomem.nr_mfns = nr_mfns;
> +       iomem.add_mapping = add_mapping;
> +
> +       rc = HYPERVISOR_physdev_op(PHYSDEVOP_pvh_map_iomem, &iomem);
> +       BUG_ON(rc);

It's purely a matter of taste but we would tend to write
        if (HYPERVISOR_foO(...)
                BUG();

> +}
> +
> +/* This for PV guest in hvm container.
> + * We need this because during boot early_ioremap path eventually calls
> + * set_pte that maps io space. Also, ACPI pages are not mapped into to the
> + * EPT during dom0 creation. The pages are mapped initially here from
> + * kernel_physical_mapping_init() then later the memtype is changed.  */
> +static void xen_dom0pvh_set_pte(pte_t *ptep, pte_t pteval)
> +{
> +       native_set_pte(ptep, pteval);
> +}
> +
> +static void xen_dom0pvh_set_pte_at(struct mm_struct *mm, unsigned long addr,
> +                                  pte_t *ptep, pte_t pteval)
> +{
> +       native_set_pte(ptep, pteval);
> +}

I think Stefano alreeady asked why not just use native_* as the hook.

> +
>  static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
>                     pte_t *ptep, pte_t pteval)
>  {
> @@ -1197,6 +1229,10 @@ static void xen_post_allocator_init(void);
>  static void __init xen_pagetable_setup_done(pgd_t *base)
>  {
>         xen_setup_shared_info();
> +
> +       if (xen_pvh_domain())
> +               return;
> +
>         xen_post_allocator_init();
>  }
> 
> @@ -1652,6 +1688,10 @@ static void set_page_prot(void *addr, pgprot_t prot)
>         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
>         pte_t pte = pfn_pte(pfn, prot);
> 
> +       /* for PVH, page tables are native. */
> +       if (xen_pvh_domain())
> +               return;
> +
>         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
>                 BUG();
>  }
> @@ -1745,6 +1785,7 @@ static void convert_pfn_mfn(void *v)
>   * but that's enough to get __va working.  We need to fill in the rest
>   * of the physical mapping once some sort of allocator has been set
>   * up.
> + * NOTE: for PVH, the page tables are native with HAP required.

OOI does this mean shadow doesn't work?

> @@ -1761,10 +1802,12 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
>         /* Zap identity mapping */
>         init_level4_pgt[0] = __pgd(0);
> 
> -       /* Pre-constructed entries are in pfn, so convert to mfn */
> -       convert_pfn_mfn(init_level4_pgt);
> -       convert_pfn_mfn(level3_ident_pgt);
> -       convert_pfn_mfn(level3_kernel_pgt);
> +       if (!xen_pvh_domain()) {

Does this test really mean if(XENFEAT_auto_translated_physmap)?

And maybe it fits mnore logically in convert_pfn_mfn itself?

> +               /* Pre-constructed entries are in pfn, so convert to mfn */
> +               convert_pfn_mfn(init_level4_pgt);
> +               convert_pfn_mfn(level3_ident_pgt);
> +               convert_pfn_mfn(level3_kernel_pgt);
> +       }
> 
>         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
>         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
> @@ -1787,12 +1830,14 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
>         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
>         set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
> 
> -       /* Pin down new L4 */
> -       pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
> -                         PFN_DOWN(__pa_symbol(init_level4_pgt)));
> +       if (!xen_pvh_domain()) {

XENFEAT_writeable_page_tables?

And inside pin_pagetable_pfn perhaps?


> +               /* Pin down new L4 */
> +               pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
> +                               PFN_DOWN(__pa_symbol(init_level4_pgt)));
> 
> -       /* Unpin Xen-provided one */
> -       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
> +               /* Unpin Xen-provided one */
> +               pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
> +       }
> 
>         /* Switch over */
>         pgd = init_level4_pgt;
> @@ -1802,9 +1847,13 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
>          * structure to attach it to, so make sure we just set kernel
>          * pgd.
>          */
> -       xen_mc_batch();
> -       __xen_write_cr3(true, __pa(pgd));
> -       xen_mc_issue(PARAVIRT_LAZY_CPU);

Not your issue but I wonder why we do this weird looking singleton
batch?

> +       if (xen_pvh_domain()) {
> +               native_write_cr3(__pa(pgd));
> +       } else {
> +               xen_mc_batch();
> +               __xen_write_cr3(true, __pa(pgd));
> +               xen_mc_issue(PARAVIRT_LAZY_CPU);
> +       }
> 
>         memblock_reserve(__pa(xen_start_info->pt_base),
>                          xen_start_info->nr_pt_frames * PAGE_SIZE);
> @@ -2067,9 +2116,21 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst 
> = {
> 
>  void __init xen_init_mmu_ops(void)
>  {
> +       x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;

I'd be tempted to leave this below, where it fits in alongside the
others and duplicate it in the if as necessary.

Given that the two cases are basically non-overlapping perhaps you want
xen_init_(pv,pvh)_mmu_ops as separate hooks?

Looking back the same might apply to the pagetable_setup_done hook?

> +
> +       if (xen_pvh_domain()) {
> +               pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
> +
> +               /* set_pte* for PCI devices to map iomem. */
> +               if (xen_initial_domain()) {
> +                       pv_mmu_ops.set_pte = xen_dom0pvh_set_pte;
> +                       pv_mmu_ops.set_pte_at = xen_dom0pvh_set_pte_at;

Is this wrong for domU or have you just not tried/implemented
passthrough support yet?

> +               }
> +               return;
> +       }
> +
>         x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
>         x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
> -       x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
>         pv_mmu_ops = xen_mmu_ops;
> 
>         memset(dummy_mapping, 0xff, PAGE_SIZE);
> @@ -2305,6 +2366,93 @@ void __init xen_hvm_init_mmu_ops(void)
>  }
>  #endif
> 
> +/* Map foreign gmfn, fgmfn, to local pfn, lpfn. This for the user space
> + * creating new guest on PVH dom0 and needs to map domU pages. Called from
> + * exported function, so no need to export this.
> + */
> +static int pvh_add_to_xen_p2m(unsigned long lpfn, unsigned long fgmfn,
> +                             unsigned int domid)
> +{
> +       int rc;
> +       struct xen_add_to_physmap pmb = {.foreign_domid = domid};

I'm not sure but I think CodingStyle would want spaces inside the {}s.

What is the b in pmb? Phys Map B??? (every other user of this interface
says xatp, FWIW)

> +
> +       pmb.gpfn = lpfn;
> +       pmb.idx = fgmfn;
> +       pmb.space = XENMAPSPACE_gmfn_foreign;
> +       rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &pmb);
> +       if (rc) {
> +               pr_warn("Failed to map pfn to mfn rc:%d pfn:%lx mfn:%lx\n",
> +                       rc, lpfn, fgmfn);
> +               return 1;
> +       }
> +       return 0;
> +}
> +
> +/* Unmap an entry from xen p2m table */

Actually it is @count entries, starting a spfn. That seems obvious
enough from the prototype not to be worth saying though.

> +int pvh_rem_xen_p2m(unsigned long spfn, int count)
> +{
> +       struct xen_remove_from_physmap xrp;
> +       int i, rc;
> +
> +       for (i=0; i < count; i++) {
> +               xrp.domid = DOMID_SELF;
> +               xrp.gpfn = spfn+i;
> +               rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
> +               if (rc) {
> +                       pr_warn("Failed to unmap pfn:%lx rc:%d done:%d\n",
> +                               spfn+i, rc, i);
> +                       return 1;
> +               }
> +       }
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(pvh_rem_xen_p2m);

What is the external/modular user of this?

I guess this is why you noted that pvh_add_to_xen_p2m didn't need
exporting, which struck me as unnecessary at the time.

pvh_add_to_xen_p2m seems to have an exported a wrapper, why not rem too?
e.g. xen_unmap_domain_mfn_range?

> +
> +struct pvh_remap_data {
> +       unsigned long fgmfn;            /* foreign domain's gmfn */
> +       pgprot_t prot;
> +       domid_t  domid;
> +       struct vm_area_struct *vma;
> +};
> +
> +static int pvh_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
> +                       void *data)
> +{
> +       struct pvh_remap_data *pvhp = data;
> +       struct xen_pvh_sav_pfn_info *savp = pvhp->vma->vm_private_data;
> +       unsigned long pfn = page_to_pfn(savp->sp_paga[savp->sp_next_todo++]);
> +       pte_t pteval = pte_mkspecial(pfn_pte(pfn, pvhp->prot));
> +
> +       native_set_pte(ptep, pteval);
> +       if (pvh_add_to_xen_p2m(pfn, pvhp->fgmfn, pvhp->domid))
> +               return -EFAULT;

Is there a little window here where we've setup the page table entry but
the p2m entry behind it is uninitialised?

I suppose even if an interrupt occurred we can rely on the virtual
address not having been "exposed" anywhere yet and therefore there is no
chance of anyone dereferencing it. But is there any harm in just
flipping the ordering here?

Why EFAULT? Seems a bit random, I think HYPERVISOR_memory_op returns a
-ve errno which you could propagate.

> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index eac3ce1..1b213b1 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -163,10 +163,19 @@ struct xen_add_to_physmap {
>      /* Which domain to change the mapping for. */
>      domid_t domid;
> 
> +    /* Number of pages to go through for gmfn_range */
> +    uint16_t    size;
> +
>      /* Source mapping space. */
>  #define XENMAPSPACE_shared_info 0 /* shared info page */
>  #define XENMAPSPACE_grant_table 1 /* grant table page */
> -    unsigned int space;
> +#define XENMAPSPACE_gmfn        2 /* GMFN */
> +#define XENMAPSPACE_gmfn_range  3 /* GMFN range */
> +#define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another guest */
> +    uint16_t space;
> +    domid_t foreign_domid;         /* IFF XENMAPSPACE_gmfn_foreign */
> +
> +#define XENMAPIDX_grant_table_status 0x80000000

Even if we don't get the full PVH implementation in the hypervisor right
away I think we should at least update the canonical interface
definition in the Xen tree (xen/include/public) first. (same goes for
the other interface definitions here).

Ian.


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.