|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v6 8/9] x86/mm: move PV code to pv/mm.c
The following code is moved:
1. PV MMU hypercall handlers
2. PV memory management code such as:
2.1 {get,put}_page_from_l{2,3,4}e
2.2 pv_{alloc,free}_page_type
3. All helper functions for the above
The l1e functions can't be moved because they are needed by shadow
code as well.
Pure code movement.
Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx>
---
xen/arch/x86/mm.c | 2620 ++------------------------------------------------
xen/arch/x86/pv/mm.c | 2452 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 2540 insertions(+), 2532 deletions(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index e004350e83..0b5fd199a4 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -539,218 +539,6 @@ const char __section(".bss.page_aligned.const")
__aligned(PAGE_SIZE)
zero_page[PAGE_SIZE];
-static int alloc_segdesc_page(struct page_info *page)
-{
- const struct domain *owner = page_get_owner(page);
- struct desc_struct *descs = __map_domain_page(page);
- unsigned i;
-
- for ( i = 0; i < 512; i++ )
- if ( unlikely(!check_descriptor(owner, &descs[i])) )
- break;
-
- unmap_domain_page(descs);
-
- return i == 512 ? 0 : -EINVAL;
-}
-
-static int get_page_and_type_from_mfn(
- mfn_t mfn, unsigned long type, struct domain *d,
- int partial, int preemptible)
-{
- struct page_info *page = mfn_to_page(mfn);
- int rc;
-
- if ( likely(partial >= 0) &&
- unlikely(!get_page_from_mfn(mfn, d)) )
- return -EINVAL;
-
- rc = (preemptible ?
- get_page_type_preemptible(page, type) :
- (get_page_type(page, type) ? 0 : -EINVAL));
-
- if ( unlikely(rc) && partial >= 0 &&
- (!preemptible || page != current->arch.old_guest_table) )
- put_page(page);
-
- return rc;
-}
-
-static void put_data_page(
- struct page_info *page, int writeable)
-{
- if ( writeable )
- put_page_and_type(page);
- else
- put_page(page);
-}
-
-#ifdef CONFIG_PV_LINEAR_PT
-
-static bool inc_linear_entries(struct page_info *pg)
-{
- typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
-
- do {
- /*
- * The check below checks for the "linear use" count being non-zero
- * as well as overflow. Signed integer overflow is undefined behavior
- * according to the C spec. However, as long as linear_pt_count is
- * smaller in size than 'int', the arithmetic operation of the
- * increment below won't overflow; rather the result will be truncated
- * when stored. Ensure that this is always true.
- */
- BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
- oc = nc++;
- if ( nc <= 0 )
- return false;
- nc = cmpxchg(&pg->linear_pt_count, oc, nc);
- } while ( oc != nc );
-
- return true;
-}
-
-static void dec_linear_entries(struct page_info *pg)
-{
- typeof(pg->linear_pt_count) oc;
-
- oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
- ASSERT(oc > 0);
-}
-
-static bool inc_linear_uses(struct page_info *pg)
-{
- typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
-
- do {
- /* See the respective comment in inc_linear_entries(). */
- BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
- oc = nc--;
- if ( nc >= 0 )
- return false;
- nc = cmpxchg(&pg->linear_pt_count, oc, nc);
- } while ( oc != nc );
-
- return true;
-}
-
-static void dec_linear_uses(struct page_info *pg)
-{
- typeof(pg->linear_pt_count) oc;
-
- oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
- ASSERT(oc < 0);
-}
-
-/*
- * We allow root tables to map each other (a.k.a. linear page tables). It
- * needs some special care with reference counts and access permissions:
- * 1. The mapping entry must be read-only, or the guest may get write access
- * to its own PTEs.
- * 2. We must only bump the reference counts for an *already validated*
- * L2 table, or we can end up in a deadlock in get_page_type() by waiting
- * on a validation that is required to complete that validation.
- * 3. We only need to increment the reference counts for the mapped page
- * frame if it is mapped by a different root table. This is sufficient and
- * also necessary to allow validation of a root table mapping itself.
- */
-static bool __read_mostly opt_pv_linear_pt = true;
-boolean_param("pv-linear-pt", opt_pv_linear_pt);
-
-#define define_get_linear_pagetable(level) \
-static int \
-get_##level##_linear_pagetable( \
- level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
-{ \
- unsigned long x, y; \
- unsigned long pfn; \
- \
- if ( !opt_pv_linear_pt ) \
- { \
- gdprintk(XENLOG_WARNING, \
- "Attempt to create linear p.t. (feature disabled)\n"); \
- return 0; \
- } \
- \
- if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
- { \
- gdprintk(XENLOG_WARNING, \
- "Attempt to create linear p.t. with write perms\n"); \
- return 0; \
- } \
- \
- if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
- { \
- struct page_info *page, *ptpg = mfn_to_page(_mfn(pde_pfn)); \
- \
- /* Make sure the page table belongs to the correct domain. */ \
- if ( unlikely(page_get_owner(ptpg) != d) ) \
- return 0; \
- \
- /* Make sure the mapped frame belongs to the correct domain. */ \
- page = get_page_from_mfn(_mfn(pfn), d); \
- if ( unlikely(!page) ) \
- return 0; \
- \
- /* \
- * Ensure that the mapped frame is an already-validated page table \
- * and is not itself having linear entries, as well as that the \
- * containing page table is not iself in use as a linear page table \
- * elsewhere. \
- * If so, atomically increment the count (checking for overflow). \
- */ \
- if ( !inc_linear_entries(ptpg) ) \
- { \
- put_page(page); \
- return 0; \
- } \
- if ( !inc_linear_uses(page) ) \
- { \
- dec_linear_entries(ptpg); \
- put_page(page); \
- return 0; \
- } \
- y = page->u.inuse.type_info; \
- do { \
- x = y; \
- if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
- unlikely((x & (PGT_type_mask|PGT_validated)) != \
- (PGT_##level##_page_table|PGT_validated)) ) \
- { \
- dec_linear_uses(page); \
- dec_linear_entries(ptpg); \
- put_page(page); \
- return 0; \
- } \
- } \
- while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
- } \
- \
- return 1; \
-}
-
-#else /* CONFIG_PV_LINEAR_PT */
-
-#define define_get_linear_pagetable(level) \
-static int \
-get_##level##_linear_pagetable( \
- level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
-{ \
- return 0; \
-}
-
-static void dec_linear_uses(struct page_info *pg)
-{
- ASSERT(pg->linear_pt_count == 0);
-}
-
-static void dec_linear_entries(struct page_info *pg)
-{
- ASSERT(pg->linear_pt_count == 0);
-}
-
-#endif /* CONFIG_PV_LINEAR_PT */
-
bool is_iomem_page(mfn_t mfn)
{
struct page_info *page;
@@ -1039,104 +827,6 @@ get_page_from_l1e(
return -EBUSY;
}
-
-/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
-/*
- * get_page_from_l2e returns:
- * 1 => page not present
- * 0 => success
- * <0 => error code
- */
-define_get_linear_pagetable(l2);
-static int
-get_page_from_l2e(
- l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
-{
- unsigned long mfn = l2e_get_pfn(l2e);
- int rc;
-
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- return 1;
-
- if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
- {
- gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
- l2e_get_flags(l2e) & L2_DISALLOW_MASK);
- return -EINVAL;
- }
-
- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
- if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
- rc = 0;
-
- return rc;
-}
-
-
-/*
- * get_page_from_l3e returns:
- * 1 => page not present
- * 0 => success
- * <0 => error code
- */
-define_get_linear_pagetable(l3);
-static int
-get_page_from_l3e(
- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
-{
- int rc;
-
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
- return 1;
-
- if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
- {
- gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
- l3e_get_flags(l3e) & l3_disallow_mask(d));
- return -EINVAL;
- }
-
- rc = get_page_and_type_from_mfn(
- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
- if ( unlikely(rc == -EINVAL) &&
- !is_pv_32bit_domain(d) &&
- get_l3_linear_pagetable(l3e, pfn, d) )
- rc = 0;
-
- return rc;
-}
-
-/*
- * get_page_from_l4e returns:
- * 1 => page not present
- * 0 => success
- * <0 => error code
- */
-define_get_linear_pagetable(l4);
-static int
-get_page_from_l4e(
- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
-{
- int rc;
-
- if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
- return 1;
-
- if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
- {
- gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
- l4e_get_flags(l4e) & L4_DISALLOW_MASK);
- return -EINVAL;
- }
-
- rc = get_page_and_type_from_mfn(
- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
- if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
- rc = 0;
-
- return rc;
-}
-
void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
{
unsigned long pfn = l1e_get_pfn(l1e);
@@ -1199,306 +889,6 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain
*l1e_owner)
}
}
-
-/*
- * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
- * Note also that this automatically deals correctly with linear p.t.'s.
- */
-static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
-{
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
- return 1;
-
- if ( l2e_get_flags(l2e) & _PAGE_PSE )
- {
- struct page_info *page = l2e_get_page(l2e);
- unsigned int i;
-
- for ( i = 0; i < (1u << PAGETABLE_ORDER); i++, page++ )
- put_page_and_type(page);
- }
- else
- {
- struct page_info *pg = l2e_get_page(l2e);
- int rc = put_page_type_ptpg(pg, mfn_to_page(_mfn(pfn)));
-
- ASSERT(!rc);
- put_page(pg);
- }
-
- return 0;
-}
-
-static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- int partial, bool defer)
-{
- struct page_info *pg;
- int rc;
-
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
- return 1;
-
- if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
- {
- unsigned long mfn = l3e_get_pfn(l3e);
- int writeable = l3e_get_flags(l3e) & _PAGE_RW;
-
- ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
- do {
- put_data_page(mfn_to_page(_mfn(mfn)), writeable);
- } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
-
- return 0;
- }
-
- pg = l3e_get_page(l3e);
-
- if ( unlikely(partial > 0) )
- {
- ASSERT(!defer);
- return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
- }
-
- if ( defer )
- {
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
- current->arch.old_guest_table = pg;
- return 0;
- }
-
- rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
- if ( likely(!rc) )
- put_page(pg);
-
- return rc;
-}
-
-static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- int partial, bool defer)
-{
- int rc = 1;
-
- if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
- (l4e_get_pfn(l4e) != pfn) )
- {
- struct page_info *pg = l4e_get_page(l4e);
-
- if ( unlikely(partial > 0) )
- {
- ASSERT(!defer);
- return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
- }
-
- if ( defer )
- {
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
- current->arch.old_guest_table = pg;
- return 0;
- }
-
- rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
- if ( likely(!rc) )
- put_page(pg);
- }
-
- return rc;
-}
-
-static int alloc_l1_table(struct page_info *page)
-{
- struct domain *d = page_get_owner(page);
- l1_pgentry_t *pl1e;
- unsigned int i;
- int ret = 0;
-
- pl1e = __map_domain_page(page);
-
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- {
- switch ( ret = get_page_from_l1e(pl1e[i], d, d, l1_disallow_mask(d)) )
- {
- default:
- goto fail;
- case 0:
- break;
- case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
- ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
- l1e_flip_flags(pl1e[i], ret);
- break;
- }
-
- pl1e[i] = adjust_guest_l1e(pl1e[i], d);
- }
-
- unmap_domain_page(pl1e);
- return 0;
-
- fail:
- gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
- while ( i-- > 0 )
- put_page_from_l1e(pl1e[i], d);
-
- unmap_domain_page(pl1e);
- return ret;
-}
-
-static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
-{
- struct page_info *page;
- l3_pgentry_t l3e3;
-
- if ( !is_pv_32bit_domain(d) )
- return 1;
-
- pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
-
- /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
- l3e3 = pl3e[3];
- if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
- {
- gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is empty\n");
- return 0;
- }
-
- /*
- * The Xen-private mappings include linear mappings. The L2 thus cannot
- * be shared by multiple L3 tables. The test here is adequate because:
- * 1. Cannot appear in slots != 3 because get_page_type() checks the
- * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
- * 2. Cannot appear in another page table's L3:
- * a. alloc_l3_table() calls this function and this check will fail
- * b. mod_l3_entry() disallows updates to slot 3 in an existing table
- */
- page = l3e_get_page(l3e3);
- BUG_ON(page->u.inuse.type_info & PGT_pinned);
- BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
- BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
- if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
- {
- gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n");
- return 0;
- }
-
- return 1;
-}
-
-static int alloc_l2_table(struct page_info *page, unsigned long type,
- int preemptible)
-{
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l2_pgentry_t *pl2e;
- unsigned int i;
- int rc = 0;
-
- pl2e = map_domain_page(_mfn(pfn));
-
- for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
- {
- if ( preemptible && i > page->nr_validated_ptes
- && hypercall_preempt_check() )
- {
- page->nr_validated_ptes = i;
- rc = -ERESTART;
- break;
- }
-
- if ( !is_guest_l2_slot(d, type, i) ||
- (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
- continue;
-
- if ( rc < 0 )
- {
- gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n",
i);
- while ( i-- > 0 )
- if ( is_guest_l2_slot(d, type, i) )
- put_page_from_l2e(pl2e[i], pfn);
- break;
- }
-
- pl2e[i] = adjust_guest_l2e(pl2e[i], d);
- }
-
- if ( rc >= 0 && (type & PGT_pae_xen_l2) )
- init_xen_pae_l2_slots(pl2e, d);
-
- unmap_domain_page(pl2e);
- return rc > 0 ? 0 : rc;
-}
-
-static int alloc_l3_table(struct page_info *page)
-{
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l3_pgentry_t *pl3e;
- unsigned int i;
- int rc = 0, partial = page->partial_pte;
-
- pl3e = map_domain_page(_mfn(pfn));
-
- /*
- * PAE guests allocate full pages, but aren't required to initialize
- * more than the first four entries; when running in compatibility
- * mode, however, the full page is visible to the MMU, and hence all
- * 512 entries must be valid/verified, which is most easily achieved
- * by clearing them out.
- */
- if ( is_pv_32bit_domain(d) )
- memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
-
- for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
- i++, partial = 0 )
- {
- if ( is_pv_32bit_domain(d) && (i == 3) )
- {
- if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
- (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
- rc = -EINVAL;
- else
- rc = get_page_and_type_from_mfn(
- l3e_get_mfn(pl3e[i]),
- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
- }
- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
- continue;
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
- page->partial_pte = partial ?: 1;
- }
- else if ( rc == -EINTR && i )
- {
- page->nr_validated_ptes = i;
- page->partial_pte = 0;
- rc = -ERESTART;
- }
- if ( rc < 0 )
- break;
-
- pl3e[i] = adjust_guest_l3e(pl3e[i], d);
- }
-
- if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
- rc = -EINVAL;
- if ( rc < 0 && rc != -ERESTART && rc != -EINTR )
- {
- gdprintk(XENLOG_WARNING, "Failure in alloc_l3_table: slot %#x\n", i);
- if ( i )
- {
- page->nr_validated_ptes = i;
- page->partial_pte = 0;
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
- while ( i-- > 0 )
- pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
- }
-
- unmap_domain_page(pl3e);
- return rc > 0 ? 0 : rc;
-}
-
void init_xen_pae_l2_slots(l2_pgentry_t *l2t, const struct domain *d)
{
memcpy(&l2t[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
@@ -1613,186 +1003,6 @@ void zap_ro_mpt(mfn_t mfn)
unmap_domain_page(l4tab);
}
-static int alloc_l4_table(struct page_info *page)
-{
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
- unsigned int i;
- int rc = 0, partial = page->partial_pte;
-
- for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
- i++, partial = 0 )
- {
- if ( !is_guest_l4_slot(d, i) ||
- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
- continue;
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
- page->partial_pte = partial ?: 1;
- }
- else if ( rc < 0 )
- {
- if ( rc != -EINTR )
- gdprintk(XENLOG_WARNING,
- "Failure in alloc_l4_table: slot %#x\n", i);
- if ( i )
- {
- page->nr_validated_ptes = i;
- page->partial_pte = 0;
- if ( rc == -EINTR )
- rc = -ERESTART;
- else
- {
- if ( current->arch.old_guest_table )
- page->nr_validated_ptes++;
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
- }
- }
- if ( rc < 0 )
- {
- unmap_domain_page(pl4e);
- return rc;
- }
-
- pl4e[i] = adjust_guest_l4e(pl4e[i], d);
- }
-
- if ( rc >= 0 )
- {
- init_xen_l4_slots(pl4e, _mfn(pfn),
- d, INVALID_MFN, VM_ASSIST(d, m2p_strict));
- atomic_inc(&d->arch.pv_domain.nr_l4_pages);
- rc = 0;
- }
- unmap_domain_page(pl4e);
-
- return rc;
-}
-
-static void free_l1_table(struct page_info *page)
-{
- struct domain *d = page_get_owner(page);
- l1_pgentry_t *pl1e;
- unsigned int i;
-
- pl1e = __map_domain_page(page);
-
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- put_page_from_l1e(pl1e[i], d);
-
- unmap_domain_page(pl1e);
-}
-
-
-static int free_l2_table(struct page_info *page, int preemptible)
-{
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l2_pgentry_t *pl2e;
- unsigned int i = page->nr_validated_ptes - 1;
- int err = 0;
-
- pl2e = map_domain_page(_mfn(pfn));
-
- ASSERT(page->nr_validated_ptes);
- do {
- if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
- put_page_from_l2e(pl2e[i], pfn) == 0 &&
- preemptible && i && hypercall_preempt_check() )
- {
- page->nr_validated_ptes = i;
- err = -ERESTART;
- }
- } while ( !err && i-- );
-
- unmap_domain_page(pl2e);
-
- if ( !err )
- page->u.inuse.type_info &= ~PGT_pae_xen_l2;
-
- return err;
-}
-
-static int free_l3_table(struct page_info *page)
-{
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l3_pgentry_t *pl3e;
- int rc = 0, partial = page->partial_pte;
- unsigned int i = page->nr_validated_ptes - !partial;
-
- pl3e = map_domain_page(_mfn(pfn));
-
- do {
- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
- if ( rc < 0 )
- break;
- partial = 0;
- if ( rc > 0 )
- continue;
- pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
- } while ( i-- );
-
- unmap_domain_page(pl3e);
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
- page->partial_pte = partial ?: -1;
- }
- else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
- {
- page->nr_validated_ptes = i + 1;
- page->partial_pte = 0;
- rc = -ERESTART;
- }
- return rc > 0 ? 0 : rc;
-}
-
-static int free_l4_table(struct page_info *page)
-{
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
- int rc = 0, partial = page->partial_pte;
- unsigned int i = page->nr_validated_ptes - !partial;
-
- do {
- if ( is_guest_l4_slot(d, i) )
- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
- if ( rc < 0 )
- break;
- partial = 0;
- } while ( i-- );
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
- page->partial_pte = partial ?: -1;
- }
- else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
- {
- page->nr_validated_ptes = i + 1;
- page->partial_pte = 0;
- rc = -ERESTART;
- }
-
- unmap_domain_page(pl4e);
-
- if ( rc >= 0 )
- {
- atomic_dec(&d->arch.pv_domain.nr_l4_pages);
- rc = 0;
- }
-
- return rc;
-}
-
#ifndef NDEBUG
/*
* We must never call _put_page_type() while holding a page_lock() for
@@ -1876,309 +1086,6 @@ void page_unlock(struct page_info *page)
current_locked_page_set(NULL);
}
-/*
- * PTE flags that a guest may change without re-validating the PTE.
- * All other bits affect translation, caching, or Xen's safety.
- */
-#define FASTPATH_FLAG_WHITELIST \
- (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \
- _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER)
-
-/* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
- unsigned long gl1mfn, int preserve_ad,
- struct vcpu *pt_vcpu, struct domain *pg_dom)
-{
- l1_pgentry_t ol1e;
- struct domain *pt_dom = pt_vcpu->domain;
- int rc = 0;
-
- if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
- return -EFAULT;
-
- ASSERT(!paging_mode_refcounts(pt_dom));
-
- if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
- {
- struct page_info *page = NULL;
-
- if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
- {
- gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
- l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
- return -EINVAL;
- }
-
- /* Translate foreign guest address. */
- if ( paging_mode_translate(pg_dom) )
- {
- p2m_type_t p2mt;
- p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ?
- P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC;
-
- page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), &p2mt, q);
-
- if ( p2m_is_paged(p2mt) )
- {
- if ( page )
- put_page(page);
- p2m_mem_paging_populate(pg_dom, l1e_get_pfn(nl1e));
- return -ENOENT;
- }
-
- if ( p2mt == p2m_ram_paging_in && !page )
- return -ENOENT;
-
- /* Did our attempt to unshare fail? */
- if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
- {
- /* We could not have obtained a page ref. */
- ASSERT(!page);
- /* And mem_sharing_notify has already been called. */
- return -ENOMEM;
- }
-
- if ( !page )
- return -EINVAL;
- nl1e = l1e_from_page(page, l1e_get_flags(nl1e));
- }
-
- /* Fast path for sufficiently-similar mappings. */
- if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) )
- {
- nl1e = adjust_guest_l1e(nl1e, pt_dom);
- rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
- preserve_ad);
- if ( page )
- put_page(page);
- return rc ? 0 : -EBUSY;
- }
-
- switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom,
- l1_disallow_mask(pt_dom)) )
- {
- default:
- if ( page )
- put_page(page);
- return rc;
- case 0:
- break;
- case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
- ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
- l1e_flip_flags(nl1e, rc);
- rc = 0;
- break;
- }
- if ( page )
- put_page(page);
-
- nl1e = adjust_guest_l1e(nl1e, pt_dom);
- if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
- preserve_ad)) )
- {
- ol1e = nl1e;
- rc = -EBUSY;
- }
- }
- else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
- preserve_ad)) )
- {
- return -EBUSY;
- }
-
- put_page_from_l1e(ol1e, pt_dom);
- return rc;
-}
-
-
-/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
-static int mod_l2_entry(l2_pgentry_t *pl2e,
- l2_pgentry_t nl2e,
- unsigned long pfn,
- int preserve_ad,
- struct vcpu *vcpu)
-{
- l2_pgentry_t ol2e;
- struct domain *d = vcpu->domain;
- struct page_info *l2pg = mfn_to_page(_mfn(pfn));
- unsigned long type = l2pg->u.inuse.type_info;
- int rc = 0;
-
- if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
- {
- gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n",
- pgentry_ptr_to_slot(pl2e));
- return -EPERM;
- }
-
- if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
- return -EFAULT;
-
- if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
- {
- if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
- {
- gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
- l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
- return -EINVAL;
- }
-
- /* Fast path for sufficiently-similar mappings. */
- if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_FLAG_WHITELIST) )
- {
- nl2e = adjust_guest_l2e(nl2e, d);
- if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) )
- return 0;
- return -EBUSY;
- }
-
- if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
- return rc;
-
- nl2e = adjust_guest_l2e(nl2e, d);
- if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
- preserve_ad)) )
- {
- ol2e = nl2e;
- rc = -EBUSY;
- }
- }
- else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
- preserve_ad)) )
- {
- return -EBUSY;
- }
-
- put_page_from_l2e(ol2e, pfn);
- return rc;
-}
-
-/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
-static int mod_l3_entry(l3_pgentry_t *pl3e,
- l3_pgentry_t nl3e,
- unsigned long pfn,
- int preserve_ad,
- struct vcpu *vcpu)
-{
- l3_pgentry_t ol3e;
- struct domain *d = vcpu->domain;
- int rc = 0;
-
- /*
- * Disallow updates to final L3 slot. It contains Xen mappings, and it
- * would be a pain to ensure they remain continuously valid throughout.
- */
- if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
- return -EINVAL;
-
- if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
- return -EFAULT;
-
- if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
- {
- if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
- {
- gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
- l3e_get_flags(nl3e) & l3_disallow_mask(d));
- return -EINVAL;
- }
-
- /* Fast path for sufficiently-similar mappings. */
- if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_FLAG_WHITELIST) )
- {
- nl3e = adjust_guest_l3e(nl3e, d);
- rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
- return rc ? 0 : -EFAULT;
- }
-
- rc = get_page_from_l3e(nl3e, pfn, d, 0);
- if ( unlikely(rc < 0) )
- return rc;
- rc = 0;
-
- nl3e = adjust_guest_l3e(nl3e, d);
- if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
- preserve_ad)) )
- {
- ol3e = nl3e;
- rc = -EFAULT;
- }
- }
- else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
- preserve_ad)) )
- {
- return -EFAULT;
- }
-
- if ( likely(rc == 0) )
- if ( !create_pae_xen_mappings(d, pl3e) )
- BUG();
-
- put_page_from_l3e(ol3e, pfn, 0, 1);
- return rc;
-}
-
-/* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
-static int mod_l4_entry(l4_pgentry_t *pl4e,
- l4_pgentry_t nl4e,
- unsigned long pfn,
- int preserve_ad,
- struct vcpu *vcpu)
-{
- struct domain *d = vcpu->domain;
- l4_pgentry_t ol4e;
- int rc = 0;
-
- if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
- {
- gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n",
- pgentry_ptr_to_slot(pl4e));
- return -EINVAL;
- }
-
- if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
- return -EFAULT;
-
- if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
- {
- if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
- {
- gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
- l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
- return -EINVAL;
- }
-
- /* Fast path for sufficiently-similar mappings. */
- if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_FLAG_WHITELIST) )
- {
- nl4e = adjust_guest_l4e(nl4e, d);
- rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
- return rc ? 0 : -EFAULT;
- }
-
- rc = get_page_from_l4e(nl4e, pfn, d, 0);
- if ( unlikely(rc < 0) )
- return rc;
- rc = 0;
-
- nl4e = adjust_guest_l4e(nl4e, d);
- if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
- preserve_ad)) )
- {
- ol4e = nl4e;
- rc = -EFAULT;
- }
- }
- else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
- preserve_ad)) )
- {
- return -EFAULT;
- }
-
- put_page_from_l4e(ol4e, pfn, 0, 1);
- return rc;
-}
-
static int cleanup_page_cacheattr(struct page_info *page)
{
unsigned int cacheattr =
@@ -2260,222 +1167,27 @@ int get_page(struct page_info *page, struct domain
*domain)
return 0;
}
-/*
- * Special version of get_page() to be used exclusively when
- * - a page is known to already have a non-zero reference count
- * - the page does not need its owner to be checked
- * - it will not be called more than once without dropping the thus
- * acquired reference again.
- * Due to get_page() reserving one reference, this call cannot fail.
- */
-static void get_page_light(struct page_info *page)
-{
- unsigned long x, nx, y = page->count_info;
-
- do {
- x = y;
- nx = x + 1;
- BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
- BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
- y = cmpxchg(&page->count_info, x, nx);
- }
- while ( unlikely(y != x) );
-}
-
-int pv_alloc_page_type(struct page_info *page, unsigned long type,
- bool preemptible)
+static int _put_page_type(struct page_info *page, bool preemptible,
+ struct page_info *ptpg)
{
- struct domain *owner = page_get_owner(page);
- int rc;
+ unsigned long nx, x, y = page->u.inuse.type_info;
- /* A page table is dirtied when its type count becomes non-zero. */
- if ( likely(owner != NULL) )
- paging_mark_dirty(owner, page_to_mfn(page));
+ ASSERT(current_locked_page_ne_check(page));
- switch ( type & PGT_type_mask )
+ for ( ; ; )
{
- case PGT_l1_page_table:
- rc = alloc_l1_table(page);
- break;
- case PGT_l2_page_table:
- rc = alloc_l2_table(page, type, preemptible);
- break;
- case PGT_l3_page_table:
- ASSERT(preemptible);
- rc = alloc_l3_table(page);
- break;
- case PGT_l4_page_table:
- ASSERT(preemptible);
- rc = alloc_l4_table(page);
- break;
- case PGT_seg_desc_page:
- rc = alloc_segdesc_page(page);
- break;
- default:
- printk("Bad type in %s %lx t=%" PRtype_info " c=%lx\n", __func__,
- type, page->u.inuse.type_info,
- page->count_info);
- rc = -EINVAL;
- BUG();
- }
+ x = y;
+ nx = x - 1;
- /* No need for atomic update of type_info here: noone else updates it. */
- smp_wmb();
- switch ( rc )
- {
- case 0:
- page->u.inuse.type_info |= PGT_validated;
- break;
- case -EINTR:
- ASSERT((page->u.inuse.type_info &
- (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
- page->u.inuse.type_info &= ~PGT_count_mask;
- break;
- default:
- ASSERT(rc < 0);
- gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn
- " (pfn %" PRI_pfn ") for type %" PRtype_info
- ": caf=%08lx taf=%" PRtype_info "\n",
- mfn_x(page_to_mfn(page)),
- get_gpfn_from_mfn(mfn_x(page_to_mfn(page))),
- type, page->count_info, page->u.inuse.type_info);
- if ( page != current->arch.old_guest_table )
- page->u.inuse.type_info = 0;
- else
+ ASSERT((x & PGT_count_mask) != 0);
+
+ switch ( nx & (PGT_locked | PGT_count_mask) )
{
- ASSERT((page->u.inuse.type_info &
- (PGT_count_mask | PGT_validated)) == 1);
- case -ERESTART:
- get_page_light(page);
- page->u.inuse.type_info |= PGT_partial;
- }
- break;
- }
-
- return rc;
-}
-
-
-int pv_free_page_type(struct page_info *page, unsigned long type,
- bool preemptible)
-{
- struct domain *owner = page_get_owner(page);
- unsigned long gmfn;
- int rc;
-
- if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
- {
- /* A page table is dirtied when its type count becomes zero. */
- paging_mark_dirty(owner, page_to_mfn(page));
-
- ASSERT(!shadow_mode_refcounts(owner));
-
- gmfn = mfn_to_gmfn(owner, mfn_x(page_to_mfn(page)));
- ASSERT(VALID_M2P(gmfn));
- /* Page sharing not supported for shadowed domains */
- if(!SHARED_M2P(gmfn))
- shadow_remove_all_shadows(owner, _mfn(gmfn));
- }
-
- if ( !(type & PGT_partial) )
- {
- page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
- page->partial_pte = 0;
- }
-
- switch ( type & PGT_type_mask )
- {
- case PGT_l1_page_table:
- free_l1_table(page);
- rc = 0;
- break;
- case PGT_l2_page_table:
- rc = free_l2_table(page, preemptible);
- break;
- case PGT_l3_page_table:
- ASSERT(preemptible);
- rc = free_l3_table(page);
- break;
- case PGT_l4_page_table:
- ASSERT(preemptible);
- rc = free_l4_table(page);
- break;
- default:
- gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
- type, mfn_x(page_to_mfn(page)));
- rc = -EINVAL;
- BUG();
- }
-
- return rc;
-}
-
-void pv_dec_linear_pt(struct page_info *ptpg, struct page_info *page,
- unsigned long type)
-{
- if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
- {
- ASSERT(is_pv_domain(page_get_owner(page)));
- ASSERT(is_pv_domain(page_get_owner(ptpg)));
-
- dec_linear_uses(page);
- dec_linear_entries(ptpg);
- }
-}
-
-int pv_put_final_page_type(struct page_info *page, unsigned long type,
- bool preemptible, struct page_info *ptpg)
-{
- int rc = pv_free_page_type(page, type, preemptible);
-
- /* No need for atomic update of type_info here: noone else updates it. */
- if ( rc == 0 )
- {
- pv_dec_linear_pt(ptpg, page, type);
- ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
- set_tlbflush_timestamp(page);
- smp_wmb();
- page->u.inuse.type_info--;
- }
- else if ( rc == -EINTR )
- {
- ASSERT((page->u.inuse.type_info &
- (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
- smp_wmb();
- page->u.inuse.type_info |= PGT_validated;
- }
- else
- {
- BUG_ON(rc != -ERESTART);
- smp_wmb();
- get_page_light(page);
- page->u.inuse.type_info |= PGT_partial;
- }
-
- return rc;
-}
-
-static int _put_page_type(struct page_info *page, bool preemptible,
- struct page_info *ptpg)
-{
- unsigned long nx, x, y = page->u.inuse.type_info;
-
- ASSERT(current_locked_page_ne_check(page));
-
- for ( ; ; )
- {
- x = y;
- nx = x - 1;
-
- ASSERT((x & PGT_count_mask) != 0);
-
- switch ( nx & (PGT_locked | PGT_count_mask) )
- {
- case 0:
- if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
- likely(nx & (PGT_validated|PGT_partial)) )
- {
- int rc;
+ case 0:
+ if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
+ likely(nx & (PGT_validated|PGT_partial)) )
+ {
+ int rc;
/*
* Only PV guests can enter this branch. HAP guests
@@ -2817,1141 +1529,101 @@ int vcpu_destroy_pagetables(struct vcpu *v)
return rc != -EINTR ? rc : -ERESTART;
}
-int new_guest_cr3(mfn_t mfn)
+int donate_page(
+ struct domain *d, struct page_info *page, unsigned int memflags)
{
- struct vcpu *curr = current;
- struct domain *d = curr->domain;
- int rc;
- mfn_t old_base_mfn;
-
- if ( is_pv_32bit_domain(d) )
- {
- mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table);
- l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
-
- rc = mod_l4_entry(pl4e,
- l4e_from_mfn(mfn,
- (_PAGE_PRESENT | _PAGE_RW |
- _PAGE_USER | _PAGE_ACCESSED)),
- mfn_x(gt_mfn), 0, curr);
- unmap_domain_page(pl4e);
- switch ( rc )
- {
- case 0:
- break;
- case -EINTR:
- case -ERESTART:
- return -ERESTART;
- default:
- gdprintk(XENLOG_WARNING,
- "Error while installing new compat baseptr %" PRI_mfn
"\n",
- mfn_x(mfn));
- return rc;
- }
-
- pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
- write_ptbase(curr);
-
- return 0;
- }
-
- rc = put_old_guest_table(curr);
- if ( unlikely(rc) )
- return rc;
-
- old_base_mfn = pagetable_get_mfn(curr->arch.guest_table);
- /*
- * This is particularly important when getting restarted after the
- * previous attempt got preempted in the put-old-MFN phase.
- */
- if ( mfn_eq(old_base_mfn, mfn) )
- {
- write_ptbase(curr);
- return 0;
- }
-
- rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
- switch ( rc )
- {
- case 0:
- break;
- case -EINTR:
- case -ERESTART:
- return -ERESTART;
- default:
- gdprintk(XENLOG_WARNING,
- "Error while installing new baseptr %" PRI_mfn "\n",
- mfn_x(mfn));
- return rc;
- }
-
- pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
-
- if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
- fill_ro_mpt(mfn);
- curr->arch.guest_table = pagetable_from_mfn(mfn);
- update_cr3(curr);
-
- write_ptbase(curr);
-
- if ( likely(mfn_x(old_base_mfn) != 0) )
- {
- struct page_info *page = mfn_to_page(old_base_mfn);
-
- if ( paging_mode_refcounts(d) )
- put_page(page);
- else
- switch ( rc = put_page_and_type_preemptible(page) )
- {
- case -EINTR:
- rc = -ERESTART;
- /* fallthrough */
- case -ERESTART:
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
- break;
- default:
- BUG_ON(rc);
- break;
- }
- }
+ const struct domain *owner = dom_xen;
- return rc;
-}
+ spin_lock(&d->page_alloc_lock);
-static struct domain *get_pg_owner(domid_t domid)
-{
- struct domain *pg_owner = NULL, *curr = current->domain;
+ if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
+ goto fail;
- if ( likely(domid == DOMID_SELF) )
- {
- pg_owner = rcu_lock_current_domain();
- goto out;
- }
+ if ( d->is_dying )
+ goto fail;
- if ( unlikely(domid == curr->domain_id) )
- {
- gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
- goto out;
- }
+ if ( page->count_info & ~(PGC_allocated | 1) )
+ goto fail;
- switch ( domid )
+ if ( !(memflags & MEMF_no_refcount) )
{
- case DOMID_IO:
- pg_owner = rcu_lock_domain(dom_io);
- break;
- case DOMID_XEN:
- pg_owner = rcu_lock_domain(dom_xen);
- break;
- default:
- if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
- {
- gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
- break;
- }
- break;
+ if ( d->tot_pages >= d->max_pages )
+ goto fail;
+ domain_adjust_tot_pages(d, 1);
}
- out:
- return pg_owner;
-}
-
-static void put_pg_owner(struct domain *pg_owner)
-{
- rcu_unlock_domain(pg_owner);
-}
-
-static inline int vcpumask_to_pcpumask(
- struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t
*pmask)
-{
- unsigned int vcpu_id, vcpu_bias, offs;
- unsigned long vmask;
- struct vcpu *v;
- bool is_native = !is_pv_32bit_domain(d);
-
- cpumask_clear(pmask);
- for ( vmask = 0, offs = 0; ; ++offs )
- {
- vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
- if ( vcpu_bias >= d->max_vcpus )
- return 0;
+ page->count_info = PGC_allocated | 1;
+ page_set_owner(page, d);
+ page_list_add_tail(page,&d->page_list);
- if ( unlikely(is_native ?
- copy_from_guest_offset(&vmask, bmap, offs, 1) :
- copy_from_guest_offset((unsigned int *)&vmask, bmap,
- offs, 1)) )
- {
- cpumask_clear(pmask);
- return -EFAULT;
- }
+ spin_unlock(&d->page_alloc_lock);
+ return 0;
- while ( vmask )
- {
- vcpu_id = find_first_set_bit(vmask);
- vmask &= ~(1UL << vcpu_id);
- vcpu_id += vcpu_bias;
- if ( (vcpu_id >= d->max_vcpus) )
- return 0;
- if ( ((v = d->vcpu[vcpu_id]) != NULL) && vcpu_cpu_dirty(v) )
- __cpumask_set_cpu(v->dirty_cpu, pmask);
- }
- }
+ fail:
+ spin_unlock(&d->page_alloc_lock);
+ gdprintk(XENLOG_WARNING, "Bad donate mfn %" PRI_mfn
+ " to d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
+ mfn_x(page_to_mfn(page)), d->domain_id,
+ owner ? owner->domain_id : DOMID_INVALID,
+ page->count_info, page->u.inuse.type_info);
+ return -EINVAL;
}
-long do_mmuext_op(
- XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops,
- unsigned int count,
- XEN_GUEST_HANDLE_PARAM(uint) pdone,
- unsigned int foreigndom)
+int steal_page(
+ struct domain *d, struct page_info *page, unsigned int memflags)
{
- struct mmuext_op op;
- unsigned long type;
- unsigned int i, done = 0;
- struct vcpu *curr = current;
- struct domain *currd = curr->domain;
- struct domain *pg_owner;
- int rc = put_old_guest_table(curr);
-
- if ( unlikely(rc) )
- {
- if ( likely(rc == -ERESTART) )
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
- foreigndom);
- return rc;
- }
-
- if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
- likely(guest_handle_is_null(uops)) )
- {
- /*
- * See the curr->arch.old_guest_table related
- * hypercall_create_continuation() below.
- */
- return (int)foreigndom;
- }
-
- if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
- {
- count &= ~MMU_UPDATE_PREEMPTED;
- if ( unlikely(!guest_handle_is_null(pdone)) )
- (void)copy_from_guest(&done, pdone, 1);
- }
- else
- perfc_incr(calls_to_mmuext_op);
-
- if ( unlikely(!guest_handle_okay(uops, count)) )
- return -EFAULT;
-
- if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
- return -ESRCH;
-
- if ( !is_pv_domain(pg_owner) )
- {
- put_pg_owner(pg_owner);
- return -EINVAL;
- }
-
- rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner);
- if ( rc )
- {
- put_pg_owner(pg_owner);
- return rc;
- }
-
- for ( i = 0; i < count; i++ )
- {
- if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
- {
- rc = -ERESTART;
- break;
- }
+ unsigned long x, y;
+ bool drop_dom_ref = false;
+ const struct domain *owner = dom_xen;
- if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
- {
- rc = -EFAULT;
- break;
- }
+ if ( paging_mode_external(d) )
+ return -EOPNOTSUPP;
- if ( is_hvm_domain(currd) )
- {
- switch ( op.cmd )
- {
- case MMUEXT_PIN_L1_TABLE:
- case MMUEXT_PIN_L2_TABLE:
- case MMUEXT_PIN_L3_TABLE:
- case MMUEXT_PIN_L4_TABLE:
- case MMUEXT_UNPIN_TABLE:
- break;
- default:
- rc = -EOPNOTSUPP;
- goto done;
- }
- }
+ spin_lock(&d->page_alloc_lock);
- rc = 0;
+ if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
+ goto fail;
- switch ( op.cmd )
- {
- struct page_info *page;
- p2m_type_t p2mt;
+ /*
+ * We require there is just one reference (PGC_allocated). We temporarily
+ * drop this reference now so that we can safely swizzle the owner.
+ */
+ y = page->count_info;
+ do {
+ x = y;
+ if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
+ goto fail;
+ y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
+ } while ( y != x );
- case MMUEXT_PIN_L1_TABLE:
- type = PGT_l1_page_table;
- goto pin_page;
+ /*
+ * With the sole reference dropped temporarily, no-one can update type
+ * information. Type count also needs to be zero in this case, but e.g.
+ * PGT_seg_desc_page may still have PGT_validated set, which we need to
+ * clear before transferring ownership (as validation criteria vary
+ * depending on domain type).
+ */
+ BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
+ PGT_pinned));
+ page->u.inuse.type_info = 0;
- case MMUEXT_PIN_L2_TABLE:
- type = PGT_l2_page_table;
- goto pin_page;
+ /* Swizzle the owner then reinstate the PGC_allocated reference. */
+ page_set_owner(page, NULL);
+ y = page->count_info;
+ do {
+ x = y;
+ BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
+ } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
- case MMUEXT_PIN_L3_TABLE:
- type = PGT_l3_page_table;
- goto pin_page;
+ /* Unlink from original owner. */
+ if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
+ drop_dom_ref = true;
+ page_list_del(page, &d->page_list);
- case MMUEXT_PIN_L4_TABLE:
- if ( is_pv_32bit_domain(pg_owner) )
- break;
- type = PGT_l4_page_table;
-
- pin_page:
- /* Ignore pinning of invalid paging levels. */
- if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
- break;
-
- if ( paging_mode_refcounts(pg_owner) )
- break;
-
- page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
- if ( unlikely(!page) )
- {
- rc = -EINVAL;
- break;
- }
-
- rc = get_page_type_preemptible(page, type);
- if ( unlikely(rc) )
- {
- if ( rc == -EINTR )
- rc = -ERESTART;
- else if ( rc != -ERESTART )
- gdprintk(XENLOG_WARNING,
- "Error %d while pinning mfn %" PRI_mfn "\n",
- rc, mfn_x(page_to_mfn(page)));
- if ( page != curr->arch.old_guest_table )
- put_page(page);
- break;
- }
-
- rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page);
- if ( !rc && unlikely(test_and_set_bit(_PGT_pinned,
- &page->u.inuse.type_info)) )
- {
- gdprintk(XENLOG_WARNING,
- "mfn %" PRI_mfn " already pinned\n",
- mfn_x(page_to_mfn(page)));
- rc = -EINVAL;
- }
-
- if ( unlikely(rc) )
- goto pin_drop;
-
- /* A page is dirtied when its pin status is set. */
- paging_mark_dirty(pg_owner, page_to_mfn(page));
-
- /* We can race domain destruction (domain_relinquish_resources). */
- if ( unlikely(pg_owner != currd) )
- {
- bool drop_ref;
-
- spin_lock(&pg_owner->page_alloc_lock);
- drop_ref = (pg_owner->is_dying &&
- test_and_clear_bit(_PGT_pinned,
- &page->u.inuse.type_info));
- spin_unlock(&pg_owner->page_alloc_lock);
- if ( drop_ref )
- {
- pin_drop:
- if ( type == PGT_l1_page_table )
- put_page_and_type(page);
- else
- {
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
- }
- }
- }
- break;
-
- case MMUEXT_UNPIN_TABLE:
- if ( paging_mode_refcounts(pg_owner) )
- break;
-
- page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
- if ( unlikely(!page) )
- {
- gdprintk(XENLOG_WARNING,
- "mfn %" PRI_mfn " bad, or bad owner d%d\n",
- op.arg1.mfn, pg_owner->domain_id);
- rc = -EINVAL;
- break;
- }
-
- if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
- {
- put_page(page);
- gdprintk(XENLOG_WARNING,
- "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn);
- rc = -EINVAL;
- break;
- }
-
- switch ( rc = put_page_and_type_preemptible(page) )
- {
- case -EINTR:
- case -ERESTART:
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
- rc = 0;
- break;
- default:
- BUG_ON(rc);
- break;
- }
- put_page(page);
-
- /* A page is dirtied when its pin status is cleared. */
- paging_mark_dirty(pg_owner, page_to_mfn(page));
- break;
-
- case MMUEXT_NEW_BASEPTR:
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else if ( unlikely(paging_mode_translate(currd)) )
- rc = -EINVAL;
- else
- rc = new_guest_cr3(_mfn(op.arg1.mfn));
- break;
-
- case MMUEXT_NEW_USER_BASEPTR: {
- unsigned long old_mfn;
-
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else if ( unlikely(paging_mode_translate(currd)) )
- rc = -EINVAL;
- if ( unlikely(rc) )
- break;
-
- old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
- /*
- * This is particularly important when getting restarted after the
- * previous attempt got preempted in the put-old-MFN phase.
- */
- if ( old_mfn == op.arg1.mfn )
- break;
-
- if ( op.arg1.mfn != 0 )
- {
- rc = get_page_and_type_from_mfn(
- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
-
- if ( unlikely(rc) )
- {
- if ( rc == -EINTR )
- rc = -ERESTART;
- else if ( rc != -ERESTART )
- gdprintk(XENLOG_WARNING,
- "Error %d installing new mfn %" PRI_mfn "\n",
- rc, op.arg1.mfn);
- break;
- }
-
- if ( VM_ASSIST(currd, m2p_strict) )
- zap_ro_mpt(_mfn(op.arg1.mfn));
- }
-
- curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
-
- if ( old_mfn != 0 )
- {
- page = mfn_to_page(_mfn(old_mfn));
-
- switch ( rc = put_page_and_type_preemptible(page) )
- {
- case -EINTR:
- rc = -ERESTART;
- /* fallthrough */
- case -ERESTART:
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
- break;
- default:
- BUG_ON(rc);
- break;
- }
- }
-
- break;
- }
-
- case MMUEXT_TLB_FLUSH_LOCAL:
- if ( likely(currd == pg_owner) )
- flush_tlb_local();
- else
- rc = -EPERM;
- break;
-
- case MMUEXT_INVLPG_LOCAL:
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else
- paging_invlpg(curr, op.arg1.linear_addr);
- break;
-
- case MMUEXT_TLB_FLUSH_MULTI:
- case MMUEXT_INVLPG_MULTI:
- {
- cpumask_t *mask = this_cpu(scratch_cpumask);
-
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else if ( unlikely(vcpumask_to_pcpumask(currd,
- guest_handle_to_param(op.arg2.vcpumask,
- const_void),
- mask)) )
- rc = -EINVAL;
- if ( unlikely(rc) )
- break;
-
- if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
- flush_tlb_mask(mask);
- else if ( __addr_ok(op.arg1.linear_addr) )
- flush_tlb_one_mask(mask, op.arg1.linear_addr);
- break;
- }
-
- case MMUEXT_TLB_FLUSH_ALL:
- if ( likely(currd == pg_owner) )
- flush_tlb_mask(currd->dirty_cpumask);
- else
- rc = -EPERM;
- break;
-
- case MMUEXT_INVLPG_ALL:
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else if ( __addr_ok(op.arg1.linear_addr) )
- flush_tlb_one_mask(currd->dirty_cpumask, op.arg1.linear_addr);
- break;
-
- case MMUEXT_FLUSH_CACHE:
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else if ( unlikely(!cache_flush_permitted(currd)) )
- rc = -EACCES;
- else
- wbinvd();
- break;
-
- case MMUEXT_FLUSH_CACHE_GLOBAL:
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else if ( likely(cache_flush_permitted(currd)) )
- {
- unsigned int cpu;
- cpumask_t *mask = this_cpu(scratch_cpumask);
-
- cpumask_clear(mask);
- for_each_online_cpu(cpu)
- if ( !cpumask_intersects(mask,
- per_cpu(cpu_sibling_mask, cpu)) )
- __cpumask_set_cpu(cpu, mask);
- flush_mask(mask, FLUSH_CACHE);
- }
- else
- rc = -EINVAL;
- break;
-
- case MMUEXT_SET_LDT:
- {
- unsigned int ents = op.arg2.nr_ents;
- unsigned long ptr = ents ? op.arg1.linear_addr : 0;
-
- if ( unlikely(currd != pg_owner) )
- rc = -EPERM;
- else if ( paging_mode_external(currd) )
- rc = -EINVAL;
- else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
- (ents > 8192) )
- {
- gdprintk(XENLOG_WARNING,
- "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents);
- rc = -EINVAL;
- }
- else if ( (curr->arch.pv_vcpu.ldt_ents != ents) ||
- (curr->arch.pv_vcpu.ldt_base != ptr) )
- {
- if ( pv_destroy_ldt(curr) )
- flush_tlb_local();
-
- curr->arch.pv_vcpu.ldt_base = ptr;
- curr->arch.pv_vcpu.ldt_ents = ents;
- load_LDT(curr);
- }
- break;
- }
-
- case MMUEXT_CLEAR_PAGE:
- page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC);
- if ( unlikely(p2mt != p2m_ram_rw) && page )
- {
- put_page(page);
- page = NULL;
- }
- if ( !page || !get_page_type(page, PGT_writable_page) )
- {
- if ( page )
- put_page(page);
- gdprintk(XENLOG_WARNING,
- "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn);
- rc = -EINVAL;
- break;
- }
-
- /* A page is dirtied when it's being cleared. */
- paging_mark_dirty(pg_owner, page_to_mfn(page));
-
- clear_domain_page(page_to_mfn(page));
-
- put_page_and_type(page);
- break;
-
- case MMUEXT_COPY_PAGE:
- {
- struct page_info *src_page, *dst_page;
-
- src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt,
- P2M_ALLOC);
- if ( unlikely(p2mt != p2m_ram_rw) && src_page )
- {
- put_page(src_page);
- src_page = NULL;
- }
- if ( unlikely(!src_page) )
- {
- gdprintk(XENLOG_WARNING,
- "Error copying from mfn %" PRI_mfn "\n",
- op.arg2.src_mfn);
- rc = -EINVAL;
- break;
- }
-
- dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt,
- P2M_ALLOC);
- if ( unlikely(p2mt != p2m_ram_rw) && dst_page )
- {
- put_page(dst_page);
- dst_page = NULL;
- }
- rc = (dst_page &&
- get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL;
- if ( unlikely(rc) )
- {
- put_page(src_page);
- if ( dst_page )
- put_page(dst_page);
- gdprintk(XENLOG_WARNING,
- "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn);
- break;
- }
-
- /* A page is dirtied when it's being copied to. */
- paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
-
- copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
-
- put_page_and_type(dst_page);
- put_page(src_page);
- break;
- }
-
- case MMUEXT_MARK_SUPER:
- case MMUEXT_UNMARK_SUPER:
- rc = -EOPNOTSUPP;
- break;
-
- default:
- rc = -ENOSYS;
- break;
- }
-
- done:
- if ( unlikely(rc) )
- break;
-
- guest_handle_add_offset(uops, 1);
- }
-
- if ( rc == -ERESTART )
- {
- ASSERT(i < count);
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmuext_op, "hihi",
- uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
- }
- else if ( curr->arch.old_guest_table )
- {
- XEN_GUEST_HANDLE_PARAM(void) null;
-
- ASSERT(rc || i == count);
- set_xen_guest_handle(null, NULL);
- /*
- * In order to have a way to communicate the final return value to
- * our continuation, we pass this in place of "foreigndom", building
- * on the fact that this argument isn't needed anymore.
- */
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmuext_op, "hihi", null,
- MMU_UPDATE_PREEMPTED, null, rc);
- }
-
- put_pg_owner(pg_owner);
-
- perfc_add(num_mmuext_ops, i);
-
- /* Add incremental work we have done to the @done output parameter. */
- if ( unlikely(!guest_handle_is_null(pdone)) )
- {
- done += i;
- copy_to_guest(pdone, &done, 1);
- }
-
- return rc;
-}
-
-long do_mmu_update(
- XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs,
- unsigned int count,
- XEN_GUEST_HANDLE_PARAM(uint) pdone,
- unsigned int foreigndom)
-{
- struct mmu_update req;
- void *va = NULL;
- unsigned long gpfn, gmfn, mfn;
- struct page_info *page;
- unsigned int cmd, i = 0, done = 0, pt_dom;
- struct vcpu *curr = current, *v = curr;
- struct domain *d = v->domain, *pt_owner = d, *pg_owner;
- mfn_t map_mfn = INVALID_MFN;
- bool sync_guest = false;
- uint32_t xsm_needed = 0;
- uint32_t xsm_checked = 0;
- int rc = put_old_guest_table(curr);
-
- if ( unlikely(rc) )
- {
- if ( likely(rc == -ERESTART) )
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
- foreigndom);
- return rc;
- }
-
- if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
- likely(guest_handle_is_null(ureqs)) )
- {
- /*
- * See the curr->arch.old_guest_table related
- * hypercall_create_continuation() below.
- */
- return (int)foreigndom;
- }
-
- if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
- {
- count &= ~MMU_UPDATE_PREEMPTED;
- if ( unlikely(!guest_handle_is_null(pdone)) )
- (void)copy_from_guest(&done, pdone, 1);
- }
- else
- perfc_incr(calls_to_mmu_update);
-
- if ( unlikely(!guest_handle_okay(ureqs, count)) )
- return -EFAULT;
-
- if ( (pt_dom = foreigndom >> 16) != 0 )
- {
- /* Pagetables belong to a foreign domain (PFD). */
- if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
- return -ESRCH;
-
- if ( pt_owner == d )
- rcu_unlock_domain(pt_owner);
- else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL )
- {
- rc = -EINVAL;
- goto out;
- }
- }
-
- if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
- {
- rc = -ESRCH;
- goto out;
- }
-
- for ( i = 0; i < count; i++ )
- {
- if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
- {
- rc = -ERESTART;
- break;
- }
-
- if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
- {
- rc = -EFAULT;
- break;
- }
-
- cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
-
- switch ( cmd )
- {
- /*
- * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
- * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
- * current A/D bits.
- */
- case MMU_NORMAL_PT_UPDATE:
- case MMU_PT_UPDATE_PRESERVE_AD:
- {
- p2m_type_t p2mt;
-
- rc = -EOPNOTSUPP;
- if ( unlikely(paging_mode_refcounts(pt_owner)) )
- break;
-
- xsm_needed |= XSM_MMU_NORMAL_UPDATE;
- if ( get_pte_flags(req.val) & _PAGE_PRESENT )
- {
- xsm_needed |= XSM_MMU_UPDATE_READ;
- if ( get_pte_flags(req.val) & _PAGE_RW )
- xsm_needed |= XSM_MMU_UPDATE_WRITE;
- }
- if ( xsm_needed != xsm_checked )
- {
- rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner,
xsm_needed);
- if ( rc )
- break;
- xsm_checked = xsm_needed;
- }
- rc = -EINVAL;
-
- req.ptr -= cmd;
- gmfn = req.ptr >> PAGE_SHIFT;
- page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC);
-
- if ( unlikely(!page) || p2mt != p2m_ram_rw )
- {
- if ( page )
- put_page(page);
- if ( p2m_is_paged(p2mt) )
- {
- p2m_mem_paging_populate(pt_owner, gmfn);
- rc = -ENOENT;
- }
- else
- gdprintk(XENLOG_WARNING,
- "Could not get page for normal update\n");
- break;
- }
-
- mfn = mfn_x(page_to_mfn(page));
-
- if ( !mfn_eq(_mfn(mfn), map_mfn) )
- {
- if ( va )
- unmap_domain_page(va);
- va = map_domain_page(_mfn(mfn));
- map_mfn = _mfn(mfn);
- }
- va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK));
-
- if ( page_lock(page) )
- {
- switch ( page->u.inuse.type_info & PGT_type_mask )
- {
- case PGT_l1_page_table:
- rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
- pg_owner);
- break;
-
- case PGT_l2_page_table:
- if ( unlikely(pg_owner != pt_owner) )
- break;
- rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
- break;
-
- case PGT_l3_page_table:
- if ( unlikely(pg_owner != pt_owner) )
- break;
- rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
- break;
-
- case PGT_l4_page_table:
- if ( unlikely(pg_owner != pt_owner) )
- break;
- rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
- /*
- * No need to sync if all uses of the page can be accounted
- * to the page lock we hold, its pinned status, and uses on
- * this (v)CPU.
- */
- if ( !rc && this_cpu(root_pgt) &&
- ((page->u.inuse.type_info & PGT_count_mask) >
- (1 + !!(page->u.inuse.type_info & PGT_pinned) +
- (pagetable_get_pfn(curr->arch.guest_table) == mfn) +
- (pagetable_get_pfn(curr->arch.guest_table_user) ==
- mfn))) )
- sync_guest = true;
- break;
-
- case PGT_writable_page:
- perfc_incr(writable_mmu_updates);
- if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
- rc = 0;
- break;
- }
- page_unlock(page);
- if ( rc == -EINTR )
- rc = -ERESTART;
- }
- else if ( get_page_type(page, PGT_writable_page) )
- {
- perfc_incr(writable_mmu_updates);
- if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
- rc = 0;
- put_page_type(page);
- }
-
- put_page(page);
- }
- break;
-
- case MMU_MACHPHYS_UPDATE:
- if ( unlikely(d != pt_owner) )
- {
- rc = -EPERM;
- break;
- }
-
- if ( unlikely(paging_mode_translate(pg_owner)) )
- {
- rc = -EINVAL;
- break;
- }
-
- mfn = req.ptr >> PAGE_SHIFT;
- gpfn = req.val;
-
- xsm_needed |= XSM_MMU_MACHPHYS_UPDATE;
- if ( xsm_needed != xsm_checked )
- {
- rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed);
- if ( rc )
- break;
- xsm_checked = xsm_needed;
- }
-
- page = get_page_from_mfn(_mfn(mfn), pg_owner);
- if ( unlikely(!page) )
- {
- gdprintk(XENLOG_WARNING,
- "Could not get page for mach->phys update\n");
- rc = -EINVAL;
- break;
- }
-
- set_gpfn_from_mfn(mfn, gpfn);
-
- paging_mark_dirty(pg_owner, _mfn(mfn));
-
- put_page(page);
- break;
-
- default:
- rc = -ENOSYS;
- break;
- }
-
- if ( unlikely(rc) )
- break;
-
- guest_handle_add_offset(ureqs, 1);
- }
-
- if ( rc == -ERESTART )
- {
- ASSERT(i < count);
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmu_update, "hihi",
- ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
- }
- else if ( curr->arch.old_guest_table )
- {
- XEN_GUEST_HANDLE_PARAM(void) null;
-
- ASSERT(rc || i == count);
- set_xen_guest_handle(null, NULL);
- /*
- * In order to have a way to communicate the final return value to
- * our continuation, we pass this in place of "foreigndom", building
- * on the fact that this argument isn't needed anymore.
- */
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmu_update, "hihi", null,
- MMU_UPDATE_PREEMPTED, null, rc);
- }
-
- put_pg_owner(pg_owner);
-
- if ( va )
- unmap_domain_page(va);
-
- if ( sync_guest )
- {
- /*
- * Force other vCPU-s of the affected guest to pick up L4 entry
- * changes (if any). Issue a flush IPI with empty operation mask to
- * facilitate this (including ourselves waiting for the IPI to
- * actually have arrived). Utilize the fact that FLUSH_VA_VALID is
- * meaningless without FLUSH_CACHE, but will allow to pass the no-op
- * check in flush_area_mask().
- */
- unsigned int cpu = smp_processor_id();
- cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
-
- cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
- if ( !cpumask_empty(mask) )
- flush_area_mask(mask, ZERO_BLOCK_PTR, FLUSH_VA_VALID);
- }
-
- perfc_add(num_page_updates, i);
-
- out:
- if ( pt_owner != d )
- rcu_unlock_domain(pt_owner);
-
- /* Add incremental work we have done to the @done output parameter. */
- if ( unlikely(!guest_handle_is_null(pdone)) )
- {
- done += i;
- copy_to_guest(pdone, &done, 1);
- }
-
- return rc;
-}
-
-int donate_page(
- struct domain *d, struct page_info *page, unsigned int memflags)
-{
- const struct domain *owner = dom_xen;
-
- spin_lock(&d->page_alloc_lock);
-
- if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
- goto fail;
-
- if ( d->is_dying )
- goto fail;
-
- if ( page->count_info & ~(PGC_allocated | 1) )
- goto fail;
-
- if ( !(memflags & MEMF_no_refcount) )
- {
- if ( d->tot_pages >= d->max_pages )
- goto fail;
- domain_adjust_tot_pages(d, 1);
- }
-
- page->count_info = PGC_allocated | 1;
- page_set_owner(page, d);
- page_list_add_tail(page,&d->page_list);
-
- spin_unlock(&d->page_alloc_lock);
- return 0;
-
- fail:
- spin_unlock(&d->page_alloc_lock);
- gdprintk(XENLOG_WARNING, "Bad donate mfn %" PRI_mfn
- " to d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
- mfn_x(page_to_mfn(page)), d->domain_id,
- owner ? owner->domain_id : DOMID_INVALID,
- page->count_info, page->u.inuse.type_info);
- return -EINVAL;
-}
-
-int steal_page(
- struct domain *d, struct page_info *page, unsigned int memflags)
-{
- unsigned long x, y;
- bool drop_dom_ref = false;
- const struct domain *owner = dom_xen;
-
- if ( paging_mode_external(d) )
- return -EOPNOTSUPP;
-
- spin_lock(&d->page_alloc_lock);
-
- if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
- goto fail;
-
- /*
- * We require there is just one reference (PGC_allocated). We temporarily
- * drop this reference now so that we can safely swizzle the owner.
- */
- y = page->count_info;
- do {
- x = y;
- if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
- goto fail;
- y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
- } while ( y != x );
-
- /*
- * With the sole reference dropped temporarily, no-one can update type
- * information. Type count also needs to be zero in this case, but e.g.
- * PGT_seg_desc_page may still have PGT_validated set, which we need to
- * clear before transferring ownership (as validation criteria vary
- * depending on domain type).
- */
- BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
- PGT_pinned));
- page->u.inuse.type_info = 0;
-
- /* Swizzle the owner then reinstate the PGC_allocated reference. */
- page_set_owner(page, NULL);
- y = page->count_info;
- do {
- x = y;
- BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
- } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
-
- /* Unlink from original owner. */
- if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
- drop_dom_ref = true;
- page_list_del(page, &d->page_list);
-
- spin_unlock(&d->page_alloc_lock);
- if ( unlikely(drop_dom_ref) )
- put_domain(d);
- return 0;
+ spin_unlock(&d->page_alloc_lock);
+ if ( unlikely(drop_dom_ref) )
+ put_domain(d);
+ return 0;
fail:
spin_unlock(&d->page_alloc_lock);
@@ -3963,122 +1635,6 @@ int steal_page(
return -EINVAL;
}
-static int __do_update_va_mapping(
- unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
-{
- l1_pgentry_t val = l1e_from_intpte(val64);
- struct vcpu *v = current;
- struct domain *d = v->domain;
- struct page_info *gl1pg;
- l1_pgentry_t *pl1e;
- unsigned long bmap_ptr;
- mfn_t gl1mfn;
- cpumask_t *mask = NULL;
- int rc;
-
- perfc_incr(calls_to_update_va);
-
- rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val);
- if ( rc )
- return rc;
-
- rc = -EINVAL;
- pl1e = map_guest_l1e(va, &gl1mfn);
- gl1pg = pl1e ? get_page_from_mfn(gl1mfn, d) : NULL;
- if ( unlikely(!gl1pg) )
- goto out;
-
- if ( !page_lock(gl1pg) )
- {
- put_page(gl1pg);
- goto out;
- }
-
- if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
- {
- page_unlock(gl1pg);
- put_page(gl1pg);
- goto out;
- }
-
- rc = mod_l1_entry(pl1e, val, mfn_x(gl1mfn), 0, v, pg_owner);
-
- page_unlock(gl1pg);
- put_page(gl1pg);
-
- out:
- if ( pl1e )
- unmap_domain_page(pl1e);
-
- switch ( flags & UVMF_FLUSHTYPE_MASK )
- {
- case UVMF_TLB_FLUSH:
- switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
- {
- case UVMF_LOCAL:
- flush_tlb_local();
- break;
- case UVMF_ALL:
- mask = d->dirty_cpumask;
- break;
- default:
- mask = this_cpu(scratch_cpumask);
- rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
- void),
- mask);
- break;
- }
- if ( mask )
- flush_tlb_mask(mask);
- break;
-
- case UVMF_INVLPG:
- switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
- {
- case UVMF_LOCAL:
- paging_invlpg(v, va);
- break;
- case UVMF_ALL:
- mask = d->dirty_cpumask;
- break;
- default:
- mask = this_cpu(scratch_cpumask);
- rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
- void),
- mask);
- break;
- }
- if ( mask )
- flush_tlb_one_mask(mask, va);
- break;
- }
-
- return rc;
-}
-
-long do_update_va_mapping(unsigned long va, u64 val64,
- unsigned long flags)
-{
- return __do_update_va_mapping(va, val64, flags, current->domain);
-}
-
-long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
- unsigned long flags,
- domid_t domid)
-{
- struct domain *pg_owner;
- int rc;
-
- if ( (pg_owner = get_pg_owner(domid)) == NULL )
- return -ESRCH;
-
- rc = __do_update_va_mapping(va, val64, flags, pg_owner);
-
- put_pg_owner(pg_owner);
-
- return rc;
-}
-
typedef struct e820entry e820entry_t;
DEFINE_XEN_GUEST_HANDLE(e820entry_t);
diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 8d7a4fd85f..6504422ad1 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -20,9 +20,18 @@
*/
#include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <xen/lib.h>
+#include <xen/mm.h>
+#include <xsm/xsm.h>
#include <asm/current.h>
+#include <asm/event.h>
+#include <asm/iocap.h>
+#include <asm/ldt.h>
#include <asm/p2m.h>
+#include <asm/pv/mm.h>
+#include <asm/shadow.h>
#include "mm.h"
@@ -133,6 +142,2449 @@ bool pv_map_ldt_shadow_page(unsigned int offset)
return true;
}
+/*
+ * PTE flags that a guest may change without re-validating the PTE.
+ * All other bits affect translation, caching, or Xen's safety.
+ */
+#define FASTPATH_FLAG_WHITELIST \
+ (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \
+ _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER)
+
+static int get_page_and_type_from_mfn(
+ mfn_t mfn, unsigned long type, struct domain *d,
+ int partial, int preemptible)
+{
+ struct page_info *page = mfn_to_page(mfn);
+ int rc;
+
+ if ( likely(partial >= 0) &&
+ unlikely(!get_page_from_mfn(mfn, d)) )
+ return -EINVAL;
+
+ rc = (preemptible ?
+ get_page_type_preemptible(page, type) :
+ (get_page_type(page, type) ? 0 : -EINVAL));
+
+ if ( unlikely(rc) && partial >= 0 &&
+ (!preemptible || page != current->arch.old_guest_table) )
+ put_page(page);
+
+ return rc;
+}
+
+static void put_data_page(
+ struct page_info *page, int writeable)
+{
+ if ( writeable )
+ put_page_and_type(page);
+ else
+ put_page(page);
+}
+
+static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
+{
+ struct page_info *page;
+ l3_pgentry_t l3e3;
+
+ if ( !is_pv_32bit_domain(d) )
+ return 1;
+
+ pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
+
+ /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
+ l3e3 = pl3e[3];
+ if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
+ {
+ gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is empty\n");
+ return 0;
+ }
+
+ /*
+ * The Xen-private mappings include linear mappings. The L2 thus cannot
+ * be shared by multiple L3 tables. The test here is adequate because:
+ * 1. Cannot appear in slots != 3 because get_page_type() checks the
+ * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
+ * 2. Cannot appear in another page table's L3:
+ * a. alloc_l3_table() calls this function and this check will fail
+ * b. mod_l3_entry() disallows updates to slot 3 in an existing table
+ */
+ page = l3e_get_page(l3e3);
+ BUG_ON(page->u.inuse.type_info & PGT_pinned);
+ BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
+ BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
+ if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
+ {
+ gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_PV_LINEAR_PT
+
+static bool inc_linear_entries(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+ do {
+ /*
+ * The check below checks for the "linear use" count being non-zero
+ * as well as overflow. Signed integer overflow is undefined behavior
+ * according to the C spec. However, as long as linear_pt_count is
+ * smaller in size than 'int', the arithmetic operation of the
+ * increment below won't overflow; rather the result will be truncated
+ * when stored. Ensure that this is always true.
+ */
+ BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+ oc = nc++;
+ if ( nc <= 0 )
+ return false;
+ nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+ } while ( oc != nc );
+
+ return true;
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) oc;
+
+ oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+ ASSERT(oc > 0);
+}
+
+static bool inc_linear_uses(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+ do {
+ /* See the respective comment in inc_linear_entries(). */
+ BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+ oc = nc--;
+ if ( nc >= 0 )
+ return false;
+ nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+ } while ( oc != nc );
+
+ return true;
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) oc;
+
+ oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
+ ASSERT(oc < 0);
+}
+
+/*
+ * We allow root tables to map each other (a.k.a. linear page tables). It
+ * needs some special care with reference counts and access permissions:
+ * 1. The mapping entry must be read-only, or the guest may get write access
+ * to its own PTEs.
+ * 2. We must only bump the reference counts for an *already validated*
+ * L2 table, or we can end up in a deadlock in get_page_type() by waiting
+ * on a validation that is required to complete that validation.
+ * 3. We only need to increment the reference counts for the mapped page
+ * frame if it is mapped by a different root table. This is sufficient and
+ * also necessary to allow validation of a root table mapping itself.
+ */
+static bool __read_mostly opt_pv_linear_pt = true;
+boolean_param("pv-linear-pt", opt_pv_linear_pt);
+
+#define define_get_linear_pagetable(level) \
+static int \
+get_##level##_linear_pagetable( \
+ level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
+{ \
+ unsigned long x, y; \
+ unsigned long pfn; \
+ \
+ if ( !opt_pv_linear_pt ) \
+ { \
+ gdprintk(XENLOG_WARNING, \
+ "Attempt to create linear p.t. (feature disabled)\n"); \
+ return 0; \
+ } \
+ \
+ if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
+ { \
+ gdprintk(XENLOG_WARNING, \
+ "Attempt to create linear p.t. with write perms\n"); \
+ return 0; \
+ } \
+ \
+ if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
+ { \
+ struct page_info *page, *ptpg = mfn_to_page(_mfn(pde_pfn)); \
+ \
+ /* Make sure the page table belongs to the correct domain. */ \
+ if ( unlikely(page_get_owner(ptpg) != d) ) \
+ return 0; \
+ \
+ /* Make sure the mapped frame belongs to the correct domain. */ \
+ page = get_page_from_mfn(_mfn(pfn), d); \
+ if ( unlikely(!page) ) \
+ return 0; \
+ \
+ /* \
+ * Ensure that the mapped frame is an already-validated page table \
+ * and is not itself having linear entries, as well as that the \
+ * containing page table is not iself in use as a linear page table \
+ * elsewhere. \
+ * If so, atomically increment the count (checking for overflow). \
+ */ \
+ if ( !inc_linear_entries(ptpg) ) \
+ { \
+ put_page(page); \
+ return 0; \
+ } \
+ if ( !inc_linear_uses(page) ) \
+ { \
+ dec_linear_entries(ptpg); \
+ put_page(page); \
+ return 0; \
+ } \
+ y = page->u.inuse.type_info; \
+ do { \
+ x = y; \
+ if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
+ unlikely((x & (PGT_type_mask|PGT_validated)) != \
+ (PGT_##level##_page_table|PGT_validated)) ) \
+ { \
+ dec_linear_uses(page); \
+ dec_linear_entries(ptpg); \
+ put_page(page); \
+ return 0; \
+ } \
+ } \
+ while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
+ } \
+ \
+ return 1; \
+}
+
+#else /* CONFIG_PV_LINEAR_PT */
+
+#define define_get_linear_pagetable(level) \
+static int \
+get_##level##_linear_pagetable( \
+ level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
+{ \
+ return 0; \
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+ ASSERT(pg->linear_pt_count == 0);
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+ ASSERT(pg->linear_pt_count == 0);
+}
+
+#endif /* CONFIG_PV_LINEAR_PT */
+
+/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+/*
+ * get_page_from_l2e returns:
+ * 1 => page not present
+ * 0 => success
+ * <0 => error code
+ */
+define_get_linear_pagetable(l2);
+static int
+get_page_from_l2e(
+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
+{
+ unsigned long mfn = l2e_get_pfn(l2e);
+ int rc;
+
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+ return 1;
+
+ if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
+ {
+ gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
+ l2e_get_flags(l2e) & L2_DISALLOW_MASK);
+ return -EINVAL;
+ }
+
+ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
+ if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+ rc = 0;
+
+ return rc;
+}
+
+
+/*
+ * get_page_from_l3e returns:
+ * 1 => page not present
+ * 0 => success
+ * <0 => error code
+ */
+define_get_linear_pagetable(l3);
+static int
+get_page_from_l3e(
+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
+{
+ int rc;
+
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+ return 1;
+
+ if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
+ {
+ gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
+ l3e_get_flags(l3e) & l3_disallow_mask(d));
+ return -EINVAL;
+ }
+
+ rc = get_page_and_type_from_mfn(
+ l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
+ if ( unlikely(rc == -EINVAL) &&
+ !is_pv_32bit_domain(d) &&
+ get_l3_linear_pagetable(l3e, pfn, d) )
+ rc = 0;
+
+ return rc;
+}
+
+/*
+ * get_page_from_l4e returns:
+ * 1 => page not present
+ * 0 => success
+ * <0 => error code
+ */
+define_get_linear_pagetable(l4);
+static int
+get_page_from_l4e(
+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
+{
+ int rc;
+
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+ return 1;
+
+ if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
+ {
+ gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
+ l4e_get_flags(l4e) & L4_DISALLOW_MASK);
+ return -EINVAL;
+ }
+
+ rc = get_page_and_type_from_mfn(
+ l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
+ if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
+ rc = 0;
+
+ return rc;
+}
+
+/*
+ * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
+ * Note also that this automatically deals correctly with linear p.t.'s.
+ */
+static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+{
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
+ return 1;
+
+ if ( l2e_get_flags(l2e) & _PAGE_PSE )
+ {
+ struct page_info *page = l2e_get_page(l2e);
+ unsigned int i;
+
+ for ( i = 0; i < (1u << PAGETABLE_ORDER); i++, page++ )
+ put_page_and_type(page);
+ }
+ else
+ {
+ struct page_info *pg = l2e_get_page(l2e);
+ int rc = put_page_type_ptpg(pg, mfn_to_page(_mfn(pfn)));
+
+ ASSERT(!rc);
+ put_page(pg);
+ }
+
+ return 0;
+}
+
+static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ int partial, bool defer)
+{
+ struct page_info *pg;
+ int rc;
+
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
+ return 1;
+
+ if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
+ {
+ unsigned long mfn = l3e_get_pfn(l3e);
+ int writeable = l3e_get_flags(l3e) & _PAGE_RW;
+
+ ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
+ do {
+ put_data_page(mfn_to_page(_mfn(mfn)), writeable);
+ } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
+
+ return 0;
+ }
+
+ pg = l3e_get_page(l3e);
+
+ if ( unlikely(partial > 0) )
+ {
+ ASSERT(!defer);
+ return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+ }
+
+ if ( defer )
+ {
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
+ return 0;
+ }
+
+ rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+
+ return rc;
+}
+
+static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ int partial, bool defer)
+{
+ int rc = 1;
+
+ if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
+ (l4e_get_pfn(l4e) != pfn) )
+ {
+ struct page_info *pg = l4e_get_page(l4e);
+
+ if ( unlikely(partial > 0) )
+ {
+ ASSERT(!defer);
+ return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+ }
+
+ if ( defer )
+ {
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
+ return 0;
+ }
+
+ rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+ }
+
+ return rc;
+}
+
+/* Update the L1 entry at pl1e to new value nl1e. */
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
+ unsigned long gl1mfn, int preserve_ad,
+ struct vcpu *pt_vcpu, struct domain *pg_dom)
+{
+ l1_pgentry_t ol1e;
+ struct domain *pt_dom = pt_vcpu->domain;
+ int rc = 0;
+
+ if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
+ return -EFAULT;
+
+ ASSERT(!paging_mode_refcounts(pt_dom));
+
+ if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
+ {
+ struct page_info *page = NULL;
+
+ if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
+ {
+ gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
+ l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
+ return -EINVAL;
+ }
+
+ /* Translate foreign guest address. */
+ if ( paging_mode_translate(pg_dom) )
+ {
+ p2m_type_t p2mt;
+ p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ?
+ P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC;
+
+ page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), &p2mt, q);
+
+ if ( p2m_is_paged(p2mt) )
+ {
+ if ( page )
+ put_page(page);
+ p2m_mem_paging_populate(pg_dom, l1e_get_pfn(nl1e));
+ return -ENOENT;
+ }
+
+ if ( p2mt == p2m_ram_paging_in && !page )
+ return -ENOENT;
+
+ /* Did our attempt to unshare fail? */
+ if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
+ {
+ /* We could not have obtained a page ref. */
+ ASSERT(!page);
+ /* And mem_sharing_notify has already been called. */
+ return -ENOMEM;
+ }
+
+ if ( !page )
+ return -EINVAL;
+ nl1e = l1e_from_page(page, l1e_get_flags(nl1e));
+ }
+
+ /* Fast path for sufficiently-similar mappings. */
+ if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) )
+ {
+ nl1e = adjust_guest_l1e(nl1e, pt_dom);
+ rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+ preserve_ad);
+ if ( page )
+ put_page(page);
+ return rc ? 0 : -EBUSY;
+ }
+
+ switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom,
+ l1_disallow_mask(pt_dom)) )
+ {
+ default:
+ if ( page )
+ put_page(page);
+ return rc;
+ case 0:
+ break;
+ case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
+ ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
+ l1e_flip_flags(nl1e, rc);
+ rc = 0;
+ break;
+ }
+ if ( page )
+ put_page(page);
+
+ nl1e = adjust_guest_l1e(nl1e, pt_dom);
+ if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+ preserve_ad)) )
+ {
+ ol1e = nl1e;
+ rc = -EBUSY;
+ }
+ }
+ else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+ preserve_ad)) )
+ {
+ return -EBUSY;
+ }
+
+ put_page_from_l1e(ol1e, pt_dom);
+ return rc;
+}
+
+
+/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
+static int mod_l2_entry(l2_pgentry_t *pl2e,
+ l2_pgentry_t nl2e,
+ unsigned long pfn,
+ int preserve_ad,
+ struct vcpu *vcpu)
+{
+ l2_pgentry_t ol2e;
+ struct domain *d = vcpu->domain;
+ struct page_info *l2pg = mfn_to_page(_mfn(pfn));
+ unsigned long type = l2pg->u.inuse.type_info;
+ int rc = 0;
+
+ if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
+ {
+ gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n",
+ pgentry_ptr_to_slot(pl2e));
+ return -EPERM;
+ }
+
+ if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
+ return -EFAULT;
+
+ if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
+ {
+ if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
+ {
+ gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
+ l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
+ return -EINVAL;
+ }
+
+ /* Fast path for sufficiently-similar mappings. */
+ if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_FLAG_WHITELIST) )
+ {
+ nl2e = adjust_guest_l2e(nl2e, d);
+ if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) )
+ return 0;
+ return -EBUSY;
+ }
+
+ if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
+ return rc;
+
+ nl2e = adjust_guest_l2e(nl2e, d);
+ if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
+ preserve_ad)) )
+ {
+ ol2e = nl2e;
+ rc = -EBUSY;
+ }
+ }
+ else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
+ preserve_ad)) )
+ {
+ return -EBUSY;
+ }
+
+ put_page_from_l2e(ol2e, pfn);
+ return rc;
+}
+
+/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
+static int mod_l3_entry(l3_pgentry_t *pl3e,
+ l3_pgentry_t nl3e,
+ unsigned long pfn,
+ int preserve_ad,
+ struct vcpu *vcpu)
+{
+ l3_pgentry_t ol3e;
+ struct domain *d = vcpu->domain;
+ int rc = 0;
+
+ /*
+ * Disallow updates to final L3 slot. It contains Xen mappings, and it
+ * would be a pain to ensure they remain continuously valid throughout.
+ */
+ if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
+ return -EINVAL;
+
+ if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
+ return -EFAULT;
+
+ if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
+ {
+ if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
+ {
+ gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
+ l3e_get_flags(nl3e) & l3_disallow_mask(d));
+ return -EINVAL;
+ }
+
+ /* Fast path for sufficiently-similar mappings. */
+ if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_FLAG_WHITELIST) )
+ {
+ nl3e = adjust_guest_l3e(nl3e, d);
+ rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
+ return rc ? 0 : -EFAULT;
+ }
+
+ rc = get_page_from_l3e(nl3e, pfn, d, 0);
+ if ( unlikely(rc < 0) )
+ return rc;
+ rc = 0;
+
+ nl3e = adjust_guest_l3e(nl3e, d);
+ if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
+ preserve_ad)) )
+ {
+ ol3e = nl3e;
+ rc = -EFAULT;
+ }
+ }
+ else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
+ preserve_ad)) )
+ {
+ return -EFAULT;
+ }
+
+ if ( likely(rc == 0) )
+ if ( !create_pae_xen_mappings(d, pl3e) )
+ BUG();
+
+ put_page_from_l3e(ol3e, pfn, 0, 1);
+ return rc;
+}
+
+/* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
+static int mod_l4_entry(l4_pgentry_t *pl4e,
+ l4_pgentry_t nl4e,
+ unsigned long pfn,
+ int preserve_ad,
+ struct vcpu *vcpu)
+{
+ struct domain *d = vcpu->domain;
+ l4_pgentry_t ol4e;
+ int rc = 0;
+
+ if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
+ {
+ gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n",
+ pgentry_ptr_to_slot(pl4e));
+ return -EINVAL;
+ }
+
+ if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
+ return -EFAULT;
+
+ if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
+ {
+ if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
+ {
+ gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
+ l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
+ return -EINVAL;
+ }
+
+ /* Fast path for sufficiently-similar mappings. */
+ if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_FLAG_WHITELIST) )
+ {
+ nl4e = adjust_guest_l4e(nl4e, d);
+ rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
+ return rc ? 0 : -EFAULT;
+ }
+
+ rc = get_page_from_l4e(nl4e, pfn, d, 0);
+ if ( unlikely(rc < 0) )
+ return rc;
+ rc = 0;
+
+ nl4e = adjust_guest_l4e(nl4e, d);
+ if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
+ preserve_ad)) )
+ {
+ ol4e = nl4e;
+ rc = -EFAULT;
+ }
+ }
+ else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
+ preserve_ad)) )
+ {
+ return -EFAULT;
+ }
+
+ put_page_from_l4e(ol4e, pfn, 0, 1);
+ return rc;
+}
+
+static int alloc_l1_table(struct page_info *page)
+{
+ struct domain *d = page_get_owner(page);
+ l1_pgentry_t *pl1e;
+ unsigned int i;
+ int ret = 0;
+
+ pl1e = __map_domain_page(page);
+
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ {
+ switch ( ret = get_page_from_l1e(pl1e[i], d, d, l1_disallow_mask(d)) )
+ {
+ default:
+ goto fail;
+ case 0:
+ break;
+ case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
+ ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
+ l1e_flip_flags(pl1e[i], ret);
+ break;
+ }
+
+ pl1e[i] = adjust_guest_l1e(pl1e[i], d);
+ }
+
+ unmap_domain_page(pl1e);
+ return 0;
+
+ fail:
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
+ while ( i-- > 0 )
+ put_page_from_l1e(pl1e[i], d);
+
+ unmap_domain_page(pl1e);
+ return ret;
+}
+
+static int alloc_l2_table(struct page_info *page, unsigned long type,
+ int preemptible)
+{
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l2_pgentry_t *pl2e;
+ unsigned int i;
+ int rc = 0;
+
+ pl2e = map_domain_page(_mfn(pfn));
+
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
+ {
+ if ( preemptible && i > page->nr_validated_ptes
+ && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ rc = -ERESTART;
+ break;
+ }
+
+ if ( !is_guest_l2_slot(d, type, i) ||
+ (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
+ continue;
+
+ if ( rc < 0 )
+ {
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n",
i);
+ while ( i-- > 0 )
+ if ( is_guest_l2_slot(d, type, i) )
+ put_page_from_l2e(pl2e[i], pfn);
+ break;
+ }
+
+ pl2e[i] = adjust_guest_l2e(pl2e[i], d);
+ }
+
+ if ( rc >= 0 && (type & PGT_pae_xen_l2) )
+ init_xen_pae_l2_slots(pl2e, d);
+
+ unmap_domain_page(pl2e);
+ return rc > 0 ? 0 : rc;
+}
+
+static int alloc_l3_table(struct page_info *page)
+{
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l3_pgentry_t *pl3e;
+ unsigned int i;
+ int rc = 0, partial = page->partial_pte;
+
+ pl3e = map_domain_page(_mfn(pfn));
+
+ /*
+ * PAE guests allocate full pages, but aren't required to initialize
+ * more than the first four entries; when running in compatibility
+ * mode, however, the full page is visible to the MMU, and hence all
+ * 512 entries must be valid/verified, which is most easily achieved
+ * by clearing them out.
+ */
+ if ( is_pv_32bit_domain(d) )
+ memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
+
+ for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+ i++, partial = 0 )
+ {
+ if ( is_pv_32bit_domain(d) && (i == 3) )
+ {
+ if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
+ (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
+ rc = -EINVAL;
+ else
+ rc = get_page_and_type_from_mfn(
+ l3e_get_mfn(pl3e[i]),
+ PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
+ }
+ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
+ continue;
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = partial ?: 1;
+ }
+ else if ( rc == -EINTR && i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ rc = -ERESTART;
+ }
+ if ( rc < 0 )
+ break;
+
+ pl3e[i] = adjust_guest_l3e(pl3e[i], d);
+ }
+
+ if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
+ rc = -EINVAL;
+ if ( rc < 0 && rc != -ERESTART && rc != -EINTR )
+ {
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l3_table: slot %#x\n", i);
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+ while ( i-- > 0 )
+ pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+ }
+
+ unmap_domain_page(pl3e);
+ return rc > 0 ? 0 : rc;
+}
+
+static int alloc_l4_table(struct page_info *page)
+{
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
+ unsigned int i;
+ int rc = 0, partial = page->partial_pte;
+
+ for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+ i++, partial = 0 )
+ {
+ if ( !is_guest_l4_slot(d, i) ||
+ (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
+ continue;
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = partial ?: 1;
+ }
+ else if ( rc < 0 )
+ {
+ if ( rc != -EINTR )
+ gdprintk(XENLOG_WARNING,
+ "Failure in alloc_l4_table: slot %#x\n", i);
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ if ( rc == -EINTR )
+ rc = -ERESTART;
+ else
+ {
+ if ( current->arch.old_guest_table )
+ page->nr_validated_ptes++;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+ }
+ }
+ if ( rc < 0 )
+ {
+ unmap_domain_page(pl4e);
+ return rc;
+ }
+
+ pl4e[i] = adjust_guest_l4e(pl4e[i], d);
+ }
+
+ if ( rc >= 0 )
+ {
+ init_xen_l4_slots(pl4e, _mfn(pfn),
+ d, INVALID_MFN, VM_ASSIST(d, m2p_strict));
+ atomic_inc(&d->arch.pv_domain.nr_l4_pages);
+ rc = 0;
+ }
+ unmap_domain_page(pl4e);
+
+ return rc;
+}
+
+static void free_l1_table(struct page_info *page)
+{
+ struct domain *d = page_get_owner(page);
+ l1_pgentry_t *pl1e;
+ unsigned int i;
+
+ pl1e = __map_domain_page(page);
+
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ put_page_from_l1e(pl1e[i], d);
+
+ unmap_domain_page(pl1e);
+}
+
+
+static int free_l2_table(struct page_info *page, int preemptible)
+{
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l2_pgentry_t *pl2e;
+ unsigned int i = page->nr_validated_ptes - 1;
+ int err = 0;
+
+ pl2e = map_domain_page(_mfn(pfn));
+
+ ASSERT(page->nr_validated_ptes);
+ do {
+ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+ put_page_from_l2e(pl2e[i], pfn) == 0 &&
+ preemptible && i && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ err = -ERESTART;
+ }
+ } while ( !err && i-- );
+
+ unmap_domain_page(pl2e);
+
+ if ( !err )
+ page->u.inuse.type_info &= ~PGT_pae_xen_l2;
+
+ return err;
+}
+
+static int free_l3_table(struct page_info *page)
+{
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l3_pgentry_t *pl3e;
+ int rc = 0, partial = page->partial_pte;
+ unsigned int i = page->nr_validated_ptes - !partial;
+
+ pl3e = map_domain_page(_mfn(pfn));
+
+ do {
+ rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
+ if ( rc < 0 )
+ break;
+ partial = 0;
+ if ( rc > 0 )
+ continue;
+ pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+ } while ( i-- );
+
+ unmap_domain_page(pl3e);
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = partial ?: -1;
+ }
+ else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+ page->partial_pte = 0;
+ rc = -ERESTART;
+ }
+ return rc > 0 ? 0 : rc;
+}
+
+static int free_l4_table(struct page_info *page)
+{
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
+ int rc = 0, partial = page->partial_pte;
+ unsigned int i = page->nr_validated_ptes - !partial;
+
+ do {
+ if ( is_guest_l4_slot(d, i) )
+ rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
+ if ( rc < 0 )
+ break;
+ partial = 0;
+ } while ( i-- );
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = partial ?: -1;
+ }
+ else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+ page->partial_pte = 0;
+ rc = -ERESTART;
+ }
+
+ unmap_domain_page(pl4e);
+
+ if ( rc >= 0 )
+ {
+ atomic_dec(&d->arch.pv_domain.nr_l4_pages);
+ rc = 0;
+ }
+
+ return rc;
+}
+
+
+void pv_dec_linear_pt(struct page_info *ptpg, struct page_info *page,
+ unsigned long type)
+{
+ if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
+ {
+ ASSERT(is_pv_domain(page_get_owner(page)));
+ ASSERT(is_pv_domain(page_get_owner(ptpg)));
+
+ dec_linear_uses(page);
+ dec_linear_entries(ptpg);
+ }
+}
+
+/*
+ * Special version of get_page() to be used exclusively when
+ * - a page is known to already have a non-zero reference count
+ * - the page does not need its owner to be checked
+ * - it will not be called more than once without dropping the thus
+ * acquired reference again.
+ * Due to get_page() reserving one reference, this call cannot fail.
+ */
+static void get_page_light(struct page_info *page)
+{
+ unsigned long x, nx, y = page->count_info;
+
+ do {
+ x = y;
+ nx = x + 1;
+ BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
+ BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
+ y = cmpxchg(&page->count_info, x, nx);
+ }
+ while ( unlikely(y != x) );
+}
+
+int pv_put_final_page_type(struct page_info *page, unsigned long type,
+ bool preemptible, struct page_info *ptpg)
+{
+ int rc = pv_free_page_type(page, type, preemptible);
+
+ /* No need for atomic update of type_info here: noone else updates it. */
+ if ( rc == 0 )
+ {
+ pv_dec_linear_pt(ptpg, page, type);
+ ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
+ set_tlbflush_timestamp(page);
+ smp_wmb();
+ page->u.inuse.type_info--;
+ }
+ else if ( rc == -EINTR )
+ {
+ ASSERT((page->u.inuse.type_info &
+ (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+ smp_wmb();
+ page->u.inuse.type_info |= PGT_validated;
+ }
+ else
+ {
+ BUG_ON(rc != -ERESTART);
+ smp_wmb();
+ get_page_light(page);
+ page->u.inuse.type_info |= PGT_partial;
+ }
+
+ return rc;
+}
+
+static int alloc_segdesc_page(struct page_info *page)
+{
+ const struct domain *owner = page_get_owner(page);
+ struct desc_struct *descs = __map_domain_page(page);
+ unsigned i;
+
+ for ( i = 0; i < 512; i++ )
+ if ( unlikely(!check_descriptor(owner, &descs[i])) )
+ break;
+
+ unmap_domain_page(descs);
+
+ return i == 512 ? 0 : -EINVAL;
+}
+
+int pv_alloc_page_type(struct page_info *page, unsigned long type,
+ bool preemptible)
+{
+ struct domain *owner = page_get_owner(page);
+ int rc;
+
+ /* A page table is dirtied when its type count becomes non-zero. */
+ if ( likely(owner != NULL) )
+ paging_mark_dirty(owner, page_to_mfn(page));
+
+ switch ( type & PGT_type_mask )
+ {
+ case PGT_l1_page_table:
+ rc = alloc_l1_table(page);
+ break;
+ case PGT_l2_page_table:
+ rc = alloc_l2_table(page, type, preemptible);
+ break;
+ case PGT_l3_page_table:
+ ASSERT(preemptible);
+ rc = alloc_l3_table(page);
+ break;
+ case PGT_l4_page_table:
+ ASSERT(preemptible);
+ rc = alloc_l4_table(page);
+ break;
+ case PGT_seg_desc_page:
+ rc = alloc_segdesc_page(page);
+ break;
+ default:
+ printk("Bad type in %s %lx t=%" PRtype_info " c=%lx\n", __func__,
+ type, page->u.inuse.type_info,
+ page->count_info);
+ rc = -EINVAL;
+ BUG();
+ }
+
+ /* No need for atomic update of type_info here: noone else updates it. */
+ smp_wmb();
+ switch ( rc )
+ {
+ case 0:
+ page->u.inuse.type_info |= PGT_validated;
+ break;
+ case -EINTR:
+ ASSERT((page->u.inuse.type_info &
+ (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+ page->u.inuse.type_info &= ~PGT_count_mask;
+ break;
+ default:
+ ASSERT(rc < 0);
+ gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn
+ " (pfn %" PRI_pfn ") for type %" PRtype_info
+ ": caf=%08lx taf=%" PRtype_info "\n",
+ mfn_x(page_to_mfn(page)),
+ get_gpfn_from_mfn(mfn_x(page_to_mfn(page))),
+ type, page->count_info, page->u.inuse.type_info);
+ if ( page != current->arch.old_guest_table )
+ page->u.inuse.type_info = 0;
+ else
+ {
+ ASSERT((page->u.inuse.type_info &
+ (PGT_count_mask | PGT_validated)) == 1);
+ case -ERESTART:
+ get_page_light(page);
+ page->u.inuse.type_info |= PGT_partial;
+ }
+ break;
+ }
+
+ return rc;
+}
+
+
+int pv_free_page_type(struct page_info *page, unsigned long type,
+ bool preemptible)
+{
+ struct domain *owner = page_get_owner(page);
+ unsigned long gmfn;
+ int rc;
+
+ if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
+ {
+ /* A page table is dirtied when its type count becomes zero. */
+ paging_mark_dirty(owner, page_to_mfn(page));
+
+ ASSERT(!shadow_mode_refcounts(owner));
+
+ gmfn = mfn_to_gmfn(owner, mfn_x(page_to_mfn(page)));
+ ASSERT(VALID_M2P(gmfn));
+ /* Page sharing not supported for shadowed domains */
+ if(!SHARED_M2P(gmfn))
+ shadow_remove_all_shadows(owner, _mfn(gmfn));
+ }
+
+ if ( !(type & PGT_partial) )
+ {
+ page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
+ page->partial_pte = 0;
+ }
+
+ switch ( type & PGT_type_mask )
+ {
+ case PGT_l1_page_table:
+ free_l1_table(page);
+ rc = 0;
+ break;
+ case PGT_l2_page_table:
+ rc = free_l2_table(page, preemptible);
+ break;
+ case PGT_l3_page_table:
+ ASSERT(preemptible);
+ rc = free_l3_table(page);
+ break;
+ case PGT_l4_page_table:
+ ASSERT(preemptible);
+ rc = free_l4_table(page);
+ break;
+ default:
+ gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
+ type, mfn_x(page_to_mfn(page)));
+ rc = -EINVAL;
+ BUG();
+ }
+
+ return rc;
+}
+
+int new_guest_cr3(mfn_t mfn)
+{
+ struct vcpu *curr = current;
+ struct domain *d = curr->domain;
+ int rc;
+ mfn_t old_base_mfn;
+
+ if ( is_pv_32bit_domain(d) )
+ {
+ mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table);
+ l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
+
+ rc = mod_l4_entry(pl4e,
+ l4e_from_mfn(mfn,
+ (_PAGE_PRESENT | _PAGE_RW |
+ _PAGE_USER | _PAGE_ACCESSED)),
+ mfn_x(gt_mfn), 0, curr);
+ unmap_domain_page(pl4e);
+ switch ( rc )
+ {
+ case 0:
+ break;
+ case -EINTR:
+ case -ERESTART:
+ return -ERESTART;
+ default:
+ gdprintk(XENLOG_WARNING,
+ "Error while installing new compat baseptr %" PRI_mfn
"\n",
+ mfn_x(mfn));
+ return rc;
+ }
+
+ pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
+ write_ptbase(curr);
+
+ return 0;
+ }
+
+ rc = put_old_guest_table(curr);
+ if ( unlikely(rc) )
+ return rc;
+
+ old_base_mfn = pagetable_get_mfn(curr->arch.guest_table);
+ /*
+ * This is particularly important when getting restarted after the
+ * previous attempt got preempted in the put-old-MFN phase.
+ */
+ if ( mfn_eq(old_base_mfn, mfn) )
+ {
+ write_ptbase(curr);
+ return 0;
+ }
+
+ rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
+ switch ( rc )
+ {
+ case 0:
+ break;
+ case -EINTR:
+ case -ERESTART:
+ return -ERESTART;
+ default:
+ gdprintk(XENLOG_WARNING,
+ "Error while installing new baseptr %" PRI_mfn "\n",
+ mfn_x(mfn));
+ return rc;
+ }
+
+ pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
+
+ if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
+ fill_ro_mpt(mfn);
+ curr->arch.guest_table = pagetable_from_mfn(mfn);
+ update_cr3(curr);
+
+ write_ptbase(curr);
+
+ if ( likely(mfn_x(old_base_mfn) != 0) )
+ {
+ struct page_info *page = mfn_to_page(old_base_mfn);
+
+ if ( paging_mode_refcounts(d) )
+ put_page(page);
+ else
+ switch ( rc = put_page_and_type_preemptible(page) )
+ {
+ case -EINTR:
+ rc = -ERESTART;
+ /* fallthrough */
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
+ break;
+ default:
+ BUG_ON(rc);
+ break;
+ }
+ }
+
+ return rc;
+}
+
+static struct domain *get_pg_owner(domid_t domid)
+{
+ struct domain *pg_owner = NULL, *curr = current->domain;
+
+ if ( likely(domid == DOMID_SELF) )
+ {
+ pg_owner = rcu_lock_current_domain();
+ goto out;
+ }
+
+ if ( unlikely(domid == curr->domain_id) )
+ {
+ gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
+ goto out;
+ }
+
+ switch ( domid )
+ {
+ case DOMID_IO:
+ pg_owner = rcu_lock_domain(dom_io);
+ break;
+ case DOMID_XEN:
+ pg_owner = rcu_lock_domain(dom_xen);
+ break;
+ default:
+ if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
+ {
+ gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
+ break;
+ }
+ break;
+ }
+
+ out:
+ return pg_owner;
+}
+
+static void put_pg_owner(struct domain *pg_owner)
+{
+ rcu_unlock_domain(pg_owner);
+}
+
+static inline int vcpumask_to_pcpumask(
+ struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t
*pmask)
+{
+ unsigned int vcpu_id, vcpu_bias, offs;
+ unsigned long vmask;
+ struct vcpu *v;
+ bool is_native = !is_pv_32bit_domain(d);
+
+ cpumask_clear(pmask);
+ for ( vmask = 0, offs = 0; ; ++offs )
+ {
+ vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
+ if ( vcpu_bias >= d->max_vcpus )
+ return 0;
+
+ if ( unlikely(is_native ?
+ copy_from_guest_offset(&vmask, bmap, offs, 1) :
+ copy_from_guest_offset((unsigned int *)&vmask, bmap,
+ offs, 1)) )
+ {
+ cpumask_clear(pmask);
+ return -EFAULT;
+ }
+
+ while ( vmask )
+ {
+ vcpu_id = find_first_set_bit(vmask);
+ vmask &= ~(1UL << vcpu_id);
+ vcpu_id += vcpu_bias;
+ if ( (vcpu_id >= d->max_vcpus) )
+ return 0;
+ if ( ((v = d->vcpu[vcpu_id]) != NULL) && vcpu_cpu_dirty(v) )
+ __cpumask_set_cpu(v->dirty_cpu, pmask);
+ }
+ }
+}
+
+long do_mmuext_op(
+ XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops,
+ unsigned int count,
+ XEN_GUEST_HANDLE_PARAM(uint) pdone,
+ unsigned int foreigndom)
+{
+ struct mmuext_op op;
+ unsigned long type;
+ unsigned int i, done = 0;
+ struct vcpu *curr = current;
+ struct domain *currd = curr->domain;
+ struct domain *pg_owner;
+ int rc = put_old_guest_table(curr);
+
+ if ( unlikely(rc) )
+ {
+ if ( likely(rc == -ERESTART) )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
+ foreigndom);
+ return rc;
+ }
+
+ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+ likely(guest_handle_is_null(uops)) )
+ {
+ /*
+ * See the curr->arch.old_guest_table related
+ * hypercall_create_continuation() below.
+ */
+ return (int)foreigndom;
+ }
+
+ if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
+ {
+ count &= ~MMU_UPDATE_PREEMPTED;
+ if ( unlikely(!guest_handle_is_null(pdone)) )
+ (void)copy_from_guest(&done, pdone, 1);
+ }
+ else
+ perfc_incr(calls_to_mmuext_op);
+
+ if ( unlikely(!guest_handle_okay(uops, count)) )
+ return -EFAULT;
+
+ if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
+ return -ESRCH;
+
+ if ( !is_pv_domain(pg_owner) )
+ {
+ put_pg_owner(pg_owner);
+ return -EINVAL;
+ }
+
+ rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner);
+ if ( rc )
+ {
+ put_pg_owner(pg_owner);
+ return rc;
+ }
+
+ for ( i = 0; i < count; i++ )
+ {
+ if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
+ {
+ rc = -ERESTART;
+ break;
+ }
+
+ if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
+ {
+ rc = -EFAULT;
+ break;
+ }
+
+ if ( is_hvm_domain(currd) )
+ {
+ switch ( op.cmd )
+ {
+ case MMUEXT_PIN_L1_TABLE:
+ case MMUEXT_PIN_L2_TABLE:
+ case MMUEXT_PIN_L3_TABLE:
+ case MMUEXT_PIN_L4_TABLE:
+ case MMUEXT_UNPIN_TABLE:
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ goto done;
+ }
+ }
+
+ rc = 0;
+
+ switch ( op.cmd )
+ {
+ struct page_info *page;
+ p2m_type_t p2mt;
+
+ case MMUEXT_PIN_L1_TABLE:
+ type = PGT_l1_page_table;
+ goto pin_page;
+
+ case MMUEXT_PIN_L2_TABLE:
+ type = PGT_l2_page_table;
+ goto pin_page;
+
+ case MMUEXT_PIN_L3_TABLE:
+ type = PGT_l3_page_table;
+ goto pin_page;
+
+ case MMUEXT_PIN_L4_TABLE:
+ if ( is_pv_32bit_domain(pg_owner) )
+ break;
+ type = PGT_l4_page_table;
+
+ pin_page:
+ /* Ignore pinning of invalid paging levels. */
+ if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
+ break;
+
+ if ( paging_mode_refcounts(pg_owner) )
+ break;
+
+ page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
+ if ( unlikely(!page) )
+ {
+ rc = -EINVAL;
+ break;
+ }
+
+ rc = get_page_type_preemptible(page, type);
+ if ( unlikely(rc) )
+ {
+ if ( rc == -EINTR )
+ rc = -ERESTART;
+ else if ( rc != -ERESTART )
+ gdprintk(XENLOG_WARNING,
+ "Error %d while pinning mfn %" PRI_mfn "\n",
+ rc, mfn_x(page_to_mfn(page)));
+ if ( page != curr->arch.old_guest_table )
+ put_page(page);
+ break;
+ }
+
+ rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page);
+ if ( !rc && unlikely(test_and_set_bit(_PGT_pinned,
+ &page->u.inuse.type_info)) )
+ {
+ gdprintk(XENLOG_WARNING,
+ "mfn %" PRI_mfn " already pinned\n",
+ mfn_x(page_to_mfn(page)));
+ rc = -EINVAL;
+ }
+
+ if ( unlikely(rc) )
+ goto pin_drop;
+
+ /* A page is dirtied when its pin status is set. */
+ paging_mark_dirty(pg_owner, page_to_mfn(page));
+
+ /* We can race domain destruction (domain_relinquish_resources). */
+ if ( unlikely(pg_owner != currd) )
+ {
+ bool drop_ref;
+
+ spin_lock(&pg_owner->page_alloc_lock);
+ drop_ref = (pg_owner->is_dying &&
+ test_and_clear_bit(_PGT_pinned,
+ &page->u.inuse.type_info));
+ spin_unlock(&pg_owner->page_alloc_lock);
+ if ( drop_ref )
+ {
+ pin_drop:
+ if ( type == PGT_l1_page_table )
+ put_page_and_type(page);
+ else
+ {
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
+ }
+ }
+ }
+ break;
+
+ case MMUEXT_UNPIN_TABLE:
+ if ( paging_mode_refcounts(pg_owner) )
+ break;
+
+ page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
+ if ( unlikely(!page) )
+ {
+ gdprintk(XENLOG_WARNING,
+ "mfn %" PRI_mfn " bad, or bad owner d%d\n",
+ op.arg1.mfn, pg_owner->domain_id);
+ rc = -EINVAL;
+ break;
+ }
+
+ if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
+ {
+ put_page(page);
+ gdprintk(XENLOG_WARNING,
+ "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn);
+ rc = -EINVAL;
+ break;
+ }
+
+ switch ( rc = put_page_and_type_preemptible(page) )
+ {
+ case -EINTR:
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
+ rc = 0;
+ break;
+ default:
+ BUG_ON(rc);
+ break;
+ }
+ put_page(page);
+
+ /* A page is dirtied when its pin status is cleared. */
+ paging_mark_dirty(pg_owner, page_to_mfn(page));
+ break;
+
+ case MMUEXT_NEW_BASEPTR:
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else if ( unlikely(paging_mode_translate(currd)) )
+ rc = -EINVAL;
+ else
+ rc = new_guest_cr3(_mfn(op.arg1.mfn));
+ break;
+
+ case MMUEXT_NEW_USER_BASEPTR: {
+ unsigned long old_mfn;
+
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else if ( unlikely(paging_mode_translate(currd)) )
+ rc = -EINVAL;
+ if ( unlikely(rc) )
+ break;
+
+ old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
+ /*
+ * This is particularly important when getting restarted after the
+ * previous attempt got preempted in the put-old-MFN phase.
+ */
+ if ( old_mfn == op.arg1.mfn )
+ break;
+
+ if ( op.arg1.mfn != 0 )
+ {
+ rc = get_page_and_type_from_mfn(
+ _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
+
+ if ( unlikely(rc) )
+ {
+ if ( rc == -EINTR )
+ rc = -ERESTART;
+ else if ( rc != -ERESTART )
+ gdprintk(XENLOG_WARNING,
+ "Error %d installing new mfn %" PRI_mfn "\n",
+ rc, op.arg1.mfn);
+ break;
+ }
+
+ if ( VM_ASSIST(currd, m2p_strict) )
+ zap_ro_mpt(_mfn(op.arg1.mfn));
+ }
+
+ curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
+
+ if ( old_mfn != 0 )
+ {
+ page = mfn_to_page(_mfn(old_mfn));
+
+ switch ( rc = put_page_and_type_preemptible(page) )
+ {
+ case -EINTR:
+ rc = -ERESTART;
+ /* fallthrough */
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
+ break;
+ default:
+ BUG_ON(rc);
+ break;
+ }
+ }
+
+ break;
+ }
+
+ case MMUEXT_TLB_FLUSH_LOCAL:
+ if ( likely(currd == pg_owner) )
+ flush_tlb_local();
+ else
+ rc = -EPERM;
+ break;
+
+ case MMUEXT_INVLPG_LOCAL:
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else
+ paging_invlpg(curr, op.arg1.linear_addr);
+ break;
+
+ case MMUEXT_TLB_FLUSH_MULTI:
+ case MMUEXT_INVLPG_MULTI:
+ {
+ cpumask_t *mask = this_cpu(scratch_cpumask);
+
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else if ( unlikely(vcpumask_to_pcpumask(currd,
+ guest_handle_to_param(op.arg2.vcpumask,
+ const_void),
+ mask)) )
+ rc = -EINVAL;
+ if ( unlikely(rc) )
+ break;
+
+ if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
+ flush_tlb_mask(mask);
+ else if ( __addr_ok(op.arg1.linear_addr) )
+ flush_tlb_one_mask(mask, op.arg1.linear_addr);
+ break;
+ }
+
+ case MMUEXT_TLB_FLUSH_ALL:
+ if ( likely(currd == pg_owner) )
+ flush_tlb_mask(currd->dirty_cpumask);
+ else
+ rc = -EPERM;
+ break;
+
+ case MMUEXT_INVLPG_ALL:
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else if ( __addr_ok(op.arg1.linear_addr) )
+ flush_tlb_one_mask(currd->dirty_cpumask, op.arg1.linear_addr);
+ break;
+
+ case MMUEXT_FLUSH_CACHE:
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else if ( unlikely(!cache_flush_permitted(currd)) )
+ rc = -EACCES;
+ else
+ wbinvd();
+ break;
+
+ case MMUEXT_FLUSH_CACHE_GLOBAL:
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else if ( likely(cache_flush_permitted(currd)) )
+ {
+ unsigned int cpu;
+ cpumask_t *mask = this_cpu(scratch_cpumask);
+
+ cpumask_clear(mask);
+ for_each_online_cpu(cpu)
+ if ( !cpumask_intersects(mask,
+ per_cpu(cpu_sibling_mask, cpu)) )
+ __cpumask_set_cpu(cpu, mask);
+ flush_mask(mask, FLUSH_CACHE);
+ }
+ else
+ rc = -EINVAL;
+ break;
+
+ case MMUEXT_SET_LDT:
+ {
+ unsigned int ents = op.arg2.nr_ents;
+ unsigned long ptr = ents ? op.arg1.linear_addr : 0;
+
+ if ( unlikely(currd != pg_owner) )
+ rc = -EPERM;
+ else if ( paging_mode_external(currd) )
+ rc = -EINVAL;
+ else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
+ (ents > 8192) )
+ {
+ gdprintk(XENLOG_WARNING,
+ "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents);
+ rc = -EINVAL;
+ }
+ else if ( (curr->arch.pv_vcpu.ldt_ents != ents) ||
+ (curr->arch.pv_vcpu.ldt_base != ptr) )
+ {
+ if ( pv_destroy_ldt(curr) )
+ flush_tlb_local();
+
+ curr->arch.pv_vcpu.ldt_base = ptr;
+ curr->arch.pv_vcpu.ldt_ents = ents;
+ load_LDT(curr);
+ }
+ break;
+ }
+
+ case MMUEXT_CLEAR_PAGE:
+ page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC);
+ if ( unlikely(p2mt != p2m_ram_rw) && page )
+ {
+ put_page(page);
+ page = NULL;
+ }
+ if ( !page || !get_page_type(page, PGT_writable_page) )
+ {
+ if ( page )
+ put_page(page);
+ gdprintk(XENLOG_WARNING,
+ "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn);
+ rc = -EINVAL;
+ break;
+ }
+
+ /* A page is dirtied when it's being cleared. */
+ paging_mark_dirty(pg_owner, page_to_mfn(page));
+
+ clear_domain_page(page_to_mfn(page));
+
+ put_page_and_type(page);
+ break;
+
+ case MMUEXT_COPY_PAGE:
+ {
+ struct page_info *src_page, *dst_page;
+
+ src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt,
+ P2M_ALLOC);
+ if ( unlikely(p2mt != p2m_ram_rw) && src_page )
+ {
+ put_page(src_page);
+ src_page = NULL;
+ }
+ if ( unlikely(!src_page) )
+ {
+ gdprintk(XENLOG_WARNING,
+ "Error copying from mfn %" PRI_mfn "\n",
+ op.arg2.src_mfn);
+ rc = -EINVAL;
+ break;
+ }
+
+ dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt,
+ P2M_ALLOC);
+ if ( unlikely(p2mt != p2m_ram_rw) && dst_page )
+ {
+ put_page(dst_page);
+ dst_page = NULL;
+ }
+ rc = (dst_page &&
+ get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL;
+ if ( unlikely(rc) )
+ {
+ put_page(src_page);
+ if ( dst_page )
+ put_page(dst_page);
+ gdprintk(XENLOG_WARNING,
+ "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn);
+ break;
+ }
+
+ /* A page is dirtied when it's being copied to. */
+ paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
+
+ copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
+
+ put_page_and_type(dst_page);
+ put_page(src_page);
+ break;
+ }
+
+ case MMUEXT_MARK_SUPER:
+ case MMUEXT_UNMARK_SUPER:
+ rc = -EOPNOTSUPP;
+ break;
+
+ default:
+ rc = -ENOSYS;
+ break;
+ }
+
+ done:
+ if ( unlikely(rc) )
+ break;
+
+ guest_handle_add_offset(uops, 1);
+ }
+
+ if ( rc == -ERESTART )
+ {
+ ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi",
+ uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE_PARAM(void) null;
+
+ ASSERT(rc || i == count);
+ set_xen_guest_handle(null, NULL);
+ /*
+ * In order to have a way to communicate the final return value to
+ * our continuation, we pass this in place of "foreigndom", building
+ * on the fact that this argument isn't needed anymore.
+ */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi", null,
+ MMU_UPDATE_PREEMPTED, null, rc);
+ }
+
+ put_pg_owner(pg_owner);
+
+ perfc_add(num_mmuext_ops, i);
+
+ /* Add incremental work we have done to the @done output parameter. */
+ if ( unlikely(!guest_handle_is_null(pdone)) )
+ {
+ done += i;
+ copy_to_guest(pdone, &done, 1);
+ }
+
+ return rc;
+}
+
+long do_mmu_update(
+ XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs,
+ unsigned int count,
+ XEN_GUEST_HANDLE_PARAM(uint) pdone,
+ unsigned int foreigndom)
+{
+ struct mmu_update req;
+ void *va = NULL;
+ unsigned long gpfn, gmfn, mfn;
+ struct page_info *page;
+ unsigned int cmd, i = 0, done = 0, pt_dom;
+ struct vcpu *curr = current, *v = curr;
+ struct domain *d = v->domain, *pt_owner = d, *pg_owner;
+ mfn_t map_mfn = INVALID_MFN;
+ bool sync_guest = false;
+ uint32_t xsm_needed = 0;
+ uint32_t xsm_checked = 0;
+ int rc = put_old_guest_table(curr);
+
+ if ( unlikely(rc) )
+ {
+ if ( likely(rc == -ERESTART) )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
+ foreigndom);
+ return rc;
+ }
+
+ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+ likely(guest_handle_is_null(ureqs)) )
+ {
+ /*
+ * See the curr->arch.old_guest_table related
+ * hypercall_create_continuation() below.
+ */
+ return (int)foreigndom;
+ }
+
+ if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
+ {
+ count &= ~MMU_UPDATE_PREEMPTED;
+ if ( unlikely(!guest_handle_is_null(pdone)) )
+ (void)copy_from_guest(&done, pdone, 1);
+ }
+ else
+ perfc_incr(calls_to_mmu_update);
+
+ if ( unlikely(!guest_handle_okay(ureqs, count)) )
+ return -EFAULT;
+
+ if ( (pt_dom = foreigndom >> 16) != 0 )
+ {
+ /* Pagetables belong to a foreign domain (PFD). */
+ if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
+ return -ESRCH;
+
+ if ( pt_owner == d )
+ rcu_unlock_domain(pt_owner);
+ else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL )
+ {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
+ if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
+ {
+ rc = -ESRCH;
+ goto out;
+ }
+
+ for ( i = 0; i < count; i++ )
+ {
+ if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
+ {
+ rc = -ERESTART;
+ break;
+ }
+
+ if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
+ {
+ rc = -EFAULT;
+ break;
+ }
+
+ cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
+
+ switch ( cmd )
+ {
+ /*
+ * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
+ * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
+ * current A/D bits.
+ */
+ case MMU_NORMAL_PT_UPDATE:
+ case MMU_PT_UPDATE_PRESERVE_AD:
+ {
+ p2m_type_t p2mt;
+
+ rc = -EOPNOTSUPP;
+ if ( unlikely(paging_mode_refcounts(pt_owner)) )
+ break;
+
+ xsm_needed |= XSM_MMU_NORMAL_UPDATE;
+ if ( get_pte_flags(req.val) & _PAGE_PRESENT )
+ {
+ xsm_needed |= XSM_MMU_UPDATE_READ;
+ if ( get_pte_flags(req.val) & _PAGE_RW )
+ xsm_needed |= XSM_MMU_UPDATE_WRITE;
+ }
+ if ( xsm_needed != xsm_checked )
+ {
+ rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner,
xsm_needed);
+ if ( rc )
+ break;
+ xsm_checked = xsm_needed;
+ }
+ rc = -EINVAL;
+
+ req.ptr -= cmd;
+ gmfn = req.ptr >> PAGE_SHIFT;
+ page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC);
+
+ if ( unlikely(!page) || p2mt != p2m_ram_rw )
+ {
+ if ( page )
+ put_page(page);
+ if ( p2m_is_paged(p2mt) )
+ {
+ p2m_mem_paging_populate(pt_owner, gmfn);
+ rc = -ENOENT;
+ }
+ else
+ gdprintk(XENLOG_WARNING,
+ "Could not get page for normal update\n");
+ break;
+ }
+
+ mfn = mfn_x(page_to_mfn(page));
+
+ if ( !mfn_eq(_mfn(mfn), map_mfn) )
+ {
+ if ( va )
+ unmap_domain_page(va);
+ va = map_domain_page(_mfn(mfn));
+ map_mfn = _mfn(mfn);
+ }
+ va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK));
+
+ if ( page_lock(page) )
+ {
+ switch ( page->u.inuse.type_info & PGT_type_mask )
+ {
+ case PGT_l1_page_table:
+ rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
+ pg_owner);
+ break;
+
+ case PGT_l2_page_table:
+ if ( unlikely(pg_owner != pt_owner) )
+ break;
+ rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+ break;
+
+ case PGT_l3_page_table:
+ if ( unlikely(pg_owner != pt_owner) )
+ break;
+ rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+ break;
+
+ case PGT_l4_page_table:
+ if ( unlikely(pg_owner != pt_owner) )
+ break;
+ rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+ /*
+ * No need to sync if all uses of the page can be accounted
+ * to the page lock we hold, its pinned status, and uses on
+ * this (v)CPU.
+ */
+ if ( !rc && this_cpu(root_pgt) &&
+ ((page->u.inuse.type_info & PGT_count_mask) >
+ (1 + !!(page->u.inuse.type_info & PGT_pinned) +
+ (pagetable_get_pfn(curr->arch.guest_table) == mfn) +
+ (pagetable_get_pfn(curr->arch.guest_table_user) ==
+ mfn))) )
+ sync_guest = true;
+ break;
+
+ case PGT_writable_page:
+ perfc_incr(writable_mmu_updates);
+ if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
+ rc = 0;
+ break;
+ }
+ page_unlock(page);
+ if ( rc == -EINTR )
+ rc = -ERESTART;
+ }
+ else if ( get_page_type(page, PGT_writable_page) )
+ {
+ perfc_incr(writable_mmu_updates);
+ if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
+ rc = 0;
+ put_page_type(page);
+ }
+
+ put_page(page);
+ }
+ break;
+
+ case MMU_MACHPHYS_UPDATE:
+ if ( unlikely(d != pt_owner) )
+ {
+ rc = -EPERM;
+ break;
+ }
+
+ if ( unlikely(paging_mode_translate(pg_owner)) )
+ {
+ rc = -EINVAL;
+ break;
+ }
+
+ mfn = req.ptr >> PAGE_SHIFT;
+ gpfn = req.val;
+
+ xsm_needed |= XSM_MMU_MACHPHYS_UPDATE;
+ if ( xsm_needed != xsm_checked )
+ {
+ rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed);
+ if ( rc )
+ break;
+ xsm_checked = xsm_needed;
+ }
+
+ page = get_page_from_mfn(_mfn(mfn), pg_owner);
+ if ( unlikely(!page) )
+ {
+ gdprintk(XENLOG_WARNING,
+ "Could not get page for mach->phys update\n");
+ rc = -EINVAL;
+ break;
+ }
+
+ set_gpfn_from_mfn(mfn, gpfn);
+
+ paging_mark_dirty(pg_owner, _mfn(mfn));
+
+ put_page(page);
+ break;
+
+ default:
+ rc = -ENOSYS;
+ break;
+ }
+
+ if ( unlikely(rc) )
+ break;
+
+ guest_handle_add_offset(ureqs, 1);
+ }
+
+ if ( rc == -ERESTART )
+ {
+ ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi",
+ ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE_PARAM(void) null;
+
+ ASSERT(rc || i == count);
+ set_xen_guest_handle(null, NULL);
+ /*
+ * In order to have a way to communicate the final return value to
+ * our continuation, we pass this in place of "foreigndom", building
+ * on the fact that this argument isn't needed anymore.
+ */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi", null,
+ MMU_UPDATE_PREEMPTED, null, rc);
+ }
+
+ put_pg_owner(pg_owner);
+
+ if ( va )
+ unmap_domain_page(va);
+
+ if ( sync_guest )
+ {
+ /*
+ * Force other vCPU-s of the affected guest to pick up L4 entry
+ * changes (if any). Issue a flush IPI with empty operation mask to
+ * facilitate this (including ourselves waiting for the IPI to
+ * actually have arrived). Utilize the fact that FLUSH_VA_VALID is
+ * meaningless without FLUSH_CACHE, but will allow to pass the no-op
+ * check in flush_area_mask().
+ */
+ unsigned int cpu = smp_processor_id();
+ cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
+
+ cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
+ if ( !cpumask_empty(mask) )
+ flush_area_mask(mask, ZERO_BLOCK_PTR, FLUSH_VA_VALID);
+ }
+
+ perfc_add(num_page_updates, i);
+
+ out:
+ if ( pt_owner != d )
+ rcu_unlock_domain(pt_owner);
+
+ /* Add incremental work we have done to the @done output parameter. */
+ if ( unlikely(!guest_handle_is_null(pdone)) )
+ {
+ done += i;
+ copy_to_guest(pdone, &done, 1);
+ }
+
+ return rc;
+}
+
+static int __do_update_va_mapping(
+ unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
+{
+ l1_pgentry_t val = l1e_from_intpte(val64);
+ struct vcpu *v = current;
+ struct domain *d = v->domain;
+ struct page_info *gl1pg;
+ l1_pgentry_t *pl1e;
+ unsigned long bmap_ptr;
+ mfn_t gl1mfn;
+ cpumask_t *mask = NULL;
+ int rc;
+
+ perfc_incr(calls_to_update_va);
+
+ rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val);
+ if ( rc )
+ return rc;
+
+ rc = -EINVAL;
+ pl1e = map_guest_l1e(va, &gl1mfn);
+ gl1pg = pl1e ? get_page_from_mfn(gl1mfn, d) : NULL;
+ if ( unlikely(!gl1pg) )
+ goto out;
+
+ if ( !page_lock(gl1pg) )
+ {
+ put_page(gl1pg);
+ goto out;
+ }
+
+ if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+ {
+ page_unlock(gl1pg);
+ put_page(gl1pg);
+ goto out;
+ }
+
+ rc = mod_l1_entry(pl1e, val, mfn_x(gl1mfn), 0, v, pg_owner);
+
+ page_unlock(gl1pg);
+ put_page(gl1pg);
+
+ out:
+ if ( pl1e )
+ unmap_domain_page(pl1e);
+
+ switch ( flags & UVMF_FLUSHTYPE_MASK )
+ {
+ case UVMF_TLB_FLUSH:
+ switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
+ {
+ case UVMF_LOCAL:
+ flush_tlb_local();
+ break;
+ case UVMF_ALL:
+ mask = d->dirty_cpumask;
+ break;
+ default:
+ mask = this_cpu(scratch_cpumask);
+ rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
+ void),
+ mask);
+ break;
+ }
+ if ( mask )
+ flush_tlb_mask(mask);
+ break;
+
+ case UVMF_INVLPG:
+ switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
+ {
+ case UVMF_LOCAL:
+ paging_invlpg(v, va);
+ break;
+ case UVMF_ALL:
+ mask = d->dirty_cpumask;
+ break;
+ default:
+ mask = this_cpu(scratch_cpumask);
+ rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
+ void),
+ mask);
+ break;
+ }
+ if ( mask )
+ flush_tlb_one_mask(mask, va);
+ break;
+ }
+
+ return rc;
+}
+
+long do_update_va_mapping(unsigned long va, u64 val64,
+ unsigned long flags)
+{
+ return __do_update_va_mapping(va, val64, flags, current->domain);
+}
+
+long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
+ unsigned long flags,
+ domid_t domid)
+{
+ struct domain *pg_owner;
+ int rc;
+
+ if ( (pg_owner = get_pg_owner(domid)) == NULL )
+ return -ESRCH;
+
+ rc = __do_update_va_mapping(va, val64, flags, pg_owner);
+
+ put_pg_owner(pg_owner);
+
+ return rc;
+}
+
/*
* Local variables:
* mode: C
--
2.11.0
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |