Avoid backing M2P table holes with memory, when those holes are large enough to cover an exact multiple of large pages. For the sake of saving and migrating guests, XENMEM_machphys_mfn_list fills the holes in the array it returns with the MFN for the previous range returned (thanks to Keir pointing out that it really doesn't matter *what* MFN gets returned for invalid ranges). Using the most rescently encountered MFN (rather than e.g. always the first one) represents an attempt to cut down on the number of references these pages will get when they get mapped into a privileged domain's address space. This also allows for saving a couple of 2M pages even on certain "normal" systems. Signed-off-by: Jan Beulich --- 2009-09-10.orig/xen/arch/x86/x86_32/mm.c 2009-09-17 15:51:43.000000000 +0200 +++ 2009-09-10/xen/arch/x86/x86_32/mm.c 2009-09-17 12:06:40.000000000 +0200 @@ -72,7 +72,7 @@ void __init paging_init(void) { unsigned long v; struct page_info *pg; - int i; + unsigned int i, n; if ( cpu_has_pge ) { @@ -96,8 +96,18 @@ void __init paging_init(void) */ mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1; mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); +#define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long)) +#define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \ + sizeof(*machine_to_phys_mapping)) + BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \ + sizeof(*machine_to_phys_mapping)); for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) { + for ( n = 0; n < CNT; ++n) + if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) ) + break; + if ( n == CNT ) + continue; if ( (pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL ) panic("Not enough memory to bootstrap Xen.\n"); l2e_write(&idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i], @@ -106,11 +116,12 @@ void __init paging_init(void) l2e_write(&idle_pg_table_l2[l2_linear_offset(RO_MPT_VIRT_START) + i], l2e_from_page( pg, (__PAGE_HYPERVISOR | _PAGE_PSE) & ~_PAGE_RW)); + /* Fill with an obvious debug pattern. */ + memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55, + 1UL << L2_PAGETABLE_SHIFT); } - - /* Fill with an obvious debug pattern. */ - for ( i = 0; i < (mpt_size / BYTES_PER_LONG); i++) - set_gpfn_from_mfn(i, 0x55555555); +#undef CNT +#undef MFN /* Create page tables for ioremap()/map_domain_page_global(). */ for ( i = 0; i < (IOREMAP_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) @@ -163,14 +174,17 @@ void __init subarch_init_memory(void) { unsigned long m2p_start_mfn; unsigned int i, j; + l2_pgentry_t l2e; BUILD_BUG_ON(sizeof(struct page_info) != 24); /* M2P table is mappable read-only by privileged domains. */ for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) { - m2p_start_mfn = l2e_get_pfn( - idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i]); + l2e = idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i]; + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + continue; + m2p_start_mfn = l2e_get_pfn(l2e); for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ ) { struct page_info *page = mfn_to_page(m2p_start_mfn + j); @@ -191,8 +205,9 @@ void __init subarch_init_memory(void) long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg) { struct xen_machphys_mfn_list xmml; - unsigned long mfn; + unsigned long mfn, last_mfn; unsigned int i, max; + l2_pgentry_t l2e; long rc = 0; switch ( op ) @@ -203,12 +218,18 @@ long subarch_memory_op(int op, XEN_GUEST max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21); - for ( i = 0; i < max; i++ ) + for ( i = 0, last_mfn = 0; i < max; i++ ) { - mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset( - RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21); + l2e = idle_pg_table_l2[l2_linear_offset( + RDWR_MPT_VIRT_START + (i << 21))]; + if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) + mfn = l2e_get_pfn(l2e); + else + mfn = last_mfn; + ASSERT(mfn); if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) ) return -EFAULT; + last_mfn = mfn; } xmml.nr_extents = i; --- 2009-09-10.orig/xen/arch/x86/x86_64/compat/mm.c 2009-09-17 15:51:43.000000000 +0200 +++ 2009-09-10/xen/arch/x86/x86_64/compat/mm.c 2009-09-17 09:08:28.000000000 +0200 @@ -153,19 +153,31 @@ int compat_arch_memory_op(int op, XEN_GU } case XENMEM_machphys_mfn_list: + { + unsigned long limit; + compat_pfn_t last_mfn; + if ( copy_from_guest(&xmml, arg, 1) ) return -EFAULT; - for ( i = 0, v = RDWR_COMPAT_MPT_VIRT_START; - (i != xmml.max_extents) && (v != RDWR_COMPAT_MPT_VIRT_END); + limit = (unsigned long)(compat_machine_to_phys_mapping + + min_t(unsigned long, max_page, + MACH2PHYS_COMPAT_NR_ENTRIES(current->domain))); + if ( limit > RDWR_COMPAT_MPT_VIRT_END ) + limit = RDWR_COMPAT_MPT_VIRT_END; + for ( i = 0, v = RDWR_COMPAT_MPT_VIRT_START, last_mfn = 0; + (i != xmml.max_extents) && (v < limit); i++, v += 1 << L2_PAGETABLE_SHIFT ) { l2e = compat_idle_pg_table_l2[l2_table_offset(v)]; - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - break; - mfn = l2e_get_pfn(l2e) + l1_table_offset(v); + if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) + mfn = l2e_get_pfn(l2e); + else + mfn = last_mfn; + ASSERT(mfn); if ( copy_to_compat_offset(xmml.extent_start, i, &mfn, 1) ) return -EFAULT; + last_mfn = mfn; } xmml.nr_extents = i; @@ -173,6 +185,7 @@ int compat_arch_memory_op(int op, XEN_GU rc = -EFAULT; break; + } default: rc = -ENOSYS; --- 2009-09-10.orig/xen/arch/x86/x86_64/mm.c 2009-09-17 15:53:39.000000000 +0200 +++ 2009-09-10/xen/arch/x86/x86_64/mm.c 2009-09-17 15:53:46.000000000 +0200 @@ -194,7 +194,7 @@ void __init pfn_pdx_hole_setup(unsigned void __init paging_init(void) { unsigned long i, mpt_size, va; - unsigned int memflags; + unsigned int n, memflags; l3_pgentry_t *l3_ro_mpt; l2_pgentry_t *l2_ro_mpt = NULL; struct page_info *l1_pg, *l2_pg, *l3_pg; @@ -213,6 +213,11 @@ void __init paging_init(void) */ mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1; mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); +#define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long)) +#define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \ + sizeof(*machine_to_phys_mapping)) + BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \ + sizeof(*machine_to_phys_mapping)); for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) { BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1)); @@ -222,37 +227,63 @@ void __init paging_init(void) if ( cpu_has_page1gb && !((unsigned long)l2_ro_mpt & ~PAGE_MASK) && - (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) && - (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER, - memflags)) != NULL ) + (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) ) + { + unsigned int k, holes; + + for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k) + { + for ( n = 0; n < CNT; ++n) + if ( mfn_valid(MFN(i + k) + n * PDX_GROUP_COUNT) ) + break; + if ( n == CNT ) + ++holes; + } + if ( k == holes ) + { + i += (1UL << PAGETABLE_ORDER) - 1; + continue; + } + if ( holes == 0 && + (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER, + memflags)) != NULL ) + { + map_pages_to_xen( + RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), + page_to_mfn(l1_pg), + 1UL << (2 * PAGETABLE_ORDER), + PAGE_HYPERVISOR); + memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), + 0x77, 1UL << L3_PAGETABLE_SHIFT); + + ASSERT(!l2_table_offset(va)); + /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */ + l3e_write(&l3_ro_mpt[l3_table_offset(va)], + l3e_from_page(l1_pg, + /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT)); + i += (1UL << PAGETABLE_ORDER) - 1; + continue; + } + } + + for ( n = 0; n < CNT; ++n) + if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) ) + break; + if ( n == CNT ) + l1_pg = NULL; + else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, + memflags)) == NULL ) + goto nomem; + else { map_pages_to_xen( RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), page_to_mfn(l1_pg), - 1UL << (2 * PAGETABLE_ORDER), + 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), - 0x77, 1UL << L3_PAGETABLE_SHIFT); - - ASSERT(!l2_table_offset(va)); - /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */ - l3e_write(&l3_ro_mpt[l3_table_offset(va)], - l3e_from_page(l1_pg, - /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT)); - i += (1UL << PAGETABLE_ORDER) - 1; - continue; + 0x55, 1UL << L2_PAGETABLE_SHIFT); } - - if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, - memflags)) == NULL ) - goto nomem; - map_pages_to_xen( - RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), - page_to_mfn(l1_pg), - 1UL << PAGETABLE_ORDER, - PAGE_HYPERVISOR); - memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55, - 1UL << L2_PAGETABLE_SHIFT); if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) ) { if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL ) @@ -264,10 +295,13 @@ void __init paging_init(void) ASSERT(!l2_table_offset(va)); } /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */ - l2e_write(l2_ro_mpt, l2e_from_page( - l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT)); + if ( l1_pg ) + l2e_write(l2_ro_mpt, l2e_from_page( + l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT)); l2_ro_mpt++; } +#undef CNT +#undef MFN /* Create user-accessible L2 directory to map the MPT for compat guests. */ BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) != @@ -288,12 +322,22 @@ void __init paging_init(void) mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END ) m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size; - for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) +#define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int)) +#define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \ + sizeof(*compat_machine_to_phys_mapping)) + BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \ + sizeof(*compat_machine_to_phys_mapping)); + for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ ) { memflags = MEMF_node(phys_to_nid(i << (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT))); + for ( n = 0; n < CNT; ++n) + if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) ) + break; + if ( n == CNT ) + continue; if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, - memflags)) == NULL ) + memflags)) == NULL ) goto nomem; map_pages_to_xen( RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), @@ -306,8 +350,9 @@ void __init paging_init(void) 1UL << L2_PAGETABLE_SHIFT); /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */ l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT)); - l2_ro_mpt++; } +#undef CNT +#undef MFN /* Set up linear page table mapping. */ l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)], @@ -428,7 +473,7 @@ long subarch_memory_op(int op, XEN_GUEST l3_pgentry_t l3e; l2_pgentry_t l2e; unsigned long v; - xen_pfn_t mfn; + xen_pfn_t mfn, last_mfn; unsigned int i; long rc = 0; @@ -440,29 +485,32 @@ long subarch_memory_op(int op, XEN_GUEST BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1)); BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1)); - for ( i = 0, v = RDWR_MPT_VIRT_START; - (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END); + for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0; + (i != xmml.max_extents) && + (v < (unsigned long)(machine_to_phys_mapping + max_page)); i++, v += 1UL << L2_PAGETABLE_SHIFT ) { l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[ l3_table_offset(v)]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - break; - if ( !(l3e_get_flags(l3e) & _PAGE_PSE) ) + mfn = last_mfn; + else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) ) { l2e = l3e_to_l2e(l3e)[l2_table_offset(v)]; - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - break; - mfn = l2e_get_pfn(l2e); + if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) + mfn = l2e_get_pfn(l2e); + else + mfn = last_mfn; } else { mfn = l3e_get_pfn(l3e) + (l2_table_offset(v) << PAGETABLE_ORDER); } - ASSERT(!l1_table_offset(v)); + ASSERT(mfn); if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) ) return -EFAULT; + last_mfn = mfn; } xmml.nr_extents = i;