# HG changeset patch
# User emellor@xxxxxxxxxxxxxxxxxxxxxx
# Node ID 3f39f030fa894d29d04b748513bf48000d6a17f5
# Parent cbf6f95e9c62ab2fcb7c430a51b5444f5139945e
# Parent e4e1674a747d4b69f194e8ccbc4dd72c481da5f0
Merged.
diff -r cbf6f95e9c62 -r 3f39f030fa89
linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Wed Nov 16 19:33:12 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Wed Nov 16 19:33:23 2005
@@ -136,21 +136,19 @@
}
EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
-/* FIXME: This is horribly broken on PAE */
static int lookup_pte_fn(
pte_t *pte, struct page *pte_page, unsigned long addr, void *data)
{
- unsigned long *ptep = (unsigned long *)data;
+ uint64_t *ptep = (uint64_t *)data;
if (ptep)
- *ptep = (pfn_to_mfn(page_to_pfn(pte_page)) <<
- PAGE_SHIFT) |
- ((unsigned long)pte & ~PAGE_MASK);
+ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pte_page)) <<
+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
return 0;
}
int create_lookup_pte_addr(struct mm_struct *mm,
unsigned long address,
- unsigned long *ptep)
+ uint64_t *ptep)
{
return generic_page_range(mm, address, PAGE_SIZE, lookup_pte_fn, ptep);
}
diff -r cbf6f95e9c62 -r 3f39f030fa89
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c Wed Nov 16
19:33:12 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c Wed Nov 16
19:33:23 2005
@@ -770,9 +770,9 @@
pfn_to_mfn_frame_list_list = alloc_bootmem(PAGE_SIZE);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(pfn_to_mfn_frame_list_list);
-
+
fpp = PAGE_SIZE/sizeof(unsigned long);
- for ( i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++ )
+ for ( i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++ )
{
if ( (j % fpp) == 0 )
{
@@ -786,8 +786,11 @@
pfn_to_mfn_frame_list[k][j] =
virt_to_mfn(&phys_to_machine_mapping[i]);
}
- HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
- }
+ HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
+
+ }
+
+
if ( ! (xen_start_info->flags & SIF_INITDOMAIN))
{
diff -r cbf6f95e9c62 -r 3f39f030fa89
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Wed Nov 16 19:33:12 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Wed Nov 16 19:33:23 2005
@@ -412,7 +412,7 @@
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
unsigned int i, op = 0;
struct grant_handle_pair *handle;
- unsigned long ptep;
+ uint64_t ptep;
int ret;
for ( i = 0; i < nr_pages; i++)
@@ -427,9 +427,9 @@
op++;
if (create_lookup_pte_addr(
- blktap_vma->vm_mm,
- MMAP_VADDR(user_vstart, idx, i),
- &ptep) !=0) {
+ blktap_vma->vm_mm,
+ MMAP_VADDR(user_vstart, idx, i),
+ &ptep) !=0) {
DPRINTK("Couldn't get a pte addr!\n");
return;
}
@@ -705,7 +705,7 @@
unsigned long uvaddr;
unsigned long kvaddr;
- unsigned long ptep;
+ uint64_t ptep;
uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
diff -r cbf6f95e9c62 -r 3f39f030fa89
linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c
--- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Wed Nov 16
19:33:12 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Wed Nov 16
19:33:23 2005
@@ -152,7 +152,8 @@
privcmd_mmapbatch_t m;
struct vm_area_struct *vma = NULL;
unsigned long *p, addr;
- unsigned long mfn, ptep;
+ unsigned long mfn;
+ uint64_t ptep;
int i;
if (copy_from_user(&m, (void *)data, sizeof(m))) {
@@ -217,15 +218,39 @@
#endif
#ifndef __ia64__
- case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: {
- unsigned long m2pv = (unsigned long)machine_to_phys_mapping;
- pgd_t *pgd = pgd_offset_k(m2pv);
- pud_t *pud = pud_offset(pgd, m2pv);
- pmd_t *pmd = pmd_offset(pud, m2pv);
- unsigned long m2p_start_mfn =
- (*(unsigned long *)pmd) >> PAGE_SHIFT;
- ret = put_user(m2p_start_mfn, (unsigned long *)data) ?
- -EFAULT: 0;
+ case IOCTL_PRIVCMD_GET_MACH2PHYS_MFNS: {
+
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned long m2pv, m2p_mfn;
+ privcmd_m2pmfns_t m;
+ unsigned long *p;
+ int i;
+
+ if (copy_from_user(&m, (void *)data, sizeof(m)))
+ return -EFAULT;
+
+ m2pv = (unsigned long)machine_to_phys_mapping;
+
+ p = m.arr;
+
+ for(i=0; i < m.num; i++) {
+
+ pgd = pgd_offset_k(m2pv);
+ pud = pud_offset(pgd, m2pv);
+ pmd = pmd_offset(pud, m2pv);
+ m2p_mfn = (*(uint64_t *)pmd >> PAGE_SHIFT)&0xFFFFFFFF;
+
+ if (put_user(m2p_mfn, p + i))
+ return -EFAULT;
+
+ m2pv += (1 << 21);
+ }
+
+ ret = 0;
+ break;
+
}
break;
#endif
diff -r cbf6f95e9c62 -r 3f39f030fa89
linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Wed Nov 16
19:33:12 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Wed Nov 16
19:33:23 2005
@@ -450,11 +450,11 @@
#endif /* !CONFIG_DISCONTIGMEM */
int direct_remap_pfn_range(struct vm_area_struct *vma,
- unsigned long address,
- unsigned long mfn,
- unsigned long size,
- pgprot_t prot,
- domid_t domid);
+ unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid);
int direct_kernel_remap_pfn_range(unsigned long address,
unsigned long mfn,
unsigned long size,
@@ -462,7 +462,7 @@
domid_t domid);
int create_lookup_pte_addr(struct mm_struct *mm,
unsigned long address,
- unsigned long *ptep);
+ uint64_t *ptep);
int touch_pte_range(struct mm_struct *mm,
unsigned long address,
unsigned long size);
diff -r cbf6f95e9c62 -r 3f39f030fa89
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Wed Nov 16
19:33:12 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Wed Nov 16
19:33:23 2005
@@ -541,7 +541,7 @@
int create_lookup_pte_addr(struct mm_struct *mm,
unsigned long address,
- unsigned long *ptep);
+ uint64_t *ptep);
int touch_pte_range(struct mm_struct *mm,
unsigned long address,
diff -r cbf6f95e9c62 -r 3f39f030fa89
linux-2.6-xen-sparse/include/asm-xen/linux-public/privcmd.h
--- a/linux-2.6-xen-sparse/include/asm-xen/linux-public/privcmd.h Wed Nov
16 19:33:12 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/linux-public/privcmd.h Wed Nov
16 19:33:23 2005
@@ -55,6 +55,11 @@
unsigned long *arr; /* array of mfns - top nibble set on err */
} privcmd_mmapbatch_t;
+typedef struct privcmd_m2pmfns {
+ int num; /* max number of mfns to return */
+ unsigned long *arr; /* array of mfns */
+} privcmd_m2pmfns_t;
+
typedef struct privcmd_blkmsg
{
unsigned long op;
@@ -69,12 +74,11 @@
*/
#define IOCTL_PRIVCMD_HYPERCALL \
_IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
-
#define IOCTL_PRIVCMD_MMAP \
_IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
#define IOCTL_PRIVCMD_MMAPBATCH \
_IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
-#define IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN \
+#define IOCTL_PRIVCMD_GET_MACH2PHYS_MFNS \
_IOC(_IOC_READ, 'P', 4, sizeof(unsigned long))
#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/examples/vif-common.sh
--- a/tools/examples/vif-common.sh Wed Nov 16 19:33:12 2005
+++ b/tools/examples/vif-common.sh Wed Nov 16 19:33:23 2005
@@ -63,7 +63,9 @@
fi
iptables "$c" FORWARD -m physdev --physdev-in "$vif" "$@" -j ACCEPT ||
- fatal "iptables $c FORWARD -m physdev --physdev-in $vif $@ -j ACCEPT
failed"
+ log err \
+ "iptables $c FORWARD -m physdev --physdev-in $vif $@ -j ACCEPT failed.
+If you are using iptables, this may affect networking for guest domains."
}
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c Wed Nov 16 19:33:12 2005
+++ b/tools/libxc/xc_linux_build.c Wed Nov 16 19:33:23 2005
@@ -629,7 +629,7 @@
memset(start_info, 0, sizeof(*start_info));
rc = xc_version(xc_handle, XENVER_version, NULL);
sprintf(start_info->magic, "xen-%i.%i-x86_%d%s",
- rc >> 16, rc & (0xFFFF), sizeof(long)*8,
+ rc >> 16, rc & (0xFFFF), (unsigned int)sizeof(long)*8,
dsi.pae_kernel ? "p" : "");
start_info->nr_pages = nr_pages;
start_info->shared_info = shared_info_frame << PAGE_SHIFT;
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/libxc/xc_linux_restore.c
--- a/tools/libxc/xc_linux_restore.c Wed Nov 16 19:33:12 2005
+++ b/tools/libxc/xc_linux_restore.c Wed Nov 16 19:33:23 2005
@@ -13,13 +13,13 @@
#include "xg_save_restore.h"
/* max mfn of the whole machine */
-static uint32_t max_mfn;
+static unsigned long max_mfn;
/* virtual starting address of the hypervisor */
-static uint32_t hvirt_start;
+static unsigned long hvirt_start;
/* #levels of page tables used by the currrent guest */
-static uint32_t pt_levels;
+static unsigned int pt_levels;
/* total number of pages used by the current guest */
static unsigned long max_pfn;
@@ -49,7 +49,6 @@
return (r == count) ? 1 : 0;
}
-
/*
** In the state file (or during transfer), all page-table pages are
@@ -60,23 +59,11 @@
*/
int uncanonicalize_pagetable(unsigned long type, void *page)
{
- int i, pte_last, xen_start, xen_end;
+ int i, pte_last;
unsigned long pfn;
uint64_t pte;
- /*
- ** We need to determine which entries in this page table hold
- ** reserved hypervisor mappings. This depends on the current
- ** page table type as well as the number of paging levels.
- */
- xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
-
- if (pt_levels == 2 && type == L2TAB)
- xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
-
- if (pt_levels == 3 && type == L3TAB)
- xen_start = L3_PAGETABLE_ENTRIES_PAE;
-
+ pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
/* Now iterate through the page table, uncanonicalizing each PTE */
for(i = 0; i < pte_last; i++) {
@@ -85,13 +72,10 @@
pte = ((uint32_t *)page)[i];
else
pte = ((uint64_t *)page)[i];
-
- if(i >= xen_start && i < xen_end)
- pte = 0;
-
+
if(pte & _PAGE_PRESENT) {
-
- pfn = pte >> PAGE_SHIFT;
+
+ pfn = (pte >> PAGE_SHIFT) & 0xffffffff;
if(pfn >= max_pfn) {
ERR("Frame number in type %lu page table is out of range: "
@@ -101,17 +85,16 @@
}
- if(type == L1TAB)
- pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT);
- else
- pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
-
- pte |= p2m[pfn] << PAGE_SHIFT;
-
+ pte &= 0xffffff0000000fffULL;
+ pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
+
if(pt_levels == 2)
((uint32_t *)page)[i] = (uint32_t)pte;
else
((uint64_t *)page)[i] = (uint64_t)pte;
+
+
+
}
}
@@ -143,6 +126,9 @@
/* A table of MFNs to map in the current region */
unsigned long *region_mfn = NULL;
+ /* Types of the pfns in the current region */
+ unsigned long region_pfn_type[MAX_BATCH_SIZE];
+
/* A temporary mapping, and a copy, of one frame of guest memory. */
unsigned long *page = NULL;
@@ -233,10 +219,12 @@
if(xc_domain_memory_increase_reservation(
xc_handle, dom, max_pfn, 0, 0, NULL) != 0) {
- ERR("Failed to increase reservation by %lx KB\n", max_pfn);
+ ERR("Failed to increase reservation by %lx KB\n", PFN_TO_KB(max_pfn));
errno = ENOMEM;
goto out;
}
+
+ DPRINTF("Increased domain reservation by %lx KB\n", PFN_TO_KB(max_pfn));
/* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
@@ -248,6 +236,7 @@
ERR("Could not initialise for MMU updates");
goto out;
}
+
DPRINTF("Reloading memory pages: 0%%\n");
@@ -261,7 +250,6 @@
while (1) {
int j;
- unsigned long region_pfn_type[MAX_BATCH_SIZE];
this_pc = (n * 100) / max_pfn;
if ( (this_pc - prev_pc) >= 5 )
@@ -322,7 +310,7 @@
if (pagetype == XTAB)
/* a bogus/unmapped page: skip it */
continue;
-
+
if (pfn > max_pfn) {
ERR("pfn out of range");
goto out;
@@ -348,10 +336,20 @@
** A page table page - need to 'uncanonicalize' it, i.e.
** replace all the references to pfns with the corresponding
** mfns for the new domain.
- */
- if(!uncanonicalize_pagetable(pagetype, page))
- goto out;
-
+ **
+ ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+ ** so we may need to update the p2m after the main loop.
+ ** Hence we defer canonicalization of L1s until then.
+ */
+ if(pt_levels != 3 || pagetype != L1TAB) {
+
+ if(!uncanonicalize_pagetable(pagetype, page)) {
+ ERR("failed uncanonicalize pt!\n");
+ goto out;
+ }
+
+ }
+
} else if(pagetype != NOTAB) {
ERR("Bogus page type %lx page table is out of range: "
@@ -359,7 +357,6 @@
goto out;
}
-
if (verify) {
@@ -386,9 +383,9 @@
}
if (xc_add_mmu_update(xc_handle, mmu,
- (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
- pfn)) {
- ERR("machpys mfn=%ld pfn=%ld", mfn, pfn);
+ (((unsigned long long)mfn) << PAGE_SHIFT)
+ | MMU_MACHPHYS_UPDATE, pfn)) {
+ ERR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
goto out;
}
} /* end of 'batch' for loop */
@@ -399,14 +396,39 @@
DPRINTF("Received all pages\n");
- if (pt_levels == 3) {
-
- /* Get all PGDs below 4GB. */
+ if(pt_levels == 3) {
+
+ /*
+ ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
+ ** is a little awkward and involves (a) finding all such PGDs and
+ ** replacing them with 'lowmem' versions; (b) upating the p2m[]
+ ** with the new info; and (c) canonicalizing all the L1s using the
+ ** (potentially updated) p2m[].
+ **
+ ** This is relatively slow (and currently involves two passes through
+ ** the pfn_type[] array), but at least seems to be correct. May wish
+ ** to consider more complex approaches to optimize this later.
+ */
+
+ int j, k;
+
+ /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
for (i = 0; i < max_pfn; i++) {
if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
unsigned long new_mfn;
+ uint64_t l3ptes[4];
+ uint64_t *l3tab;
+
+ l3tab = (uint64_t *)
+ xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+ PROT_READ, p2m[i]);
+
+ for(j = 0; j < 4; j++)
+ l3ptes[j] = l3tab[j];
+
+ munmap(l3tab, PAGE_SIZE);
if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
ERR("Couldn't get a page below 4GB :-(");
@@ -414,15 +436,58 @@
}
p2m[i] = new_mfn;
- if (xc_add_mmu_update(
- xc_handle, mmu,
- (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i)) {
+ if (xc_add_mmu_update(xc_handle, mmu,
+ (((unsigned long long)new_mfn)
+ << PAGE_SHIFT) |
+ MMU_MACHPHYS_UPDATE, i)) {
ERR("Couldn't m2p on PAE root pgdir");
goto out;
}
+
+ l3tab = (uint64_t *)
+ xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+ PROT_READ | PROT_WRITE, p2m[i]);
+
+ for(j = 0; j < 4; j++)
+ l3tab[j] = l3ptes[j];
+
+ munmap(l3tab, PAGE_SIZE);
+
}
}
-
+
+ /* Second pass: find all L1TABs and uncanonicalize them */
+ j = 0;
+
+ for(i = 0; i < max_pfn; i++) {
+
+ if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
+ region_mfn[j] = p2m[i];
+ j++;
+ }
+
+ if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
+
+ if (!(region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_READ | PROT_WRITE,
+ region_mfn, j))) {
+ ERR("map batch failed");
+ goto out;
+ }
+
+ for(k = 0; k < j; k++) {
+ if(!uncanonicalize_pagetable(L1TAB,
+ region_base + k*PAGE_SIZE)) {
+ ERR("failed uncanonicalize pt!\n");
+ goto out;
+ }
+ }
+
+ munmap(region_base, j*PAGE_SIZE);
+ j = 0;
+ }
+ }
+
}
@@ -430,6 +495,7 @@
ERR("Error doing finish_mmu_updates()");
goto out;
}
+
/*
* Pin page tables. Do this after writing to them as otherwise Xen
@@ -439,7 +505,7 @@
if ( (pfn_type[i] & LPINTAB) == 0 )
continue;
-
+
switch(pfn_type[i]) {
case (L1TAB|LPINTAB):
@@ -463,22 +529,15 @@
}
pin[nr_pins].arg1.mfn = p2m[i];
+
+ nr_pins ++;
- if (++nr_pins == MAX_PIN_BATCH) {
+ if (i == (max_pfn-1) || nr_pins == MAX_PIN_BATCH) {
if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) {
ERR("Failed to pin batch of %d page tables", nr_pins);
goto out;
}
- DPRINTF("successfully pinned batch of %d page tables", nr_pins);
nr_pins = 0;
- }
- }
-
- if (nr_pins != 0) {
- if((rc = xc_mmuext_op(xc_handle, pin, nr_pins, dom)) < 0) {
- ERR("Failed (2) to pin batch of %d page tables", nr_pins);
- DPRINTF("rc is %d\n", rc);
- goto out;
}
}
@@ -579,23 +638,20 @@
pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
if (pfn >= max_pfn) {
- DPRINTF("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx\n",
- pfn, max_pfn, pfn_type[pfn]);
- ERR("PT base is bad.");
+ ERR("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx",
+ pfn, max_pfn, pfn_type[pfn]);
goto out;
}
if ((pt_levels == 2) && ((pfn_type[pfn]<ABTYPE_MASK) != L2TAB)) {
- DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
- pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
- ERR("PT base is bad.");
+ ERR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
+ pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
goto out;
}
if ((pt_levels == 3) && ((pfn_type[pfn]<ABTYPE_MASK) != L3TAB)) {
- DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
- pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
- ERR("PT base is bad.");
+ ERR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
+ pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
goto out;
}
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c Wed Nov 16 19:33:12 2005
+++ b/tools/libxc/xc_linux_save.c Wed Nov 16 19:33:23 2005
@@ -27,13 +27,13 @@
/* max mfn of the whole machine */
-static uint32_t max_mfn;
+static unsigned long max_mfn;
/* virtual starting address of the hypervisor */
-static uint32_t hvirt_start;
+static unsigned long hvirt_start;
/* #levels of page tables used by the currrent guest */
-static uint32_t pt_levels;
+static unsigned int pt_levels;
/* total number of pages used by the current guest */
static unsigned long max_pfn;
@@ -73,7 +73,7 @@
*/
#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
#define BITMAP_ENTRY(_nr,_bmap) \
((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
@@ -500,6 +500,70 @@
+static unsigned long *xc_map_m2p(int xc_handle,
+ unsigned long max_mfn,
+ int prot)
+{
+ privcmd_m2pmfns_t m2p_mfns;
+ privcmd_mmap_t ioctlx;
+ privcmd_mmap_entry_t *entries;
+ unsigned long m2p_chunks, m2p_size;
+ unsigned long *m2p;
+ int i, rc;
+
+ m2p_size = M2P_SIZE(max_mfn);
+ m2p_chunks = M2P_CHUNKS(max_mfn);
+
+
+ m2p_mfns.num = m2p_chunks;
+
+ if(!(m2p_mfns.arr = malloc(m2p_chunks * sizeof(unsigned long)))) {
+ ERR("failed to allocate space for m2p mfns!\n");
+ return NULL;
+ }
+
+ if (ioctl(xc_handle, IOCTL_PRIVCMD_GET_MACH2PHYS_MFNS, &m2p_mfns) < 0) {
+ ERR("xc_get_m2p_mfns:");
+ return NULL;
+ }
+
+ if((m2p = mmap(NULL, m2p_size, prot,
+ MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
+ ERR("failed to mmap m2p");
+ return NULL;
+ }
+
+
+ if(!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
+ ERR("failed to allocate space for mmap entries!\n");
+ return NULL;
+ }
+
+
+ ioctlx.num = m2p_chunks;
+ ioctlx.dom = DOMID_XEN;
+ ioctlx.entry = entries;
+
+ for(i=0; i < m2p_chunks; i++) {
+
+ entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
+ entries[i].mfn = m2p_mfns.arr[i];
+ entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
+
+ }
+
+ if((rc = ioctl(xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx)) < 0) {
+ ERR("ioctl_mmap failed (rc = %d)", rc);
+ return NULL;
+ }
+
+ free(m2p_mfns.arr);
+ free(entries);
+
+ return m2p;
+}
+
+
int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
uint32_t max_factor, uint32_t flags)
@@ -531,16 +595,12 @@
/* A copy of the pfn-to-mfn table frame list. */
unsigned long *p2m_frame_list = NULL;
- unsigned long m2p_start_mfn;
-
/* Live mapping of shared info structure */
shared_info_t *live_shinfo = NULL;
/* base of the region in which domain memory is mapped */
unsigned char *region_base = NULL;
-
-
/* power of 2 order of max_pfn */
int order_nr;
@@ -563,9 +623,6 @@
max_factor = DEF_MAX_FACTOR;
initialize_mbit_rate();
-
- DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live ?
- "true" : "false");
if(!get_platform_info(xc_handle, dom,
&max_mfn, &hvirt_start, &pt_levels)) {
@@ -647,11 +704,13 @@
}
/* Setup the mfn_to_pfn table mapping */
- m2p_start_mfn = xc_get_m2p_start_mfn(xc_handle);
- live_m2p = xc_map_foreign_range(xc_handle, DOMID_XEN, M2P_SIZE,
- PROT_READ, m2p_start_mfn);
-
- /* Get a local copy fo the live_P2M_frame_list */
+ if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
+ ERR("Failed to map live M2P table");
+ goto out;
+ }
+
+
+ /* Get a local copy of the live_P2M_frame_list */
if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
ERR("Couldn't allocate p2m_frame_list array");
goto out;
@@ -662,6 +721,8 @@
for (i = 0; i < max_pfn; i += ulpp) {
if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) {
ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
+ ERR("entry %d: p2m_frame_list[%ld] is 0x%lx", i, i/ulpp,
+ p2m_frame_list[i/ulpp]);
goto out;
}
}
@@ -693,20 +754,14 @@
}
-#if 0
- sent_last_iter = 0xFFFFFFFF; /* Pretend we sent a /lot/ last time */
-#else
- sent_last_iter = 1 << 20;
-#endif
+ /* pretend we sent all the pages last iteration */
+ sent_last_iter = max_pfn;
/* calculate the power of 2 order of max_pfn, e.g.
15->4 16->4 17->5 */
for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
continue;
-
-#undef BITMAP_SIZE
-#define BITMAP_SIZE ((1<<20)/8)
/* Setup to_send / to_fix and to_skip bitmaps */
to_send = malloc(BITMAP_SIZE);
@@ -922,10 +977,8 @@
/* write out pages in batch */
- if (pagetype == XTAB) {
- DPRINTF("SKIP BOGUS page %i mfn %08lx\n", j, pfn_type[j]);
+ if (pagetype == XTAB)
continue;
- }
pagetype &= LTABTYPE_MASK;
@@ -950,10 +1003,10 @@
} /* end of the write out for this batch */
sent_this_iter += batch;
-
+
+ munmap(region_base, batch*PAGE_SIZE);
+
} /* end of this while loop for this iteration */
-
- munmap(region_base, batch*PAGE_SIZE);
skip:
@@ -1027,13 +1080,9 @@
DPRINTF("All memory is saved\n");
- /* Success! */
- rc = 0;
-
- /* ^^^^^^ XXX SMH: hmm.. not sure that's really success! */
-
/* Zero terminate */
- if (!write_exact(io_fd, &rc, sizeof(int))) {
+ i = 0;
+ if (!write_exact(io_fd, &i, sizeof(int))) {
ERR("Error when writing to state file (6)");
goto out;
}
@@ -1043,17 +1092,17 @@
unsigned int i,j;
unsigned long pfntab[1024];
- for ( i = 0, j = 0; i < max_pfn; i++ ) {
- if ( ! is_mapped(live_p2m[i]) )
+ for (i = 0, j = 0; i < max_pfn; i++) {
+ if (!is_mapped(live_p2m[i]))
j++;
}
-
+
if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
ERR("Error when writing to state file (6a)");
goto out;
}
- for ( i = 0, j = 0; i < max_pfn; ) {
+ for (i = 0, j = 0; i < max_pfn; ) {
if (!is_mapped(live_p2m[i]))
pfntab[j++] = i;
@@ -1097,7 +1146,10 @@
ERR("Error when writing to state file (1)");
goto out;
}
-
+
+ /* Success! */
+ rc = 0;
+
out:
if (live_shinfo)
@@ -1110,7 +1162,7 @@
munmap(live_p2m, P2M_SIZE);
if(live_m2p)
- munmap(live_m2p, M2P_SIZE);
+ munmap(live_m2p, M2P_SIZE(max_mfn));
free(pfn_type);
free(pfn_batch);
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/libxc/xc_private.c
--- a/tools/libxc/xc_private.c Wed Nov 16 19:33:12 2005
+++ b/tools/libxc/xc_private.c Wed Nov 16 19:33:23 2005
@@ -260,18 +260,6 @@
}
-unsigned long xc_get_m2p_start_mfn ( int xc_handle )
-{
- unsigned long mfn;
-
- if ( ioctl( xc_handle, IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN, &mfn ) < 0 )
- {
- perror("xc_get_m2p_start_mfn:");
- return 0;
- }
- return mfn;
-}
-
int xc_get_pfn_list(int xc_handle,
uint32_t domid,
unsigned long *pfn_buf,
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/libxc/xg_private.h
--- a/tools/libxc/xg_private.h Wed Nov 16 19:33:12 2005
+++ b/tools/libxc/xg_private.h Wed Nov 16 19:33:23 2005
@@ -153,8 +153,6 @@
} mfn_mapper_t;
-unsigned long xc_get_m2p_start_mfn (int xc_handle);
-
int xc_copy_to_domain_page(int xc_handle, uint32_t domid,
unsigned long dst_pfn, void *src_page);
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h Wed Nov 16 19:33:12 2005
+++ b/tools/libxc/xg_save_restore.h Wed Nov 16 19:33:23 2005
@@ -3,6 +3,8 @@
**
** Defintions and utilities for save / restore.
*/
+
+#include "xc_private.h"
#define DEBUG 1
#define PROGRESS 0
@@ -55,25 +57,24 @@
** Returns 1 on success, 0 on failure.
*/
static int get_platform_info(int xc_handle, uint32_t dom,
- /* OUT */ uint32_t *max_mfn,
- /* OUT */ uint32_t *hvirt_start,
- /* OUT */ uint32_t *pt_levels)
+ /* OUT */ unsigned long *max_mfn,
+ /* OUT */ unsigned long *hvirt_start,
+ /* OUT */ unsigned int *pt_levels)
{
xen_capabilities_info_t xen_caps = "";
xen_platform_parameters_t xen_params;
- xc_physinfo_t physinfo;
- if (xc_physinfo(xc_handle, &physinfo) != 0)
- return 0;
-
+
if (xc_version(xc_handle, XENVER_platform_parameters, &xen_params) != 0)
return 0;
if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0)
return 0;
- *max_mfn = physinfo.total_pages;
+ if (xc_memory_op(xc_handle, XENMEM_maximum_ram_page, max_mfn) != 0)
+ return 0;
+
*hvirt_start = xen_params.virt_start;
if (strstr(xen_caps, "xen-3.0-x86_64"))
@@ -95,13 +96,22 @@
** entry tell us whether or not the the PFN is currently mapped.
*/
-#define PFN_TO_KB(_pfn) ((_pfn) * PAGE_SIZE / 1024)
+#define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10))
#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
-/* Size in bytes of the M2P and P2M (both rounded up to nearest PAGE_SIZE) */
-#define M2P_SIZE ROUNDUP((max_mfn * sizeof(unsigned long)), PAGE_SHIFT)
-#define P2M_SIZE ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT)
+/*
+** The M2P is made up of some number of 'chunks' of at least 2MB in size.
+** The below definitions and utility function(s) deal with mapping the M2P
+** regarldess of the underlying machine memory size or architecture.
+*/
+#define M2P_SHIFT L2_PAGETABLE_SHIFT_PAE
+#define M2P_CHUNK_SIZE (1 << M2P_SHIFT)
+#define M2P_SIZE(_m) ROUNDUP(((_m) * sizeof(unsigned long)), M2P_SHIFT)
+#define M2P_CHUNKS(_m) (M2P_SIZE((_m)) >> M2P_SHIFT)
+
+/* Size in bytes of the P2M (rounded up to the nearest PAGE_SIZE bytes) */
+#define P2M_SIZE ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT)
/* Number of unsigned longs in a page */
#define ulpp (PAGE_SIZE/sizeof(unsigned long))
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py Wed Nov 16 19:33:12 2005
+++ b/tools/python/xen/xend/XendCheckpoint.py Wed Nov 16 19:33:23 2005
@@ -129,7 +129,7 @@
l = read_exact(fd, sizeof_unsigned_long,
"not a valid guest state file: pfn count read")
nr_pfns = unpack("=L", l)[0] # XXX endianess
- if nr_pfns > 1024*1024: # XXX
+ if nr_pfns > 16*1024*1024: # XXX
raise XendError(
"not a valid guest state file: pfn count out of range")
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py Wed Nov 16 19:33:12 2005
+++ b/tools/python/xen/xend/XendDomain.py Wed Nov 16 19:33:23 2005
@@ -63,14 +63,19 @@
self.domains = {}
self.domains_lock = threading.RLock()
- xswatch("@releaseDomain", self.onReleaseDomain)
-
self.domains_lock.acquire()
try:
self._add_domain(
XendDomainInfo.recreate(self.xen_domains()[PRIV_DOMAIN],
True))
self.dom0_setup()
+
+ # This watch registration needs to be before the refresh call, so
+ # that we're sure that we haven't missed any releases, but inside
+ # the domains_lock, as we don't want the watch to fire until after
+ # the refresh call has completed.
+ xswatch("@releaseDomain", self.onReleaseDomain)
+
self.refresh(True)
finally:
self.domains_lock.release()
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Wed Nov 16 19:33:12 2005
+++ b/tools/python/xen/xend/XendDomainInfo.py Wed Nov 16 19:33:23 2005
@@ -45,6 +45,8 @@
from xen.xend.xenstore.xstransact import xstransact
from xen.xend.xenstore.xsutil import GetDomainPath, IntroduceDomain
+from xen.xend.xenstore.xswatch import xswatch
+
"""Shutdown code for poweroff."""
DOMAIN_POWEROFF = 0
@@ -82,7 +84,6 @@
SHUTDOWN_TIMEOUT = 30
-DOMROOT = '/local/domain/'
VMROOT = '/vm/'
ZOMBIE_PREFIX = 'Zombie-'
@@ -100,26 +101,52 @@
#log.setLevel(logging.TRACE)
-## Configuration entries that we expect to round-trip -- be read from the
+##
+# All parameters of VMs that may be configured on-the-fly, or at start-up.
+#
+VM_CONFIG_PARAMS = [
+ ('name', str),
+ ('on_poweroff', str),
+ ('on_reboot', str),
+ ('on_crash', str),
+ ]
+
+
+##
+# Configuration entries that we expect to round-trip -- be read from the
# config file or xc, written to save-files (i.e. through sxpr), and reused as
# config on restart or restore, all without munging. Some configuration
# entries are munged for backwards compatibility reasons, or because they
# don't come out of xc in the same form as they are specified in the config
# file, so those are handled separately.
ROUNDTRIPPING_CONFIG_ENTRIES = [
- ('name', str),
- ('uuid', str),
- ('ssidref', int),
- ('vcpus', int),
- ('vcpu_avail', int),
- ('cpu_weight', float),
- ('memory', int),
- ('maxmem', int),
- ('bootloader', str),
- ('on_poweroff', str),
- ('on_reboot', str),
- ('on_crash', str)
+ ('uuid', str),
+ ('ssidref', int),
+ ('vcpus', int),
+ ('vcpu_avail', int),
+ ('cpu_weight', float),
+ ('memory', int),
+ ('maxmem', int),
+ ('bootloader', str),
]
+
+ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
+
+
+##
+# All entries written to the store. This is VM_CONFIGURATION_PARAMS, plus
+# those entries written to the store that cannot be reconfigured on-the-fly.
+#
+VM_STORE_ENTRIES = [
+ ('uuid', str),
+ ('ssidref', int),
+ ('vcpus', int),
+ ('vcpu_avail', int),
+ ('memory', int),
+ ('maxmem', int),
+ ]
+
+VM_STORE_ENTRIES += VM_CONFIG_PARAMS
#
@@ -156,6 +183,7 @@
vm.initDomain()
vm.storeVmDetails()
vm.storeDomDetails()
+ vm.registerWatch()
vm.refreshShutdown()
return vm
except:
@@ -211,6 +239,7 @@
vm.storeVmDetails()
vm.storeDomDetails()
+ vm.registerWatch()
vm.refreshShutdown(xeninfo)
return vm
@@ -371,12 +400,50 @@
self.console_port = None
self.console_mfn = None
+ self.vmWatch = None
+
self.state = STATE_DOM_OK
self.state_updated = threading.Condition()
self.refresh_shutdown_lock = threading.Condition()
## private:
+
+ def readVMDetails(self, params):
+ """Read from the store all of those entries that we consider
+ """
+ try:
+ return self.gatherVm(*params)
+ except ValueError:
+ # One of the int/float entries in params has a corresponding store
+ # entry that is invalid. We recover, because older versions of
+ # Xend may have put the entry there (memory/target, for example),
+ # but this is in general a bad situation to have reached.
+ log.exception(
+ "Store corrupted at %s! Domain %d's configuration may be "
+ "affected.", self.vmpath, self.domid)
+ return []
+
+
+ def storeChanged(self):
+ log.debug("XendDomainInfo.storeChanged");
+
+ changed = False
+
+ def f(x, y):
+ if y is not None and self.info[x[0]] != y:
+ self.info[x[0]] = y
+ changed = True
+
+ map(f, VM_CONFIG_PARAMS, self.readVMDetails(VM_CONFIG_PARAMS))
+
+ if changed:
+ # Update the domain section of the store, as this contains some
+ # parameters derived from the VM configuration.
+ self.storeDomDetails()
+
+ return 1
+
def augmentInfo(self):
"""Augment self.info, as given to us through {@link #recreate}, with
@@ -387,30 +454,8 @@
if not self.infoIsSet(name) and val is not None:
self.info[name] = val
- params = (("name", str),
- ("on_poweroff", str),
- ("on_reboot", str),
- ("on_crash", str),
- ("image", str),
- ("memory", int),
- ("maxmem", int),
- ("vcpus", int),
- ("vcpu_avail", int),
- ("start_time", float))
-
- try:
- from_store = self.gatherVm(*params)
- except ValueError, exn:
- # One of the int/float entries in params has a corresponding store
- # entry that is invalid. We recover, because older versions of
- # Xend may have put the entry there (memory/target, for example),
- # but this is in general a bad situation to have reached.
- log.exception(
- "Store corrupted at %s! Domain %d's configuration may be "
- "affected.", self.vmpath, self.domid)
- return
-
- map(lambda x, y: useIfNeeded(x[0], y), params, from_store)
+ map(lambda x, y: useIfNeeded(x[0], y), VM_STORE_ENTRIES,
+ self.readVMDetails(VM_STORE_ENTRIES))
device = []
for c in controllerClasses:
@@ -536,23 +581,24 @@
self.introduceDomain()
self.storeDomDetails()
+ self.registerWatch()
self.refreshShutdown()
log.debug("XendDomainInfo.completeRestore done")
def storeVmDetails(self):
- to_store = {
- 'uuid': self.info['uuid']
- }
+ to_store = {}
+
+ for k in VM_STORE_ENTRIES:
+ if self.infoIsSet(k[0]):
+ to_store[k[0]] = str(self.info[k[0]])
if self.infoIsSet('image'):
to_store['image'] = sxp.to_string(self.info['image'])
- for k in ['name', 'ssidref', 'memory', 'maxmem', 'on_poweroff',
- 'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail']:
- if self.infoIsSet(k):
- to_store[k] = str(self.info[k])
+ if self.infoIsSet('start_time'):
+ to_store['start_time'] = str(self.info['start_time'])
log.debug("Storing VM details: %s", to_store)
@@ -599,13 +645,16 @@
return result
- def setDomid(self, domid):
- """Set the domain id.
-
- @param dom: domain id
- """
- self.domid = domid
- self.storeDom("domid", self.domid)
+ ## public:
+
+ def registerWatch(self):
+ """Register a watch on this VM's entries in the store, so that
+ when they are changed externally, we keep up to date. This should
+ only be called by {@link #create}, {@link #recreate}, or {@link
+ #restore}, once the domain's details have been written, but before the
+ new instance is returned."""
+ self.vmWatch = xswatch(self.vmpath, self.storeChanged)
+
def getDomid(self):
return self.domid
@@ -1116,6 +1165,13 @@
"""Cleanup VM resources. Idempotent. Nothrow guarantee."""
try:
+ try:
+ if self.vmWatch:
+ self.vmWatch.unwatch()
+ self.vmWatch = None
+ except:
+ log.exception("Unwatching VM path failed.")
+
self.removeVm()
except:
log.exception("Removing VM path failed.")
diff -r cbf6f95e9c62 -r 3f39f030fa89 tools/python/xen/xend/xenstore/xswatch.py
--- a/tools/python/xen/xend/xenstore/xswatch.py Wed Nov 16 19:33:12 2005
+++ b/tools/python/xen/xend/xenstore/xswatch.py Wed Nov 16 19:33:23 2005
@@ -20,6 +20,10 @@
self.kwargs = kwargs
watchStart()
xs.watch(path, self)
+
+
+ def unwatch(self):
+ xs.unwatch(self.path, self)
watchThread = None
@@ -49,7 +53,7 @@
watch = we[1]
res = watch.fn(*watch.args, **watch.kwargs)
if not res:
- xs.unwatch(watch.path, watch)
+ watch.unwatch()
except:
log.exception("read_watch failed")
# Ignore this exception -- there's no point throwing it
diff -r cbf6f95e9c62 -r 3f39f030fa89 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Nov 16 19:33:12 2005
+++ b/xen/arch/x86/mm.c Wed Nov 16 19:33:23 2005
@@ -898,6 +898,7 @@
return 1;
fail:
+ MEM_LOG("Failure in alloc_l3_table: entry %d", i);
while ( i-- > 0 )
if ( is_guest_l3_slot(i) )
put_page_from_l3e(pl3e[i], pfn);
@@ -948,6 +949,7 @@
return 1;
fail:
+ MEM_LOG("Failure in alloc_l4_table: entry %d", i);
while ( i-- > 0 )
if ( is_guest_l4_slot(i) )
put_page_from_l4e(pl4e[i], pfn);
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|