# HG changeset patch
# User Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
# Date 1214965837 -32400
# Node ID 08f77df14cba8e2dfe580779bb9ca2f64e1ae0ae
# Parent 11318234588e61b45df5a06fe6a29264854ba22a
# Parent 19970181d6a46aee1199857b6d3c6bedc7507121
merge with xen-unstable.hg
---
docs/ChangeLog | 9
extras/mini-os/arch/x86/mm.c | 11
extras/mini-os/blkfront.c | 1
extras/mini-os/fbfront.c | 2
extras/mini-os/fs-front.c | 10
extras/mini-os/lib/sys.c | 2
extras/mini-os/netfront.c | 6
stubdom/grub.patches/99minios | 10
stubdom/grub/Makefile | 2
tools/blktap/drivers/Makefile | 10
tools/blktap/drivers/blktapctrl.c | 2
tools/blktap/drivers/block-qcow.c | 35 +
tools/blktap/drivers/block-qcow2.c | 5
tools/blktap/drivers/check_gcrypt | 14
tools/blktap/lib/blktaplib.h | 2
tools/debugger/xenitp/xenitp.c | 24
tools/examples/xend-config.sxp | 3
tools/firmware/hvmloader/hvmloader.c | 10
tools/firmware/rombios/rombios.c | 35 -
tools/ioemu/hw/xen_console.c | 8
tools/ioemu/target-i386-dm/exec-dm.c | 17
tools/ioemu/xenstore.c | 11
tools/libxc/ia64/xc_ia64_hvm_build.c | 7
tools/libxc/ia64/xc_ia64_linux_restore.c | 24
tools/libxc/ia64/xc_ia64_linux_save.c | 19
tools/libxc/xc_core.c | 8
tools/libxc/xc_core_ia64.c | 3
tools/libxc/xc_core_ia64.h | 2
tools/libxc/xc_domain.c | 65 --
tools/libxc/xc_domain_restore.c | 12
tools/libxc/xc_domain_save.c | 20
tools/libxc/xc_misc.c | 28
tools/libxc/xc_pagetab.c | 4
tools/libxc/xc_private.h | 4
tools/libxc/xc_ptrace.c | 34 -
tools/libxc/xc_ptrace_core.c | 8
tools/libxc/xc_resume.c | 10
tools/libxc/xenctrl.h | 44 +
tools/libxc/xg_save_restore.h | 22
tools/python/xen/util/blkif.py | 41 -
tools/python/xen/xend/XendConfig.py | 2
tools/python/xen/xend/XendOptions.py | 7
tools/python/xen/xend/image.py | 20
tools/python/xen/xend/server/blkif.py | 6
tools/python/xen/xm/main.py | 3
tools/tests/test_x86_emulator.c | 9
tools/xenballoon/xenballoon-monitor | 43 +
tools/xenballoon/xenballoon.conf | 91 +++
tools/xenballoon/xenballoond | 205 ++++++
tools/xenballoon/xenballoond.README | 82 ++
tools/xenballoon/xenballoond.init | 91 +++
tools/xentrace/xenctx.c | 8
tools/xm-test/lib/XmTestLib/block_utils.py | 2
xen/arch/ia64/vmx/vmx_hypercall.c | 47 +
xen/arch/ia64/xen/mm.c | 6
xen/arch/x86/acpi/cpufreq/Makefile | 1
xen/arch/x86/acpi/cpufreq/cpufreq.c | 139 +++-
xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c | 14
xen/arch/x86/acpi/cpufreq/powernow.c | 305 ++++++++++
xen/arch/x86/acpi/cpufreq/utility.c | 103 +++
xen/arch/x86/acpi/pmstat.c | 7
xen/arch/x86/acpi/power.c | 25
xen/arch/x86/hvm/emulate.c | 113 +--
xen/arch/x86/hvm/hvm.c | 60 +
xen/arch/x86/hvm/vmx/vmcs.c | 100 +--
xen/arch/x86/hvm/vmx/vmx.c | 11
xen/arch/x86/hvm/vmx/vpmu_core2.c | 20
xen/arch/x86/mm.c | 45 +
xen/arch/x86/mm/shadow/common.c | 811 ++++++++++++++++++++++++++-
xen/arch/x86/mm/shadow/multi.c | 559 +++++++++++++++++-
xen/arch/x86/mm/shadow/multi.h | 14
xen/arch/x86/mm/shadow/private.h | 130 ++++
xen/arch/x86/mm/shadow/types.h | 5
xen/arch/x86/platform_hypercall.c | 7
xen/arch/x86/x86_emulate/x86_emulate.c | 700 ++++++++++++++++++-----
xen/arch/x86/x86_emulate/x86_emulate.h | 37 -
xen/common/domain.c | 259 ++++----
xen/drivers/passthrough/vtd/dmar.c | 3
xen/drivers/passthrough/vtd/dmar.h | 16
xen/drivers/passthrough/vtd/intremap.c | 7
xen/drivers/passthrough/vtd/iommu.c | 16
xen/drivers/passthrough/vtd/qinval.c | 16
xen/drivers/passthrough/vtd/utils.c | 2
xen/include/acpi/cpufreq/cpufreq.h | 3
xen/include/acpi/cpufreq/processor_perf.h | 13
xen/include/asm-x86/domain.h | 14
xen/include/asm-x86/hvm/vmx/vmcs.h | 8
xen/include/asm-x86/mm.h | 8
xen/include/asm-x86/perfc_defn.h | 15
xen/include/public/hvm/hvm_op.h | 13
xen/include/xen/domain.h | 3
xen/include/xen/sched.h | 12
92 files changed, 3996 insertions(+), 824 deletions(-)
diff -r 11318234588e -r 08f77df14cba docs/ChangeLog
--- a/docs/ChangeLog Thu Jun 19 12:48:04 2008 +0900
+++ b/docs/ChangeLog Wed Jul 02 11:30:37 2008 +0900
@@ -16,6 +16,15 @@ Xen 3.3 release
Xen 3.3 release
---------------
+17903: Add greater than 16 xvd device availability
+http://xenbits.xensource.com/xen-unstable.hg?rev/0728459b3c8d
+
+The tools can now attach a disk of the form:
+(1<<28) | (device<<8) | partition
+to support many more xvd disks and up to 256 partitions.
+The linux guest frontend has been expanded to support
+this new construct, while legacy guests should just ignore it.
+
17538: Add XENPF_set_processor_pminfo
http://xenbits.xensource.com/xen-unstable.hg?rev/5bb9093eb0e9
diff -r 11318234588e -r 08f77df14cba extras/mini-os/arch/x86/mm.c
--- a/extras/mini-os/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900
@@ -528,18 +528,13 @@ void *map_frames_ex(unsigned long *f, un
static void clear_bootstrap(void)
{
- xen_pfn_t mfns[] = { virt_to_mfn(&shared_info) };
- int n = sizeof(mfns)/sizeof(*mfns);
pte_t nullpte = { };
/* Use first page as the CoW zero page */
memset(&_text, 0, PAGE_SIZE);
- mfn_zero = pfn_to_mfn((unsigned long) &_text);
- if (HYPERVISOR_update_va_mapping((unsigned long) &_text, nullpte,
UVMF_INVLPG))
- printk("Unable to unmap first page\n");
-
- if (free_physical_pages(mfns, n) != n)
- printk("Unable to free bootstrap pages\n");
+ mfn_zero = virt_to_mfn((unsigned long) &_text);
+ if (HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG))
+ printk("Unable to unmap NULL page\n");
}
void arch_init_p2m(unsigned long max_pfn)
diff -r 11318234588e -r 08f77df14cba extras/mini-os/blkfront.c
--- a/extras/mini-os/blkfront.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/blkfront.c Wed Jul 02 11:30:37 2008 +0900
@@ -125,7 +125,6 @@ struct blkfront_dev *init_blkfront(char
dev->events = NULL;
- // FIXME: proper frees on failures
again:
err = xenbus_transaction_start(&xbt);
if (err) {
diff -r 11318234588e -r 08f77df14cba extras/mini-os/fbfront.c
--- a/extras/mini-os/fbfront.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/fbfront.c Wed Jul 02 11:30:37 2008 +0900
@@ -100,7 +100,6 @@ struct kbdfront_dev *init_kbdfront(char
s->in_cons = s->in_prod = 0;
s->out_cons = s->out_prod = 0;
- // FIXME: proper frees on failures
again:
err = xenbus_transaction_start(&xbt);
if (err) {
@@ -408,7 +407,6 @@ struct fbfront_dev *init_fbfront(char *n
s->pd[i] = 0;
- // FIXME: proper frees on failures
again:
err = xenbus_transaction_start(&xbt);
if (err) {
diff -r 11318234588e -r 08f77df14cba extras/mini-os/fs-front.c
--- a/extras/mini-os/fs-front.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/fs-front.c Wed Jul 02 11:30:37 2008 +0900
@@ -136,8 +136,8 @@ again:
again:
old_id = freelist[0];
/* Note: temporal inconsistency, since freelist[0] can be changed by
someone
- * else, but we are a sole owner of freelist[id], it's OK. */
- freelist[id] = old_id;
+ * else, but we are a sole owner of freelist[id + 1], it's OK. */
+ freelist[id + 1] = old_id;
new_id = id;
if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
{
@@ -154,7 +154,7 @@ static inline unsigned short get_id_from
again:
old_id = freelist[0];
- new_id = freelist[old_id];
+ new_id = freelist[old_id + 1];
if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
{
printk("Cmpxchg on freelist remove failed.\n");
@@ -785,8 +785,8 @@ static void alloc_request_table(struct f
printk("Allocating request array for import %d, nr_entries = %d.\n",
import->import_id, import->nr_entries);
requests = xmalloc_array(struct fs_request, import->nr_entries);
- import->freelist = xmalloc_array(unsigned short, import->nr_entries);
- memset(import->freelist, 0, sizeof(unsigned short) * import->nr_entries);
+ import->freelist = xmalloc_array(unsigned short, import->nr_entries + 1);
+ memset(import->freelist, 0, sizeof(unsigned short) * (import->nr_entries +
1));
for(i=0; i<import->nr_entries; i++)
{
/* TODO: that's a lot of memory */
diff -r 11318234588e -r 08f77df14cba extras/mini-os/lib/sys.c
--- a/extras/mini-os/lib/sys.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/lib/sys.c Wed Jul 02 11:30:37 2008 +0900
@@ -686,7 +686,7 @@ static int select_poll(int nfds, fd_set
#ifdef LIBC_VERBOSE
static int nb;
static int nbread[NOFILE], nbwrite[NOFILE], nbexcept[NOFILE];
- static s64_t lastshown;
+ static s_time_t lastshown;
nb++;
#endif
diff -r 11318234588e -r 08f77df14cba extras/mini-os/netfront.c
--- a/extras/mini-os/netfront.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/netfront.c Wed Jul 02 11:30:37 2008 +0900
@@ -38,7 +38,7 @@ struct netfront_dev {
struct netfront_dev {
domid_t dom;
- unsigned short tx_freelist[NET_TX_RING_SIZE];
+ unsigned short tx_freelist[NET_TX_RING_SIZE + 1];
struct semaphore tx_sem;
struct net_buffer rx_buffers[NET_RX_RING_SIZE];
@@ -70,14 +70,14 @@ void init_rx_buffers(struct netfront_dev
static inline void add_id_to_freelist(unsigned int id,unsigned short* freelist)
{
- freelist[id] = freelist[0];
+ freelist[id + 1] = freelist[0];
freelist[0] = id;
}
static inline unsigned short get_id_from_freelist(unsigned short* freelist)
{
unsigned int id = freelist[0];
- freelist[0] = freelist[id];
+ freelist[0] = freelist[id + 1];
return id;
}
diff -r 11318234588e -r 08f77df14cba stubdom/grub.patches/99minios
--- a/stubdom/grub.patches/99minios Thu Jun 19 12:48:04 2008 +0900
+++ b/stubdom/grub.patches/99minios Wed Jul 02 11:30:37 2008 +0900
@@ -832,7 +832,18 @@ Index: grub/stage2/fsys_reiserfs.c
Index: grub/stage2/fsys_reiserfs.c
===================================================================
--- grub.orig/stage2/fsys_reiserfs.c 2008-06-16 15:18:03.410933000 +0100
-+++ grub/stage2/fsys_reiserfs.c 2008-06-16 15:18:14.786009000 +0100
++++ grub/stage2/fsys_reiserfs.c 2008-06-20 18:33:52.002100000 +0100
+@@ -224,8 +224,8 @@
+
+ struct disk_child
+ {
+- unsigned long dc_block_number; /* Disk child's block
number. */
+- unsigned short dc_size; /* Disk child's
used space. */
++ __u32 dc_block_number; /* Disk child's block number. */
++ __u16 dc_size; /* Disk child's used space. */
+ };
+
+ #define DC_SIZE (sizeof (struct disk_child))
@@ -369,7 +369,14 @@
static __inline__ unsigned long
log2 (unsigned long word)
diff -r 11318234588e -r 08f77df14cba stubdom/grub/Makefile
--- a/stubdom/grub/Makefile Thu Jun 19 12:48:04 2008 +0900
+++ b/stubdom/grub/Makefile Wed Jul 02 11:30:37 2008 +0900
@@ -5,7 +5,7 @@ vpath %.c ../grub-cvs
BOOT=boot-$(XEN_TARGET_ARCH).o
-DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I.
+DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I$(XEN_ROOT)/tools/include -I.
DEF_CPPFLAGS += -I../grub-cvs/stage1
DEF_CPPFLAGS += -I../grub-cvs/stage2
DEF_CPPFLAGS += -I../grub-cvs/netboot
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/Makefile
--- a/tools/blktap/drivers/Makefile Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/Makefile Wed Jul 02 11:30:37 2008 +0900
@@ -17,8 +17,16 @@ CFLAGS += -Wp,-MD,.$(@F).d
CFLAGS += -Wp,-MD,.$(@F).d
DEPS = .*.d
+ifeq ($(shell . ./check_gcrypt),"yes")
+CFLAGS += -DUSE_GCRYPT
+CRYPT_LIB := -lgcrypt
+else
+CRYPT_LIB := -lcrypto
+$(warning *** libgcrypt not installed: falling back to libcrypto ***)
+endif
+
LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib
-lblktap
-LDFLAGS_img := $(LIBAIO_DIR)/libaio.a -lcrypto -lpthread -lz
+LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lz
BLK-OBJS-y := block-aio.o
BLK-OBJS-y += block-sync.o
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/blktapctrl.c
--- a/tools/blktap/drivers/blktapctrl.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/blktapctrl.c Wed Jul 02 11:30:37 2008 +0900
@@ -127,7 +127,7 @@ static int get_new_dev(int *major, int *
char *devname;
tr.domid = blkif->domid;
- tr.busid = (unsigned short)blkif->be_id;
+ tr.busid = blkif->be_id;
ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr );
if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) {
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/block-qcow.c Wed Jul 02 11:30:37 2008 +0900
@@ -33,7 +33,6 @@
#include <zlib.h>
#include <inttypes.h>
#include <libaio.h>
-#include <openssl/md5.h>
#include "bswap.h"
#include "aes.h"
#include "tapdisk.h"
@@ -146,6 +145,35 @@ struct tdqcow_state {
static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
+#ifdef USE_GCRYPT
+
+#include <gcrypt.h>
+
+static uint32_t gen_cksum(char *ptr, int len)
+{
+ int i;
+ uint32_t md[4];
+
+ /* Convert L1 table to big endian */
+ for(i = 0; i < len / sizeof(uint64_t); i++) {
+ cpu_to_be64s(&((uint64_t*) ptr)[i]);
+ }
+
+ /* Generate checksum */
+ gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
+
+ /* Convert L1 table back to native endianess */
+ for(i = 0; i < len / sizeof(uint64_t); i++) {
+ be64_to_cpus(&((uint64_t*) ptr)[i]);
+ }
+
+ return md[0];
+}
+
+#else /* use libcrypto */
+
+#include <openssl/md5.h>
+
static uint32_t gen_cksum(char *ptr, int len)
{
int i;
@@ -153,9 +181,8 @@ static uint32_t gen_cksum(char *ptr, int
uint32_t ret;
md = malloc(MD5_DIGEST_LENGTH);
-
if(!md) return 0;
-
+
/* Convert L1 table to big endian */
for(i = 0; i < len / sizeof(uint64_t); i++) {
cpu_to_be64s(&((uint64_t*) ptr)[i]);
@@ -175,6 +202,8 @@ static uint32_t gen_cksum(char *ptr, int
free(md);
return ret;
}
+
+#endif
static int get_filesize(char *filename, uint64_t *size, struct stat *st)
{
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow2.c
--- a/tools/blktap/drivers/block-qcow2.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/block-qcow2.c Wed Jul 02 11:30:37 2008 +0900
@@ -254,10 +254,7 @@ static int bdrv_pread(int fd, int64_t of
*/
static int bdrv_pwrite(int fd, int64_t offset, const void *buf, int count)
{
- int ret;
-
- ret = lseek(fd, offset, SEEK_SET);
- if (ret != offset) {
+ if (lseek(fd, offset, SEEK_SET) == -1) {
DPRINTF("bdrv_pwrite failed seek (%#"PRIx64").\n", offset);
return -1;
}
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/check_gcrypt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap/drivers/check_gcrypt Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+cat > .gcrypt.c << EOF
+#include <gcrypt.h>
+int main(void) { return 0; }
+EOF
+
+if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
+ echo "yes"
+else
+ echo "no"
+fi
+
+rm -f .gcrypt*
diff -r 11318234588e -r 08f77df14cba tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/lib/blktaplib.h Wed Jul 02 11:30:37 2008 +0900
@@ -161,7 +161,7 @@ typedef struct tapdev_info {
typedef struct domid_translate {
unsigned short domid;
- unsigned short busid;
+ uint32_t busid;
} domid_translate_t ;
typedef struct image {
diff -r 11318234588e -r 08f77df14cba tools/debugger/xenitp/xenitp.c
--- a/tools/debugger/xenitp/xenitp.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/debugger/xenitp/xenitp.c Wed Jul 02 11:30:37 2008 +0900
@@ -57,6 +57,16 @@ static int cur_vcpu;
#define CFM_SOF_MASK 0x3f
int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr);
+
+/* wrapper for vcpu_gest_context_any_t */
+static int xc_ia64_vcpu_getcontext(int xc_handle,
+ uint32_t domid,
+ uint32_t vcpu,
+ vcpu_guest_context_t *ctxt)
+{
+ return xc_vcpu_getcontext(xc_handle, domid, vcpu,
+ (vcpu_guest_context_any_t *)ctxt);
+}
static inline unsigned int ctx_slot (vcpu_guest_context_t *ctx)
{
@@ -729,7 +739,7 @@ int wait_domain (int vcpu, vcpu_guest_co
fflush (stdout);
nanosleep (&ts, NULL);
}
- return xc_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
+ return xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
}
int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr)
@@ -945,13 +955,13 @@ char *parse_arg (char **buf)
return res;
}
-vcpu_guest_context_t vcpu_ctx[MAX_VIRT_CPUS];
+vcpu_guest_context_any_t vcpu_ctx_any[MAX_VIRT_CPUS];
int vcpu_setcontext (int vcpu)
{
int ret;
- ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx[vcpu]);
+ ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx_any[vcpu]);
if (ret < 0)
perror ("xc_vcpu_setcontext");
@@ -1518,7 +1528,7 @@ enum cmd_status do_command (int vcpu, ch
int flag_ambiguous;
cur_vcpu = vcpu;
- cur_ctx = &vcpu_ctx[vcpu];
+ cur_ctx = &vcpu_ctx_any[vcpu].c;
/* Handle repeat last-command. */
if (*line == 0) {
@@ -1575,7 +1585,7 @@ void xenitp (int vcpu)
int ret;
struct sigaction sa;
- cur_ctx = &vcpu_ctx[vcpu];
+ cur_ctx = &vcpu_ctx_any[vcpu].c;
xc_handle = xc_interface_open (); /* for accessing control interface */
@@ -1588,9 +1598,9 @@ void xenitp (int vcpu)
exit (-1);
}
- ret = xc_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
+ ret = xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
if (ret < 0) {
- perror ("xc_vcpu_getcontext");
+ perror ("xc_ia64_vcpu_getcontext");
exit (-1);
}
diff -r 11318234588e -r 08f77df14cba tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/examples/xend-config.sxp Wed Jul 02 11:30:37 2008 +0900
@@ -242,3 +242,6 @@
# Script to run when the label of a resource has changed.
#(resource-label-change-script '')
+
+# Rotation count of qemu-dm log file.
+#(qemu-dm-logrotate-count 10)
diff -r 11318234588e -r 08f77df14cba tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/firmware/hvmloader/hvmloader.c Wed Jul 02 11:30:37 2008 +0900
@@ -206,10 +206,12 @@ static void pci_setup(void)
pci_writew(devfn, 0x3d, 0x0001);
break;
case 0x0101:
- /* PIIX3 IDE */
- ASSERT((vendor_id == 0x8086) && (device_id == 0x7010));
- pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
- pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
+ if ( vendor_id == 0x8086 )
+ {
+ /* Intel ICHs since PIIX3: enable IDE legacy mode. */
+ pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
+ pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
+ }
break;
}
diff -r 11318234588e -r 08f77df14cba tools/firmware/rombios/rombios.c
--- a/tools/firmware/rombios/rombios.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/firmware/rombios/rombios.c Wed Jul 02 11:30:37 2008 +0900
@@ -9783,6 +9783,27 @@ smbios_init:
#endif
+#if BX_TCGBIOS
+; The section between the POST entry and the NMI entry is filling up
+; and causes crashes if this code was directly there
+tcpa_post_part1:
+ call _tcpa_acpi_init
+
+ push dword #0
+ call _tcpa_initialize_tpm
+ add sp, #4
+
+ call _tcpa_do_measure_POSTs
+ call _tcpa_wake_event /* specs: 3.2.3.7 */
+ ret
+
+tcpa_post_part2:
+ call _tcpa_calling_int19h /* specs: 8.2.3 step 1 */
+ call _tcpa_add_event_separators /* specs: 8.2.3 step 2 */
+ /* we do not call int 19h handler but keep following eventlog */
+ call _tcpa_returned_int19h /* specs: 8.2.3 step 3/7 */
+ ret
+#endif
;; for 'C' strings and other data, insert them here with
@@ -10003,14 +10024,7 @@ post_default_ints:
mov 0x0410, ax
#if BX_TCGBIOS
- call _tcpa_acpi_init
-
- push dword #0
- call _tcpa_initialize_tpm
- add sp, #4
-
- call _tcpa_do_measure_POSTs
- call _tcpa_wake_event /* specs: 3.2.3.7 */
+ call tcpa_post_part1
#endif
;; Parallel setup
@@ -10138,10 +10152,7 @@ post_default_ints:
call _interactive_bootkey
#if BX_TCGBIOS
- call _tcpa_calling_int19h /* specs: 8.2.3 step 1 */
- call _tcpa_add_event_separators /* specs: 8.2.3 step 2 */
- /* we do not call int 19h handler but keep following eventlog */
- call _tcpa_returned_int19h /* specs: 8.2.3 step 3/7 */
+ call tcpa_post_part2
#endif
;; Start the boot sequence. See the comments in int19_relocated
diff -r 11318234588e -r 08f77df14cba tools/ioemu/hw/xen_console.c
--- a/tools/ioemu/hw/xen_console.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/hw/xen_console.c Wed Jul 02 11:30:37 2008 +0900
@@ -160,16 +160,18 @@ int xs_gather(struct xs_handle *xs, cons
static int domain_create_ring(struct domain *dom)
{
- int err, remote_port, ring_ref, rc;
+ int err, remote_port, ring_ref, limit, rc;
err = xs_gather(dom->xsh, dom->serialpath,
"ring-ref", "%u", &ring_ref,
"port", "%i", &remote_port,
+ "limit", "%i", &limit,
NULL);
if (err) {
err = xs_gather(dom->xsh, dom->conspath,
"ring-ref", "%u", &ring_ref,
"port", "%i", &remote_port,
+ "limit", "%i", &limit,
NULL);
if (err) {
fprintf(stderr, "Console: failed to find ring-ref/port
yet\n");
@@ -178,7 +180,9 @@ static int domain_create_ring(struct dom
dom->use_consolepath = 1;
} else
dom->use_consolepath = 0;
- fprintf(stderr, "Console: got ring-ref %d port %d\n", ring_ref,
remote_port);
+ dom->buffer.max_capacity = limit;
+ fprintf(stderr, "Console: got ring-ref %d port %d limit %d\n",
+ ring_ref, remote_port, limit);
if ((ring_ref == dom->ring_ref) && (remote_port == dom->remote_port))
goto out;
diff -r 11318234588e -r 08f77df14cba tools/ioemu/target-i386-dm/exec-dm.c
--- a/tools/ioemu/target-i386-dm/exec-dm.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/target-i386-dm/exec-dm.c Wed Jul 02 11:30:37 2008 +0900
@@ -483,9 +483,11 @@ static void memcpy_words(void *dst, void
}
#endif
-void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
- int len, int is_write)
-{
+void cpu_physical_memory_rw(target_phys_addr_t _addr, uint8_t *buf,
+ int _len, int is_write)
+{
+ target_phys_addr_t addr = _addr;
+ int len = _len;
int l, io_index;
uint8_t *ptr;
uint32_t val;
@@ -520,6 +522,7 @@ void cpu_physical_memory_rw(target_phys_
} else if ((ptr = phys_ram_addr(addr)) != NULL) {
/* Writing to RAM */
memcpy_words(ptr, buf, l);
+#ifndef CONFIG_STUBDOM
if (logdirty_bitmap != NULL) {
/* Record that we have dirtied this frame */
unsigned long pfn = addr >> TARGET_PAGE_BITS;
@@ -531,6 +534,7 @@ void cpu_physical_memory_rw(target_phys_
|= 1UL << pfn % HOST_LONG_BITS;
}
}
+#endif
#ifdef __ia64__
sync_icache(ptr, l);
#endif
@@ -566,6 +570,13 @@ void cpu_physical_memory_rw(target_phys_
addr += l;
}
+#ifdef CONFIG_STUBDOM
+ if (logdirty_bitmap != NULL)
+ xc_hvm_modified_memory(xc_handle, domid, _addr >> TARGET_PAGE_BITS,
+ (_addr + _len + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS
+ - _addr >> TARGET_PAGE_BITS);
+#endif
+
mapcache_unlock();
}
#endif
diff -r 11318234588e -r 08f77df14cba tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/xenstore.c Wed Jul 02 11:30:37 2008 +0900
@@ -260,8 +260,6 @@ void xenstore_parse_domain_config(int hv
/* autoguess qcow vs qcow2 */
} else if (!strcmp(drv,"file") || !strcmp(drv,"phy")) {
format = &bdrv_raw;
- } else if (!strcmp(drv,"phy")) {
- format = &bdrv_raw;
} else {
format = bdrv_find_format(drv);
if (!format) {
@@ -404,6 +402,10 @@ void xenstore_process_logdirty_event(voi
/* No key yet: wait for the next watch */
return;
+#ifdef CONFIG_STUBDOM
+ /* We pass the writes to hypervisor */
+ seg = (void*)1;
+#else
strncpy(key_terminated, key_ascii, 16);
free(key_ascii);
key = (key_t) strtoull(key_terminated, NULL, 16);
@@ -419,11 +421,6 @@ void xenstore_process_logdirty_event(voi
fprintf(logfile, "%s: key=%16.16llx size=%lu\n", __FUNCTION__,
(unsigned long long)key, logdirty_bitmap_size);
-#ifdef CONFIG_STUBDOM
- /* XXX we just can't use shm. */
- fprintf(logfile, "Log dirty is not implemented in stub domains!\n");
- return;
-#else
shmid = shmget(key, 2 * logdirty_bitmap_size, S_IRUSR|S_IWUSR);
if (shmid == -1) {
fprintf(logfile, "Log-dirty: shmget failed: segment %16.16llx "
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_hvm_build.c
--- a/tools/libxc/ia64/xc_ia64_hvm_build.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_hvm_build.c Wed Jul 02 11:30:37 2008 +0900
@@ -1052,7 +1052,8 @@ int
int
xc_hvm_build(int xc_handle, uint32_t domid, int memsize, const char
*image_name)
{
- vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
+ vcpu_guest_context_any_t st_ctxt_any;
+ vcpu_guest_context_t *ctxt = &st_ctxt_any.c;
char *image = NULL;
unsigned long image_size;
unsigned long nr_pages;
@@ -1079,14 +1080,14 @@ xc_hvm_build(int xc_handle, uint32_t dom
free(image);
- memset(ctxt, 0, sizeof(*ctxt));
+ memset(&st_ctxt_any, 0, sizeof(st_ctxt_any));
ctxt->regs.ip = 0x80000000ffffffb0UL;
ctxt->regs.ar.fpsr = xc_ia64_fpsr_default();
ctxt->regs.cr.itir = 14 << 2;
ctxt->regs.psr = IA64_PSR_AC | IA64_PSR_BN;
ctxt->regs.cr.dcr = 0;
ctxt->regs.cr.pta = 15 << 2;
- return xc_vcpu_setcontext(xc_handle, domid, 0, ctxt);
+ return xc_vcpu_setcontext(xc_handle, domid, 0, &st_ctxt_any);
error_out:
free(image);
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_restore.c
--- a/tools/libxc/ia64/xc_ia64_linux_restore.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_restore.c Wed Jul 02 11:30:37 2008 +0900
@@ -117,8 +117,9 @@ xc_ia64_recv_unallocated_list(int xc_han
static int
xc_ia64_recv_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
- uint32_t vcpu, vcpu_guest_context_t *ctxt)
-{
+ uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
+{
+ vcpu_guest_context_t *ctxt = &ctxt_any->c;
if (read_exact(io_fd, ctxt, sizeof(*ctxt))) {
ERROR("Error when reading ctxt");
return -1;
@@ -128,14 +129,14 @@ xc_ia64_recv_vcpu_context(int xc_handle,
/* Initialize and set registers. */
ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online;
- if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt) != 0) {
+ if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt_any) != 0) {
ERROR("Couldn't set vcpu context");
return -1;
}
/* Just a check. */
ctxt->flags = 0;
- if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
+ if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
ERROR("Could not get vcpu context");
return -1;
}
@@ -226,19 +227,20 @@ xc_ia64_pv_recv_vcpu_context(int xc_hand
int rc = -1;
/* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
-
- if (lock_pages(&ctxt, sizeof(ctxt))) {
+ vcpu_guest_context_any_t ctxt_any;
+ vcpu_guest_context_t *ctxt = &ctxt_any.c;
+
+ if (lock_pages(&ctxt_any, sizeof(ctxt_any))) {
/* needed for build domctl, but might as well do early */
ERROR("Unable to lock_pages ctxt");
return -1;
}
- if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt))
+ if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt_any))
goto out;
/* Then get privreg page. */
- if (read_page(xc_handle, io_fd, dom, ctxt.privregs_pfn) < 0) {
+ if (read_page(xc_handle, io_fd, dom, ctxt->privregs_pfn) < 0) {
ERROR("Could not read vcpu privregs");
goto out;
}
@@ -441,12 +443,12 @@ xc_ia64_hvm_recv_context(int xc_handle,
/* vcpu context */
for (i = 0; i <= info.max_vcpu_id; i++) {
/* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
+ vcpu_guest_context_any_t ctxt_any;
if (!__test_bit(i, vcpumap))
continue;
- if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+ if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
goto out;
/* system context of vcpu is recieved as hvm context. */
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c Wed Jul 02 11:30:37 2008 +0900
@@ -180,9 +180,10 @@ xc_ia64_send_unallocated_list(int xc_han
static int
xc_ia64_send_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
- uint32_t vcpu, vcpu_guest_context_t *ctxt)
-{
- if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
+ uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
+{
+ vcpu_guest_context_t *ctxt = &ctxt_any->c;
+ if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
ERROR("Could not get vcpu context");
return -1;
}
@@ -269,17 +270,19 @@ xc_ia64_pv_send_context(int xc_handle, i
/* vcpu context */
for (i = 0; i <= info->max_vcpu_id; i++) {
/* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
+ vcpu_guest_context_any_t ctxt_any;
+ vcpu_guest_context_t *ctxt = &ctxt_any.c;
+
char *mem;
if (!__test_bit(i, vcpumap))
continue;
- if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+ if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
goto out;
mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
- PROT_READ|PROT_WRITE, ctxt.privregs_pfn);
+ PROT_READ|PROT_WRITE, ctxt->privregs_pfn);
if (mem == NULL) {
ERROR("cannot map privreg page");
goto out;
@@ -337,12 +340,12 @@ xc_ia64_hvm_send_context(int xc_handle,
/* vcpu context */
for (i = 0; i <= info->max_vcpu_id; i++) {
/* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
+ vcpu_guest_context_any_t ctxt_any;
if (!__test_bit(i, vcpumap))
continue;
- if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+ if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
goto out;
/* system context of vcpu is sent as hvm context. */
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core.c
--- a/tools/libxc/xc_core.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core.c Wed Jul 02 11:30:37 2008 +0900
@@ -407,7 +407,7 @@ xc_domain_dumpcore_via_callback(int xc_h
int nr_vcpus = 0;
char *dump_mem, *dump_mem_start = NULL;
- vcpu_guest_context_t ctxt[MAX_VIRT_CPUS];
+ vcpu_guest_context_any_t ctxt[MAX_VIRT_CPUS];
struct xc_core_arch_context arch_ctxt;
char dummy[PAGE_SIZE];
int dummy_len;
@@ -581,10 +581,10 @@ xc_domain_dumpcore_via_callback(int xc_h
PERROR("Could not get section header for .xen_prstatus");
goto out;
}
- filesz = sizeof(ctxt[0]) * nr_vcpus;
+ filesz = sizeof(ctxt[0].c) * nr_vcpus;
sts = xc_core_shdr_set(shdr, strtab, XEN_DUMPCORE_SEC_PRSTATUS,
SHT_PROGBITS, offset, filesz,
- __alignof__(ctxt[0]), sizeof(ctxt[0]));
+ __alignof__(ctxt[0].c), sizeof(ctxt[0].c));
if ( sts != 0 )
goto out;
offset += filesz;
@@ -707,7 +707,7 @@ xc_domain_dumpcore_via_callback(int xc_h
goto out;
/* prstatus: .xen_prstatus */
- sts = dump_rtn(args, (char *)&ctxt, sizeof(ctxt[0]) * nr_vcpus);
+ sts = dump_rtn(args, (char *)&ctxt[0].c, sizeof(ctxt[0].c) * nr_vcpus);
if ( sts != 0 )
goto out;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.c
--- a/tools/libxc/xc_core_ia64.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core_ia64.c Wed Jul 02 11:30:37 2008 +0900
@@ -308,9 +308,10 @@ xc_core_arch_context_free(struct xc_core
int
xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
- vcpu_guest_context_t* ctxt,
+ vcpu_guest_context_any_t* ctxt_any,
int xc_handle, uint32_t domid)
{
+ vcpu_guest_context_t *ctxt = &ctxt_any->c;
mapped_regs_t* mapped_regs;
if ( ctxt->privregs_pfn == VGC_PRIVREGS_HVM )
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.h
--- a/tools/libxc/xc_core_ia64.h Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core_ia64.h Wed Jul 02 11:30:37 2008 +0900
@@ -40,7 +40,7 @@ xc_core_arch_context_free(struct xc_core
xc_core_arch_context_free(struct xc_core_arch_context* arch_ctxt);
int
xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
- vcpu_guest_context_t* ctxt,
+ vcpu_guest_context_any_t* ctxt,
int xc_handle, uint32_t domid);
int
xc_core_arch_context_get_shdr(struct xc_core_arch_context* arch_ctxt,
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain.c Wed Jul 02 11:30:37 2008 +0900
@@ -298,30 +298,21 @@ int xc_vcpu_getcontext(int xc_handle,
int xc_vcpu_getcontext(int xc_handle,
uint32_t domid,
uint32_t vcpu,
- vcpu_guest_context_t *ctxt)
-{
- int rc;
- DECLARE_DOMCTL;
- size_t sz = sizeof(vcpu_guest_context_either_t);
+ vcpu_guest_context_any_t *ctxt)
+{
+ int rc;
+ DECLARE_DOMCTL;
+ size_t sz = sizeof(vcpu_guest_context_any_t);
domctl.cmd = XEN_DOMCTL_getvcpucontext;
domctl.domain = (domid_t)domid;
domctl.u.vcpucontext.vcpu = (uint16_t)vcpu;
- set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
-
- /*
- * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
- * larger of the two if possible, otherwise fall back to native size.
- */
+ set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
+
+
if ( (rc = lock_pages(ctxt, sz)) != 0 )
- {
- sz = sizeof(*ctxt);
- if ( (rc = lock_pages(ctxt, sz)) != 0 )
- return rc;
- }
-
+ return rc;
rc = do_domctl(xc_handle, &domctl);
-
unlock_pages(ctxt, sz);
return rc;
@@ -626,32 +617,28 @@ int xc_vcpu_setcontext(int xc_handle,
int xc_vcpu_setcontext(int xc_handle,
uint32_t domid,
uint32_t vcpu,
- vcpu_guest_context_t *ctxt)
-{
- DECLARE_DOMCTL;
- int rc;
- size_t sz = sizeof(vcpu_guest_context_either_t);
+ vcpu_guest_context_any_t *ctxt)
+{
+ DECLARE_DOMCTL;
+ int rc;
+ size_t sz = sizeof(vcpu_guest_context_any_t);
+
+ if (ctxt == NULL)
+ {
+ errno = EINVAL;
+ return -1;
+ }
domctl.cmd = XEN_DOMCTL_setvcpucontext;
domctl.domain = domid;
domctl.u.vcpucontext.vcpu = vcpu;
- set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
-
- /*
- * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
- * larger of the two if possible, otherwise fall back to native size.
- */
- if ( (ctxt != NULL) && (rc = lock_pages(ctxt, sz)) != 0 )
- {
- sz = sizeof(*ctxt);
- if ( (rc = lock_pages(ctxt, sz)) != 0 )
- return rc;
- }
-
+ set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
+
+ if ( (rc = lock_pages(ctxt, sz)) != 0 )
+ return rc;
rc = do_domctl(xc_handle, &domctl);
-
- if ( ctxt != NULL )
- unlock_pages(ctxt, sz);
+
+ unlock_pages(ctxt, sz);
return rc;
}
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain_restore.c Wed Jul 02 11:30:37 2008 +0900
@@ -153,7 +153,7 @@ static xen_pfn_t *load_p2m_frame_list(
int io_fd, int *pae_extended_cr3, int *ext_vcpucontext)
{
xen_pfn_t *p2m_frame_list;
- vcpu_guest_context_either_t ctxt;
+ vcpu_guest_context_any_t ctxt;
xen_pfn_t p2m_fl_zero;
/* Read first entry of P2M list, or extended-info signature (~0UL). */
@@ -284,12 +284,12 @@ int xc_domain_restore(int xc_handle, int
/* The new domain's shared-info frame number. */
unsigned long shared_info_frame;
unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
- shared_info_either_t *old_shared_info =
- (shared_info_either_t *)shared_info_page;
- shared_info_either_t *new_shared_info;
+ shared_info_any_t *old_shared_info =
+ (shared_info_any_t *)shared_info_page;
+ shared_info_any_t *new_shared_info;
/* A copy of the CPU context of the guest. */
- vcpu_guest_context_either_t ctxt;
+ vcpu_guest_context_any_t ctxt;
/* A table containing the type of each PFN (/not/ MFN!). */
unsigned long *pfn_type = NULL;
@@ -304,7 +304,7 @@ int xc_domain_restore(int xc_handle, int
xen_pfn_t *p2m_frame_list = NULL;
/* A temporary mapping of the guest's start_info page. */
- start_info_either_t *start_info;
+ start_info_any_t *start_info;
/* Our mapping of the current region (batch) */
char *region_base;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain_save.c Wed Jul 02 11:30:37 2008 +0900
@@ -412,7 +412,7 @@ static int suspend_and_state(int (*suspe
** it to update the MFN to a reasonable value.
*/
static void *map_frame_list_list(int xc_handle, uint32_t dom,
- shared_info_either_t *shinfo)
+ shared_info_any_t *shinfo)
{
int count = 100;
void *p;
@@ -628,9 +628,9 @@ static xen_pfn_t *map_and_save_p2m_table
int io_fd,
uint32_t dom,
unsigned long p2m_size,
- shared_info_either_t *live_shinfo)
-{
- vcpu_guest_context_either_t ctxt;
+ shared_info_any_t *live_shinfo)
+{
+ vcpu_guest_context_any_t ctxt;
/* Double and single indirect references to the live P2M table */
void *live_p2m_frame_list_list = NULL;
@@ -735,7 +735,7 @@ static xen_pfn_t *map_and_save_p2m_table
p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
}
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
+ if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
{
ERROR("Could not get vcpu context");
goto out;
@@ -814,7 +814,7 @@ int xc_domain_save(int xc_handle, int io
unsigned long shared_info_frame;
/* A copy of the CPU context of the guest. */
- vcpu_guest_context_either_t ctxt;
+ vcpu_guest_context_any_t ctxt;
/* A table containing the type of each PFN (/not/ MFN!). */
unsigned long *pfn_type = NULL;
@@ -824,7 +824,7 @@ int xc_domain_save(int xc_handle, int io
char page[PAGE_SIZE];
/* Live mapping of shared info structure */
- shared_info_either_t *live_shinfo = NULL;
+ shared_info_any_t *live_shinfo = NULL;
/* base of the region in which domain memory is mapped */
unsigned char *region_base = NULL;
@@ -1536,7 +1536,7 @@ int xc_domain_save(int xc_handle, int io
}
}
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
+ if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
{
ERROR("Could not get vcpu context");
goto out;
@@ -1556,7 +1556,7 @@ int xc_domain_save(int xc_handle, int io
if ( !(vcpumap & (1ULL << i)) )
continue;
- if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) )
+ if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
{
ERROR("No context for VCPU%d", i);
goto out;
@@ -1624,7 +1624,7 @@ int xc_domain_save(int xc_handle, int io
* Reset the MFN to be a known-invalid value. See map_frame_list_list().
*/
memcpy(page, live_shinfo, PAGE_SIZE);
- SET_FIELD(((shared_info_either_t *)page),
+ SET_FIELD(((shared_info_any_t *)page),
arch.pfn_to_mfn_frame_list_list, 0);
if ( write_exact(io_fd, page, PAGE_SIZE) )
{
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_misc.c Wed Jul 02 11:30:37 2008 +0900
@@ -253,6 +253,34 @@ int xc_hvm_track_dirty_vram(
arg.first_pfn = first_pfn;
arg.nr = nr;
set_xen_guest_handle(arg.dirty_bitmap, (uint8_t *)dirty_bitmap);
+
+ if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
+ {
+ PERROR("Could not lock memory");
+ return rc;
+ }
+
+ rc = do_xen_hypercall(xc_handle, &hypercall);
+
+ unlock_pages(&arg, sizeof(arg));
+
+ return rc;
+}
+
+int xc_hvm_modified_memory(
+ int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr)
+{
+ DECLARE_HYPERCALL;
+ struct xen_hvm_modified_memory arg;
+ int rc;
+
+ hypercall.op = __HYPERVISOR_hvm_op;
+ hypercall.arg[0] = HVMOP_modified_memory;
+ hypercall.arg[1] = (unsigned long)&arg;
+
+ arg.domid = dom;
+ arg.first_pfn = first_pfn;
+ arg.nr = nr;
if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
{
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_pagetab.c
--- a/tools/libxc/xc_pagetab.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_pagetab.c Wed Jul 02 11:30:37 2008 +0900
@@ -48,7 +48,7 @@ unsigned long xc_translate_foreign_addre
unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom,
int vcpu, unsigned long long virt )
{
- vcpu_guest_context_t ctx;
+ vcpu_guest_context_any_t ctx;
unsigned long long cr3;
void *pd, *pt, *pdppage = NULL, *pdp, *pml = NULL;
unsigned long long pde, pte, pdpe, pmle;
@@ -78,7 +78,7 @@ unsigned long xc_translate_foreign_addre
DPRINTF("failed to retreive vcpu context\n");
goto out;
}
- cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.ctrlreg[3])) << PAGE_SHIFT;
+ cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.c.ctrlreg[3])) << PAGE_SHIFT;
/* Page Map Level 4 */
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_private.h
--- a/tools/libxc/xc_private.h Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_private.h Wed Jul 02 11:30:37 2008 +0900
@@ -188,9 +188,9 @@ int xc_map_foreign_ranges(int xc_handle,
privcmd_mmap_entry_t *entries, int nr);
void *map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
- vcpu_guest_context_t *ctxt);
+ vcpu_guest_context_any_t *ctxt);
int xc_waitdomain_core(int xc_handle, int domain, int *status,
- int options, vcpu_guest_context_t *ctxt);
+ int options, vcpu_guest_context_any_t *ctxt);
void bitmap_64_to_byte(uint8_t *bp, const uint64_t *lp, int nbits);
void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits);
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace.c
--- a/tools/libxc/xc_ptrace.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_ptrace.c Wed Jul 02 11:30:37 2008 +0900
@@ -40,9 +40,9 @@ static int current_isfile;
static int current_isfile;
static int current_is_hvm;
-static uint64_t online_cpumap;
-static uint64_t regs_valid;
-static vcpu_guest_context_t ctxt[MAX_VIRT_CPUS];
+static uint64_t online_cpumap;
+static uint64_t regs_valid;
+static vcpu_guest_context_any_t ctxt[MAX_VIRT_CPUS];
extern int ffsll(long long int);
#define FOREACH_CPU(cpumap, i) for ( cpumap = online_cpumap; (i =
ffsll(cpumap)); cpumap &= ~(1 << (index - 1)) )
@@ -96,9 +96,9 @@ xc_register_event_handler(thr_ev_handler
}
static inline int
-paging_enabled(vcpu_guest_context_t *v)
-{
- unsigned long cr0 = v->ctrlreg[0];
+paging_enabled(vcpu_guest_context_any_t *v)
+{
+ unsigned long cr0 = v->c.ctrlreg[0];
return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
}
@@ -174,7 +174,7 @@ map_domain_va_32(
l2 = xc_map_foreign_range(
xc_handle, current_domid, PAGE_SIZE, PROT_READ,
- xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+ xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
if ( l2 == NULL )
return NULL;
@@ -216,7 +216,7 @@ map_domain_va_pae(
l3 = xc_map_foreign_range(
xc_handle, current_domid, PAGE_SIZE, PROT_READ,
- xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+ xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
if ( l3 == NULL )
return NULL;
@@ -264,12 +264,12 @@ map_domain_va_64(
uint64_t *l4, *l3, *l2, *l1;
static void *v[MAX_VIRT_CPUS];
- if ((ctxt[cpu].ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
+ if ((ctxt[cpu].c.ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
return map_domain_va_32(xc_handle, cpu, guest_va, perm);
l4 = xc_map_foreign_range(
xc_handle, current_domid, PAGE_SIZE, PROT_READ,
- xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+ xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
if ( l4 == NULL )
return NULL;
@@ -494,26 +494,26 @@ xc_ptrace(
case PTRACE_GETREGS:
if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
goto out_error;
- SET_PT_REGS(pt, ctxt[cpu].user_regs);
+ SET_PT_REGS(pt, ctxt[cpu].c.user_regs);
memcpy(data, &pt, sizeof(struct gdb_regs));
break;
case PTRACE_GETFPREGS:
if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
goto out_error;
- memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof (elf_fpregset_t));
+ memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof (elf_fpregset_t));
break;
case PTRACE_GETFPXREGS:
if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
goto out_error;
- memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt));
+ memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof(ctxt[cpu].c.fpu_ctxt));
break;
case PTRACE_SETREGS:
if (current_isfile)
goto out_unsupported; /* XXX not yet supported */
- SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].user_regs);
+ SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].c.user_regs);
if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
&ctxt[cpu])))
goto out_error_domctl;
@@ -525,7 +525,7 @@ xc_ptrace(
/* XXX we can still have problems if the user switches threads
* during single-stepping - but that just seems retarded
*/
- ctxt[cpu].user_regs.eflags |= PSL_T;
+ ctxt[cpu].c.user_regs.eflags |= PSL_T;
if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
&ctxt[cpu])))
goto out_error_domctl;
@@ -542,9 +542,9 @@ xc_ptrace(
if (fetch_regs(xc_handle, cpu, NULL))
goto out_error;
/* Clear trace flag */
- if ( ctxt[cpu].user_regs.eflags & PSL_T )
+ if ( ctxt[cpu].c.user_regs.eflags & PSL_T )
{
- ctxt[cpu].user_regs.eflags &= ~PSL_T;
+ ctxt[cpu].c.user_regs.eflags &= ~PSL_T;
if ((retval = xc_vcpu_setcontext(xc_handle, current_domid,
cpu, &ctxt[cpu])))
goto out_error_domctl;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace_core.c
--- a/tools/libxc/xc_ptrace_core.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_ptrace_core.c Wed Jul 02 11:30:37 2008 +0900
@@ -641,24 +641,24 @@ static const struct xc_core_format_type*
void *
map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
- vcpu_guest_context_t *ctxt)
+ vcpu_guest_context_any_t *ctxt)
{
if (current_format_type == NULL)
return NULL;
return (current_format_type->map_domain_va_core)(domfd, cpu, guest_va,
- ctxt);
+ &ctxt->c);
}
int
xc_waitdomain_core(int xc_handle, int domfd, int *status, int options,
- vcpu_guest_context_t *ctxt)
+ vcpu_guest_context_any_t *ctxt)
{
int ret;
int i;
for (i = 0; i < NR_FORMAT_TYPE; i++) {
ret = (format_type[i].waitdomain_core)(xc_handle, domfd, status,
- options, ctxt);
+ options, &ctxt->c);
if (ret == 0) {
current_format_type = &format_type[i];
break;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_resume.c Wed Jul 02 11:30:37 2008 +0900
@@ -13,7 +13,7 @@
static int modify_returncode(int xc_handle, uint32_t domid)
{
- vcpu_guest_context_either_t ctxt;
+ vcpu_guest_context_any_t ctxt;
xc_dominfo_t info;
xen_capabilities_info_t caps;
int rc;
@@ -39,7 +39,7 @@ static int modify_returncode(int xc_hand
return -1;
}
- if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
+ if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt)) != 0 )
return rc;
if ( !info.hvm )
@@ -49,7 +49,7 @@ static int modify_returncode(int xc_hand
else
ctxt.x32.user_regs.eax = 1;
- if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
+ if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt)) != 0 )
return rc;
return 0;
@@ -89,7 +89,7 @@ static int xc_domain_resume_any(int xc_h
int i, rc = -1;
#if defined(__i386__) || defined(__x86_64__)
unsigned long mfn, p2m_size = 0;
- vcpu_guest_context_t ctxt;
+ vcpu_guest_context_any_t ctxt;
start_info_t *start_info;
shared_info_t *shinfo = NULL;
xen_pfn_t *p2m_frame_list_list = NULL;
@@ -167,7 +167,7 @@ static int xc_domain_resume_any(int xc_h
goto out;
}
- mfn = ctxt.user_regs.edx;
+ mfn = ctxt.c.user_regs.edx;
start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
PROT_READ | PROT_WRITE, mfn);
diff -r 11318234588e -r 08f77df14cba tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xenctrl.h Wed Jul 02 11:30:37 2008 +0900
@@ -30,6 +30,11 @@
#include <xen/xsm/acm.h>
#include <xen/xsm/acm_ops.h>
#include <xen/xsm/flask_op.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <xen/foreign/x86_32.h>
+#include <xen/foreign/x86_64.h>
+#endif
#ifdef __ia64__
#define XC_PAGE_SHIFT 14
@@ -162,6 +167,35 @@ typedef struct xc_dominfo {
} xc_dominfo_t;
typedef xen_domctl_getdomaininfo_t xc_domaininfo_t;
+
+typedef union
+{
+#if defined(__i386__) || defined(__x86_64__)
+ vcpu_guest_context_x86_64_t x64;
+ vcpu_guest_context_x86_32_t x32;
+#endif
+ vcpu_guest_context_t c;
+} vcpu_guest_context_any_t;
+
+typedef union
+{
+#if defined(__i386__) || defined(__x86_64__)
+ shared_info_x86_64_t x64;
+ shared_info_x86_32_t x32;
+#endif
+ shared_info_t s;
+} shared_info_any_t;
+
+typedef union
+{
+#if defined(__i386__) || defined(__x86_64__)
+ start_info_x86_64_t x64;
+ start_info_x86_32_t x32;
+#endif
+ start_info_t s;
+} start_info_any_t;
+
+
int xc_domain_create(int xc_handle,
uint32_t ssidref,
xen_domain_handle_t handle,
@@ -307,7 +341,7 @@ int xc_vcpu_setcontext(int xc_handle,
int xc_vcpu_setcontext(int xc_handle,
uint32_t domid,
uint32_t vcpu,
- vcpu_guest_context_t *ctxt);
+ vcpu_guest_context_any_t *ctxt);
/**
* This function will return information about one or more domains, using a
* single hypercall. The domain information will be stored into the supplied
@@ -368,7 +402,7 @@ int xc_vcpu_getcontext(int xc_handle,
int xc_vcpu_getcontext(int xc_handle,
uint32_t domid,
uint32_t vcpu,
- vcpu_guest_context_t *ctxt);
+ vcpu_guest_context_any_t *ctxt);
typedef xen_domctl_getvcpuinfo_t xc_vcpuinfo_t;
int xc_vcpu_getinfo(int xc_handle,
@@ -894,6 +928,12 @@ int xc_hvm_track_dirty_vram(
int xc_handle, domid_t dom,
uint64_t first_pfn, uint64_t nr,
unsigned long *bitmap);
+
+/*
+ * Notify that some pages got modified by the Device Model
+ */
+int xc_hvm_modified_memory(
+ int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr);
typedef enum {
XC_ERROR_NONE = 0,
diff -r 11318234588e -r 08f77df14cba tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xg_save_restore.h Wed Jul 02 11:30:37 2008 +0900
@@ -112,28 +112,6 @@ static inline int get_platform_info(int
#define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))
-/* 32-on-64 support: saving 32bit guests from 64bit tools and vice versa */
-typedef union
-{
- vcpu_guest_context_x86_64_t x64;
- vcpu_guest_context_x86_32_t x32;
- vcpu_guest_context_t c;
-} vcpu_guest_context_either_t;
-
-typedef union
-{
- shared_info_x86_64_t x64;
- shared_info_x86_32_t x32;
- shared_info_t s;
-} shared_info_either_t;
-
-typedef union
-{
- start_info_x86_64_t x64;
- start_info_x86_32_t x32;
- start_info_t s;
-} start_info_either_t;
-
#define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f))
#define SET_FIELD(_p, _f, _v) do { \
diff -r 11318234588e -r 08f77df14cba tools/python/xen/util/blkif.py
--- a/tools/python/xen/util/blkif.py Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/util/blkif.py Wed Jul 02 11:30:37 2008 +0900
@@ -16,8 +16,11 @@ def blkdev_name_to_number(name):
n = expand_dev_name(name)
+ devname = 'virtual-device'
+ devnum = None
+
try:
- return os.stat(n).st_rdev
+ return (devname, os.stat(n).st_rdev)
except Exception, ex:
pass
@@ -25,28 +28,30 @@ def blkdev_name_to_number(name):
if re.match( '/dev/sd[a-z]([1-9]|1[0-5])?$', n):
major = scsi_major[(ord(n[7:8]) - ord('a')) / 16]
minor = ((ord(n[7:8]) - ord('a')) % 16) * 16 + int(n[8:] or 0)
- return major * 256 + minor
- if re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
+ devnum = major * 256 + minor
+ elif re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
major = scsi_major[((ord(n[7:8]) - ord('a') + 1) * 26 + (ord(n[8:9]) -
ord('a'))) / 16 ]
minor = (((ord(n[7:8]) - ord('a') + 1 ) * 26 + (ord(n[8:9]) -
ord('a'))) % 16) * 16 + int(n[9:] or 0)
- return major * 256 + minor
-
- if re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
+ devnum = major * 256 + minor
+ elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
ide_majors = [ 3, 22, 33, 34, 56, 57, 88, 89, 90, 91 ]
major = ide_majors[(ord(n[7:8]) - ord('a')) / 2]
minor = ((ord(n[7:8]) - ord('a')) % 2) * 64 + int(n[8:] or 0)
- return major * 256 + minor
+ devnum = major * 256 + minor
+ elif re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?$', n):
+ devnum = (202 << 8) + ((ord(n[8:9]) - ord('a')) << 4) + int(n[9:] or 0)
+ elif re.match('/dev/xvd[q-z]([1-9]|1[0-5])?$', n):
+ devname = 'virtual-device-ext'
+ devnum = (1 << 28) + ((ord(n[8:9]) - ord('a')) << 8) + int(n[9:] or 0)
+ elif re.match('/dev/xvd[a-i][a-z]([1-9]|1[0-5])?$', n):
+ devname = 'virtual-device-ext'
+ devnum = (1 << 28) + (((ord(n[8:9]) - ord('a') + 1) * 26 +
(ord(n[9:10]) - ord('a'))) << 8) + int(n[10:] or 0)
+ elif re.match( '^(0x)[0-9a-fA-F]+$', name ):
+ devnum = string.atoi(name, 16)
+ elif re.match('^[0-9]+$', name):
+ devnum = string.atoi(name, 10)
- if re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?', n):
- return 202 * 256 + 16 * (ord(n[8:9]) - ord('a')) + int(n[9:] or 0)
-
- if re.match( '^(0x)[0-9a-fA-F]+$', name ):
- return string.atoi(name,16)
-
- if re.match('^[0-9]+$', name):
- return string.atoi(name, 10)
-
- return None
+ return (devname, devnum)
def blkdev_segment(name):
"""Take the given block-device name (e.g. '/dev/sda1', 'hda')
@@ -58,7 +63,7 @@ def blkdev_segment(name):
type: 'Disk' or identifying name for partition type
"""
val = None
- n = blkdev_name_to_number(name)
+ (name, n) = blkdev_name_to_number(name)
if not n is None:
val = { 'device' : n,
'start_sector' : long(0),
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py Wed Jul 02 11:30:37 2008 +0900
@@ -1123,7 +1123,7 @@ class XendConfig(dict):
try:
devid = int(dev2)
except ValueError:
- devid = blkdev_name_to_number(dev2)
+ (xenbus, devid) = blkdev_name_to_number(dev2)
if devid == None:
log.debug("The device %s is not device name", dev2)
return None
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendOptions.py
--- a/tools/python/xen/xend/XendOptions.py Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/XendOptions.py Wed Jul 02 11:30:37 2008 +0900
@@ -132,6 +132,9 @@ class XendOptions:
"""Default script to configure a backend network interface"""
vif_script = osdep.vif_script
+ """Default rotation count of qemu-dm log file."""
+ qemu_dm_logrotate_count = 10
+
def __init__(self):
self.configure()
@@ -350,6 +353,10 @@ class XendOptions:
def get_vnc_x509_verify(self):
return self.get_config_string('vnc-x509-verify',
self.xend_vnc_x509_verify)
+
+ def get_qemu_dm_logrotate_count(self):
+ return self.get_config_int("qemu-dm-logrotate-count",
+ self.qemu_dm_logrotate_count)
class XendOptionsFile(XendOptions):
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/image.py Wed Jul 02 11:30:37 2008 +0900
@@ -378,13 +378,23 @@ class ImageHandler:
# keep track of pid and spawned options to kill it later
self.logfile = "/var/log/xen/qemu-dm-%s.log" %
str(self.vm.info['name_label'])
- if os.path.exists(self.logfile):
- if os.path.exists(self.logfile + ".1"):
- os.unlink(self.logfile + ".1")
- os.rename(self.logfile, self.logfile + ".1")
+
+ # rotate log
+ logfile_mode = os.O_WRONLY|os.O_CREAT|os.O_APPEND
+ logrotate_count = XendOptions.instance().get_qemu_dm_logrotate_count()
+ if logrotate_count > 0:
+ logfile_mode |= os.O_TRUNC
+ if os.path.exists("%s.%d" % (self.logfile, logrotate_count)):
+ os.unlink("%s.%d" % (self.logfile, logrotate_count))
+ for n in range(logrotate_count - 1, 0, -1):
+ if os.path.exists("%s.%d" % (self.logfile, n)):
+ os.rename("%s.%d" % (self.logfile, n),
+ "%s.%d" % (self.logfile, (n + 1)))
+ if os.path.exists(self.logfile):
+ os.rename(self.logfile, self.logfile + ".1")
null = os.open("/dev/null", os.O_RDONLY)
- logfd = os.open(self.logfile,
os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
+ logfd = os.open(self.logfile, logfile_mode)
sys.stderr.flush()
pid = os.fork()
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/server/blkif.py
--- a/tools/python/xen/xend/server/blkif.py Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/server/blkif.py Wed Jul 02 11:30:37 2008 +0900
@@ -81,11 +81,11 @@ class BlkifController(DevController):
if security.on() == xsconstants.XS_POLICY_ACM:
self.do_access_control(config, uname)
- devid = blkif.blkdev_name_to_number(dev)
+ (device_path, devid) = blkif.blkdev_name_to_number(dev)
if devid is None:
raise VmError('Unable to find number for device (%s)' % (dev))
- front = { 'virtual-device' : "%i" % devid,
+ front = { device_path : "%i" % devid,
'device-type' : dev_type
}
@@ -204,5 +204,5 @@ class BlkifController(DevController):
dev = devid.split('/')[-1]
dev = int(dev)
except ValueError:
- dev = blkif.blkdev_name_to_number(dev)
+ (device_path, dev) = blkif.blkdev_name_to_number(dev)
return dev
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xm/main.py Wed Jul 02 11:30:37 2008 +0900
@@ -2022,8 +2022,7 @@ def xm_block_list(args):
map(server.xenapi.VBD.get_runtime_properties, vbd_refs)
vbd_devs = \
map(server.xenapi.VBD.get_device, vbd_refs)
- vbd_devids = \
- map(blkdev_name_to_number, vbd_devs)
+ vbd_devids = [blkdev_name_to_number(x)[1] for x in vbd_devs]
devs = map(lambda (devid, prop): [devid, map2sxp(prop)],
zip(vbd_devids, vbd_properties))
else:
diff -r 11318234588e -r 08f77df14cba tools/tests/test_x86_emulator.c
--- a/tools/tests/test_x86_emulator.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/tests/test_x86_emulator.c Wed Jul 02 11:30:37 2008 +0900
@@ -22,23 +22,22 @@ static int read(
static int read(
unsigned int seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
- *val = 0;
- memcpy(val, (void *)offset, bytes);
+ memcpy(p_data, (void *)offset, bytes);
return X86EMUL_OKAY;
}
static int write(
unsigned int seg,
unsigned long offset,
- unsigned long val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
- memcpy((void *)offset, &val, bytes);
+ memcpy((void *)offset, p_data, bytes);
return X86EMUL_OKAY;
}
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon-monitor
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoon-monitor Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# xenballoon-monitor - monitor certain stats from xenballoond
+# (run in dom0 with "watch -d xenballoon-monitor" for xentop-like output)
+#
+# Copyright (C) 2009 Oracle Corporation and/or its affiliates.
+# All rights reserved
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# Hint: Use "xm sched-credit -d 0 -w 2000" to watch on heavily loaded machines
+#
+echo "id mem-kb tgt-kb commit swapin swapout pgin pgout
active(sec)"
+for i in `xenstore-list /local/domain`; do
+ if [ "$i" -ne 0 ]; then
+ tot=0; tgt=0; sin=0; sout=0; pgin=0; pgout=0; cmt=0; up=0; idle=0; act=0;
+ if xenstore-exists /local/domain/$i/memory/meminfo; then
+ tot=`xenstore-read /local/domain/$i/memory/meminfo | grep MemTotal \
+ | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
+ cmt=`xenstore-read /local/domain/$i/memory/meminfo | grep Committed_AS \
+ | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
+ fi
+ if xenstore-exists /local/domain/$i/memory/selftarget; then
+ tgt=`xenstore-read /local/domain/$i/memory/selftarget`
+ fi
+ if xenstore-exists /local/domain/$i/memory/vmstat; then
+ sin=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpin \
+ | cut -d" " -f2`
+ sout=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpout \
+ | cut -d" " -f2`
+ pgin=`xenstore-read /local/domain/$i/memory/vmstat | grep pgpgin \
+ | cut -d" " -f2`
+ pgout=`xenstore-read /local/domain/$i/memory/vmstat | grep pgout \
+ | cut -d" " -f2`
+ fi
+ if xenstore-exists /local/domain/$i/memory/uptime; then
+ up=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f1`
+ idle=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f2`
+ act=`echo $up - $idle | bc -iq`
+ fi
+ printf "%2d %8d%8d%8d%9d%9d%10d%10d%10.2f\n" $i $tot $tgt $cmt $sin $sout
$pgin $pgout $act
+ fi
+done
+echo Free memory: `xm info | grep free | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
MB
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoon.conf Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,91 @@
+## Path: System/xen
+## Description: xen domain start/stop on boot
+## Type: string
+## Default:
+
+# NOTE: "xenbus is enabled" means not only that /proc/xen/xenbus exists
+# but also that /usr/bin/xenstore-* tools are installed.
+
+## Type: boolean
+## Default: false
+#
+# If XENBALLOON_SELF is true, selfballooning will occur, meaning the
+# balloon driver will grow and shrink according to available memory.
+# If xenbus is enabled, may be overridden by {memory/selfballoon}==0
+# If false but xenballoond is able to communicate with domain0 via
+# xenbus, balloon targets will be set by domain0
+#
+XENBALLOON_SELF=false
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# If self-ballooning, number of seconds between checks/adjustments.
+# If xenbus is enabled, may be overridden by {memory/interval}
+XENBALLOON_SELF_INTERVAL=1
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# If NOT self-ballooning but xenbus is enabled, number of seconds between
+# checks/adjustments. May be overridden by {memory/interval}
+XENBALLOON_INTERVAL=1
+
+## Type: integer (must be > 0)
+## Default: 10
+#
+# When current > target, reduces rate at which target memory is ballooned
+# out. For a value of n, 1/n of the difference will be ballooned.
+# This value applies both to selfballooning and directed ballooning.
+# May be overridden by {memory/downhysteresis}
+XENBALLOON_AUTO_DOWNHYSTERESIS=10
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# When current < target, reduces rate at which target memory is reclaimed
+# (if available). For a value of n, 1/n of the difference will be ballooned.
+# This value applies both to selfballooning and directed ballooning.
+# May be overridden by {memory/uphysteresis}
+XENBALLOON_AUTO_UPHYSTERESIS=1
+
+## Type: integer (must be >= 0)
+## Default: 0
+#
+# In order to avoid ballooning so much memory that a guest experiences
+# out-of-memory errors (OOMs), memory will not be ballooned out below
+# a minimum target, in MB. If this value is 0 (default), an heuristic
+# based on the maximum amount of memory will be used. (The heuristic
+# provides the same minimum as recent versions of the balloon driver but
+# early versions of the balloon driver did not enforce a minimum.)
+XENBALLOON_MINMEM=0
+
+## Type: string
+## Default: "/var/run/xenballoon-maxmem"
+#
+# Location where memory high-water mark is stored; if a guest supports
+# hot-add memory, maxmem might increase across time and the minimum
+# target heuristic is based on max memory. NOTE: Reboot after changing
+# this variable, else overballooning may occur.
+XENBALLOON_MAXMEMFILE=/var/run/xenballoon-maxmem
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/meminfo" on xenbus at memory/meminfo
+XENBALLOON_SEND_MEMINFO=1
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/vmstat" on xenbus at memory/vmstat
+XENBALLOON_SEND_VMSTAT=1
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/uptime" on xenbus at memory/uptime
+XENBALLOON_SEND_UPTIME=1
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,205 @@
+#!/bin/bash
+#
+# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+# All rights reserved.
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# xenballoond - In-guest engine for Xen memory ballooning
+# Version: 080630
+#
+# Two "policies" are implemented:
+# - Selfballooning: Adjust memory periodically, with no (or little) input
+# from domain0. Target memory is determined solely by the
+# Committed_AS line in /proc/meminfo, but parameters may adjust
+# the rate at which the target is achieved.
+# - Directed ballooning: Adjust memory solely as directed by domain0
+#
+# Under some circumstances, "output" may also be generated; the contents
+# of /proc/meminfo and /proc/vmstat may be periodically placed on xenbus.
+#
+# If xenbus is running and the /usr/bin/xenstore-* tools are installed,
+# "xenbus is enabled".
+#
+# Parameters are documented in /etc/sysconfig/xenballoon.conf. Although
+# some are not used with directed ballooning, all must be set properly.
+# If xenbus is enabled, some of these parameters may be overridden by values
+# set by domain0 via xenbus.
+
+minmb() {
+ RETVAL=$XENBALLOON_MINMEM
+ if [ $RETVAL -ne 0 ]; then
+ return $RETVAL
+ fi
+ kb=`cat $XENBALLOON_MAXMEMFILE`
+ let "mb=$kb/1024"
+ let "pages=$kb/4"
+ # this algorithm from drivers/xen/balloon/balloon.c:minimum_target()
+ # which was added to balloon.c in 2008 to avoid ballooning too small
+ # it is unnecessary here except to accomodate pre-2008 balloon drivers
+ # note that ranges are adjusted because a VM with "memory=1024"
+ # gets somewhat less than 1024MB
+ if [ $mb -lt 125 ]; then
+ let RETVAL="$(( 8 + ($pages >> 9) ))"
+ elif [ $mb -lt 500 ]; then
+ let RETVAL="$(( 40 + ($pages >> 10) ))"
+ elif [ $mb -lt 2000 ]; then
+ let RETVAL="$(( 104 + ($pages >> 11) ))"
+ else
+ let RETVAL="$(( 296 + ($pages >> 13) ))"
+ fi
+ return # value returned in RETVAL in mB
+}
+
+curkb() {
+ kb=`grep MemTotal /proc/meminfo | sed 's/ */ /' | \
+ cut -f2 -d' '`
+ RETVAL=$kb
+ return # value returned in RETVAL in kB
+}
+
+downhysteresis() {
+ RETVAL=$XENBALLOON_AUTO_DOWNHYSTERESIS
+ if [ $xenstore_enabled = "true" ]; then
+ if xenstore-exists memory/downhysteresis ; then
+ RETVAL=`xenstore-read memory/downhysteresis`
+ fi
+ fi
+ return
+}
+
+uphysteresis() {
+ RETVAL=$XENBALLOON_AUTO_UPHYSTERESIS
+ if [ $xenstore_enabled = "true" ]; then
+ if xenstore-exists memory/uphysteresis ; then
+ RETVAL=`xenstore-read memory/uphysteresis`
+ fi
+ fi
+ return
+}
+
+selfballoon_eval() {
+ if [ $xenstore_enabled = "true" ]; then
+ if xenstore-exists memory/selfballoon; then
+ RETVAL=`xenstore-read memory/selfballoon`
+ if [ $RETVAL -eq 1 ]; then
+ selfballoon_enabled=true
+ return
+ fi
+ fi
+ fi
+ selfballoon_enabled=$XENBALLOON_SELF
+ return
+}
+
+selftarget() {
+ tgtkb=`grep Committed_AS /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '`
+ minmb
+ let "minbytes=$RETVAL*1024*1024"
+ let "tgtbytes=$tgtkb*1024"
+ if [ $tgtbytes -lt $minbytes ]; then
+ let "tgtbytes=$minbytes"
+ fi
+ RETVAL=$tgtbytes # value returned in RETVAL in bytes
+ return
+}
+
+# $1 == 1 means use selftarget, else target in kB
+balloon_to_target() {
+ if [ "$1" -eq 1 ]; then
+ selftarget
+ tgtbytes=$RETVAL
+ else
+ let "tgtbytes=$(( $1 * 1024 ))"
+ fi
+ curkb
+ let "curbytes=$RETVAL*1024"
+ if [ $curbytes -gt $tgtbytes ]; then
+ downhysteresis
+ downhys=$RETVAL
+ if [ $downhys -ne 0 ]; then
+ let "tgtbytes=$(( $curbytes - \
+ ( ( $curbytes - $tgtbytes ) / $downhys ) ))"
+ fi
+ else if [ $curbytes -lt $tgtbytes ]; then
+ uphysteresis
+ uphys=$RETVAL
+ let "tgtbytes=$(( $curbytes + \
+ ( ( $tgtbytes - $curbytes ) / $uphys ) ))"
+ fi
+ fi
+ echo $tgtbytes > /proc/xen/balloon
+ if [ $xenstore_enabled = "true" ]; then
+ let "tgtkb=$(( $tgtbytes/1024 ))"
+ xenstore-write memory/selftarget $tgtkb
+ fi
+}
+
+send_memory_stats() {
+ if [ ! $xenstore_enabled = "true" ]; then
+ return
+ fi
+ if [ $XENBALLOON_SEND_MEMINFO ]; then
+ xenstore-write memory/meminfo "`cat /proc/meminfo`"
+ fi
+ if [ $XENBALLOON_SEND_VMSTAT ]; then
+ xenstore-write memory/vmstat "`cat /proc/vmstat`"
+ fi
+ if [ $XENBALLOON_SEND_UPTIME ]; then
+ xenstore-write memory/uptime "`cat /proc/uptime`"
+ fi
+}
+
+if [ ! -f /proc/xen/balloon ]; then
+ echo "$0: no balloon driver installed"
+ exit 0
+fi
+if [ ! -f /proc/meminfo ]; then
+ echo "$0: can't read /proc/meminfo"
+ exit 0
+fi
+xenstore_enabled=true
+if [ -f /usr/bin/xenstore-exists -a -f /usr/bin/xenstore-read -a \
+ -f /usr/bin/xenstore-write ]; then
+ xenstore_enabled=true
+else
+ echo "$0: missing /usr/bin/xenstore-* tools, disabling directed
ballooning"
+ xenstore_enabled=false
+fi
+
+. /etc/sysconfig/xenballoon.conf
+
+while true;
+do
+ # handle special case for PV domains with hot-add memory
+ if [ ! -f $XENBALLOON_MAXMEMFILE ]; then
+ maxkb=0
+ else
+ maxkb=`cat $XENBALLOON_MAXMEMFILE`
+ fi
+ curkb=`grep MemTotal /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '`
+ if [ $curkb -gt $maxkb ]; then
+ echo $curkb > $XENBALLOON_MAXMEMFILE
+ fi
+ interval=$XENBALLOON_INTERVAL
+ # do self-ballooning
+ selfballoon_eval
+ if [ $selfballoon_enabled = "true" ]; then
+ balloon_to_target 1
+ interval=$XENBALLOON_SELF_INTERVAL
+ # or do directed ballooning
+ elif [ $xenstore_enabled = "true" ]; then
+ if xenstore-exists memory/target ; then
+ tgtkb=`xenstore-read memory/target`
+ balloon_to_target $tgtkb
+ fi
+ interval=$XENBALLOON_INTERVAL
+ fi
+ send_memory_stats
+ if [ $xenstore_enabled = "true" ]; then
+ if xenstore-exists memory/interval ; then
+ interval=`xenstore-read memory/interval`
+ fi
+ fi
+ sleep $interval
+done &
+
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond.README Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,82 @@
+Xenballoond.README
+Preliminary version 0.1, 2008/06/30
+
+Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+All rights reserved.
+Written by Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+
+INTRODUCTION
+
+Xenballoond runs in guest domains and both implements selfballooning and
+provides metrics to dom0 for (future) directed ballooning. Both capabilities
+provide a foundation for basic "memory overcommit" functionality.
+
+With selfballooning enabled, xenballoond uses the Committed_AS value found
+in /proc/meminfo as a first approximation of how much memory is required
+by the guest and feeds this statistic back to the balloon driver to inflate
+or deflate the balloon as required to achieve the target guest memory size.
+Hysteresis parameters may be adjusted to rate-limit balloon inflation
+and deflation.
+
+If configured, certain selfballooning parameters -- including notably
+enabling/disabling of self-ballooning -- can be controlled from domain0.
+(These are fully documented in xenballoon.conf.)
+
+If configured, the following guest statistics are sent back to domain0:
+- /proc/meminfo
+- /proc/vmstat
+- /proc/uptime
+In a future release, some of these values will be used by a policy module
+in domain0 to control guest balloon size and provide memory balancing
+across all guests on a given system.
+
+Note that no page sharing (content-based or otherwise) is implemented
+and no VMM-based swapping is necessary.
+
+For more information, see:
+http://www.xen.org/files/xensummitboston08/MemoryOvercommit-XenSummit2008.pdf
+http://wiki.xensource.com/xenwiki/Open_Topics_For_Discussion?action=AttachFile&do=get&target=Memory+Overcommit.pdf
+
+INSTALLATION AND DEPLOYMENT
+
+In this preliminary release:
+- directed ballooning is not implemented, though a monitor is provided
+- only Redhat-based guests are supported
+
+Guest prerequisites to use xenballoond:
+- each guest must be configured with adequate[1] swap space
+- each guest must have the balloon driver installed (/proc/xen/balloon exists)
+- if directed ballooning (or monitoring) is desired, xenstore tools must be
+ installed in each guest in /usr/bin [2]
+
+[1] for best results, for a guest that is configured with maxmem=N and
+ requires Z MB of swap space without xenballoond, available swap should
+ be increased to N+Z MB when xenballoond is running
+[2] specifically xenstore-read, xenstore-exists, and xenstore-write must
+ be installed. Binaries can be obtained, for example, by building
+ xen-vvv.gz/tools in a guest-binary-compatible development tree
+
+Instructions to install/deploy xenballoond (in Redhat-based system):
+- in each guest:
+ - ensure pre-requisites are met (see above)
+ - place xenballoon.conf in /etc/sysconfig
+ - place xenballoond in /usr/sbin
+ - copy xenballoond.init to /etc/rc.d/init.d/xenballoond (note file rename)
+ - edit /etc/sysconfig/xenballoond.conf as desired (especially note that
+ selfballooning defaults as off)
+ - start xenballoond with "service xenballoond start", and/or configure
+ xenballoond to start at init (e.g. "chkconfig xenballoond on")
+- in domain0:
+ - if monitoring is desired, xenballoon-monitor may be installed in /usr/sbin
+- note that certain xenballoond.conf variables may be overridden by domain0
+ if xenstore is running in the guest; these are fully documented in
+ xenballoond.conf
+
+TODO:
+080630 modifications to support SUSE-based and debian-based guests
+080630 domain0 ballooning policy module
+080630 experiment with more aggressive (optionally) memory minimum targets
+080630 BUG: xenballoond doesn't properly record the fact that it's running;
+ e.g. flipping between run levels 5 and 3 launches additional daemons
+080630 BUG: reports of possible incompatibilites between ballooning and
+ save/restore/migrate have not been duplicated
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.init
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond.init Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,91 @@
+#!/bin/bash
+#
+# xenballoond Script to start and stop Xen ballooning daemon.
+#
+# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+# All rights reserved.
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# chkconfig: 2345 98 01
+# description: Starts and stops the Xen control daemon.
+### BEGIN INIT INFO
+# Provides: xenballoond
+# Required-Start: $syslog $remote_fs
+# Should-Start:
+# Required-Stop: $syslog $remote_fs
+# Should-Stop:
+# Default-Start: 3 4 5
+# Default-Stop: 0 1 2 6
+# Default-Enabled: yes
+# Short-Description: Start/stop xend
+# Description: Starts and stops the Xen ballooning daemon.
+### END INIT INFO
+
+# Source function library
+. /etc/init.d/functions
+
+#don't use in domain0
+[ -f /proc/xen/capabilities ] && \
+ grep -q "control_d" /proc/xen/capabilities && exit 0
+
+if [ -f /etc/sysconfig/xenballoon.conf ]; then
+ . /etc/sysconfig/xenballoon.conf
+fi
+
+# Check that balloon driver is present
+[ ! -f /proc/xen/balloon ] && exit 0
+
+# Record original memory (in kB)
+[ -z "$XENBALLOON_MAXMEMFILE" ] && exit 0
+let maxmem=`grep MemTotal /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '`
+if [ -f "$XENBALLOON_MAXMEMFILE" ]; then
+ let oldmax=`cat $XENBALLOON_MAXMEMFILE`
+ if [ $oldmax -gt $maxmem ]; then
+ let maxmem=oldmax
+ fi
+fi
+echo $maxmem > $XENBALLOON_MAXMEMFILE
+
+RETVAL=0
+prog="xenballoond"
+
+start() {
+ # Start daemons.
+ echo -n $"Starting $prog: "
+ daemon xenballoond $OPTIONS
+ RETVAL=$?
+ echo
+ return $RETVAL
+}
+
+stop() {
+ echo -n $"Shutting down $prog: "
+ killproc xenballoond
+ RETVAL=$?
+ echo
+ return $RETVAL
+}
+
+# See how we were called.
+case "$1" in
+ start)
+ start
+ ;;
+ stop)
+ stop
+ ;;
+ status)
+ status xenballoond
+ RETVAL=$?
+ ;;
+ restart|reload)
+ stop
+ start
+ RETVAL=$?
+ ;;
+ *)
+ echo $"Usage: $0 {start|stop|restart|status}"
+ exit 1
+esac
+
+exit $RETVAL
diff -r 11318234588e -r 08f77df14cba tools/xentrace/xenctx.c
--- a/tools/xentrace/xenctx.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/xentrace/xenctx.c Wed Jul 02 11:30:37 2008 +0900
@@ -702,7 +702,7 @@ void dump_ctx(int vcpu)
void dump_ctx(int vcpu)
{
int ret;
- vcpu_guest_context_t ctx;
+ vcpu_guest_context_any_t ctx;
xc_dominfo_t dominfo;
xc_handle = xc_interface_open(); /* for accessing control interface */
@@ -727,10 +727,10 @@ void dump_ctx(int vcpu)
exit(-1);
}
- print_ctx(&ctx);
+ print_ctx(&ctx.c);
#ifndef NO_TRANSLATION
- if (is_kernel_text(INSTR_POINTER((&ctx.user_regs))))
- print_stack(&ctx, vcpu);
+ if (is_kernel_text(INSTR_POINTER((&ctx.c.user_regs))))
+ print_stack(&ctx.c, vcpu);
#endif
if (!dominfo.paused) {
diff -r 11318234588e -r 08f77df14cba tools/xm-test/lib/XmTestLib/block_utils.py
--- a/tools/xm-test/lib/XmTestLib/block_utils.py Thu Jun 19 12:48:04
2008 +0900
+++ b/tools/xm-test/lib/XmTestLib/block_utils.py Wed Jul 02 11:30:37
2008 +0900
@@ -15,7 +15,7 @@ __all__ = [ "block_attach", "block_detac
def get_state(domain, devname):
- number = xen.util.blkif.blkdev_name_to_number(devname)
+ (path, number) = xen.util.blkif.blkdev_name_to_number(devname)
s, o = traceCommand("xm block-list %s | awk '/^%d/ {print $4}'" %
(domain.getName(), number))
if s != 0:
diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/vmx/vmx_hypercall.c
--- a/xen/arch/ia64/vmx/vmx_hypercall.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Jul 02 11:30:37 2008 +0900
@@ -204,6 +204,53 @@ do_hvm_op(unsigned long op, XEN_GUEST_HA
rc = -ENOSYS;
break;
+ case HVMOP_modified_memory:
+ {
+ struct xen_hvm_modified_memory a;
+ struct domain *d;
+ unsigned long pfn;
+
+ if ( copy_from_guest(&a, arg, 1) )
+ return -EFAULT;
+
+ if ( a.domid == DOMID_SELF )
+ {
+ d = rcu_lock_current_domain();
+ }
+ else
+ {
+ if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
+ return -ESRCH;
+ if ( !IS_PRIV_FOR(current->domain, d) )
+ {
+ rc = -EPERM;
+ goto param_fail3;
+ }
+ }
+
+ rc = -EINVAL;
+ if ( !is_hvm_domain(d) )
+ goto param_fail3;
+
+ rc = -EINVAL;
+ if ( a.first_pfn > domain_get_maximum_gpfn(d)
+ || a.first_pfn + a.nr - 1 < a.first_pfn
+ || a.first_pfn + a.nr - 1 > domain_get_maximum_gpfn(d))
+ goto param_fail3;
+
+ rc = 0;
+ if ( !d->arch.shadow_bitmap )
+ goto param_fail3;
+
+ for (pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++)
+ if (pfn < d->arch.shadow_bitmap_size)
+ set_bit(pfn, d->arch.shadow_bitmap);
+
+ param_fail3:
+ rcu_unlock_domain(d);
+ break;
+ }
+
default:
gdprintk(XENLOG_INFO, "Bad HVM op %ld.\n", op);
rc = -ENOSYS;
diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/ia64/xen/mm.c Wed Jul 02 11:30:37 2008 +0900
@@ -207,7 +207,7 @@ alloc_dom_xen_and_dom_io(void)
* Any Xen-heap pages that we will allow to be mapped will have
* their domain field set to dom_xen.
*/
- dom_xen = alloc_domain(DOMID_XEN);
+ dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
BUG_ON(dom_xen == NULL);
/*
@@ -215,7 +215,7 @@ alloc_dom_xen_and_dom_io(void)
* This domain owns I/O pages that are within the range of the page_info
* array. Mappings occur at the priv of the caller.
*/
- dom_io = alloc_domain(DOMID_IO);
+ dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
BUG_ON(dom_io == NULL);
}
@@ -1553,7 +1553,7 @@ expose_p2m_init(void)
* Initialise our DOMID_P2M domain.
* This domain owns m2p table pages.
*/
- dom_p2m = alloc_domain(DOMID_P2M);
+ dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0);
BUG_ON(dom_p2m == NULL);
dom_p2m->max_pages = ~0U;
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/Makefile
--- a/xen/arch/x86/acpi/cpufreq/Makefile Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/Makefile Wed Jul 02 11:30:37 2008 +0900
@@ -1,3 +1,4 @@ obj-y += cpufreq.o
obj-y += cpufreq.o
obj-y += utility.o
obj-y += cpufreq_ondemand.o
+obj-y += powernow.o
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c Wed Jul 02 11:30:37 2008 +0900
@@ -47,6 +47,10 @@ struct processor_pminfo processor_pminfo
struct processor_pminfo processor_pminfo[NR_CPUS];
struct cpufreq_policy xen_px_policy[NR_CPUS];
+static cpumask_t *cpufreq_dom_pt;
+static cpumask_t cpufreq_dom_mask;
+static unsigned int cpufreq_dom_max;
+
enum {
UNDEFINED_CAPABLE = 0,
SYSTEM_INTEL_MSR_CAPABLE,
@@ -60,7 +64,6 @@ struct acpi_cpufreq_data {
struct processor_performance *acpi_data;
struct cpufreq_frequency_table *freq_table;
unsigned int max_freq;
- unsigned int resume;
unsigned int cpu_feature;
};
@@ -328,14 +331,16 @@ static int acpi_cpufreq_target(struct cp
next_perf_state = data->freq_table[next_state].index;
if (perf->state == next_perf_state) {
- if (unlikely(data->resume)) {
- printk("xen_pminfo: @acpi_cpufreq_target, "
- "Called after resume, resetting to P%d\n",
+ if (unlikely(policy->resume)) {
+ printk(KERN_INFO "Called after resume, resetting to P%d\n",
next_perf_state);
- data->resume = 0;
+ policy->resume = 0;
}
- else
+ else {
+ printk(KERN_INFO "Already at target state (P%d)\n",
+ next_perf_state);
return 0;
+ }
}
switch (data->cpu_feature) {
@@ -531,7 +536,7 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
* the first call to ->target() should result in us actually
* writing something to the appropriate registers.
*/
- data->resume = 1;
+ policy->resume = 1;
return result;
@@ -549,61 +554,101 @@ static struct cpufreq_driver acpi_cpufre
.init = acpi_cpufreq_cpu_init,
};
-int acpi_cpufreq_init(void)
-{
- unsigned int i, ret = 0;
- unsigned int dom, max_dom = 0;
- cpumask_t *pt, dom_mask;
-
- cpus_clear(dom_mask);
+void cpufreq_dom_exit(void)
+{
+ cpufreq_dom_max = 0;
+ cpus_clear(cpufreq_dom_mask);
+ if (cpufreq_dom_pt)
+ xfree(cpufreq_dom_pt);
+}
+
+int cpufreq_dom_init(void)
+{
+ unsigned int i;
+
+ cpufreq_dom_max = 0;
+ cpus_clear(cpufreq_dom_mask);
for_each_online_cpu(i) {
- cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
- if (max_dom < processor_pminfo[i].perf.domain_info.domain)
- max_dom = processor_pminfo[i].perf.domain_info.domain;
- }
- max_dom++;
-
- pt = xmalloc_array(cpumask_t, max_dom);
- if (!pt)
+ cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
+ if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
+ cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
+ }
+ cpufreq_dom_max++;
+
+ cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
+ if (!cpufreq_dom_pt)
return -ENOMEM;
- memset(pt, 0, max_dom * sizeof(cpumask_t));
-
- /* get cpumask of each psd domain */
+ memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));
+
for_each_online_cpu(i)
- cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
+ cpu_set(i,
cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
for_each_online_cpu(i)
- processor_pminfo[i].perf.shared_cpu_map =
- pt[processor_pminfo[i].perf.domain_info.domain];
-
- cpufreq_driver = &acpi_cpufreq_driver;
-
- /* setup cpufreq infrastructure */
+ processor_pminfo[i].perf.shared_cpu_map =
+ cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain];
+
+ return 0;
+}
+
+static int cpufreq_cpu_init(void)
+{
+ int i, ret = 0;
+
for_each_online_cpu(i) {
xen_px_policy[i].cpu = i;
ret = px_statistic_init(i);
if (ret)
- goto out;
+ return ret;
ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]);
if (ret)
- goto out;
- }
-
- /* setup ondemand cpufreq */
- for (dom=0; dom<max_dom; dom++) {
- if (!cpu_isset(dom, dom_mask))
+ return ret;
+ }
+ return ret;
+}
+
+int cpufreq_dom_dbs(unsigned int event)
+{
+ int cpu, dom, ret = 0;
+
+ for (dom=0; dom<cpufreq_dom_max; dom++) {
+ if (!cpu_isset(dom, cpufreq_dom_mask))
continue;
- i = first_cpu(pt[dom]);
- ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
+ cpu = first_cpu(cpufreq_dom_pt[dom]);
+ ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
if (ret)
- goto out;
- }
-
-out:
- xfree(pt);
-
+ return ret;
+ }
return ret;
}
+
+int acpi_cpufreq_init(void)
+{
+ int ret = 0;
+
+ /* setup cpumask of psd dom and shared cpu map of cpu */
+ ret = cpufreq_dom_init();
+ if (ret)
+ goto err;
+
+ /* setup cpufreq driver */
+ cpufreq_driver = &acpi_cpufreq_driver;
+
+ /* setup cpufreq infrastructure */
+ ret = cpufreq_cpu_init();
+ if (ret)
+ goto err;
+
+ /* setup cpufreq dbs according to dom coordiation */
+ ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
+ if (ret)
+ goto err;
+
+ return ret;
+
+err:
+ cpufreq_dom_exit();
+ return ret;
+}
diff -r 11318234588e -r 08f77df14cba
xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Thu Jun 19 12:48:04
2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Wed Jul 02 11:30:37
2008 +0900
@@ -52,7 +52,7 @@ static struct dbs_tuners {
static struct timer dbs_timer[NR_CPUS];
-static inline uint64_t get_cpu_idle_time(unsigned int cpu)
+inline uint64_t get_cpu_idle_time(unsigned int cpu)
{
uint64_t idle_ns;
struct vcpu *v;
@@ -79,6 +79,12 @@ static void dbs_check_cpu(struct cpu_dbs
return;
policy = this_dbs_info->cur_policy;
+
+ if (unlikely(policy->resume)) {
+ __cpufreq_driver_target(policy, policy->max,CPUFREQ_RELATION_H);
+ return;
+ }
+
cur_ns = NOW();
total_ns = cur_ns - this_dbs_info->prev_cpu_wall;
this_dbs_info->prev_cpu_wall = NOW();
@@ -217,8 +223,7 @@ int cpufreq_governor_dbs(struct cpufreq_
break;
case CPUFREQ_GOV_STOP:
- if (this_dbs_info->enable)
- dbs_timer_exit(this_dbs_info);
+ dbs_timer_exit(this_dbs_info);
dbs_enable--;
break;
@@ -233,5 +238,4 @@ int cpufreq_governor_dbs(struct cpufreq_
break;
}
return 0;
-}
-
+}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/powernow.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,305 @@
+/*
+ * powernow - AMD Architectural P-state Driver ($Revision: 1.4 $)
+ *
+ * Copyright (C) 2008 Mark Langsdorf <mark.langsdorf@xxxxxxx>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/delay.h>
+#include <xen/cpumask.h>
+#include <xen/timer.h>
+#include <xen/xmalloc.h>
+#include <asm/bug.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/config.h>
+#include <asm/processor.h>
+#include <asm/percpu.h>
+#include <asm/cpufeature.h>
+#include <acpi/acpi.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
+#define USE_HW_PSTATE 0x00000080
+#define HW_PSTATE_MASK 0x00000007
+#define HW_PSTATE_VALID_MASK 0x80000000
+#define HW_PSTATE_MAX_MASK 0x000000f0
+#define HW_PSTATE_MAX_SHIFT 4
+#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
+#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
+#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
+#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
+
+extern struct processor_pminfo processor_pminfo[NR_CPUS];
+extern struct cpufreq_policy xen_px_policy[NR_CPUS];
+
+struct powernow_cpufreq_data {
+ struct processor_performance *acpi_data;
+ struct cpufreq_frequency_table *freq_table;
+ unsigned int max_freq;
+ unsigned int resume;
+ unsigned int cpu_feature;
+};
+
+static struct powernow_cpufreq_data *drv_data[NR_CPUS];
+
+struct drv_cmd {
+ unsigned int type;
+ cpumask_t mask;
+ u64 addr;
+ u32 val;
+};
+
+static void transition_pstate(void *drvcmd)
+{
+ struct drv_cmd *cmd;
+ cmd = (struct drv_cmd *) drvcmd;
+
+ wrmsr(MSR_PSTATE_CTRL, cmd->val, 0);
+}
+
+static int powernow_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq, unsigned int relation)
+{
+ struct powernow_cpufreq_data *data = drv_data[policy->cpu];
+ struct processor_performance *perf;
+ struct cpufreq_freqs freqs;
+ cpumask_t online_policy_cpus;
+ struct drv_cmd cmd;
+ unsigned int next_state = 0; /* Index into freq_table */
+ unsigned int next_perf_state = 0; /* Index into perf table */
+ int result = 0;
+
+ if (unlikely(data == NULL ||
+ data->acpi_data == NULL || data->freq_table == NULL)) {
+ return -ENODEV;
+ }
+
+ perf = data->acpi_data;
+ result = cpufreq_frequency_table_target(policy,
+ data->freq_table,
+ target_freq,
+ relation, &next_state);
+ if (unlikely(result))
+ return -ENODEV;
+
+ online_policy_cpus = policy->cpus;
+
+ next_perf_state = data->freq_table[next_state].index;
+ if (perf->state == next_perf_state) {
+ if (unlikely(data->resume))
+ data->resume = 0;
+ else
+ return 0;
+ }
+
+ cpus_clear(cmd.mask);
+
+ if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
+ cmd.mask = online_policy_cpus;
+ else
+ cpu_set(policy->cpu, cmd.mask);
+
+ freqs.old = perf->states[perf->state].core_frequency * 1000;
+ freqs.new = data->freq_table[next_state].frequency;
+
+ cmd.val = next_perf_state;
+
+ on_selected_cpus( cmd.mask, transition_pstate, (void *) &cmd, 0, 0);
+
+ perf->state = next_perf_state;
+ policy->cur = freqs.new;
+
+ return result;
+}
+
+static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+ unsigned int i;
+ unsigned int valid_states = 0;
+ unsigned int cpu = policy->cpu;
+ struct powernow_cpufreq_data *data;
+ unsigned int result = 0;
+ struct processor_performance *perf;
+ u32 max_hw_pstate, hi = 0, lo = 0;
+
+ data = xmalloc(struct powernow_cpufreq_data);
+ if (!data)
+ return -ENOMEM;
+ memset(data, 0, sizeof(struct powernow_cpufreq_data));
+
+ drv_data[cpu] = data;
+
+ data->acpi_data = &processor_pminfo[cpu].perf;
+
+ perf = data->acpi_data;
+ policy->shared_type = perf->shared_type;
+
+ /*
+ * Will let policy->cpus know about dependency only when software
+ * coordination is required.
+ */
+ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
+ policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+ policy->cpus = perf->shared_cpu_map;
+ } else {
+ policy->cpus = cpumask_of_cpu(cpu);
+ }
+
+ /* capability check */
+ if (perf->state_count <= 1) {
+ printk("No P-States\n");
+ result = -ENODEV;
+ goto err_unreg;
+ }
+ rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
+ max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+
+ if (perf->control_register.space_id != perf->status_register.space_id) {
+ result = -ENODEV;
+ goto err_unreg;
+ }
+
+ data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
+ (perf->state_count+1));
+ if (!data->freq_table) {
+ result = -ENOMEM;
+ goto err_unreg;
+ }
+
+ /* detect transition latency */
+ policy->cpuinfo.transition_latency = 0;
+ for (i=0; i<perf->state_count; i++) {
+ if ((perf->states[i].transition_latency * 1000) >
+ policy->cpuinfo.transition_latency)
+ policy->cpuinfo.transition_latency =
+ perf->states[i].transition_latency * 1000;
+ }
+
+ data->max_freq = perf->states[0].core_frequency * 1000;
+ /* table init */
+ for (i=0; i<perf->state_count && i<max_hw_pstate; i++) {
+ if (i>0 && perf->states[i].core_frequency >=
+ data->freq_table[valid_states-1].frequency / 1000)
+ continue;
+
+ data->freq_table[valid_states].index = perf->states[i].control &
HW_PSTATE_MASK;
+ data->freq_table[valid_states].frequency =
+ perf->states[i].core_frequency * 1000;
+ valid_states++;
+ }
+ data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+ perf->state = 0;
+
+ result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
+ if (result)
+ goto err_freqfree;
+
+ /*
+ * the first call to ->target() should result in us actually
+ * writing something to the appropriate registers.
+ */
+ data->resume = 1;
+
+ policy->cur = data->freq_table[i].frequency;
+ return result;
+
+err_freqfree:
+ xfree(data->freq_table);
+err_unreg:
+ xfree(data);
+ drv_data[cpu] = NULL;
+
+ return result;
+}
+
+static struct cpufreq_driver powernow_cpufreq_driver = {
+ .target = powernow_cpufreq_target,
+ .init = powernow_cpufreq_cpu_init,
+};
+
+int powernow_cpufreq_init(void)
+{
+ unsigned int i, ret = 0;
+ unsigned int dom, max_dom = 0;
+ cpumask_t *pt, dom_mask;
+
+ cpus_clear(dom_mask);
+
+ for_each_online_cpu(i) {
+ struct cpuinfo_x86 *c = &cpu_data[i];
+ if (c->x86_vendor != X86_VENDOR_AMD)
+ ret = -ENODEV;
+ else
+ {
+ u32 eax, ebx, ecx, edx;
+ cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+ if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE)
+ ret = -ENODEV;
+ }
+ if (ret)
+ return ret;
+ cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
+ if (max_dom < processor_pminfo[i].perf.domain_info.domain)
+ max_dom = processor_pminfo[i].perf.domain_info.domain;
+ }
+ max_dom++;
+
+ pt = xmalloc_array(cpumask_t, max_dom);
+ if (!pt)
+ return -ENOMEM;
+ memset(pt, 0, max_dom * sizeof(cpumask_t));
+
+ /* get cpumask of each psd domain */
+ for_each_online_cpu(i)
+ cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
+
+ for_each_online_cpu(i)
+ processor_pminfo[i].perf.shared_cpu_map =
+ pt[processor_pminfo[i].perf.domain_info.domain];
+
+ cpufreq_driver = &powernow_cpufreq_driver;
+
+ /* setup cpufreq infrastructure */
+ for_each_online_cpu(i) {
+ xen_px_policy[i].cpu = i;
+
+ ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]);
+ if (ret)
+ goto cpufreq_init_out;
+ }
+
+ /* setup ondemand cpufreq */
+ for (dom=0; dom<max_dom; dom++) {
+ if (!cpu_isset(dom, dom_mask))
+ continue;
+ i = first_cpu(pt[dom]);
+ ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
+ if (ret)
+ goto cpufreq_init_out;
+ }
+
+cpufreq_init_out:
+ xfree(pt);
+
+ return ret;
+}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/utility.c
--- a/xen/arch/x86/acpi/cpufreq/utility.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/utility.c Wed Jul 02 11:30:37 2008 +0900
@@ -37,6 +37,41 @@ struct cpufreq_driver *cpufreq_driver;
* Px STATISTIC INFO *
*********************************************************************/
+void px_statistic_suspend(void)
+{
+ int cpu;
+ uint64_t now;
+
+ now = NOW();
+
+ for_each_online_cpu(cpu) {
+ struct pm_px *pxpt = &px_statistic_data[cpu];
+ uint64_t total_idle_ns;
+ uint64_t tmp_idle_ns;
+
+ total_idle_ns = get_cpu_idle_time(cpu);
+ tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
+
+ pxpt->u.pt[pxpt->u.cur].residency +=
+ now - pxpt->prev_state_wall;
+ pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
+ }
+}
+
+void px_statistic_resume(void)
+{
+ int cpu;
+ uint64_t now;
+
+ now = NOW();
+
+ for_each_online_cpu(cpu) {
+ struct pm_px *pxpt = &px_statistic_data[cpu];
+ pxpt->prev_state_wall = now;
+ pxpt->prev_idle_wall = get_cpu_idle_time(cpu);
+ }
+}
+
void px_statistic_update(cpumask_t cpumask, uint8_t from, uint8_t to)
{
uint32_t i;
@@ -47,15 +82,22 @@ void px_statistic_update(cpumask_t cpuma
for_each_cpu_mask(i, cpumask) {
struct pm_px *pxpt = &px_statistic_data[i];
uint32_t statnum = processor_pminfo[i].perf.state_count;
+ uint64_t total_idle_ns;
+ uint64_t tmp_idle_ns;
+
+ total_idle_ns = get_cpu_idle_time(i);
+ tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
pxpt->u.last = from;
pxpt->u.cur = to;
pxpt->u.pt[to].count++;
pxpt->u.pt[from].residency += now - pxpt->prev_state_wall;
+ pxpt->u.pt[from].residency -= tmp_idle_ns;
(*(pxpt->u.trans_pt + from*statnum + to))++;
pxpt->prev_state_wall = now;
+ pxpt->prev_idle_wall = total_idle_ns;
}
}
@@ -87,6 +129,7 @@ int px_statistic_init(int cpuid)
pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency;
pxpt->prev_state_wall = NOW();
+ pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
return 0;
}
@@ -107,6 +150,7 @@ void px_statistic_reset(int cpuid)
}
pxpt->prev_state_wall = NOW();
+ pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
}
@@ -242,3 +286,62 @@ int __cpufreq_driver_getavg(struct cpufr
return ret;
}
+
+
+/*********************************************************************
+ * CPUFREQ SUSPEND/RESUME *
+ *********************************************************************/
+
+void cpufreq_suspend(void)
+{
+ int cpu;
+
+ /* to protect the case when Px was controlled by dom0-kernel */
+ /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
+ for_each_online_cpu(cpu) {
+ struct processor_performance *perf = &processor_pminfo[cpu].perf;
+
+ if (!perf->init)
+ return;
+ }
+
+ cpufreq_dom_dbs(CPUFREQ_GOV_STOP);
+
+ cpufreq_dom_exit();
+
+ px_statistic_suspend();
+}
+
+int cpufreq_resume(void)
+{
+ int cpu, ret = 0;
+
+ /* 1. to protect the case when Px was controlled by dom0-kernel */
+ /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
+ /* 2. set state and resume flag to sync cpu to right state and freq */
+ for_each_online_cpu(cpu) {
+ struct processor_performance *perf = &processor_pminfo[cpu].perf;
+ struct cpufreq_policy *policy = &xen_px_policy[cpu];
+
+ if (!perf->init)
+ goto err;
+ perf->state = 0;
+ policy->resume = 1;
+ }
+
+ px_statistic_resume();
+
+ ret = cpufreq_dom_init();
+ if (ret)
+ goto err;
+
+ ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
+ if (ret)
+ goto err;
+
+ return ret;
+
+err:
+ cpufreq_dom_exit();
+ return ret;
+}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/pmstat.c
--- a/xen/arch/x86/acpi/pmstat.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/pmstat.c Wed Jul 02 11:30:37 2008 +0900
@@ -71,11 +71,18 @@ int do_get_pm_info(struct xen_sysctl_get
case PMSTAT_get_pxstat:
{
uint64_t now, ct;
+ uint64_t total_idle_ns;
+ uint64_t tmp_idle_ns;
+
+ total_idle_ns = get_cpu_idle_time(op->cpuid);
+ tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
now = NOW();
pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc;
pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall;
+ pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
pxpt->prev_state_wall = now;
+ pxpt->prev_idle_wall = total_idle_ns;
ct = pmpt->perf.state_count;
if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) )
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/power.c
--- a/xen/arch/x86/acpi/power.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/power.c Wed Jul 02 11:30:37 2008 +0900
@@ -27,7 +27,7 @@
#include <public/platform.h>
#include <asm/tboot.h>
-#define pmprintk(_l, _f, _a...) printk(_l "<PM> " _f "\n", ## _a )
+#include <acpi/cpufreq/cpufreq.h>
static char opt_acpi_sleep[20];
string_param("acpi_sleep", opt_acpi_sleep);
@@ -124,9 +124,11 @@ static int enter_state(u32 state)
if ( !spin_trylock(&pm_lock) )
return -EBUSY;
- pmprintk(XENLOG_INFO, "Preparing system for ACPI S%d state.", state);
+ printk(XENLOG_INFO "Preparing system for ACPI S%d state.", state);
freeze_domains();
+
+ cpufreq_suspend();
disable_nonboot_cpus();
if ( num_online_cpus() != 1 )
@@ -139,11 +141,14 @@ static int enter_state(u32 state)
acpi_sleep_prepare(state);
+ console_start_sync();
+ printk("Entering ACPI S%d state.\n", state);
+
local_irq_save(flags);
if ( (error = device_power_down()) )
{
- pmprintk(XENLOG_ERR, "Some devices failed to power down.");
+ printk(XENLOG_ERR "Some devices failed to power down.");
goto done;
}
@@ -162,8 +167,6 @@ static int enter_state(u32 state)
break;
}
- pmprintk(XENLOG_DEBUG, "Back to C.");
-
/* Restore CR4 and EFER from cached values. */
write_cr4(read_cr4());
if ( cpu_has_efer )
@@ -171,16 +174,18 @@ static int enter_state(u32 state)
device_power_up();
- pmprintk(XENLOG_INFO, "Finishing wakeup from ACPI S%d state.", state);
+ printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.", state);
done:
local_irq_restore(flags);
+ console_end_sync();
acpi_sleep_post(state);
if ( !hvm_cpu_up() )
BUG();
enable_cpu:
enable_nonboot_cpus();
+ cpufreq_resume();
thaw_domains();
spin_unlock(&pm_lock);
return error;
@@ -206,7 +211,7 @@ int acpi_enter_sleep(struct xenpf_enter_
((sleep->pm1a_cnt_val ^ sleep->pm1b_cnt_val) &
ACPI_BITMASK_SLEEP_ENABLE) )
{
- pmprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
+ gdprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
return -EINVAL;
}
@@ -278,7 +283,7 @@ acpi_status asmlinkage acpi_enter_sleep_
if ( tboot_in_measured_env() )
{
tboot_sleep(sleep_state);
- pmprintk(XENLOG_ERR, "TBOOT failed entering s3 state\n");
+ printk(XENLOG_ERR "TBOOT failed entering s3 state\n");
return_ACPI_STATUS(AE_ERROR);
}
@@ -320,7 +325,7 @@ static int __init acpi_sleep_init(void)
p += strspn(p, ", \t");
}
- printk(XENLOG_INFO "<PM> ACPI (supports");
+ printk(XENLOG_INFO "ACPI sleep modes:");
for ( i = 0; i < ACPI_S_STATE_COUNT; i++ )
{
if ( i == ACPI_STATE_S3 )
@@ -331,7 +336,7 @@ static int __init acpi_sleep_init(void)
else
sleep_states[i] = 0;
}
- printk(")\n");
+ printk("\n");
return 0;
}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/emulate.c
--- a/xen/arch/x86/hvm/emulate.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/emulate.c Wed Jul 02 11:30:37 2008 +0900
@@ -21,15 +21,33 @@
static int hvmemul_do_io(
int is_mmio, paddr_t addr, unsigned long *reps, int size,
- paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
+ paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+ paddr_t value = ram_gpa;
+ int value_is_ptr = (p_data == NULL);
struct vcpu *curr = current;
vcpu_iodata_t *vio = get_ioreq(curr);
ioreq_t *p = &vio->vp_ioreq;
int rc;
- /* Only retrieve the value from singleton (non-REP) reads. */
- ASSERT((val == NULL) || ((dir == IOREQ_READ) && !value_is_ptr));
+ /*
+ * Weird-sized accesses have undefined behaviour: we discard writes
+ * and read all-ones.
+ */
+ if ( unlikely((size > sizeof(long)) || (size & (size - 1))) )
+ {
+ gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size);
+ ASSERT(p_data != NULL); /* cannot happen with a REP prefix */
+ if ( dir == IOREQ_READ )
+ memset(p_data, ~0, size);
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ( (p_data != NULL) && (dir == IOREQ_WRITE) )
+ {
+ memcpy(&value, p_data, size);
+ p_data = NULL;
+ }
if ( is_mmio && !value_is_ptr )
{
@@ -47,8 +65,7 @@ static int hvmemul_do_io(
unsigned int bytes = curr->arch.hvm_vcpu.mmio_large_read_bytes;
if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) )
{
- *val = 0;
- memcpy(val, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
+ memcpy(p_data, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
size);
return X86EMUL_OKAY;
}
@@ -61,7 +78,7 @@ static int hvmemul_do_io(
break;
case HVMIO_completed:
curr->arch.hvm_vcpu.io_state = HVMIO_none;
- if ( val == NULL )
+ if ( p_data == NULL )
return X86EMUL_UNHANDLEABLE;
goto finish_access;
case HVMIO_dispatched:
@@ -82,7 +99,7 @@ static int hvmemul_do_io(
}
curr->arch.hvm_vcpu.io_state =
- (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
+ (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
p->dir = dir;
p->data_is_ptr = value_is_ptr;
@@ -116,7 +133,7 @@ static int hvmemul_do_io(
break;
case X86EMUL_UNHANDLEABLE:
hvm_send_assist_req(curr);
- rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
+ rc = (p_data != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
break;
default:
BUG();
@@ -126,8 +143,8 @@ static int hvmemul_do_io(
return rc;
finish_access:
- if ( val != NULL )
- *val = curr->arch.hvm_vcpu.io_data;
+ if ( p_data != NULL )
+ memcpy(p_data, &curr->arch.hvm_vcpu.io_data, size);
if ( is_mmio && !value_is_ptr )
{
@@ -152,7 +169,7 @@ static int hvmemul_do_io(
sizeof(curr->arch.hvm_vcpu.mmio_large_read)) )
{
memcpy(&curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
- val, size);
+ p_data, size);
curr->arch.hvm_vcpu.mmio_large_read_bytes += size;
}
}
@@ -163,18 +180,16 @@ static int hvmemul_do_io(
static int hvmemul_do_pio(
unsigned long port, unsigned long *reps, int size,
- paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
- return hvmemul_do_io(0, port, reps, size, value,
- dir, df, value_is_ptr, val);
+ paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+ return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data);
}
static int hvmemul_do_mmio(
paddr_t gpa, unsigned long *reps, int size,
- paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
- return hvmemul_do_io(1, gpa, reps, size, value,
- dir, df, value_is_ptr, val);
+ paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+ return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data);
}
/*
@@ -287,7 +302,7 @@ static int __hvmemul_read(
static int __hvmemul_read(
enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
enum hvm_access_type access_type,
struct hvm_emulate_ctxt *hvmemul_ctxt)
@@ -302,8 +317,6 @@ static int __hvmemul_read(
seg, offset, bytes, access_type, hvmemul_ctxt, &addr);
if ( rc != X86EMUL_OKAY )
return rc;
-
- *val = 0;
if ( unlikely(curr->arch.hvm_vcpu.mmio_gva == (addr & PAGE_MASK)) &&
curr->arch.hvm_vcpu.mmio_gva )
@@ -314,7 +327,7 @@ static int __hvmemul_read(
gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
if ( (off + bytes) <= PAGE_SIZE )
return hvmemul_do_mmio(gpa, &reps, bytes, 0,
- IOREQ_READ, 0, 0, val);
+ IOREQ_READ, 0, p_data);
}
if ( (seg != x86_seg_none) &&
@@ -322,15 +335,13 @@ static int __hvmemul_read(
pfec |= PFEC_user_mode;
rc = ((access_type == hvm_access_insn_fetch) ?
- hvm_fetch_from_guest_virt(val, addr, bytes, pfec) :
- hvm_copy_from_guest_virt(val, addr, bytes, pfec));
+ hvm_fetch_from_guest_virt(p_data, addr, bytes, pfec) :
+ hvm_copy_from_guest_virt(p_data, addr, bytes, pfec));
if ( rc == HVMCOPY_bad_gva_to_gfn )
return X86EMUL_EXCEPTION;
if ( rc == HVMCOPY_bad_gfn_to_mfn )
{
- unsigned long reps = 1;
-
if ( access_type == hvm_access_insn_fetch )
return X86EMUL_UNHANDLEABLE;
@@ -339,7 +350,7 @@ static int __hvmemul_read(
if ( rc != X86EMUL_OKAY )
return rc;
- return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
+ return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data);
}
return X86EMUL_OKAY;
@@ -348,19 +359,19 @@ static int hvmemul_read(
static int hvmemul_read(
enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
return __hvmemul_read(
- seg, offset, val, bytes, hvm_access_read,
+ seg, offset, p_data, bytes, hvm_access_read,
container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
}
static int hvmemul_insn_fetch(
enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
@@ -371,19 +382,18 @@ static int hvmemul_insn_fetch(
/* Fall back if requested bytes are not in the prefetch cache. */
if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
return __hvmemul_read(
- seg, offset, val, bytes,
+ seg, offset, p_data, bytes,
hvm_access_insn_fetch, hvmemul_ctxt);
/* Hit the cache. Simple memcpy. */
- *val = 0;
- memcpy(val, &hvmemul_ctxt->insn_buf[insn_off], bytes);
+ memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes);
return X86EMUL_OKAY;
}
static int hvmemul_write(
enum x86_segment seg,
unsigned long offset,
- unsigned long val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
@@ -406,29 +416,27 @@ static int hvmemul_write(
unsigned int off = addr & (PAGE_SIZE - 1);
gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
if ( (off + bytes) <= PAGE_SIZE )
- return hvmemul_do_mmio(gpa, &reps, bytes, val,
- IOREQ_WRITE, 0, 0, NULL);
+ return hvmemul_do_mmio(gpa, &reps, bytes, 0,
+ IOREQ_WRITE, 0, p_data);
}
if ( (seg != x86_seg_none) &&
(hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) )
pfec |= PFEC_user_mode;
- rc = hvm_copy_to_guest_virt(addr, &val, bytes, pfec);
+ rc = hvm_copy_to_guest_virt(addr, p_data, bytes, pfec);
if ( rc == HVMCOPY_bad_gva_to_gfn )
return X86EMUL_EXCEPTION;
if ( rc == HVMCOPY_bad_gfn_to_mfn )
{
- unsigned long reps = 1;
-
rc = hvmemul_linear_to_phys(
addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt);
if ( rc != X86EMUL_OKAY )
return rc;
- return hvmemul_do_mmio(gpa, &reps, bytes, val,
- IOREQ_WRITE, 0, 0, NULL);
+ return hvmemul_do_mmio(gpa, &reps, bytes, 0,
+ IOREQ_WRITE, 0, p_data);
}
return X86EMUL_OKAY;
@@ -442,12 +450,8 @@ static int hvmemul_cmpxchg(
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
- unsigned long new = 0;
- if ( bytes > sizeof(new) )
- return X86EMUL_UNHANDLEABLE;
- memcpy(&new, p_new, bytes);
/* Fix this in case the guest is really relying on r-m-w atomicity. */
- return hvmemul_write(seg, offset, new, bytes, ctxt);
+ return hvmemul_write(seg, offset, p_new, bytes, ctxt);
}
static int hvmemul_rep_ins(
@@ -480,7 +484,7 @@ static int hvmemul_rep_ins(
return rc;
return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
- !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+ !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
}
static int hvmemul_rep_outs(
@@ -513,7 +517,7 @@ static int hvmemul_rep_outs(
return rc;
return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE,
- !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+ !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
}
static int hvmemul_rep_movs(
@@ -563,14 +567,14 @@ static int hvmemul_rep_movs(
if ( !p2m_is_ram(p2mt) )
return hvmemul_do_mmio(
sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ,
- !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+ !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
(void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt);
if ( p2m_is_ram(p2mt) )
return X86EMUL_UNHANDLEABLE;
return hvmemul_do_mmio(
dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE,
- !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+ !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
}
static int hvmemul_read_segment(
@@ -607,7 +611,8 @@ static int hvmemul_read_io(
struct x86_emulate_ctxt *ctxt)
{
unsigned long reps = 1;
- return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
+ *val = 0;
+ return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val);
}
static int hvmemul_write_io(
@@ -617,7 +622,7 @@ static int hvmemul_write_io(
struct x86_emulate_ctxt *ctxt)
{
unsigned long reps = 1;
- return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL);
+ return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val);
}
static int hvmemul_read_cr(
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c Wed Jul 02 11:30:37 2008 +0900
@@ -2529,6 +2529,66 @@ long do_hvm_op(unsigned long op, XEN_GUE
break;
}
+ case HVMOP_modified_memory:
+ {
+ struct xen_hvm_modified_memory a;
+ struct domain *d;
+ unsigned long pfn;
+
+ if ( copy_from_guest(&a, arg, 1) )
+ return -EFAULT;
+
+ if ( a.domid == DOMID_SELF )
+ {
+ d = rcu_lock_current_domain();
+ }
+ else
+ {
+ if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
+ return -ESRCH;
+ if ( !IS_PRIV_FOR(current->domain, d) )
+ {
+ rc = -EPERM;
+ goto param_fail3;
+ }
+ }
+
+ rc = -EINVAL;
+ if ( !is_hvm_domain(d) )
+ goto param_fail3;
+
+ rc = xsm_hvm_param(d, op);
+ if ( rc )
+ goto param_fail3;
+
+ rc = -EINVAL;
+ if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
+ ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
+ ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
+ goto param_fail3;
+
+ rc = 0;
+ if ( !paging_mode_log_dirty(d) )
+ goto param_fail3;
+
+ for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
+ {
+ p2m_type_t t;
+ mfn_t mfn = gfn_to_mfn(d, pfn, &t);
+ if ( mfn_x(mfn) != INVALID_MFN )
+ {
+ paging_mark_dirty(d, mfn_x(mfn));
+ /* These are most probably not page tables any more */
+ /* don't take a long time and don't die either */
+ sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
+ }
+ }
+
+ param_fail3:
+ rcu_unlock_domain(d);
+ break;
+ }
+
default:
{
gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmcs.c Wed Jul 02 11:30:37 2008 +0900
@@ -677,10 +677,11 @@ static int construct_vmcs(struct vcpu *v
return 0;
}
-int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val)
-{
- unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
- const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_read_guest_msr(u32 msr, u64 *val)
+{
+ struct vcpu *curr = current;
+ unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+ const struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
for ( i = 0; i < msr_count; i++ )
{
@@ -694,10 +695,11 @@ int vmx_read_guest_msr(struct vcpu *v, u
return -ESRCH;
}
-int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val)
-{
- unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
- struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_write_guest_msr(u32 msr, u64 val)
+{
+ struct vcpu *curr = current;
+ unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
for ( i = 0; i < msr_count; i++ )
{
@@ -711,10 +713,20 @@ int vmx_write_guest_msr(struct vcpu *v,
return -ESRCH;
}
-int vmx_add_guest_msr(struct vcpu *v, u32 msr)
-{
- unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
- struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_add_guest_msr(u32 msr)
+{
+ struct vcpu *curr = current;
+ unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+
+ if ( msr_area == NULL )
+ {
+ if ( (msr_area = alloc_xenheap_page()) == NULL )
+ return -ENOMEM;
+ curr->arch.hvm_vmx.msr_area = msr_area;
+ __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
+ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+ }
for ( i = 0; i < msr_count; i++ )
if ( msr_area[i].index == msr )
@@ -723,29 +735,29 @@ int vmx_add_guest_msr(struct vcpu *v, u3
if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
return -ENOSPC;
- if ( msr_area == NULL )
- {
- if ( (msr_area = alloc_xenheap_page()) == NULL )
- return -ENOMEM;
- v->arch.hvm_vmx.msr_area = msr_area;
- __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
- __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
- }
-
msr_area[msr_count].index = msr;
msr_area[msr_count].mbz = 0;
msr_area[msr_count].data = 0;
- v->arch.hvm_vmx.msr_count = ++msr_count;
+ curr->arch.hvm_vmx.msr_count = ++msr_count;
__vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count);
__vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count);
return 0;
}
-int vmx_add_host_load_msr(struct vcpu *v, u32 msr)
-{
- unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count;
- struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area;
+int vmx_add_host_load_msr(u32 msr)
+{
+ struct vcpu *curr = current;
+ unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count;
+ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area;
+
+ if ( msr_area == NULL )
+ {
+ if ( (msr_area = alloc_xenheap_page()) == NULL )
+ return -ENOMEM;
+ curr->arch.hvm_vmx.host_msr_area = msr_area;
+ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+ }
for ( i = 0; i < msr_count; i++ )
if ( msr_area[i].index == msr )
@@ -754,18 +766,10 @@ int vmx_add_host_load_msr(struct vcpu *v
if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
return -ENOSPC;
- if ( msr_area == NULL )
- {
- if ( (msr_area = alloc_xenheap_page()) == NULL )
- return -ENOMEM;
- v->arch.hvm_vmx.host_msr_area = msr_area;
- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
- }
-
msr_area[msr_count].index = msr;
msr_area[msr_count].mbz = 0;
rdmsrl(msr, msr_area[msr_count].data);
- v->arch.hvm_vmx.host_msr_count = ++msr_count;
+ curr->arch.hvm_vmx.host_msr_count = ++msr_count;
__vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count);
return 0;
@@ -776,21 +780,17 @@ int vmx_create_vmcs(struct vcpu *v)
struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
int rc;
- if ( arch_vmx->vmcs == NULL )
- {
- if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
- return -ENOMEM;
-
- INIT_LIST_HEAD(&arch_vmx->active_list);
- __vmpclear(virt_to_maddr(arch_vmx->vmcs));
- arch_vmx->active_cpu = -1;
- arch_vmx->launched = 0;
- }
+ if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&arch_vmx->active_list);
+ __vmpclear(virt_to_maddr(arch_vmx->vmcs));
+ arch_vmx->active_cpu = -1;
+ arch_vmx->launched = 0;
if ( (rc = construct_vmcs(v)) != 0 )
{
vmx_free_vmcs(arch_vmx->vmcs);
- arch_vmx->vmcs = NULL;
return rc;
}
@@ -801,13 +801,13 @@ void vmx_destroy_vmcs(struct vcpu *v)
{
struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
- if ( arch_vmx->vmcs == NULL )
- return;
-
vmx_clear_vmcs(v);
vmx_free_vmcs(arch_vmx->vmcs);
- arch_vmx->vmcs = NULL;
+
+ free_xenheap_page(v->arch.hvm_vmx.host_msr_area);
+ free_xenheap_page(v->arch.hvm_vmx.msr_area);
+ free_xenheap_page(v->arch.hvm_vmx.msr_bitmap);
}
void vm_launch_fail(void)
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Jul 02 11:30:37 2008 +0900
@@ -1523,7 +1523,8 @@ static int vmx_cr_access(unsigned long e
break;
case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
value = v->arch.hvm_vcpu.guest_cr[0];
- value = (value & ~0xFFFF) | ((exit_qualification >> 16) & 0xFFFF);
+ /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
+ value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
HVMTRACE_LONG_1D(LMSW, current, value);
return !hvm_set_cr0(value);
default:
@@ -1655,7 +1656,7 @@ static int vmx_msr_read_intercept(struct
goto done;
}
- if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
+ if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
break;
if ( is_last_branch_msr(ecx) )
@@ -1817,12 +1818,12 @@ static int vmx_msr_write_intercept(struc
for ( ; (rc == 0) && lbr->count; lbr++ )
for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
- if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
+ if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
vmx_disable_intercept_for_msr(v, lbr->base + i);
}
if ( (rc < 0) ||
- (vmx_add_host_load_msr(v, ecx) < 0) )
+ (vmx_add_host_load_msr(ecx) < 0) )
vmx_inject_hw_exception(v, TRAP_machine_check, 0);
else
{
@@ -1842,7 +1843,7 @@ static int vmx_msr_write_intercept(struc
switch ( long_mode_do_msr_write(regs) )
{
case HNDL_unhandled:
- if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
+ if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
!is_last_branch_msr(ecx) )
wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
break;
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vpmu_core2.c
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Wed Jul 02 11:30:37 2008 +0900
@@ -219,12 +219,12 @@ static int core2_vpmu_alloc_resource(str
return 0;
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
- if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
- return 0;
-
- if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
- return 0;
- vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
+ if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+ return 0;
+
+ if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+ return 0;
+ vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
pmu_enable = xmalloc_bytes(sizeof(struct core2_pmu_enable) +
(core2_get_pmc_count()-1)*sizeof(char));
@@ -347,7 +347,7 @@ static int core2_vpmu_do_wrmsr(struct cp
break;
case MSR_CORE_PERF_FIXED_CTR_CTRL:
non_global_ctrl = msr_content;
- vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
+ vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
global_ctrl >>= 32;
for ( i = 0; i < 3; i++ )
{
@@ -359,7 +359,7 @@ static int core2_vpmu_do_wrmsr(struct cp
break;
default:
tmp = ecx - MSR_P6_EVNTSEL0;
- vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
+ vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
if ( tmp >= 0 && tmp < core2_get_pmc_count() )
core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] =
(global_ctrl >> tmp) & (msr_content >> 22) & 1;
@@ -385,7 +385,7 @@ static int core2_vpmu_do_wrmsr(struct cp
if ( type != MSR_TYPE_GLOBAL )
wrmsrl(ecx, msr_content);
else
- vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+ vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
return 1;
}
@@ -410,7 +410,7 @@ static int core2_vpmu_do_rdmsr(struct cp
msr_content = core2_vpmu_cxt->global_ovf_status;
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
- vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
+ vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
break;
default:
rdmsrl(regs->ecx, msr_content);
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900
@@ -219,7 +219,7 @@ void __init arch_init_memory(void)
* Any Xen-heap pages that we will allow to be mapped will have
* their domain field set to dom_xen.
*/
- dom_xen = alloc_domain(DOMID_XEN);
+ dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
BUG_ON(dom_xen == NULL);
/*
@@ -227,7 +227,7 @@ void __init arch_init_memory(void)
* This domain owns I/O pages that are within the range of the page_info
* array. Mappings occur at the priv of the caller.
*/
- dom_io = alloc_domain(DOMID_IO);
+ dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
BUG_ON(dom_io == NULL);
/* First 1MB of RAM is historically marked as I/O. */
@@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page
{
struct domain *d = page_get_owner(page);
- /* Never allow a shadowed frame to go from type count 0 to 1 */
- if ( d && shadow_mode_enabled(d) )
- shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
+ /* Normally we should never let a page go from type count 0
+ * to type count 1 when it is shadowed. One exception:
+ * out-of-sync shadowed pages are allowed to become
+ * writeable. */
+ if ( d && shadow_mode_enabled(d)
+ && (page->count_info & PGC_page_table)
+ && !((page->shadow_flags & (1u<<29))
+ && type == PGT_writable_page) )
+ shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
ASSERT(!(x & PGT_pae_xen_l2));
if ( (x & PGT_type_mask) != type )
@@ -3533,15 +3539,14 @@ static int ptwr_emulated_read(
static int ptwr_emulated_read(
enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
unsigned int rc;
unsigned long addr = offset;
- *val = 0;
- if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
+ if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
{
propagate_page_fault(addr + bytes - rc, 0); /* read fault */
return X86EMUL_EXCEPTION;
@@ -3568,7 +3573,7 @@ static int ptwr_emulated_update(
/* Only allow naturally-aligned stores within the original %cr2 page. */
if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
{
- MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
+ MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
ptwr_ctxt->cr2, addr, bytes);
return X86EMUL_UNHANDLEABLE;
}
@@ -3676,10 +3681,21 @@ static int ptwr_emulated_write(
static int ptwr_emulated_write(
enum x86_segment seg,
unsigned long offset,
- unsigned long val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
+ paddr_t val = 0;
+
+ if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
+ {
+ MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
+ offset, bytes);
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ memcpy(&val, p_data, bytes);
+
return ptwr_emulated_update(
offset, 0, val, bytes, 0,
container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
@@ -3694,10 +3710,17 @@ static int ptwr_emulated_cmpxchg(
struct x86_emulate_ctxt *ctxt)
{
paddr_t old = 0, new = 0;
- if ( bytes > sizeof(paddr_t) )
+
+ if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
+ {
+ MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
+ offset, bytes);
return X86EMUL_UNHANDLEABLE;
+ }
+
memcpy(&old, p_old, bytes);
memcpy(&new, p_new, bytes);
+
return ptwr_emulated_update(
offset, old, new, bytes, 1,
container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/common.c Wed Jul 02 11:30:37 2008 +0900
@@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d
/* Use shadow pagetables for log-dirty support */
paging_log_dirty_init(d, shadow_enable_log_dirty,
shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ d->arch.paging.shadow.oos_active = 0;
+#endif
}
/* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -64,6 +68,16 @@ void shadow_domain_init(struct domain *d
*/
void shadow_vcpu_init(struct vcpu *v)
{
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ int i;
+
+ for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
+ {
+ v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
+ v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
+ }
+#endif
+
v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
}
@@ -131,7 +145,7 @@ static int
static int
hvm_read(enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
enum hvm_access_type access_type,
struct sh_emulate_ctxt *sh_ctxt)
@@ -144,12 +158,10 @@ hvm_read(enum x86_segment seg,
if ( rc )
return rc;
- *val = 0;
-
if ( access_type == hvm_access_insn_fetch )
- rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0);
+ rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
else
- rc = hvm_copy_from_guest_virt(val, addr, bytes, 0);
+ rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
switch ( rc )
{
@@ -167,20 +179,20 @@ static int
static int
hvm_emulate_read(enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
if ( !is_x86_user_segment(seg) )
return X86EMUL_UNHANDLEABLE;
- return hvm_read(seg, offset, val, bytes, hvm_access_read,
+ return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
container_of(ctxt, struct sh_emulate_ctxt, ctxt));
}
static int
hvm_emulate_insn_fetch(enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
@@ -192,19 +204,18 @@ hvm_emulate_insn_fetch(enum x86_segment
/* Fall back if requested bytes are not in the prefetch cache. */
if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
- return hvm_read(seg, offset, val, bytes,
+ return hvm_read(seg, offset, p_data, bytes,
hvm_access_insn_fetch, sh_ctxt);
/* Hit the cache. Simple memcpy. */
- *val = 0;
- memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
+ memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
return X86EMUL_OKAY;
}
static int
hvm_emulate_write(enum x86_segment seg,
unsigned long offset,
- unsigned long val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
@@ -227,7 +238,7 @@ hvm_emulate_write(enum x86_segment seg,
return rc;
return v->arch.paging.mode->shadow.x86_emulate_write(
- v, addr, &val, bytes, sh_ctxt);
+ v, addr, p_data, bytes, sh_ctxt);
}
static int
@@ -279,7 +290,7 @@ static int
static int
pv_emulate_read(enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
@@ -288,8 +299,7 @@ pv_emulate_read(enum x86_segment seg,
if ( !is_x86_user_segment(seg) )
return X86EMUL_UNHANDLEABLE;
- *val = 0;
- if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
+ if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
{
propagate_page_fault(offset + bytes - rc, 0); /* read fault */
return X86EMUL_EXCEPTION;
@@ -301,7 +311,7 @@ static int
static int
pv_emulate_write(enum x86_segment seg,
unsigned long offset,
- unsigned long val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
@@ -311,7 +321,7 @@ pv_emulate_write(enum x86_segment seg,
if ( !is_x86_user_segment(seg) )
return X86EMUL_UNHANDLEABLE;
return v->arch.paging.mode->shadow.x86_emulate_write(
- v, offset, &val, bytes, sh_ctxt);
+ v, offset, p_data, bytes, sh_ctxt);
}
static int
@@ -427,6 +437,585 @@ void shadow_continue_emulation(struct sh
}
}
}
+
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Out-of-sync shadows. */
+
+/* From time to time, we let a shadowed pagetable page go out of sync
+ * with its shadow: the guest is allowed to write directly to the page,
+ * and those writes are not synchronously reflected in the shadow.
+ * This lets us avoid many emulations if the guest is writing a lot to a
+ * pagetable, but it relaxes a pretty important invariant in the shadow
+ * pagetable design. Therefore, some rules:
+ *
+ * 1. Only L1 pagetables may go out of sync: any page that is shadowed
+ * at at higher level must be synchronously updated. This makes
+ * using linear shadow pagetables much less dangerous.
+ * That means that: (a) unsyncing code needs to check for higher-level
+ * shadows, and (b) promotion code needs to resync.
+ *
+ * 2. All shadow operations on a guest page require the page to be brought
+ * back into sync before proceeding. This must be done under the
+ * shadow lock so that the page is guaranteed to remain synced until
+ * the operation completes.
+ *
+ * Exceptions to this rule: the pagefault and invlpg handlers may
+ * update only one entry on an out-of-sync page without resyncing it.
+ *
+ * 3. Operations on shadows that do not start from a guest page need to
+ * be aware that they may be handling an out-of-sync shadow.
+ *
+ * 4. Operations that do not normally take the shadow lock (fast-path
+ * #PF handler, INVLPG) must fall back to a locking, syncing version
+ * if they see an out-of-sync table.
+ *
+ * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
+ * must explicitly resync all relevant pages or update their
+ * shadows.
+ *
+ * Currently out-of-sync pages are listed in a simple open-addressed
+ * hash table with a second chance (must resist temptation to radically
+ * over-engineer hash tables...) The virtual address of the access
+ * which caused us to unsync the page is also kept in the hash table, as
+ * a hint for finding the writable mappings later.
+ *
+ * We keep a hash per vcpu, because we want as much as possible to do
+ * the re-sync on the save vcpu we did the unsync on, so the VA hint
+ * will be valid.
+ */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+static void sh_oos_audit(struct domain *d)
+{
+ int idx, expected_idx, expected_idx_alt;
+ struct page_info *pg;
+ struct vcpu *v;
+
+ for_each_vcpu(d, v)
+ {
+ for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+ {
+ mfn_t *oos = v->arch.paging.shadow.oos;
+ if ( !mfn_valid(oos[idx]) )
+ continue;
+
+ expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
+ expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
+ if ( idx != expected_idx && idx != expected_idx_alt )
+ {
+ printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
+ __func__, idx, mfn_x(oos[idx]),
+ expected_idx, expected_idx_alt);
+ BUG();
+ }
+ pg = mfn_to_page(oos[idx]);
+ if ( !(pg->count_info & PGC_page_table) )
+ {
+ printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
+ __func__, idx, mfn_x(oos[idx]), pg->count_info);
+ BUG();
+ }
+ if ( !(pg->shadow_flags & SHF_out_of_sync) )
+ {
+ printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
+ __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+ BUG();
+ }
+ if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
+ {
+ printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
+ __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+ BUG();
+ }
+ }
+ }
+}
+#endif
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
+{
+ int idx;
+ struct vcpu *v;
+ mfn_t *oos;
+
+ ASSERT(mfn_is_out_of_sync(gmfn));
+
+ for_each_vcpu(d, v)
+ {
+ oos = v->arch.paging.shadow.oos;
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+
+ if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+ return;
+ }
+
+ SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
+ BUG();
+}
+#endif
+
+/* Update the shadow, but keep the page out of sync. */
+static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
+{
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(mfn_valid(gmfn));
+ ASSERT(page_is_out_of_sync(pg));
+
+ /* Call out to the appropriate per-mode resyncing function */
+ if ( pg->shadow_flags & SHF_L1_32 )
+ SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
+ else if ( pg->shadow_flags & SHF_L1_PAE )
+ SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( pg->shadow_flags & SHF_L1_64 )
+ SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
+#endif
+}
+
+#define _FIXUP_IDX(_b, _i) ((_b) * SHADOW_OOS_FT_HASH + (_i))
+
+void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
+ mfn_t smfn, unsigned long off)
+{
+ int idx, i, free = 0, free_slot = 0;
+ struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+ idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+ for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+ {
+ if ( !mfn_valid(fixups[_FIXUP_IDX(idx, i)].gmfn)
+ || !mfn_is_out_of_sync(fixups[_FIXUP_IDX(idx, i)].gmfn) )
+ {
+ free = 1;
+ free_slot = _FIXUP_IDX(idx, i);
+ }
+ else if ( (mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn))
+ && (mfn_x(fixups[_FIXUP_IDX(idx, i)].smfn) == mfn_x(smfn))
+ && (fixups[_FIXUP_IDX(idx, i)].off == off) )
+ {
+ perfc_incr(shadow_oos_fixup_no_add);
+ return;
+ }
+ }
+
+ if ( free )
+ {
+ if ( !v->arch.paging.shadow.oos_fixup_used )
+ v->arch.paging.shadow.oos_fixup_used = 1;
+ fixups[free_slot].gmfn = gmfn;
+ fixups[free_slot].smfn = smfn;
+ fixups[free_slot].off = off;
+ perfc_incr(shadow_oos_fixup_add_ok);
+ return;
+ }
+
+
+ perfc_incr(shadow_oos_fixup_add_fail);
+}
+
+void oos_fixup_remove(struct vcpu *v, mfn_t gmfn)
+{
+ int idx, i;
+ struct domain *d = v->domain;
+
+ perfc_incr(shadow_oos_fixup_remove);
+
+ /* If the domain is dying we might get called when deallocating
+ * the shadows. Fixup tables are already freed so exit now. */
+ if ( d->is_dying )
+ return;
+
+ idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+ for_each_vcpu(d, v)
+ {
+ struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+ for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+ if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn) )
+ fixups[_FIXUP_IDX(idx, i)].gmfn = _mfn(INVALID_MFN);
+ }
+}
+
+int oos_fixup_flush(struct vcpu *v)
+{
+ int i, rc = 0;
+ struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+ perfc_incr(shadow_oos_fixup_flush);
+
+ if ( !v->arch.paging.shadow.oos_fixup_used )
+ return 0;
+
+ for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
+ {
+ if ( mfn_valid(fixups[i].gmfn) )
+ {
+ if ( mfn_is_out_of_sync(fixups[i].gmfn) )
+ rc |= sh_remove_write_access_from_sl1p(v, fixups[i].gmfn,
+ fixups[i].smfn,
+ fixups[i].off);
+ fixups[i].gmfn = _mfn(INVALID_MFN);
+ }
+ }
+
+ v->arch.paging.shadow.oos_fixup_used = 0;
+
+ return rc;
+}
+
+int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn)
+{
+ int idx, i, rc = 0;
+ struct domain *d = v->domain;
+
+ perfc_incr(shadow_oos_fixup_flush_gmfn);
+
+ idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+ for_each_vcpu(d, v)
+ {
+ struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+ for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+ {
+ if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) != mfn_x(gmfn) )
+ continue;
+
+ rc |= sh_remove_write_access_from_sl1p(v,
+
fixups[_FIXUP_IDX(idx,i)].gmfn,
+
fixups[_FIXUP_IDX(idx,i)].smfn,
+
fixups[_FIXUP_IDX(idx,i)].off);
+
+ fixups[_FIXUP_IDX(idx,i)].gmfn = _mfn(INVALID_MFN);
+ }
+ }
+
+ return rc;
+}
+
+static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned long
va)
+{
+ int ftlb = 0;
+
+ ftlb |= oos_fixup_flush_gmfn(v, gmfn);
+
+ switch ( sh_remove_write_access(v, gmfn, 0, va) )
+ {
+ default:
+ case 0:
+ break;
+
+ case 1:
+ ftlb |= 1;
+ break;
+
+ case -1:
+ /* An unfindable writeable typecount has appeared, probably via a
+ * grant table entry: can't shoot the mapping, so try to unshadow
+ * the page. If that doesn't work either, the guest is granting
+ * his pagetables and must be killed after all.
+ * This will flush the tlb, so we can return with no worries. */
+ sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+ return 1;
+ }
+
+ if ( ftlb )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+
+ return 0;
+}
+
+
+/* Pull all the entries on an out-of-sync page back into sync. */
+static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va, mfn_t snp)
+{
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(shadow_locked_by_me(v->domain));
+ ASSERT(mfn_is_out_of_sync(gmfn));
+ /* Guest page must be shadowed *only* as L1 when out of sync. */
+ ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
+ & ~SHF_L1_ANY));
+ ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+ /* Need to pull write access so the page *stays* in sync. */
+ if ( oos_remove_write_access(v, gmfn, va) )
+ {
+ /* Page has been unshadowed. */
+ return;
+ }
+
+ /* No more writable mappings of this page, please */
+ pg->shadow_flags &= ~SHF_oos_may_write;
+
+ /* Update the shadows with current guest entries. */
+ _sh_resync_l1(v, gmfn, snp);
+
+ /* Now we know all the entries are synced, and will stay that way */
+ pg->shadow_flags &= ~SHF_out_of_sync;
+ perfc_incr(shadow_resync);
+}
+
+
+/* Add an MFN to the list of out-of-sync guest pagetables */
+static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+ int idx, oidx, swap = 0;
+ void *gptr, *gsnpptr;
+ mfn_t *oos = v->arch.paging.shadow.oos;
+ unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+ mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ oidx = idx;
+
+ if ( mfn_valid(oos[idx])
+ && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
+ {
+ /* Punt the current occupant into the next slot */
+ SWAP(oos[idx], gmfn);
+ SWAP(oos_va[idx], va);
+ swap = 1;
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+ }
+ if ( mfn_valid(oos[idx]) )
+ {
+ /* Crush the current occupant. */
+ _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
+ perfc_incr(shadow_unsync_evict);
+ }
+ oos[idx] = gmfn;
+ oos_va[idx] = va;
+
+ if ( swap )
+ SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
+
+ gptr = sh_map_domain_page(oos[oidx]);
+ gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
+ memcpy(gsnpptr, gptr, PAGE_SIZE);
+ sh_unmap_domain_page(gptr);
+ sh_unmap_domain_page(gsnpptr);
+}
+
+/* Remove an MFN from the list of out-of-sync guest pagetables */
+static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
+{
+ int idx;
+ mfn_t *oos;
+ struct domain *d = v->domain;
+
+ SHADOW_PRINTK("D%dV%d gmfn %lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+ for_each_vcpu(d, v)
+ {
+ oos = v->arch.paging.shadow.oos;
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+ {
+ oos[idx] = _mfn(INVALID_MFN);
+ return;
+ }
+ }
+
+ SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+ BUG();
+}
+
+mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
+{
+ int idx;
+ mfn_t *oos;
+ mfn_t *oos_snapshot;
+ struct domain *d = v->domain;
+
+ for_each_vcpu(d, v)
+ {
+ oos = v->arch.paging.shadow.oos;
+ oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+ {
+ return oos_snapshot[idx];
+ }
+ }
+
+ SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+ BUG();
+ return _mfn(INVALID_MFN);
+}
+
+/* Pull a single guest page back into sync */
+void sh_resync(struct vcpu *v, mfn_t gmfn)
+{
+ int idx;
+ mfn_t *oos;
+ unsigned long *oos_va;
+ mfn_t *oos_snapshot;
+ struct domain *d = v->domain;
+
+ for_each_vcpu(d, v)
+ {
+ oos = v->arch.paging.shadow.oos;
+ oos_va = v->arch.paging.shadow.oos_va;
+ oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+
+ if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+ {
+ _sh_resync(v, gmfn, oos_va[idx], oos_snapshot[idx]);
+ oos[idx] = _mfn(INVALID_MFN);
+ return;
+ }
+ }
+
+ SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+ BUG();
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table,
+ * by making a call out to the mode in which that shadow was made. */
+static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+ struct page_info *pg = mfn_to_page(gl1mfn);
+ if ( pg->shadow_flags & SHF_L1_32 )
+ return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
+ else if ( pg->shadow_flags & SHF_L1_PAE )
+ return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( pg->shadow_flags & SHF_L1_64 )
+ return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
+#endif
+ SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
+ mfn_x(gl1mfn));
+ BUG();
+ return 0; /* BUG() is no longer __attribute__((noreturn)). */
+}
+
+
+/* Pull all out-of-sync pages back into sync. Pages brought out of sync
+ * on other vcpus are allowed to remain out of sync, but their contents
+ * will be made safe (TLB flush semantics); pages unsynced by this vcpu
+ * are brought back into sync and write-protected. If skip != 0, we try
+ * to avoid resyncing at all if we think we can get away with it. */
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int
do_locking)
+{
+ int idx;
+ struct vcpu *other;
+ mfn_t *oos = v->arch.paging.shadow.oos;
+ unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+ mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+
+ SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
+
+ ASSERT(do_locking || shadow_locked_by_me(v->domain));
+
+ if ( !this )
+ goto resync_others;
+
+ if ( do_locking )
+ shadow_lock(v->domain);
+
+ if ( oos_fixup_flush(v) )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+
+ /* First: resync all of this vcpu's oos pages */
+ for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+ if ( mfn_valid(oos[idx]) )
+ {
+ /* Write-protect and sync contents */
+ _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
+ oos[idx] = _mfn(INVALID_MFN);
+ }
+
+ if ( do_locking )
+ shadow_unlock(v->domain);
+
+ resync_others:
+ if ( !others )
+ return;
+
+ /* Second: make all *other* vcpus' oos pages safe. */
+ for_each_vcpu(v->domain, other)
+ {
+ if ( v == other )
+ continue;
+
+ if ( do_locking )
+ shadow_lock(v->domain);
+
+ oos = other->arch.paging.shadow.oos;
+ oos_va = other->arch.paging.shadow.oos_va;
+ oos_snapshot = other->arch.paging.shadow.oos_snapshot;
+ for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+ {
+ if ( !mfn_valid(oos[idx]) )
+ continue;
+
+ if ( skip )
+ {
+ /* Update the shadows and leave the page OOS. */
+ if ( sh_skip_sync(v, oos[idx]) )
+ continue;
+ _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
+ }
+ else
+ {
+ /* Write-protect and sync contents */
+ _sh_resync(other, oos[idx], oos_va[idx], oos_snapshot[idx]);
+ oos[idx] = _mfn(INVALID_MFN);
+ }
+ }
+
+ if ( do_locking )
+ shadow_unlock(v->domain);
+ }
+}
+
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+ struct page_info *pg;
+
+ ASSERT(shadow_locked_by_me(v->domain));
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+ pg = mfn_to_page(gmfn);
+
+ /* Guest page must be shadowed *only* as L1 and *only* once when out
+ * of sync. Also, get out now if it's already out of sync.
+ * Also, can't safely unsync if some vcpus have paging disabled.*/
+ if ( pg->shadow_flags &
+ ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
+ || sh_page_has_multiple_shadows(pg)
+ || !is_hvm_domain(v->domain)
+ || !v->domain->arch.paging.shadow.oos_active )
+ return 0;
+
+ pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
+ oos_hash_add(v, gmfn, va);
+ perfc_incr(shadow_unsync);
+ return 1;
+}
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
/**************************************************************************/
/* Code for "promoting" a guest page to the point where the shadow code is
@@ -440,6 +1029,12 @@ void shadow_promote(struct vcpu *v, mfn_
ASSERT(mfn_valid(gmfn));
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Is the page already shadowed and out of sync? */
+ if ( page_is_out_of_sync(page) )
+ sh_resync(v, gmfn);
+#endif
+
/* We should never try to promote a gmfn that has writeable mappings */
ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
|| (page->u.inuse.type_info & PGT_count_mask) == 0
@@ -463,7 +1058,17 @@ void shadow_demote(struct vcpu *v, mfn_t
clear_bit(type, &page->shadow_flags);
if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+ {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Was the page out of sync? */
+ if ( page_is_out_of_sync(page) )
+ {
+ oos_hash_remove(v, gmfn);
+ oos_fixup_remove(v, gmfn);
+ }
+#endif
clear_bit(_PGC_page_table, &page->count_info);
+ }
}
/**************************************************************************/
@@ -674,7 +1279,8 @@ shadow_order(unsigned int shadow_type)
0, /* SH_type_l3_64_shadow */
0, /* SH_type_l4_64_shadow */
2, /* SH_type_p2m_table */
- 0 /* SH_type_monitor_table */
+ 0, /* SH_type_monitor_table */
+ 0 /* SH_type_oos_snapshot */
};
ASSERT(shadow_type < SH_type_unused);
return type_to_order[shadow_type];
@@ -1220,6 +1826,14 @@ static unsigned int sh_set_allocation(st
sp = list_entry(d->arch.paging.shadow.freelists[order].next,
struct shadow_page_info, list);
list_del(&sp->list);
+#if defined(__x86_64__)
+ /*
+ * Re-instate lock field which we overwrite with shadow_page_info.
+ * This was safe, since the lock is only used on guest pages.
+ */
+ for ( j = 0; j < 1U << order; j++ )
+ spin_lock_init(&((struct page_info *)sp)[j].lock);
+#endif
d->arch.paging.shadow.free_pages -= 1 << order;
d->arch.paging.shadow.total_pages -= 1 << order;
free_domheap_pages((struct page_info *)sp, order);
@@ -1297,6 +1911,27 @@ static void sh_hash_audit_bucket(struct
/* Bad shadow flags on guest page? */
BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
/* Bad type count on guest page? */
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( sp->type == SH_type_l1_32_shadow
+ || sp->type == SH_type_l1_pae_shadow
+ || sp->type == SH_type_l1_64_shadow )
+ {
+ if ( (gpg->u.inuse.type_info & PGT_type_mask) ==
PGT_writable_page
+ && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
+ {
+ if ( !page_is_out_of_sync(gpg) )
+ {
+ SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+ " and not OOS but has typecount %#lx\n",
+ sp->backpointer,
+ mfn_x(shadow_page_to_mfn(sp)),
+ gpg->u.inuse.type_info);
+ BUG();
+ }
+ }
+ }
+ else /* Not an l1 */
+#endif
if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
&& (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
{
@@ -1608,7 +2243,8 @@ void sh_destroy_shadow(struct vcpu *v, m
/* Remove all writeable mappings of a guest frame from the shadow tables
* Returns non-zero if we need to flush TLBs.
* level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access.*/
+ * level==0 means we have some other reason for revoking write access.
+ * If level==0 we are allowed to fail, returning -1. */
int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
unsigned int level,
@@ -1659,7 +2295,12 @@ int sh_remove_write_access(struct vcpu *
return 0;
/* Early exit if it's already a pagetable, or otherwise not writeable */
- if ( sh_mfn_is_a_page_table(gmfn)
+ if ( (sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Unless they've been allowed to go out of sync with their shadows */
+ && !mfn_oos_may_write(gmfn)
+#endif
+ )
|| (pg->u.inuse.type_info & PGT_count_mask) == 0 )
return 0;
@@ -1676,7 +2317,7 @@ int sh_remove_write_access(struct vcpu *
}
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
- if ( v == current && level != 0 )
+ if ( v == current )
{
unsigned long gfn;
/* Heuristic: there is likely to be only one writeable mapping,
@@ -1690,6 +2331,8 @@ int sh_remove_write_access(struct vcpu *
return 1; \
} while (0)
+ if ( level == 0 && fault_addr )
+ GUESS(fault_addr, 6);
if ( v->arch.paging.mode->guest_levels == 2 )
{
@@ -1773,13 +2416,19 @@ int sh_remove_write_access(struct vcpu *
#endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
/* Brute-force search of all the shadows, by walking the hash */
- perfc_incr(shadow_writeable_bf);
+ if ( level == 0 )
+ perfc_incr(shadow_writeable_bf_1);
+ else
+ perfc_incr(shadow_writeable_bf);
hash_foreach(v, callback_mask, callbacks, gmfn);
/* If that didn't catch the mapping, then there's some non-pagetable
* mapping -- ioreq page, grant mapping, &c. */
if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
{
+ if ( level == 0 )
+ return -1;
+
SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
"%lu special-use mappings of it\n", mfn_x(gmfn),
(mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
@@ -1790,7 +2439,34 @@ int sh_remove_write_access(struct vcpu *
return 1;
}
-
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+ mfn_t smfn, unsigned long off)
+{
+ struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
+
+ ASSERT(mfn_valid(smfn));
+ ASSERT(mfn_valid(gmfn));
+
+ if ( sp->type == SH_type_l1_32_shadow )
+ {
+ return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
+ (v, gmfn, smfn, off);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ else if ( sp->type == SH_type_l1_pae_shadow )
+ return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
+ (v, gmfn, smfn, off);
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( sp->type == SH_type_l1_64_shadow )
+ return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
+ (v, gmfn, smfn, off);
+#endif
+#endif
+
+ return 0;
+}
+#endif
/**************************************************************************/
/* Remove all mappings of a guest frame from the shadow tables.
@@ -2127,6 +2803,36 @@ static void sh_update_paging_modes(struc
}
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( v->arch.paging.shadow.oos_fixups == NULL )
+ {
+ int i;
+ v->arch.paging.shadow.oos_fixups =
+ alloc_xenheap_pages(SHADOW_OOS_FT_ORDER);
+ if ( v->arch.paging.shadow.oos_fixups == NULL )
+ {
+ SHADOW_ERROR("Could not allocate OOS fixup table"
+ " for dom %u vcpu %u\n",
+ v->domain->domain_id, v->vcpu_id);
+ domain_crash(v->domain);
+ return;
+ }
+ for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
+ v->arch.paging.shadow.oos_fixups[i].gmfn = _mfn(INVALID_MFN);
+ }
+
+ if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
+ {
+ int i;
+ for(i = 0; i < SHADOW_OOS_PAGES; i++)
+ {
+ shadow_prealloc(d, SH_type_oos_snapshot, 1);
+ v->arch.paging.shadow.oos_snapshot[i] =
+ shadow_alloc(d, SH_type_oos_snapshot, 0);
+ }
+ }
+#endif /* OOS */
+
// Valid transitions handled by this function:
// - For PV guests:
// - after a shadow mode has been changed
@@ -2158,6 +2864,13 @@ static void sh_update_paging_modes(struc
///
ASSERT(shadow_mode_translate(d));
ASSERT(shadow_mode_external(d));
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Need to resync all our pages now, because if a page goes out
+ * of sync with paging enabled and is resynced with paging
+ * disabled, the resync will go wrong. */
+ shadow_resync_all(v, 0);
+#endif /* OOS */
if ( !hvm_paging_enabled(v) )
{
@@ -2254,6 +2967,27 @@ static void sh_update_paging_modes(struc
// This *does* happen, at least for CR4.PGE...
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* We need to check that all the vcpus have paging enabled to
+ * unsync PTs. */
+ if ( is_hvm_domain(d) )
+ {
+ int pe = 1;
+ struct vcpu *vptr;
+
+ for_each_vcpu(d, vptr)
+ {
+ if ( !hvm_paging_enabled(vptr) )
+ {
+ pe = 0;
+ break;
+ }
+ }
+
+ d->arch.paging.shadow.oos_active = pe;
+ }
+#endif /* OOS */
+
v->arch.paging.mode->update_cr3(v, 0);
}
@@ -2426,17 +3160,36 @@ void shadow_teardown(struct domain *d)
}
}
-#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
+#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
/* Free the virtual-TLB array attached to each vcpu */
for_each_vcpu(d, v)
{
+#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
if ( v->arch.paging.vtlb )
{
xfree(v->arch.paging.vtlb);
v->arch.paging.vtlb = NULL;
}
- }
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( v->arch.paging.shadow.oos_fixups )
+ {
+ free_xenheap_pages(v->arch.paging.shadow.oos_fixups,
+ SHADOW_OOS_FT_ORDER);
+ v->arch.paging.shadow.oos_fixups = NULL;
+ }
+
+ {
+ int i;
+ mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+ for(i = 0; i < SHADOW_OOS_PAGES; i++)
+ if ( mfn_valid(oos_snapshot[i]) )
+ shadow_free(d, oos_snapshot[i]);
+ }
+#endif /* OOS */
+ }
+#endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
{
@@ -3044,7 +3797,11 @@ void shadow_audit_tables(struct vcpu *v)
if ( !(SHADOW_AUDIT_ENABLE) )
return;
-
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ sh_oos_audit(v->domain);
+#endif
+
if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
mask = ~1; /* Audit every table in the system */
else
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.c Wed Jul 02 11:30:37 2008 +0900
@@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig
}
/* Remove write access permissions from a gwalk_t in a batch, and
- * return OR-ed result for TLB flush hint
+ * return OR-ed result for TLB flush hint and need to rewalk the guest
+ * pages.
+ *
+ * Syncing pages will remove write access to that page; but it may
+ * also give write access to other pages in the path. If we resync any
+ * pages, re-walk from the beginning.
*/
+#define GW_RMWR_FLUSHTLB 1
+#define GW_RMWR_REWALK 2
+
static inline uint32_t
gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
{
- int rc = 0;
+ uint32_t rc = 0;
#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
-#endif
- rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
-#endif
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( mfn_is_out_of_sync(gw->l3mfn) )
+ {
+ sh_resync(v, gw->l3mfn);
+ rc = GW_RMWR_REWALK;
+ }
+ else
+#endif /* OOS */
+ if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
+ rc = GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( mfn_is_out_of_sync(gw->l2mfn) )
+ {
+ sh_resync(v, gw->l2mfn);
+ rc |= GW_RMWR_REWALK;
+ }
+ else
+#endif /* OOS */
+ if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
+ rc |= GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
if ( !(guest_supports_superpages(v) &&
- (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
- rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+ (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ && !mfn_is_out_of_sync(gw->l1mfn)
+#endif /* OOS */
+ && sh_remove_write_access(v, gw->l1mfn, 1, va) )
+ rc |= GW_RMWR_FLUSHTLB;
return rc;
}
@@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v,
// protect guest page tables
//
- if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
+ if ( unlikely((level == 1)
+ && sh_mfn_is_a_page_table(target_mfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+ && !mfn_oos_may_write(target_mfn)
+#endif /* OOS */
+ ) )
{
if ( shadow_mode_trap_reads(d) )
{
@@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v
domain_crash(v->domain);
return SHADOW_SET_ERROR;
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+ shadow_resync_all(v, 0);
+#endif
}
/* Write the new entry */
@@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v
| (((unsigned long)sl3e) & ~PAGE_MASK));
if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+ {
/* About to install a new reference */
if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
{
domain_crash(v->domain);
return SHADOW_SET_ERROR;
- }
+ }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+ shadow_resync_all(v, 0);
+#endif
+ }
/* Write the new entry */
shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
@@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v
| (((unsigned long)sl2e) & ~PAGE_MASK));
if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
+ {
+ mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
+
/* About to install a new reference */
- if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
+ if ( !sh_get_ref(v, sl1mfn, paddr) )
{
domain_crash(v->domain);
return SHADOW_SET_ERROR;
- }
+ }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ {
+ struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
+ mfn_t gl1mfn = _mfn(sp->backpointer);
+
+ /* If the shadow is a fl1 then the backpointer contains
+ the GFN instead of the GMFN, and it's definitely not
+ OOS. */
+ if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
+ && mfn_is_out_of_sync(gl1mfn) )
+ sh_resync(v, gl1mfn);
+ }
+#endif
+ }
/* Write the new entry */
#if GUEST_PAGING_LEVELS == 2
@@ -1347,6 +1409,9 @@ static int shadow_set_l1e(struct vcpu *v
int flags = 0;
struct domain *d = v->domain;
shadow_l1e_t old_sl1e;
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+ mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
+#endif
ASSERT(sl1e != NULL);
old_sl1e = *sl1e;
@@ -1363,8 +1428,18 @@ static int shadow_set_l1e(struct vcpu *v
/* Doesn't look like a pagetable. */
flags |= SHADOW_SET_ERROR;
new_sl1e = shadow_l1e_empty();
- } else {
+ }
+ else
+ {
shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+ if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
+ && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) )
+ {
+ oos_fixup_add(v, new_gmfn, sl1mfn,
pgentry_ptr_to_slot(sl1e));
+ }
+#endif
+
}
}
}
@@ -2532,6 +2607,9 @@ static int validate_gl1e(struct vcpu *v,
mfn_t gmfn;
p2m_type_t p2mt;
int result = 0;
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ mfn_t gl1mfn;
+#endif /* OOS */
perfc_incr(shadow_validate_gl1e_calls);
@@ -2539,10 +2617,138 @@ static int validate_gl1e(struct vcpu *v,
gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
+ result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+ if ( mfn_valid(gl1mfn)
+ && mfn_is_out_of_sync(gl1mfn) )
+ {
+ /* Update the OOS snapshot. */
+ mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
+ guest_l1e_t *snp;
+
+ ASSERT(mfn_valid(snpmfn));
+
+ snp = sh_map_domain_page(snpmfn);
+ snp[guest_index(new_ge)] = new_gl1e;
+ sh_unmap_domain_page(snp);
+ }
+#endif /* OOS */
+
+ return result;
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Special validation function for re-syncing out-of-sync shadows.
+ * Walks the *shadow* page, and for every entry that it finds,
+ * revalidates the guest entry that corresponds to it.
+ * N.B. This function is called with the vcpu that unsynced the page,
+ * *not* the one that is causing it to be resynced. */
+void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
+{
+ mfn_t sl1mfn;
+ shadow_l1e_t *sl1p;
+ guest_l1e_t *gl1p, *gp, *snp;
+ int rc = 0;
+
+ ASSERT(mfn_valid(snpmfn));
+
+ sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+ ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
+
+ snp = sh_map_domain_page(snpmfn);
+ gp = sh_map_domain_page(gl1mfn);
+ gl1p = gp;
+
+ SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
+ guest_l1e_t gl1e = *gl1p;
+ guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
+
+ if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
+ {
+ gfn_t gfn;
+ mfn_t gmfn;
+ p2m_type_t p2mt;
+ shadow_l1e_t nsl1e;
+
+ gfn = guest_l1e_get_gfn(gl1e);
+ gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
+ l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
+ rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
+
+ *snpl1p = gl1e;
+ }
+ });
+
+ sh_unmap_domain_page(gp);
+ sh_unmap_domain_page(snp);
+
+ /* Setting shadow L1 entries should never need us to flush the TLB */
+ ASSERT(!(rc & SHADOW_SET_FLUSH));
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table.
+ * That is: if we can tell that it's only used once, and that the
+ * toplevel shadow responsible is not one of ours.
+ * N.B. This function is called with the vcpu that required the resync,
+ * *not* the one that originally unsynced the page, but it is
+ * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
+int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+ struct shadow_page_info *sp;
+ mfn_t smfn;
+
+ smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+ ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
- result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
- return result;
-}
+ /* Up to l2 */
+ sp = mfn_to_shadow_page(smfn);
+ if ( sp->count != 1 || !sp->up )
+ return 0;
+ smfn = _mfn(sp->up >> PAGE_SHIFT);
+ ASSERT(mfn_valid(smfn));
+
+#if (SHADOW_PAGING_LEVELS == 4)
+ /* up to l3 */
+ sp = mfn_to_shadow_page(smfn);
+ if ( sp->count != 1 || !sp->up )
+ return 0;
+ smfn = _mfn(sp->up >> PAGE_SHIFT);
+ ASSERT(mfn_valid(smfn));
+
+ /* up to l4 */
+ sp = mfn_to_shadow_page(smfn);
+ if ( sp->count != 1
+ || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
+ return 0;
+ smfn = _mfn(sp->up >> PAGE_SHIFT);
+ ASSERT(mfn_valid(smfn));
+
+#if (GUEST_PAGING_LEVELS == 2)
+ /* In 2-on-3 shadow mode the up pointer contains the link to the
+ * shadow page, but the shadow_table contains only the first of the
+ * four pages that makes the PAE top shadow tables. */
+ smfn = _mfn(mfn_x(smfn) & ~0x3UL);
+#endif
+
+#endif
+
+ if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
+#if (SHADOW_PAGING_LEVELS == 3)
+ || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
+ || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
+ || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
+#endif
+ )
+ return 0;
+
+ /* Only in use in one toplevel shadow, and it's not the one we're
+ * running on */
+ return 1;
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
/**************************************************************************/
@@ -2725,6 +2931,10 @@ static void sh_prefetch(struct vcpu *v,
shadow_l1e_t sl1e;
u32 gflags;
p2m_type_t p2mt;
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ guest_l1e_t *snpl1p = NULL;
+#endif /* OOS */
+
/* Prefetch no further than the end of the _shadow_ l1 MFN */
dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
@@ -2737,6 +2947,17 @@ static void sh_prefetch(struct vcpu *v,
/* Normal guest page; grab the next guest entry */
gl1p = sh_map_domain_page(gw->l1mfn);
gl1p += guest_l1_table_offset(gw->va);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( mfn_is_out_of_sync(gw->l1mfn) )
+ {
+ mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
+
+ ASSERT(mfn_valid(snpmfn));
+ snpl1p = sh_map_domain_page(snpmfn);
+ snpl1p += guest_l1_table_offset(gw->va);
+ }
+#endif /* OOS */
}
for ( i = 1; i < dist ; i++ )
@@ -2774,9 +2995,18 @@ static void sh_prefetch(struct vcpu *v,
/* Propagate the entry. */
l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
(void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( snpl1p != NULL )
+ snpl1p[i] = gl1e;
+#endif /* OOS */
}
if ( gl1p != NULL )
sh_unmap_domain_page(gl1p);
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( snpl1p != NULL )
+ sh_unmap_domain_page(snpl1p);
+#endif /* OOS */
}
#endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
@@ -2805,6 +3035,7 @@ static int sh_page_fault(struct vcpu *v,
int r;
fetch_type_t ft = 0;
p2m_type_t p2mt;
+ uint32_t rc;
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
int fast_emul = 0;
#endif
@@ -2830,6 +3061,17 @@ static int sh_page_fault(struct vcpu *v,
{
fast_emul = 1;
gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Fall back to the slow path if we're trying to emulate
+ writes to an out of sync page. */
+ if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
+ {
+ v->arch.paging.last_write_emul_ok = 0;
+ goto page_fault_slow_path;
+ }
+#endif /* OOS */
+
perfc_incr(shadow_fault_fast_emulate);
goto early_emulation;
}
@@ -2855,6 +3097,31 @@ static int sh_page_fault(struct vcpu *v,
sizeof(sl1e)) == 0)
&& sh_l1e_is_magic(sl1e)) )
{
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* First, need to check that this isn't an out-of-sync
+ * shadow l1e. If it is, we fall back to the slow path, which
+ * will sync it up again. */
+ {
+ shadow_l2e_t sl2e;
+ mfn_t gl1mfn;
+ if ( (__copy_from_user(&sl2e,
+ (sh_linear_l2_table(v)
+ + shadow_l2_linear_offset(va)),
+ sizeof(sl2e)) != 0)
+ || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
+ || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
+ shadow_l2e_get_mfn(sl2e))->backpointer))
+ || unlikely(mfn_is_out_of_sync(gl1mfn)) )
+ {
+ /* Hit the slow path as if there had been no
+ * shadow entry at all, and let it tidy up */
+ ASSERT(regs->error_code & PFEC_page_present);
+ regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
+ goto page_fault_slow_path;
+ }
+ }
+#endif /* SHOPT_OUT_OF_SYNC */
+
if ( sh_l1e_is_gnp(sl1e) )
{
/* Not-present in a guest PT: pass to the guest as
@@ -2890,6 +3157,10 @@ static int sh_page_fault(struct vcpu *v,
return EXCRET_fault_fixed;
}
}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ page_fault_slow_path:
+#endif
#endif /* SHOPT_FAST_FAULT_PATH */
/* Detect if this page fault happened while we were already in Xen
@@ -2904,7 +3175,21 @@ static int sh_page_fault(struct vcpu *v,
return 0;
}
- if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
+ rewalk:
+ rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( !(rc & _PAGE_PRESENT) )
+ regs->error_code |= PFEC_page_present;
+ else if ( regs->error_code & PFEC_page_present )
+ {
+ SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB"
+ " flushing. Have fun debugging it.\n");
+ regs->error_code &= ~PFEC_page_present;
+ }
+#endif
+
+ if ( rc != 0 )
{
perfc_incr(shadow_fault_bail_real_fault);
SHADOW_PRINTK("not a shadow fault\n");
@@ -2948,7 +3233,10 @@ static int sh_page_fault(struct vcpu *v,
shadow_lock(d);
- if ( gw_remove_write_accesses(v, va, &gw) )
+ rc = gw_remove_write_accesses(v, va, &gw);
+
+ /* First bit set: Removed write access to a page. */
+ if ( rc & GW_RMWR_FLUSHTLB )
{
/* Write permission removal is also a hint that other gwalks
* overlapping with this one may be inconsistent
@@ -2958,11 +3246,20 @@ static int sh_page_fault(struct vcpu *v,
flush_tlb_mask(d->domain_dirty_cpumask);
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Second bit set: Resynced a page. Re-walk needed. */
+ if ( rc & GW_RMWR_REWALK )
+ {
+ shadow_unlock(d);
+ goto rewalk;
+ }
+#endif /* OOS */
+
if ( !shadow_check_gwalk(v, va, &gw) )
{
perfc_incr(shadow_inconsistent_gwalk);
shadow_unlock(d);
- return EXCRET_fault_fixed;
+ goto rewalk;
}
shadow_audit_tables(v);
@@ -2991,17 +3288,45 @@ static int sh_page_fault(struct vcpu *v,
return 0;
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Always unsync when writing to L1 page tables. */
+ if ( sh_mfn_is_a_page_table(gmfn)
+ && ft == ft_demand_write )
+ sh_unsync(v, gmfn, va);
+#endif /* OOS */
+
/* Calculate the shadow entry and write it */
l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( mfn_valid(gw.l1mfn)
+ && mfn_is_out_of_sync(gw.l1mfn) )
+ {
+ /* Update the OOS snapshot. */
+ mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
+ guest_l1e_t *snp;
+
+ ASSERT(mfn_valid(snpmfn));
+
+ snp = sh_map_domain_page(snpmfn);
+ snp[guest_l1_table_offset(va)] = gw.l1e;
+ sh_unmap_domain_page(snp);
+ }
+#endif /* OOS */
+
#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
/* Prefetch some more shadow entries */
sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
#endif
/* Need to emulate accesses to page tables */
- if ( sh_mfn_is_a_page_table(gmfn) )
+ if ( sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Unless they've been allowed to go out of sync with their shadows */
+ && !mfn_is_out_of_sync(gmfn)
+#endif
+ )
{
if ( ft == ft_demand_write )
{
@@ -3215,6 +3540,7 @@ sh_invlpg(struct vcpu *v, unsigned long
* instruction should be issued on the hardware, or 0 if it's safe not
* to do so. */
{
+ mfn_t sl1mfn;
shadow_l2e_t sl2e;
perfc_incr(shadow_invlpg);
@@ -3278,12 +3604,64 @@ sh_invlpg(struct vcpu *v, unsigned long
// If so, then we'll need to flush the entire TLB (because that's
// easier than invalidating all of the individual 4K pages).
//
- if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
+ sl1mfn = shadow_l2e_get_mfn(sl2e);
+ if ( mfn_to_shadow_page(sl1mfn)->type
== SH_type_fl1_shadow )
{
flush_tlb_local();
return 0;
}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Check to see if the SL1 is out of sync. */
+ {
+ mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+ struct page_info *pg = mfn_to_page(gl1mfn);
+ if ( mfn_valid(gl1mfn)
+ && page_is_out_of_sync(pg) )
+ {
+ /* The test above may give false positives, since we don't
+ * hold the shadow lock yet. Check again with the lock held. */
+ shadow_lock(v->domain);
+
+ /* This must still be a copy-from-user because we didn't
+ * have the shadow lock last time we checked, and the
+ * higher-level shadows might have disappeared under our
+ * feet. */
+ if ( __copy_from_user(&sl2e,
+ sh_linear_l2_table(v)
+ + shadow_l2_linear_offset(va),
+ sizeof (sl2e)) != 0 )
+ {
+ perfc_incr(shadow_invlpg_fault);
+ shadow_unlock(v->domain);
+ return 0;
+ }
+
+ if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
+ {
+ shadow_unlock(v->domain);
+ return 0;
+ }
+
+ sl1mfn = shadow_l2e_get_mfn(sl2e);
+ gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+ pg = mfn_to_page(gl1mfn);
+
+ if ( likely(sh_mfn_is_a_page_table(gl1mfn)
+ && page_is_out_of_sync(pg) ) )
+ {
+ shadow_l1e_t *sl1;
+ sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+ /* Remove the shadow entry that maps this VA */
+ (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
+ }
+ shadow_unlock(v->domain);
+ /* Need the invlpg, to pick up the disappeareance of the sl1e */
+ return 1;
+ }
+ }
+#endif
return 1;
}
@@ -3710,6 +4088,13 @@ sh_update_cr3(struct vcpu *v, int do_loc
return;
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Need to resync all the shadow entries on a TLB flush. Resync
+ * current vcpus OOS pages before switching to the new shadow
+ * tables so that the VA hint is still valid. */
+ shadow_resync_current_vcpu(v, do_locking);
+#endif
+
if ( do_locking ) shadow_lock(v->domain);
ASSERT(shadow_locked_by_me(v->domain));
@@ -3938,11 +4323,70 @@ sh_update_cr3(struct vcpu *v, int do_loc
/* Release the lock, if we took it (otherwise it's the caller's problem) */
if ( do_locking ) shadow_unlock(v->domain);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Need to resync all the shadow entries on a TLB flush. We only
+ * update the shadows, leaving the pages out of sync. Also, we try
+ * to skip synchronization of shadows not mapped in the new
+ * tables. */
+ shadow_sync_other_vcpus(v, do_locking);
+#endif
+
}
/**************************************************************************/
/* Functions to revoke guest rights */
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+ mfn_t smfn, unsigned long off)
+{
+ int r;
+ shadow_l1e_t *sl1p, sl1e;
+ struct shadow_page_info *sp;
+
+ ASSERT(mfn_valid(gmfn));
+ ASSERT(mfn_valid(smfn));
+
+ sp = mfn_to_shadow_page(smfn);
+
+ if ( sp->mbz != 0 ||
+#if GUEST_PAGING_LEVELS == 4
+ (sp->type != SH_type_l1_64_shadow)
+#elif GUEST_PAGING_LEVELS == 3
+ (sp->type != SH_type_l1_pae_shadow)
+#elif GUEST_PAGING_LEVELS == 2
+ (sp->type != SH_type_l1_32_shadow)
+#endif
+ )
+ goto fail;
+
+ sl1p = sh_map_domain_page(smfn);
+ sl1p += off;
+ sl1e = *sl1p;
+ if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
+ != (_PAGE_PRESENT|_PAGE_RW))
+ || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
+ {
+ sh_unmap_domain_page(sl1p);
+ goto fail;
+ }
+
+ /* Found it! Need to remove its write permissions. */
+ sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
+ r = shadow_set_l1e(v, sl1p, sl1e, smfn);
+ ASSERT( !(r & SHADOW_SET_ERROR) );
+
+ sh_unmap_domain_page(sl1p);
+ perfc_incr(shadow_writeable_h_7);
+ return 1;
+
+ fail:
+ perfc_incr(shadow_writeable_h_8);
+ return 0;
+}
+#endif /* OOS */
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
@@ -4437,23 +4881,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v,
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
-#define AUDIT_FAIL(_level, _fmt, _a...) do { \
- printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
- "gl" #_level "mfn = %" PRI_mfn \
- " sl" #_level "mfn = %" PRI_mfn \
- " &gl" #_level "e = %p &sl" #_level "e = %p" \
- " gl" #_level "e = %" SH_PRI_gpte \
- " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
- GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
- _level, guest_index(gl ## _level ## e), \
- mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
- gl ## _level ## e, sl ## _level ## e, \
- gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
- ##_a); \
- BUG(); \
- done = 1; \
+#define AUDIT_FAIL(_level, _fmt, _a...) do { \
+ printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
+ "gl" #_level "mfn = %" PRI_mfn \
+ " sl" #_level "mfn = %" PRI_mfn \
+ " &gl" #_level "e = %p &sl" #_level "e = %p" \
+ " gl" #_level "e = %" SH_PRI_gpte \
+ " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
+ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
+ _level, guest_index(gl ## _level ## e), \
+ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
+ gl ## _level ## e, sl ## _level ## e, \
+ gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level,
\
+ ##_a); \
+ BUG(); \
+ done = 1; \
} while (0)
+#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
+ printk("Shadow %u-on-%u audit failed at level %i\n" \
+ "gl" #_level "mfn = %" PRI_mfn \
+ " sl" #_level "mfn = %" PRI_mfn \
+ " Error: " _fmt "\n", \
+ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
+ _level, \
+ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
+ ##_a); \
+ BUG(); \
+ done = 1; \
+} while (0)
static char * sh_audit_flags(struct vcpu *v, int level,
int gflags, int sflags)
@@ -4494,6 +4950,16 @@ int sh_audit_l1_table(struct vcpu *v, mf
/* Follow the backpointer */
gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
+ if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
+ {
+ oos_audit_hash_is_present(v->domain, gl1mfn);
+ return 0;
+ }
+#endif
+
gl1e = gp = sh_map_domain_page(gl1mfn);
SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
@@ -4574,6 +5040,13 @@ int sh_audit_l2_table(struct vcpu *v, mf
/* Follow the backpointer */
gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Only L1's may be out of sync. */
+ if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
+ AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
+#endif
+
gl2e = gp = sh_map_domain_page(gl2mfn);
SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
@@ -4616,6 +5089,13 @@ int sh_audit_l3_table(struct vcpu *v, mf
/* Follow the backpointer */
gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Only L1's may be out of sync. */
+ if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
+ AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
+#endif
+
gl3e = gp = sh_map_domain_page(gl3mfn);
SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
@@ -4656,6 +5136,13 @@ int sh_audit_l4_table(struct vcpu *v, mf
/* Follow the backpointer */
gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Only L1's may be out of sync. */
+ if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
+ AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
+#endif
+
gl4e = gp = sh_map_domain_page(gl4mfn);
SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
{
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.h
--- a/xen/arch/x86/mm/shadow/multi.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.h Wed Jul 02 11:30:37 2008 +0900
@@ -115,3 +115,17 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_
extern struct paging_mode
SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+extern void
+SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
+ (struct vcpu*v, mfn_t gmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+#endif
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/private.h Wed Jul 02 11:30:37 2008 +0900
@@ -63,8 +63,9 @@ extern int shadow_audit_enable;
#define SHOPT_SKIP_VERIFY 0x20 /* Skip PTE v'fy when safe to do so */
#define SHOPT_VIRTUAL_TLB 0x40 /* Cache guest v->p translations */
#define SHOPT_FAST_EMULATION 0x80 /* Fast write emulation */
-
-#define SHADOW_OPTIMIZATIONS 0xff
+#define SHOPT_OUT_OF_SYNC 0x100 /* Allow guest writes to L1 PTs */
+
+#define SHADOW_OPTIMIZATIONS 0x1ff
/******************************************************************************
@@ -195,9 +196,9 @@ struct shadow_page_info
u32 tlbflush_timestamp;
};
struct {
- unsigned int type:4; /* What kind of shadow is this? */
+ unsigned int type:5; /* What kind of shadow is this? */
unsigned int pinned:1; /* Is the shadow pinned? */
- unsigned int count:27; /* Reference count */
+ unsigned int count:26; /* Reference count */
u32 mbz; /* Must be zero: this is where the owner
* field lives in a non-shadow page */
} __attribute__((packed));
@@ -242,7 +243,8 @@ static inline void shadow_check_page_str
#define SH_type_max_shadow (13U)
#define SH_type_p2m_table (14U) /* in use as the p2m table */
#define SH_type_monitor_table (15U) /* in use as a monitor table */
-#define SH_type_unused (16U)
+#define SH_type_oos_snapshot (16U) /* in use as OOS snapshot */
+#define SH_type_unused (17U)
/*
* What counts as a pinnable shadow?
@@ -301,6 +303,72 @@ static inline int sh_type_is_pinnable(st
#define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE)
#define SHF_64 (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64)
+#define SHF_L1_ANY (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Marks a guest L1 page table which is shadowed but not write-protected.
+ * If set, then *only* L1 shadows (SHF_L1_*) are allowed.
+ *
+ * out_of_sync indicates that the shadow tables may not reflect the
+ * guest tables. If it is clear, then the shadow tables *must* reflect
+ * the guest tables.
+ *
+ * oos_may_write indicates that a page may have writable mappings.
+ *
+ * Most of the time the flags are synonymous. There is a short period of time
+ * during resync that oos_may_write is clear but out_of_sync is not. If a
+ * codepath is called during that time and is sensitive to oos issues, it may
+ * need to use the second flag.
+ */
+#define SHF_out_of_sync (1u<<30)
+#define SHF_oos_may_write (1u<<29)
+
+/* Fixup tables are a non-complete writable-mappings reverse map for
+ OOS pages. This let us quickly resync pages (avoiding brute-force
+ search of the shadows) when the va hint is not sufficient (i.e.,
+ the pagetable is mapped in multiple places and in multiple
+ shadows.) */
+#define SHADOW_OOS_FT_ENTRIES \
+ ((PAGE_SIZE << SHADOW_OOS_FT_ORDER) \
+ / (SHADOW_OOS_FT_HASH * sizeof(struct oos_fixup)))
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
+static inline int sh_page_has_multiple_shadows(struct page_info *pg)
+{
+ u32 shadows;
+ if ( !(pg->count_info & PGC_page_table) )
+ return 0;
+ shadows = pg->shadow_flags & SHF_page_type_mask;
+ /* More than one type bit set in shadow-flags? */
+ return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* The caller must verify this is reasonable to call; i.e., valid mfn,
+ * domain is translated, &c */
+static inline int page_is_out_of_sync(struct page_info *p)
+{
+ return (p->count_info & PGC_page_table)
+ && (p->shadow_flags & SHF_out_of_sync);
+}
+
+static inline int mfn_is_out_of_sync(mfn_t gmfn)
+{
+ return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
+}
+
+static inline int page_oos_may_write(struct page_info *p)
+{
+ return (p->count_info & PGC_page_table)
+ && (p->shadow_flags & SHF_oos_may_write);
+}
+
+static inline int mfn_oos_may_write(mfn_t gmfn)
+{
+ return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
/******************************************************************************
* Various function declarations
@@ -351,7 +419,57 @@ int shadow_cmpxchg_guest_entry(struct vc
int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
intpte_t *old, intpte_t new, mfn_t gmfn);
-
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va);
+
+/* Pull an out-of-sync page back into sync. */
+void sh_resync(struct vcpu *v, mfn_t gmfn);
+
+void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+
+int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+ mfn_t smfn, unsigned long offset);
+
+/* Pull all out-of-sync shadows back into sync. If skip != 0, we try
+ * to avoid resyncing where we think we can get away with it. */
+
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int
do_locking);
+
+static inline void
+shadow_resync_all(struct vcpu *v, int do_locking)
+{
+ sh_resync_all(v,
+ 0 /* skip */,
+ 1 /* this */,
+ 1 /* others */,
+ do_locking);
+}
+
+static inline void
+shadow_resync_current_vcpu(struct vcpu *v, int do_locking)
+{
+ sh_resync_all(v,
+ 0 /* skip */,
+ 1 /* this */,
+ 0 /* others */,
+ do_locking);
+}
+
+static inline void
+shadow_sync_other_vcpus(struct vcpu *v, int do_locking)
+{
+ sh_resync_all(v,
+ 1 /* skip */,
+ 0 /* this */,
+ 1 /* others */,
+ do_locking);
+}
+
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
+mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn);
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
/******************************************************************************
* Flags used in the return value of the shadow_set_lXe() functions...
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/types.h Wed Jul 02 11:30:37 2008 +0900
@@ -438,6 +438,11 @@ struct shadow_walk_t
#define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap)
#define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry)
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+#define sh_resync_l1 INTERNAL_NAME(sh_resync_l1)
+#define sh_safe_not_to_sync INTERNAL_NAME(sh_safe_not_to_sync)
+#define sh_rm_write_access_from_sl1p
INTERNAL_NAME(sh_rm_write_access_from_sl1p)
+#endif
/* The sh_guest_(map|get)_* functions depends on Xen's paging levels */
#define sh_guest_map_l1e \
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/platform_hypercall.c Wed Jul 02 11:30:37 2008 +0900
@@ -408,7 +408,12 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
cpu_count++;
}
if ( cpu_count == num_online_cpus() )
- ret = acpi_cpufreq_init();
+ {
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+ ret = powernow_cpufreq_init();
+ else
+ ret = acpi_cpufreq_init();
+ }
break;
}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.c
--- a/xen/arch/x86/x86_emulate/x86_emulate.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c Wed Jul 02 11:30:37 2008 +0900
@@ -142,12 +142,14 @@ static uint8_t opcode_table[256] = {
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
/* 0xD0 - 0xD7 */
- ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
- ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
+ ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
+ ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
/* 0xD8 - 0xDF */
- 0, ImplicitOps|ModRM|Mov, 0, ImplicitOps|ModRM|Mov,
- 0, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+ ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+ ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+ ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+ ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
/* 0xE0 - 0xE7 */
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
@@ -216,7 +218,7 @@ static uint8_t twobyte_table[256] = {
ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
/* 0xA0 - 0xA7 */
ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
- DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0,
+ DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0,
/* 0xA8 - 0xAF */
ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstReg|SrcMem|ModRM,
@@ -246,8 +248,20 @@ static uint8_t twobyte_table[256] = {
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
- unsigned int bytes;
- unsigned long val, orig_val;
+ unsigned int bytes;
+
+ /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
+ union {
+ unsigned long val;
+ uint32_t bigval[4];
+ };
+
+ /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
+ union {
+ unsigned long orig_val;
+ uint32_t orig_bigval[4];
+ };
+
union {
/* OP_REG: Pointer to register field. */
unsigned long *reg;
@@ -466,7 +480,7 @@ do{ asm volatile (
/* Fetch next part of the instruction being emulated. */
#define insn_fetch_bytes(_size) \
-({ unsigned long _x, _eip = _regs.eip; \
+({ unsigned long _x = 0, _eip = _regs.eip; \
if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \
_regs.eip += (_size); /* real hardware doesn't truncate */ \
generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15, \
@@ -594,6 +608,18 @@ do{ struct fpu_insn_ctxt fic;
put_fpu(&fic); \
} while (0)
+#define emulate_fpu_insn_memsrc(_op, _arg) \
+do{ struct fpu_insn_ctxt fic; \
+ get_fpu(X86EMUL_FPU_fpu, &fic); \
+ asm volatile ( \
+ "movb $2f-1f,%0 \n" \
+ "1: " _op " %1 \n" \
+ "2: \n" \
+ : "=m" (fic.insn_bytes) \
+ : "m" (_arg) : "memory" ); \
+ put_fpu(&fic); \
+} while (0)
+
#define emulate_fpu_insn_stub(_bytes...) \
do{ uint8_t stub[] = { _bytes, 0xc3 }; \
struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; \
@@ -654,6 +680,19 @@ static void __put_rep_prefix(
if ( rep_prefix ) \
__put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
})
+
+/* Compatibility function: read guest memory, zero-extend result to a ulong. */
+static int read_ulong(
+ enum x86_segment seg,
+ unsigned long offset,
+ unsigned long *val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ *val = 0;
+ return ops->read(seg, offset, val, bytes, ctxt);
+}
/*
* Unsigned multiplication with double-word result.
@@ -841,7 +880,8 @@ static int ioport_access_check(
(tr.limit < 0x67) )
goto raise_exception;
- if ( (rc = ops->read(x86_seg_none, tr.base + 0x66, &iobmp, 2, ctxt)) )
+ if ( (rc = read_ulong(x86_seg_none, tr.base + 0x66,
+ &iobmp, 2, ctxt, ops)) )
return rc;
/* Ensure TSS includes two bytes including byte containing first port. */
@@ -849,7 +889,8 @@ static int ioport_access_check(
if ( tr.limit <= iobmp )
goto raise_exception;
- if ( (rc = ops->read(x86_seg_none, tr.base + iobmp, &iobmp, 2, ctxt)) )
+ if ( (rc = read_ulong(x86_seg_none, tr.base + iobmp,
+ &iobmp, 2, ctxt, ops)) )
return rc;
if ( (iobmp & (((1<<bytes)-1) << (first_port&7))) != 0 )
goto raise_exception;
@@ -941,12 +982,12 @@ protmode_load_seg(
goto raise_exn;
do {
- if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8),
- &val, 4, ctxt)) )
+ if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8),
+ &val, 4, ctxt, ops)) )
return rc;
desc.a = val;
- if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
- &val, 4, ctxt)) )
+ if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
+ &val, 4, ctxt, ops)) )
return rc;
desc.b = val;
@@ -992,14 +1033,15 @@ protmode_load_seg(
if ( (desc.b & (5u<<9)) == (4u<<9) )
goto raise_exn;
/* Non-conforming segment: check DPL against RPL and CPL. */
- if ( ((desc.b & (6u<<9)) != (6u<<9)) && ((dpl < cpl) || (dpl <
rpl)) )
+ if ( ((desc.b & (6u<<9)) != (6u<<9)) &&
+ ((dpl < cpl) || (dpl < rpl)) )
goto raise_exn;
break;
}
/* Ensure Accessed flag is set. */
new_desc_b = desc.b | 0x100;
- rc = ((desc.b & 0x100) ? X86EMUL_OKAY :
+ rc = ((desc.b & 0x100) ? X86EMUL_OKAY :
ops->cmpxchg(
x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
&desc.b, &new_desc_b, 4, ctxt));
@@ -1061,16 +1103,16 @@ decode_register(
case 2: p = ®s->edx; break;
case 3: p = ®s->ebx; break;
case 4: p = (highbyte_regs ?
- ((unsigned char *)®s->eax + 1) :
+ ((unsigned char *)®s->eax + 1) :
(unsigned char *)®s->esp); break;
case 5: p = (highbyte_regs ?
- ((unsigned char *)®s->ecx + 1) :
+ ((unsigned char *)®s->ecx + 1) :
(unsigned char *)®s->ebp); break;
case 6: p = (highbyte_regs ?
- ((unsigned char *)®s->edx + 1) :
+ ((unsigned char *)®s->edx + 1) :
(unsigned char *)®s->esi); break;
case 7: p = (highbyte_regs ?
- ((unsigned char *)®s->ebx + 1) :
+ ((unsigned char *)®s->ebx + 1) :
(unsigned char *)®s->edi); break;
#if defined(__x86_64__)
case 8: p = ®s->r8; break;
@@ -1402,8 +1444,8 @@ x86_emulate(
case 8: src.val = *(uint64_t *)src.reg; break;
}
}
- else if ( (rc = ops->read(src.mem.seg, src.mem.off,
- &src.val, src.bytes, ctxt)) )
+ else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
+ &src.val, src.bytes, ctxt, ops)) )
goto done;
break;
case SrcImm:
@@ -1494,8 +1536,8 @@ x86_emulate(
}
else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
{
- if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
- &dst.val, dst.bytes, ctxt)) )
+ if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+ &dst.val, dst.bytes, ctxt, ops)) )
goto done;
dst.orig_val = dst.val;
}
@@ -1571,8 +1613,8 @@ x86_emulate(
int lb, ub, idx;
generate_exception_if(mode_64bit() || (src.type != OP_MEM),
EXC_UD, -1);
- if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes,
- &src_val2, op_bytes, ctxt)) )
+ if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes,
+ &src_val2, op_bytes, ctxt, ops)) )
goto done;
ub = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2;
lb = (op_bytes == 2) ? (int16_t)src.val : (int32_t)src.val;
@@ -1588,8 +1630,8 @@ x86_emulate(
/* movsxd */
if ( src.type == OP_REG )
src.val = *(int32_t *)src.reg;
- else if ( (rc = ops->read(src.mem.seg, src.mem.off,
- &src.val, 4, ctxt)) )
+ else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
+ &src.val, 4, ctxt, ops)) )
goto done;
dst.val = (int32_t)src.val;
}
@@ -1613,8 +1655,8 @@ x86_emulate(
unsigned long src1; /* ModR/M source operand */
if ( ea.type == OP_REG )
src1 = *ea.reg;
- else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
- &src1, op_bytes, ctxt)) )
+ else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
+ &src1, op_bytes, ctxt, ops)) )
goto done;
_regs.eflags &= ~(EFLG_OF|EFLG_CF);
switch ( dst.bytes )
@@ -1720,8 +1762,8 @@ x86_emulate(
/* 64-bit mode: POP defaults to a 64-bit operand. */
if ( mode_64bit() && (dst.bytes == 4) )
dst.bytes = 8;
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
- &dst.val, dst.bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+ &dst.val, dst.bytes, ctxt, ops)) != 0 )
goto done;
break;
@@ -1773,8 +1815,8 @@ x86_emulate(
dst.val = x86_seg_es;
les: /* dst.val identifies the segment */
generate_exception_if(src.type != OP_MEM, EXC_UD, -1);
- if ( (rc = ops->read(src.mem.seg, src.mem.off + src.bytes,
- &sel, 2, ctxt)) != 0 )
+ if ( (rc = read_ulong(src.mem.seg, src.mem.off + src.bytes,
+ &sel, 2, ctxt, ops)) != 0 )
goto done;
if ( (rc = load_seg(dst.val, (uint16_t)sel, ctxt, ops)) != 0 )
goto done;
@@ -2020,8 +2062,8 @@ x86_emulate(
dst.bytes = op_bytes = 8;
if ( dst.type == OP_REG )
dst.val = *dst.reg;
- else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
- &dst.val, 8, ctxt)) != 0 )
+ else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+ &dst.val, 8, ctxt, ops)) != 0 )
goto done;
}
src.val = _regs.eip;
@@ -2036,8 +2078,8 @@ x86_emulate(
generate_exception_if(dst.type != OP_MEM, EXC_UD, -1);
- if ( (rc = ops->read(dst.mem.seg, dst.mem.off+dst.bytes,
- &sel, 2, ctxt)) )
+ if ( (rc = read_ulong(dst.mem.seg, dst.mem.off+dst.bytes,
+ &sel, 2, ctxt, ops)) )
goto done;
if ( (modrm_reg & 7) == 3 ) /* call */
@@ -2046,9 +2088,9 @@ x86_emulate(
fail_if(ops->read_segment == NULL);
if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) ||
(rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
- reg.sel, op_bytes, ctxt)) ||
+ ®.sel, op_bytes, ctxt)) ||
(rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
- _regs.eip, op_bytes, ctxt)) )
+ &_regs.eip, op_bytes, ctxt)) )
goto done;
}
@@ -2066,12 +2108,12 @@ x86_emulate(
dst.bytes = 8;
if ( dst.type == OP_REG )
dst.val = *dst.reg;
- else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
- &dst.val, 8, ctxt)) != 0 )
+ else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+ &dst.val, 8, ctxt, ops)) != 0 )
goto done;
}
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
- dst.val, dst.bytes, ctxt)) != 0 )
+ &dst.val, dst.bytes, ctxt)) != 0 )
goto done;
dst.type = OP_NONE;
break;
@@ -2106,7 +2148,7 @@ x86_emulate(
&dst.val, dst.bytes, ctxt);
else
rc = ops->write(
- dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt);
+ dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
if ( rc != 0 )
goto done;
default:
@@ -2153,7 +2195,7 @@ x86_emulate(
if ( mode_64bit() && (op_bytes == 4) )
op_bytes = 8;
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
- reg.sel, op_bytes, ctxt)) != 0 )
+ ®.sel, op_bytes, ctxt)) != 0 )
goto done;
break;
}
@@ -2165,8 +2207,8 @@ x86_emulate(
/* 64-bit mode: POP defaults to a 64-bit operand. */
if ( mode_64bit() && (op_bytes == 4) )
op_bytes = 8;
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
- &dst.val, op_bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+ &dst.val, op_bytes, ctxt, ops)) != 0 )
goto done;
if ( (rc = load_seg(src.val, (uint16_t)dst.val, ctxt, ops)) != 0 )
return rc;
@@ -2275,8 +2317,8 @@ x86_emulate(
dst.bytes = op_bytes;
if ( mode_64bit() && (dst.bytes == 4) )
dst.bytes = 8;
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
- &dst.val, dst.bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+ &dst.val, dst.bytes, ctxt, ops)) != 0 )
goto done;
break;
@@ -2288,7 +2330,7 @@ x86_emulate(
generate_exception_if(mode_64bit(), EXC_UD, -1);
for ( i = 0; i < 8; i++ )
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
- regs[i], op_bytes, ctxt)) != 0 )
+ ®s[i], op_bytes, ctxt)) != 0 )
goto done;
break;
}
@@ -2303,8 +2345,8 @@ x86_emulate(
generate_exception_if(mode_64bit(), EXC_UD, -1);
for ( i = 0; i < 8; i++ )
{
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
- &dst.val, op_bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+ &dst.val, op_bytes, ctxt, ops)) != 0 )
goto done;
switch ( op_bytes )
{
@@ -2382,8 +2424,8 @@ x86_emulate(
}
else
{
- if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
- &dst.val, dst.bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+ &dst.val, dst.bytes, ctxt, ops)) != 0 )
goto done;
fail_if(ops->write_io == NULL);
if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
@@ -2455,9 +2497,9 @@ x86_emulate(
if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) ||
(rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
- reg.sel, op_bytes, ctxt)) ||
+ ®.sel, op_bytes, ctxt)) ||
(rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
- _regs.eip, op_bytes, ctxt)) )
+ &_regs.eip, op_bytes, ctxt)) )
goto done;
if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 )
@@ -2483,8 +2525,8 @@ x86_emulate(
/* 64-bit mode: POP defaults to a 64-bit operand. */
if ( mode_64bit() && (op_bytes == 4) )
op_bytes = 8;
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
- &dst.val, op_bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+ &dst.val, op_bytes, ctxt, ops)) != 0 )
goto done;
if ( op_bytes == 2 )
dst.val = (uint16_t)dst.val | (_regs.eflags & 0xffff0000u);
@@ -2507,8 +2549,8 @@ x86_emulate(
dst.type = OP_REG;
dst.reg = (unsigned long *)&_regs.eax;
dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes),
- &dst.val, dst.bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
+ &dst.val, dst.bytes, ctxt, ops)) != 0 )
goto done;
break;
@@ -2536,8 +2578,8 @@ x86_emulate(
}
else
{
- if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
- &dst.val, dst.bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+ &dst.val, dst.bytes, ctxt, ops)) != 0 )
goto done;
dst.type = OP_MEM;
nr_reps = 1;
@@ -2556,10 +2598,10 @@ x86_emulate(
unsigned long next_eip = _regs.eip;
get_rep_prefix();
src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
- &dst.val, dst.bytes, ctxt)) ||
- (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
- &src.val, src.bytes, ctxt)) )
+ if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+ &dst.val, dst.bytes, ctxt, ops)) ||
+ (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
+ &src.val, src.bytes, ctxt, ops)) )
goto done;
register_address_increment(
_regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -2592,8 +2634,8 @@ x86_emulate(
dst.type = OP_REG;
dst.bytes = (d & ByteOp) ? 1 : op_bytes;
dst.reg = (unsigned long *)&_regs.eax;
- if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
- &dst.val, dst.bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+ &dst.val, dst.bytes, ctxt, ops)) != 0 )
goto done;
register_address_increment(
_regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -2606,8 +2648,8 @@ x86_emulate(
get_rep_prefix();
src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
dst.val = _regs.eax;
- if ( (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
- &src.val, src.bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
+ &src.val, src.bytes, ctxt, ops)) != 0 )
goto done;
register_address_increment(
_regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes);
@@ -2624,8 +2666,8 @@ x86_emulate(
case 0xc3: /* ret (near) */ {
int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0;
op_bytes = mode_64bit() ? 8 : op_bytes;
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
- &dst.val, op_bytes, ctxt)) != 0 )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
+ &dst.val, op_bytes, ctxt, ops)) != 0 )
goto done;
_regs.eip = dst.val;
break;
@@ -2640,7 +2682,7 @@ x86_emulate(
dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
dst.reg = (unsigned long *)&_regs.ebp;
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
- _regs.ebp, dst.bytes, ctxt)) )
+ &_regs.ebp, dst.bytes, ctxt)) )
goto done;
dst.val = _regs.esp;
@@ -2650,14 +2692,14 @@ x86_emulate(
{
unsigned long ebp, temp_data;
ebp = truncate_word(_regs.ebp - i*dst.bytes, ctxt->sp_size/8);
- if ( (rc = ops->read(x86_seg_ss, ebp,
- &temp_data, dst.bytes, ctxt)) ||
+ if ( (rc = read_ulong(x86_seg_ss, ebp,
+ &temp_data, dst.bytes, ctxt, ops)) ||
(rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
- temp_data, dst.bytes, ctxt)) )
+ &temp_data, dst.bytes, ctxt)) )
goto done;
}
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
- dst.val, dst.bytes, ctxt)) )
+ &dst.val, dst.bytes, ctxt)) )
goto done;
}
@@ -2683,8 +2725,8 @@ x86_emulate(
/* Second writeback, to %%ebp. */
dst.reg = (unsigned long *)&_regs.ebp;
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
- &dst.val, dst.bytes, ctxt)) )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+ &dst.val, dst.bytes, ctxt, ops)) )
goto done;
break;
@@ -2692,10 +2734,10 @@ x86_emulate(
case 0xcb: /* ret (far) */ {
int offset = (b == 0xca) ? insn_fetch_type(uint16_t) : 0;
op_bytes = mode_64bit() ? 8 : op_bytes;
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
- &dst.val, op_bytes, ctxt)) ||
- (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
- &src.val, op_bytes, ctxt)) ||
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+ &dst.val, op_bytes, ctxt, ops)) ||
+ (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
+ &src.val, op_bytes, ctxt, ops)) ||
(rc = load_seg(x86_seg_cs, (uint16_t)src.val, ctxt, ops)) )
goto done;
_regs.eip = dst.val;
@@ -2729,12 +2771,12 @@ x86_emulate(
if ( !mode_iopl() )
mask |= EFLG_IF;
fail_if(!in_realmode(ctxt, ops));
- if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
- &eip, op_bytes, ctxt)) ||
- (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
- &cs, op_bytes, ctxt)) ||
- (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
- &eflags, op_bytes, ctxt)) )
+ if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+ &eip, op_bytes, ctxt, ops)) ||
+ (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+ &cs, op_bytes, ctxt, ops)) ||
+ (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+ &eflags, op_bytes, ctxt, ops)) )
goto done;
if ( op_bytes == 2 )
eflags = (uint16_t)eflags | (_regs.eflags & 0xffff0000u);
@@ -2779,12 +2821,64 @@ x86_emulate(
case 0xd7: /* xlat */ {
unsigned long al = (uint8_t)_regs.eax;
- if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al),
- &al, 1, ctxt)) != 0 )
+ if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.ebx + al),
+ &al, 1, ctxt, ops)) != 0 )
goto done;
*(uint8_t *)&_regs.eax = al;
break;
}
+
+ case 0xd8: /* FPU 0xd8 */
+ switch ( modrm )
+ {
+ case 0xc0 ... 0xc7: /* fadd %stN,%stN */
+ case 0xc8 ... 0xcf: /* fmul %stN,%stN */
+ case 0xd0 ... 0xd7: /* fcom %stN,%stN */
+ case 0xd8 ... 0xdf: /* fcomp %stN,%stN */
+ case 0xe0 ... 0xe7: /* fsub %stN,%stN */
+ case 0xe8 ... 0xef: /* fsubr %stN,%stN */
+ case 0xf0 ... 0xf7: /* fdiv %stN,%stN */
+ case 0xf8 ... 0xff: /* fdivr %stN,%stN */
+ emulate_fpu_insn_stub(0xd8, modrm);
+ break;
+ default:
+ fail_if(modrm >= 0xc0);
+ ea.bytes = 4;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fadd */
+ emulate_fpu_insn_memsrc("fadds", src.val);
+ break;
+ case 1: /* fmul */
+ emulate_fpu_insn_memsrc("fmuls", src.val);
+ break;
+ case 2: /* fcom */
+ emulate_fpu_insn_memsrc("fcoms", src.val);
+ break;
+ case 3: /* fcomp */
+ emulate_fpu_insn_memsrc("fcomps", src.val);
+ break;
+ case 4: /* fsub */
+ emulate_fpu_insn_memsrc("fsubs", src.val);
+ break;
+ case 5: /* fsubr */
+ emulate_fpu_insn_memsrc("fsubrs", src.val);
+ break;
+ case 6: /* fdiv */
+ emulate_fpu_insn_memsrc("fdivs", src.val);
+ break;
+ case 7: /* fdivr */
+ emulate_fpu_insn_memsrc("fdivrs", src.val);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ }
+ break;
case 0xd9: /* FPU 0xd9 */
switch ( modrm )
@@ -2822,28 +2916,269 @@ x86_emulate(
emulate_fpu_insn_stub(0xd9, modrm);
break;
default:
- fail_if((modrm_reg & 7) != 7);
fail_if(modrm >= 0xc0);
- /* fnstcw m2byte */
- ea.bytes = 2;
- dst = ea;
- emulate_fpu_insn_memdst("fnstcw", dst.val);
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fld m32fp */
+ ea.bytes = 4;
+ src = ea;
+ if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memsrc("flds", src.val);
+ break;
+ case 2: /* fstp m32fp */
+ ea.bytes = 4;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fsts", dst.val);
+ break;
+ case 3: /* fstp m32fp */
+ ea.bytes = 4;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fstps", dst.val);
+ break;
+ /* case 4: fldenv - TODO */
+ case 5: /* fldcw m2byte */
+ ea.bytes = 2;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memsrc("fldcw", src.val);
+ break;
+ /* case 6: fstenv - TODO */
+ case 7: /* fnstcw m2byte */
+ ea.bytes = 2;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fnstcw", dst.val);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ }
+ break;
+
+ case 0xda: /* FPU 0xda */
+ switch ( modrm )
+ {
+ case 0xc0 ... 0xc7: /* fcmovb %stN */
+ case 0xc8 ... 0xcf: /* fcmove %stN */
+ case 0xd0 ... 0xd7: /* fcmovbe %stN */
+ case 0xd8 ... 0xdf: /* fcmovu %stN */
+ case 0xe9: /* fucompp */
+ emulate_fpu_insn_stub(0xda, modrm);
+ break;
+ default:
+ fail_if(modrm >= 0xc0);
+ ea.bytes = 8;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fiadd m64i */
+ emulate_fpu_insn_memsrc("fiaddl", src.val);
+ break;
+ case 1: /* fimul m64i */
+ emulate_fpu_insn_memsrc("fimul", src.val);
+ break;
+ case 2: /* ficom m64i */
+ emulate_fpu_insn_memsrc("ficoml", src.val);
+ break;
+ case 3: /* ficomp m64i */
+ emulate_fpu_insn_memsrc("ficompl", src.val);
+ break;
+ case 4: /* fisub m64i */
+ emulate_fpu_insn_memsrc("fisubl", src.val);
+ break;
+ case 5: /* fisubr m64i */
+ emulate_fpu_insn_memsrc("fisubrl", src.val);
+ break;
+ case 6: /* fidiv m64i */
+ emulate_fpu_insn_memsrc("fidivl", src.val);
+ break;
+ case 7: /* fidivr m64i */
+ emulate_fpu_insn_memsrc("fidivrl", src.val);
+ break;
+ default:
+ goto cannot_emulate;
+ }
}
break;
case 0xdb: /* FPU 0xdb */
- fail_if(modrm != 0xe3);
- /* fninit */
- emulate_fpu_insn("fninit");
+ switch ( modrm )
+ {
+ case 0xc0 ... 0xc7: /* fcmovnb %stN */
+ case 0xc8 ... 0xcf: /* fcmovne %stN */
+ case 0xd0 ... 0xd7: /* fcmovnbe %stN */
+ case 0xd8 ... 0xdf: /* fcmovnu %stN */
+ emulate_fpu_insn_stub(0xdb, modrm);
+ break;
+ case 0xe2: /* fnclex */
+ emulate_fpu_insn("fnclex");
+ break;
+ case 0xe3: /* fninit */
+ emulate_fpu_insn("fninit");
+ break;
+ case 0xe4: /* fsetpm - 287 only, ignored by 387 */
+ break;
+ case 0xe8 ... 0xef: /* fucomi %stN */
+ case 0xf0 ... 0xf7: /* fcomi %stN */
+ emulate_fpu_insn_stub(0xdb, modrm);
+ break;
+ default:
+ fail_if(modrm >= 0xc0);
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fild m32i */
+ ea.bytes = 4;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memsrc("fildl", src.val);
+ break;
+ case 1: /* fisttp m32i */
+ ea.bytes = 4;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fisttpl", dst.val);
+ break;
+ case 2: /* fist m32i */
+ ea.bytes = 4;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fistl", dst.val);
+ break;
+ case 3: /* fistp m32i */
+ ea.bytes = 4;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fistpl", dst.val);
+ break;
+ case 5: /* fld m80fp */
+ ea.bytes = 10;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off,
+ &src.val, src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memdst("fldt", src.val);
+ break;
+ case 7: /* fstp m80fp */
+ ea.bytes = 10;
+ dst.type = OP_MEM;
+ dst = ea;
+ emulate_fpu_insn_memdst("fstpt", dst.val);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ }
+ break;
+
+ case 0xdc: /* FPU 0xdc */
+ switch ( modrm )
+ {
+ case 0xc0 ... 0xc7: /* fadd %stN */
+ case 0xc8 ... 0xcf: /* fmul %stN */
+ case 0xe0 ... 0xe7: /* fsubr %stN */
+ case 0xe8 ... 0xef: /* fsub %stN */
+ case 0xf0 ... 0xf7: /* fdivr %stN */
+ case 0xf8 ... 0xff: /* fdiv %stN */
+ emulate_fpu_insn_stub(0xdc, modrm);
+ break;
+ default:
+ fail_if(modrm >= 0xc0);
+ ea.bytes = 8;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fadd m64fp */
+ emulate_fpu_insn_memsrc("faddl", src.val);
+ break;
+ case 1: /* fmul m64fp */
+ emulate_fpu_insn_memsrc("fmull", src.val);
+ break;
+ case 2: /* fcom m64fp */
+ emulate_fpu_insn_memsrc("fcoml", src.val);
+ break;
+ case 3: /* fcomp m64fp */
+ emulate_fpu_insn_memsrc("fcompl", src.val);
+ break;
+ case 4: /* fsub m64fp */
+ emulate_fpu_insn_memsrc("fsubl", src.val);
+ break;
+ case 5: /* fsubr m64fp */
+ emulate_fpu_insn_memsrc("fsubrl", src.val);
+ break;
+ case 6: /* fdiv m64fp */
+ emulate_fpu_insn_memsrc("fdivl", src.val);
+ break;
+ case 7: /* fdivr m64fp */
+ emulate_fpu_insn_memsrc("fdivrl", src.val);
+ break;
+ }
+ }
break;
case 0xdd: /* FPU 0xdd */
- fail_if((modrm_reg & 7) != 7);
- fail_if(modrm >= 0xc0);
- /* fnstsw m2byte */
- ea.bytes = 2;
- dst = ea;
- emulate_fpu_insn_memdst("fnstsw", dst.val);
+ switch ( modrm )
+ {
+ case 0xc0 ... 0xc7: /* ffree %stN */
+ case 0xd0 ... 0xd7: /* fst %stN */
+ case 0xd8 ... 0xdf: /* fstp %stN */
+ case 0xe0 ... 0xe7: /* fucom %stN */
+ case 0xe8 ... 0xef: /* fucomp %stN */
+ emulate_fpu_insn_stub(0xdd, modrm);
+ break;
+ default:
+ fail_if(modrm >= 0xc0);
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fld m64fp */;
+ ea.bytes = 8;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memsrc("fldl", src.val);
+ break;
+ case 1: /* fisttp m64i */
+ ea.bytes = 8;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fisttpll", dst.val);
+ break;
+ case 2: /* fst m64fp */
+ ea.bytes = 8;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memsrc("fstl", dst.val);
+ break;
+ case 3: /* fstp m64fp */
+ ea.bytes = 8;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fstpl", dst.val);
+ break;
+ case 7: /* fnstsw m2byte */
+ ea.bytes = 2;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fnstsw", dst.val);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ }
break;
case 0xde: /* FPU 0xde */
@@ -2859,17 +3194,120 @@ x86_emulate(
emulate_fpu_insn_stub(0xde, modrm);
break;
default:
- goto cannot_emulate;
+ fail_if(modrm >= 0xc0);
+ ea.bytes = 2;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fiadd m16i */
+ emulate_fpu_insn_memsrc("fiadd", src.val);
+ break;
+ case 1: /* fimul m16i */
+ emulate_fpu_insn_memsrc("fimul", src.val);
+ break;
+ case 2: /* ficom m16i */
+ emulate_fpu_insn_memsrc("ficom", src.val);
+ break;
+ case 3: /* ficomp m16i */
+ emulate_fpu_insn_memsrc("ficomp", src.val);
+ break;
+ case 4: /* fisub m16i */
+ emulate_fpu_insn_memsrc("fisub", src.val);
+ break;
+ case 5: /* fisubr m16i */
+ emulate_fpu_insn_memsrc("fisubr", src.val);
+ break;
+ case 6: /* fidiv m16i */
+ emulate_fpu_insn_memsrc("fidiv", src.val);
+ break;
+ case 7: /* fidivr m16i */
+ emulate_fpu_insn_memsrc("fidivr", src.val);
+ break;
+ default:
+ goto cannot_emulate;
+ }
}
break;
case 0xdf: /* FPU 0xdf */
- fail_if(modrm != 0xe0);
- /* fnstsw %ax */
- dst.bytes = 2;
- dst.type = OP_REG;
- dst.reg = (unsigned long *)&_regs.eax;
- emulate_fpu_insn_memdst("fnstsw", dst.val);
+ switch ( modrm )
+ {
+ case 0xe0:
+ /* fnstsw %ax */
+ dst.bytes = 2;
+ dst.type = OP_REG;
+ dst.reg = (unsigned long *)&_regs.eax;
+ emulate_fpu_insn_memdst("fnstsw", dst.val);
+ break;
+ case 0xf0 ... 0xf7: /* fcomip %stN */
+ case 0xf8 ... 0xff: /* fucomip %stN */
+ emulate_fpu_insn_stub(0xdf, modrm);
+ break;
+ default:
+ fail_if(modrm >= 0xc0);
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* fild m16i */
+ ea.bytes = 2;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memsrc("fild", src.val);
+ break;
+ case 1: /* fisttp m16i */
+ ea.bytes = 2;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fisttp", dst.val);
+ break;
+ case 2: /* fist m16i */
+ ea.bytes = 2;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fist", dst.val);
+ break;
+ case 3: /* fistp m16i */
+ ea.bytes = 2;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fistp", dst.val);
+ break;
+ case 4: /* fbld m80dec */
+ ea.bytes = 10;
+ dst = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off,
+ &src.val, src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memdst("fbld", src.val);
+ break;
+ case 5: /* fild m64i */
+ ea.bytes = 8;
+ src = ea;
+ if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+ src.bytes, ctxt)) != 0 )
+ goto done;
+ emulate_fpu_insn_memsrc("fildll", src.val);
+ break;
+ case 6: /* fbstp packed bcd */
+ ea.bytes = 10;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fbstp", dst.val);
+ break;
+ case 7: /* fistp m64i */
+ ea.bytes = 8;
+ dst = ea;
+ dst.type = OP_MEM;
+ emulate_fpu_insn_memdst("fistpll", dst.val);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ }
break;
case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
@@ -2924,7 +3362,6 @@ x86_emulate(
/* out */
fail_if(ops->write_io == NULL);
rc = ops->write_io(port, op_bytes, _regs.eax, ctxt);
-
}
else
{
@@ -3242,9 +3679,9 @@ x86_emulate(
if ( op_bytes == 2 )
reg.base &= 0xffffff;
if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0,
- reg.limit, 2, ctxt)) ||
+ ®.limit, 2, ctxt)) ||
(rc = ops->write(ea.mem.seg, ea.mem.off+2,
- reg.base, mode_64bit() ? 8 : 4, ctxt)) )
+ ®.base, mode_64bit() ? 8 : 4, ctxt)) )
goto done;
break;
case 2: /* lgdt */
@@ -3252,10 +3689,10 @@ x86_emulate(
generate_exception_if(ea.type != OP_MEM, EXC_UD, -1);
fail_if(ops->write_segment == NULL);
memset(®, 0, sizeof(reg));
- if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0,
- &limit, 2, ctxt)) ||
- (rc = ops->read(ea.mem.seg, ea.mem.off+2,
- &base, mode_64bit() ? 8 : 4, ctxt)) )
+ if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
+ &limit, 2, ctxt, ops)) ||
+ (rc = read_ulong(ea.mem.seg, ea.mem.off+2,
+ &base, mode_64bit() ? 8 : 4, ctxt, ops)) )
goto done;
reg.base = base;
reg.limit = limit;
@@ -3267,7 +3704,8 @@ x86_emulate(
goto done;
break;
case 4: /* smsw */
- ea.bytes = 2;
+ if ( ea.type == OP_MEM )
+ ea.bytes = 2;
dst = ea;
fail_if(ops->read_cr == NULL);
if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
@@ -3281,11 +3719,11 @@ x86_emulate(
goto done;
if ( ea.type == OP_REG )
cr0w = *ea.reg;
- else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
- &cr0w, 2, ctxt)) )
+ else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
+ &cr0w, 2, ctxt, ops)) )
goto done;
- cr0 &= 0xffff0000;
- cr0 |= (uint16_t)cr0w;
+ /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
+ cr0 = (cr0 & ~0xe) | (cr0w & 0xf);
if ( (rc = ops->write_cr(0, cr0, ctxt)) )
goto done;
break;
@@ -3404,8 +3842,10 @@ x86_emulate(
if ( ea.type == OP_MEM )
{
unsigned long lval, hval;
- if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
- (rc = ops->read(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
+ if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
+ &lval, 4, ctxt, ops)) ||
+ (rc = read_ulong(ea.mem.seg, ea.mem.off+4,
+ &hval, 4, ctxt, ops)) )
goto done;
val = ((uint64_t)hval << 32) | (uint32_t)lval;
stub[2] = modrm & 0x38; /* movq (%eax),%mmN */
@@ -3428,8 +3868,8 @@ x86_emulate(
if ( ea.type == OP_MEM )
{
unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32);
- if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, lval, 4, ctxt)) ||
- (rc = ops->write(ea.mem.seg, ea.mem.off+4, hval, 4, ctxt)) )
+ if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
+ (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
goto done;
}
break;
@@ -3481,8 +3921,8 @@ x86_emulate(
/* Get actual old value. */
for ( i = 0; i < (op_bytes/sizeof(long)); i++ )
- if ( (rc = ops->read(ea.mem.seg, ea.mem.off + i*sizeof(long),
- &old[i], sizeof(long), ctxt)) != 0 )
+ if ( (rc = read_ulong(ea.mem.seg, ea.mem.off + i*sizeof(long),
+ &old[i], sizeof(long), ctxt, ops)) != 0 )
goto done;
/* Get expected and proposed values. */
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.h
--- a/xen/arch/x86/x86_emulate/x86_emulate.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h Wed Jul 02 11:30:37 2008 +0900
@@ -102,7 +102,8 @@ enum x86_emulate_fpu_type {
};
/*
- * These operations represent the instruction emulator's interface to memory.
+ * These operations represent the instruction emulator's interface to memory,
+ * I/O ports, privileged state... pretty much everything other than GPRs.
*
* NOTES:
* 1. If the access fails (cannot emulate, or a standard access faults) then
@@ -110,8 +111,7 @@ enum x86_emulate_fpu_type {
* some out-of-band mechanism, unknown to the emulator. The memop signals
* failure by returning X86EMUL_EXCEPTION to the emulator, which will
* then immediately bail.
- * 2. Valid access sizes are 1, 2, 4 and 8 (x86/64 only) bytes.
- * 3. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ * 2. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
*/
struct x86_emulate_ops
{
@@ -121,19 +121,25 @@ struct x86_emulate_ops
* All memory-access functions:
* @seg: [IN ] Segment being dereferenced (specified as x86_seg_??).
* @offset:[IN ] Offset within segment.
+ * @p_data:[IN ] Pointer to i/o data buffer (length is @bytes)
* Read functions:
* @val: [OUT] Value read, zero-extended to 'ulong'.
* Write functions:
* @val: [IN ] Value to write (low-order bytes used as req'd).
* Variable-length access functions:
- * @bytes: [IN ] Number of bytes to read or write.
- */
-
- /* read: Emulate a memory read. */
+ * @bytes: [IN ] Number of bytes to read or write. Valid access sizes are
+ * 1, 2, 4 and 8 (x86/64 only) bytes, unless otherwise
+ * stated.
+ */
+
+ /*
+ * read: Emulate a memory read.
+ * @bytes: Access length (0 < @bytes < 4096).
+ */
int (*read)(
enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt);
@@ -144,15 +150,18 @@ struct x86_emulate_ops
int (*insn_fetch)(
enum x86_segment seg,
unsigned long offset,
- unsigned long *val,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt);
-
- /* write: Emulate a memory write. */
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt);
+
+ /*
+ * write: Emulate a memory write.
+ * @bytes: Access length (0 < @bytes < 4096).
+ */
int (*write)(
enum x86_segment seg,
unsigned long offset,
- unsigned long val,
+ void *p_data,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt);
diff -r 11318234588e -r 08f77df14cba xen/common/domain.c
--- a/xen/common/domain.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/common/domain.c Wed Jul 02 11:30:37 2008 +0900
@@ -73,21 +73,133 @@ int current_domain_id(void)
return current->domain->domain_id;
}
-struct domain *alloc_domain(domid_t domid)
+static struct domain *alloc_domain_struct(void)
+{
+ return xmalloc(struct domain);
+}
+
+static void free_domain_struct(struct domain *d)
+{
+ xfree(d);
+}
+
+static void __domain_finalise_shutdown(struct domain *d)
+{
+ struct vcpu *v;
+
+ BUG_ON(!spin_is_locked(&d->shutdown_lock));
+
+ if ( d->is_shut_down )
+ return;
+
+ for_each_vcpu ( d, v )
+ if ( !v->paused_for_shutdown )
+ return;
+
+ d->is_shut_down = 1;
+ send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+}
+
+static void vcpu_check_shutdown(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+
+ spin_lock(&d->shutdown_lock);
+
+ if ( d->is_shutting_down )
+ {
+ if ( !v->paused_for_shutdown )
+ vcpu_pause_nosync(v);
+ v->paused_for_shutdown = 1;
+ v->defer_shutdown = 0;
+ __domain_finalise_shutdown(d);
+ }
+
+ spin_unlock(&d->shutdown_lock);
+}
+
+struct vcpu *alloc_vcpu(
+ struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
+{
+ struct vcpu *v;
+
+ BUG_ON(d->vcpu[vcpu_id] != NULL);
+
+ if ( (v = alloc_vcpu_struct()) == NULL )
+ return NULL;
+
+ v->domain = d;
+ v->vcpu_id = vcpu_id;
+
+ v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
+ v->runstate.state_entry_time = NOW();
+
+ if ( !is_idle_domain(d) )
+ {
+ set_bit(_VPF_down, &v->pause_flags);
+ v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
+ }
+
+ if ( sched_init_vcpu(v, cpu_id) != 0 )
+ {
+ free_vcpu_struct(v);
+ return NULL;
+ }
+
+ if ( vcpu_initialise(v) != 0 )
+ {
+ sched_destroy_vcpu(v);
+ free_vcpu_struct(v);
+ return NULL;
+ }
+
+ d->vcpu[vcpu_id] = v;
+ if ( vcpu_id != 0 )
+ d->vcpu[v->vcpu_id-1]->next_in_list = v;
+
+ /* Must be called after making new vcpu visible to for_each_vcpu(). */
+ vcpu_check_shutdown(v);
+
+ return v;
+}
+
+struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
{
struct domain *d;
-
- if ( (d = xmalloc(struct domain)) == NULL )
+ struct vcpu *v;
+ unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
+
+ if ( (v = idle_vcpu[cpu_id]) != NULL )
+ return v;
+
+ d = (vcpu_id == 0) ?
+ domain_create(IDLE_DOMAIN_ID, 0, 0) :
+ idle_vcpu[cpu_id - vcpu_id]->domain;
+ BUG_ON(d == NULL);
+
+ v = alloc_vcpu(d, vcpu_id, cpu_id);
+ idle_vcpu[cpu_id] = v;
+
+ return v;
+}
+
+struct domain *domain_create(
+ domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
+{
+ struct domain *d, **pd;
+ enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
+ INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 };
+ int init_status = 0;
+
+ if ( (d = alloc_domain_struct()) == NULL )
return NULL;
memset(d, 0, sizeof(*d));
d->domain_id = domid;
if ( xsm_alloc_security_domain(d) != 0 )
- {
- free_domain(d);
- return NULL;
- }
+ goto fail;
+ init_status |= INIT_xsm;
atomic_set(&d->refcnt, 1);
spin_lock_init(&d->domain_lock);
@@ -97,132 +209,17 @@ struct domain *alloc_domain(domid_t domi
INIT_LIST_HEAD(&d->page_list);
INIT_LIST_HEAD(&d->xenpage_list);
- return d;
-}
-
-void free_domain(struct domain *d)
-{
- xsm_free_security_domain(d);
- xfree(d);
-}
-
-static void __domain_finalise_shutdown(struct domain *d)
-{
- struct vcpu *v;
-
- BUG_ON(!spin_is_locked(&d->shutdown_lock));
-
- if ( d->is_shut_down )
- return;
-
- for_each_vcpu ( d, v )
- if ( !v->paused_for_shutdown )
- return;
-
- d->is_shut_down = 1;
- send_guest_global_virq(dom0, VIRQ_DOM_EXC);
-}
-
-static void vcpu_check_shutdown(struct vcpu *v)
-{
- struct domain *d = v->domain;
-
- spin_lock(&d->shutdown_lock);
-
- if ( d->is_shutting_down )
- {
- if ( !v->paused_for_shutdown )
- vcpu_pause_nosync(v);
- v->paused_for_shutdown = 1;
- v->defer_shutdown = 0;
- __domain_finalise_shutdown(d);
- }
-
- spin_unlock(&d->shutdown_lock);
-}
-
-struct vcpu *alloc_vcpu(
- struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
-{
- struct vcpu *v;
-
- BUG_ON(d->vcpu[vcpu_id] != NULL);
-
- if ( (v = alloc_vcpu_struct()) == NULL )
- return NULL;
-
- v->domain = d;
- v->vcpu_id = vcpu_id;
-
- v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
- v->runstate.state_entry_time = NOW();
-
- if ( !is_idle_domain(d) )
- {
- set_bit(_VPF_down, &v->pause_flags);
- v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
- }
-
- if ( sched_init_vcpu(v, cpu_id) != 0 )
- {
- free_vcpu_struct(v);
- return NULL;
- }
-
- if ( vcpu_initialise(v) != 0 )
- {
- sched_destroy_vcpu(v);
- free_vcpu_struct(v);
- return NULL;
- }
-
- d->vcpu[vcpu_id] = v;
- if ( vcpu_id != 0 )
- d->vcpu[v->vcpu_id-1]->next_in_list = v;
-
- /* Must be called after making new vcpu visible to for_each_vcpu(). */
- vcpu_check_shutdown(v);
-
- return v;
-}
-
-struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
-{
- struct domain *d;
- struct vcpu *v;
- unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
-
- if ( (v = idle_vcpu[cpu_id]) != NULL )
- return v;
-
- d = (vcpu_id == 0) ?
- domain_create(IDLE_DOMAIN_ID, 0, 0) :
- idle_vcpu[cpu_id - vcpu_id]->domain;
- BUG_ON(d == NULL);
-
- v = alloc_vcpu(d, vcpu_id, cpu_id);
- idle_vcpu[cpu_id] = v;
-
- return v;
-}
-
-struct domain *domain_create(
- domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
-{
- struct domain *d, **pd;
- enum { INIT_evtchn = 1, INIT_gnttab = 2, INIT_arch = 8 };
- int init_status = 0;
-
- if ( (d = alloc_domain(domid)) == NULL )
- return NULL;
-
if ( domcr_flags & DOMCRF_hvm )
d->is_hvm = 1;
if ( (domid == 0) && opt_dom0_vcpus_pin )
d->is_pinned = 1;
+ if ( domcr_flags & DOMCRF_dummy )
+ return d;
+
rangeset_domain_initialise(d);
+ init_status |= INIT_rangeset;
if ( !is_idle_domain(d) )
{
@@ -278,8 +275,11 @@ struct domain *domain_create(
grant_table_destroy(d);
if ( init_status & INIT_evtchn )
evtchn_destroy(d);
- rangeset_domain_destroy(d);
- free_domain(d);
+ if ( init_status & INIT_rangeset )
+ rangeset_domain_destroy(d);
+ if ( init_status & INIT_xsm )
+ xsm_free_security_domain(d);
+ free_domain_struct(d);
return NULL;
}
@@ -535,7 +535,8 @@ static void complete_domain_destroy(stru
if ( d->target != NULL )
put_domain(d->target);
- free_domain(d);
+ xsm_free_security_domain(d);
+ free_domain_struct(d);
send_guest_global_virq(dom0, VIRQ_DOM_EXC);
}
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.c
--- a/xen/drivers/passthrough/vtd/dmar.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/dmar.c Wed Jul 02 11:30:37 2008 +0900
@@ -383,7 +383,8 @@ acpi_parse_one_drhd(struct acpi_dmar_ent
dmaru->address = drhd->address;
dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
INIT_LIST_HEAD(&dmaru->ioapic_list);
- dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %lx\n", dmaru->address);
+ dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n",
+ dmaru->address);
dev_scope_start = (void *)(drhd + 1);
dev_scope_end = ((void *)drhd) + header->length;
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.h
--- a/xen/drivers/passthrough/vtd/dmar.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/dmar.h Wed Jul 02 11:30:37 2008 +0900
@@ -42,28 +42,28 @@ struct acpi_ioapic_unit {
struct acpi_drhd_unit {
struct list_head list;
- unsigned long address; /* register base address of the unit */
- struct pci_dev *devices; /* target devices */
+ u64 address; /* register base address of the unit */
+ struct pci_dev *devices; /* target devices */
int devices_cnt;
- u8 include_all:1;
+ u8 include_all:1;
struct iommu *iommu;
struct list_head ioapic_list;
};
struct acpi_rmrr_unit {
struct list_head list;
- unsigned long base_address;
- unsigned long end_address;
+ u64 base_address;
+ u64 end_address;
struct pci_dev *devices; /* target devices */
int devices_cnt;
- u8 allow_all:1;
+ u8 allow_all:1;
};
struct acpi_atsr_unit {
struct list_head list;
- struct pci_dev *devices; /* target devices */
+ struct pci_dev *devices; /* target devices */
int devices_cnt;
- u8 all_ports:1;
+ u8 all_ports:1;
};
#define for_each_iommu(domain, iommu) \
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/intremap.c
--- a/xen/drivers/passthrough/vtd/intremap.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/intremap.c Wed Jul 02 11:30:37 2008 +0900
@@ -52,7 +52,7 @@ static void remap_entry_to_ioapic_rte(
unsigned long flags;
struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
- if ( ir_ctrl == NULL || ir_ctrl->iremap_index < 0 )
+ if ( ir_ctrl == NULL )
{
dprintk(XENLOG_ERR VTDPREFIX,
"remap_entry_to_ioapic_rte: ir_ctl is not ready\n");
@@ -153,6 +153,7 @@ static void ioapic_rte_to_remap_entry(st
}
memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+ iommu_flush_cache_entry(iremap_entry);
iommu_flush_iec_index(iommu, 0, index);
invalidate_sync(iommu);
@@ -170,7 +171,8 @@ unsigned int io_apic_read_remap_rte(
struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid);
struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
- if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 )
+ if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ||
+ ir_ctrl->iremap_index == -1 )
{
*IO_APIC_BASE(apic) = reg;
return *(IO_APIC_BASE(apic)+4);
@@ -377,6 +379,7 @@ static void msi_msg_to_remap_entry(
remap_rte->data = 0;
memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+ iommu_flush_cache_entry(iremap_entry);
iommu_flush_iec_index(iommu, 0, index);
invalidate_sync(iommu);
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/iommu.c Wed Jul 02 11:30:37 2008 +0900
@@ -1269,7 +1269,6 @@ static int domain_context_mapping(
}
static int domain_context_unmap_one(
- struct domain *domain,
struct iommu *iommu,
u8 bus, u8 devfn)
{
@@ -1300,7 +1299,6 @@ static int domain_context_unmap_one(
}
static int domain_context_unmap(
- struct domain *domain,
struct iommu *iommu,
struct pci_dev *pdev)
{
@@ -1320,14 +1318,13 @@ static int domain_context_unmap(
PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
break;
case DEV_TYPE_PCIe_ENDPOINT:
- ret = domain_context_unmap_one(domain, iommu,
+ ret = domain_context_unmap_one(iommu,
(u8)(pdev->bus), (u8)(pdev->devfn));
break;
case DEV_TYPE_PCI:
if ( pdev->bus == 0 )
ret = domain_context_unmap_one(
- domain, iommu,
- (u8)(pdev->bus), (u8)(pdev->devfn));
+ iommu, (u8)(pdev->bus), (u8)(pdev->devfn));
else
{
if ( bus2bridge[pdev->bus].bus != 0 )
@@ -1335,7 +1332,7 @@ static int domain_context_unmap(
"domain_context_unmap:"
"bus2bridge[%d].bus != 0\n", pdev->bus);
- ret = domain_context_unmap_one(domain, iommu,
+ ret = domain_context_unmap_one(iommu,
(u8)(bus2bridge[pdev->bus].bus),
(u8)(bus2bridge[pdev->bus].devfn));
@@ -1345,8 +1342,7 @@ static int domain_context_unmap(
for ( func = 0; func < 8; func++ )
{
ret = domain_context_unmap_one(
- domain, iommu,
- pdev->bus, (u8)PCI_DEVFN(dev, func));
+ iommu, pdev->bus, (u8)PCI_DEVFN(dev, func));
if ( ret )
return ret;
}
@@ -1389,7 +1385,7 @@ void reassign_device_ownership(
found:
drhd = acpi_find_matched_drhd_unit(pdev);
iommu = drhd->iommu;
- domain_context_unmap(source, iommu, pdev);
+ domain_context_unmap(iommu, pdev);
/* Move pci device from the source domain to target domain. */
spin_lock_irqsave(&source_hd->iommu_list_lock, flags);
@@ -1589,7 +1585,7 @@ static int iommu_prepare_rmrr_dev(
struct pci_dev *pdev)
{
struct acpi_drhd_unit *drhd;
- unsigned long size;
+ u64 size;
int ret;
/* page table init */
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/qinval.c
--- a/xen/drivers/passthrough/vtd/qinval.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/qinval.c Wed Jul 02 11:30:37 2008 +0900
@@ -222,7 +222,7 @@ int invalidate_sync(struct iommu *iommu)
int ret = -1;
struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
- if ( qi_ctrl->qinval_maddr == 0 )
+ if ( qi_ctrl->qinval_maddr != 0 )
{
ret = queue_invalidate_wait(iommu,
0, 1, 1, 1, &qi_ctrl->qinval_poll_status);
@@ -416,7 +416,6 @@ int qinval_setup(struct iommu *iommu)
int qinval_setup(struct iommu *iommu)
{
s_time_t start_time;
- u32 status = 0;
struct qi_ctrl *qi_ctrl;
struct iommu_flush *flush;
@@ -450,15 +449,12 @@ int qinval_setup(struct iommu *iommu)
/* Make sure hardware complete it */
start_time = NOW();
- for ( ; ; )
- {
- status = dmar_readl(iommu->reg, DMAR_GSTS_REG);
- if ( status & DMA_GSTS_QIES )
- break;
+ while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) )
+ {
if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
panic("Cannot set QIE field for queue invalidation\n");
cpu_relax();
}
- status = 0;
- return status;
-}
+
+ return 0;
+}
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/utils.c
--- a/xen/drivers/passthrough/vtd/utils.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/utils.c Wed Jul 02 11:30:37 2008 +0900
@@ -166,7 +166,7 @@ void print_iommu_regs(struct acpi_drhd_u
struct iommu *iommu = drhd->iommu;
printk("---- print_iommu_regs ----\n");
- printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
+ printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address);
printk("print_iommu_regs: DMAR_VER_REG = %x\n",
dmar_readl(iommu->reg,DMAR_VER_REG));
printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n",
diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/cpufreq.h
--- a/xen/include/acpi/cpufreq/cpufreq.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/acpi/cpufreq/cpufreq.h Wed Jul 02 11:30:37 2008 +0900
@@ -36,7 +36,10 @@ struct cpufreq_policy {
unsigned int max; /* in kHz */
unsigned int cur; /* in kHz, only needed if cpufreq
* governors are used */
+ unsigned int resume; /* flag for cpufreq 1st run
+ * S3 wakeup, hotplug cpu, etc */
};
+extern struct cpufreq_policy xen_px_policy[NR_CPUS];
#define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
#define CPUFREQ_SHARED_TYPE_HW (1) /* HW does needed coordination */
diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/processor_perf.h
--- a/xen/include/acpi/cpufreq/processor_perf.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/acpi/cpufreq/processor_perf.h Wed Jul 02 11:30:37 2008 +0900
@@ -6,9 +6,21 @@
int get_cpu_id(u8);
int acpi_cpufreq_init(void);
+int powernow_cpufreq_init(void);
+
void px_statistic_update(cpumask_t, uint8_t, uint8_t);
int px_statistic_init(int);
void px_statistic_reset(int);
+void px_statistic_suspend(void);
+void px_statistic_resume(void);
+
+void cpufreq_dom_exit(void);
+int cpufreq_dom_init(void);
+int cpufreq_dom_dbs(unsigned int);
+void cpufreq_suspend(void);
+int cpufreq_resume(void);
+
+inline uint64_t get_cpu_idle_time(unsigned int);
struct processor_performance {
uint32_t state;
@@ -44,6 +56,7 @@ struct pm_px {
struct pm_px {
struct px_stat u;
uint64_t prev_state_wall;
+ uint64_t prev_idle_wall;
};
extern struct pm_px px_statistic_data[NR_CPUS];
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/domain.h Wed Jul 02 11:30:37 2008 +0900
@@ -103,6 +103,9 @@ struct shadow_domain {
* emulation and remove write permission
*/
atomic_t gtable_dirty_version;
+
+ /* OOS */
+ int oos_active;
};
struct shadow_vcpu {
@@ -122,6 +125,17 @@ struct shadow_vcpu {
unsigned long last_emulated_frame;
/* Last MFN that we emulated a write successfully */
unsigned long last_emulated_mfn;
+
+ /* Shadow out-of-sync: pages that this vcpu has let go out of sync */
+ mfn_t oos[SHADOW_OOS_PAGES];
+ unsigned long oos_va[SHADOW_OOS_PAGES];
+ mfn_t oos_snapshot[SHADOW_OOS_PAGES];
+ struct oos_fixup {
+ mfn_t gmfn;
+ mfn_t smfn;
+ unsigned long off;
+ } *oos_fixups;
+ int oos_fixup_used;
};
/************************************************/
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Wed Jul 02 11:30:37 2008 +0900
@@ -333,10 +333,10 @@ enum vmcs_field {
#define VMCS_VPID_WIDTH 16
void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr);
-int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val);
-int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val);
-int vmx_add_guest_msr(struct vcpu *v, u32 msr);
-int vmx_add_host_load_msr(struct vcpu *v, u32 msr);
+int vmx_read_guest_msr(u32 msr, u64 *val);
+int vmx_write_guest_msr(u32 msr, u64 val);
+int vmx_add_guest_msr(u32 msr);
+int vmx_add_host_load_msr(u32 msr);
#endif /* ASM_X86_HVM_VMX_VMCS_H__ */
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/mm.h Wed Jul 02 11:30:37 2008 +0900
@@ -130,6 +130,14 @@ static inline u32 pickle_domptr(struct d
/* The order of the largest allocation unit we use for shadow pages */
#define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
+/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
+#define SHADOW_OOS_PAGES 3
+
+/* The order OOS fixup tables per vcpu */
+#define SHADOW_OOS_FT_ORDER 1
+/* OOS fixup tables hash entries */
+#define SHADOW_OOS_FT_HASH 13
+
#define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain))
#define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/perfc_defn.h Wed Jul 02 11:30:37 2008 +0900
@@ -80,7 +80,11 @@ PERFCOUNTER(shadow_writeable_h_3, "shad
PERFCOUNTER(shadow_writeable_h_3, "shadow writeable: 64b w2k3")
PERFCOUNTER(shadow_writeable_h_4, "shadow writeable: linux low/solaris")
PERFCOUNTER(shadow_writeable_h_5, "shadow writeable: linux high")
+PERFCOUNTER(shadow_writeable_h_6, "shadow writeable: unsync va")
+PERFCOUNTER(shadow_writeable_h_7, "shadow writeable: sl1p")
+PERFCOUNTER(shadow_writeable_h_8, "shadow writeable: sl1p failed")
PERFCOUNTER(shadow_writeable_bf, "shadow writeable brute-force")
+PERFCOUNTER(shadow_writeable_bf_1, "shadow writeable resync bf")
PERFCOUNTER(shadow_mappings, "shadow removes all mappings")
PERFCOUNTER(shadow_mappings_bf, "shadow rm-mappings brute-force")
PERFCOUNTER(shadow_early_unshadow, "shadow unshadows for fork/exit")
@@ -101,4 +105,15 @@ PERFCOUNTER(shadow_em_ex_non_pt, "shad
PERFCOUNTER(shadow_em_ex_non_pt, "shadow extra non-pt-write op")
PERFCOUNTER(shadow_em_ex_fail, "shadow extra emulation failed")
+PERFCOUNTER(shadow_oos_fixup_add_ok, "shadow OOS fixups adds")
+PERFCOUNTER(shadow_oos_fixup_no_add, "shadow OOS fixups no adds")
+PERFCOUNTER(shadow_oos_fixup_add_fail, "shadow OOS fixups adds failed")
+PERFCOUNTER(shadow_oos_fixup_remove, "shadow OOS fixups removes")
+PERFCOUNTER(shadow_oos_fixup_flush, "shadow OOS fixups flushes")
+PERFCOUNTER(shadow_oos_fixup_flush_gmfn,"shadow OOS fixups gmfn flushes")
+
+PERFCOUNTER(shadow_unsync, "shadow OOS unsyncs")
+PERFCOUNTER(shadow_unsync_evict, "shadow OOS evictions")
+PERFCOUNTER(shadow_resync, "shadow OOS resyncs")
+
/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r 11318234588e -r 08f77df14cba xen/include/public/hvm/hvm_op.h
--- a/xen/include/public/hvm/hvm_op.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/public/hvm/hvm_op.h Wed Jul 02 11:30:37 2008 +0900
@@ -92,6 +92,19 @@ typedef struct xen_hvm_track_dirty_vram
typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t;
DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t);
+/* Notify that some pages got modified by the Device Model. */
+#define HVMOP_modified_memory 7
+struct xen_hvm_modified_memory {
+ /* Domain to be updated. */
+ domid_t domid;
+ /* First pfn. */
+ uint64_aligned_t first_pfn;
+ /* Number of pages. */
+ uint64_aligned_t nr;
+};
+typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
+
#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff -r 11318234588e -r 08f77df14cba xen/include/xen/domain.h
--- a/xen/include/xen/domain.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/xen/domain.h Wed Jul 02 11:30:37 2008 +0900
@@ -15,9 +15,6 @@ int boot_vcpu(
struct domain *d, int vcpuid, vcpu_guest_context_u ctxt);
struct vcpu *alloc_idle_vcpu(unsigned int cpu_id);
void vcpu_reset(struct vcpu *v);
-
-struct domain *alloc_domain(domid_t domid);
-void free_domain(struct domain *d);
struct xen_domctl_getdomaininfo;
void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
diff -r 11318234588e -r 08f77df14cba xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/xen/sched.h Wed Jul 02 11:30:37 2008 +0900
@@ -315,10 +315,14 @@ struct domain *domain_create(
struct domain *domain_create(
domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
/* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
-#define _DOMCRF_hvm 0
-#define DOMCRF_hvm (1U<<_DOMCRF_hvm)
-#define _DOMCRF_hap 1
-#define DOMCRF_hap (1U<<_DOMCRF_hap)
+#define _DOMCRF_hvm 0
+#define DOMCRF_hvm (1U<<_DOMCRF_hvm)
+ /* DOMCRF_hap: Create a domain with hardware-assisted paging. */
+#define _DOMCRF_hap 1
+#define DOMCRF_hap (1U<<_DOMCRF_hap)
+ /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */
+#define _DOMCRF_dummy 2
+#define DOMCRF_dummy (1U<<_DOMCRF_dummy)
int construct_dom0(
struct domain *d,
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|