[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH][HVM] vmx domain save/restore support



attached is the vmx domain save/restore patch. works well for different guest/host combination W/O breaking domu save/restore according to my test.
pls. first apply xiaowei's qemu dm fix.

===know issue===
*  shpage pfn *
HV look for shpage pfn in an e820 entry when init. but some guest(win/em64t linux) will reuse this e820 ram, which cause losing shpage pfn when restore. so this entry is marked as "reserved" to avoid guest reuse (in this patch xc_hvm_build.c). we can change this if having good solution in future.

*  64bit host vmx restore python err *
when restore vmx guest on em64t host, i got a error "ERROR (xmlrpclib2:167) int exceeds XML-RPC limits" W/O blocking restore. "xend restart" can be a work around.

*  guest smp support *
i'm doing guest smp support including apic/vmcs save/restore now. so turn on "apic" in config file may cause save/restore failure.

*  guest save/restore across platform*
e.g save 32 guest on 64 host, then restore 32 guest on 32 host. we can't support this because save/restore face different vcpu_context format on different host. need universal format for this.

=== test report ===

"+" stands for okay, "-" stands for fail

   32b host:
       + 32/32
       + 32win/32

   pae host:
       + 32/pae
       + pae/pae
       + 32win/pae
       + pae_win/pae


   em64t host:
       + 32/64
       + pae/64
       + 64/64
       + 32win/64
       + pae_win/64

sometimes pae_win/64 are not stable:(



# HG changeset patch
# User Edwin Zhai <edwin.zhai@xxxxxxxxx>
# Node ID 2abb1c801ab72ee7e88b144871162fe2e47a0970
# Parent  98c3ddf83a59b0cbbdce63bb210adfd0d2ec1aea
vmx save/restore support

Signed-off-by: Zhai Edwin <edwin.zhai@xxxxxxxxx>
Signed-off-by: Dong Eddie <eddie.dong@xxxxxxxxx>
Signed-off-by: Nakajima Jun <jun.nakajima@xxxxxxxxx>

diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c       Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/ioemu/hw/cirrus_vga.c       Wed Jul 19 16:09:59 2006 +0800
@@ -3010,11 +3010,44 @@ static CPUWriteMemoryFunc *cirrus_mmio_w
     cirrus_mmio_writel,
 };
 
+void cirrus_stop_acc(CirrusVGAState *s)
+{
+    if (s->map_addr){
+        int error;
+        s->map_addr = 0;
+        error = unset_vram_mapping(s->cirrus_lfb_addr,
+                s->cirrus_lfb_end);
+        fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n");
+
+        munmap(s->vram_ptr, VGA_RAM_SIZE);
+    }
+}
+
+void cirrus_restart_acc(CirrusVGAState *s)
+{
+    if (s->cirrus_lfb_addr && s->cirrus_lfb_end) {
+        void *vram_pointer, *old_vram;
+        fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, 
lfb_end=0x%lx.\n",
+                s->cirrus_lfb_addr, s->cirrus_lfb_end);
+        vram_pointer = set_vram_mapping(s->cirrus_lfb_addr ,s->cirrus_lfb_end);
+        if (!vram_pointer){
+            fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n");
+        } else {
+            old_vram = vga_update_vram((VGAState *)s, vram_pointer,
+                    VGA_RAM_SIZE);
+            qemu_free(old_vram);
+            s->map_addr = s->cirrus_lfb_addr;
+            s->map_end = s->cirrus_lfb_end;
+        }
+    }
+}
+
 /* load/save state */
 
 static void cirrus_vga_save(QEMUFile *f, void *opaque)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc;
 
     qemu_put_be32s(f, &s->latch);
     qemu_put_8s(f, &s->sr_index);
@@ -3049,11 +3082,20 @@ static void cirrus_vga_save(QEMUFile *f,
     qemu_put_be32s(f, &s->hw_cursor_y);
     /* XXX: we do not save the bitblt state - we assume we do not save
        the state when the blitter is active */
+
+    vga_acc = (!!s->map_addr);
+    qemu_put_8s(f, &vga_acc);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
+    if (vga_acc)
+        cirrus_stop_acc(s);
 }
 
 static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc = 0;
 
     if (version_id != 1)
         return -EINVAL;
@@ -3091,6 +3133,14 @@ static int cirrus_vga_load(QEMUFile *f, 
 
     qemu_get_be32s(f, &s->hw_cursor_x);
     qemu_get_be32s(f, &s->hw_cursor_y);
+
+    qemu_get_8s(f, &vga_acc);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
+    if (vga_acc){
+        cirrus_restart_acc(s);
+    }
 
     /* force refresh */
     s->graphic_mode = -1;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/ioemu/target-i386-dm/helper2.c
--- a/tools/ioemu/target-i386-dm/helper2.c      Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/ioemu/target-i386-dm/helper2.c      Wed Jul 19 16:09:59 2006 +0800
@@ -457,6 +457,7 @@ int main_loop(void)
 {
     extern int vm_running;
     extern int shutdown_requested;
+    extern int suspend_requested;
     CPUState *env = cpu_single_env;
     int evtchn_fd = xc_evtchn_fd(xce_handle);
 
@@ -472,6 +473,10 @@ int main_loop(void)
                 qemu_system_reset();
                 reset_requested = 0;
             }
+            if (suspend_requested) {
+                fprintf(logfile, "device model received suspend signal!\n");
+                break;
+            }
         }
 
         /* Wait up to 10 msec. */
@@ -483,7 +488,15 @@ int main_loop(void)
                              shared_page->vcpu_iodata[send_vcpu].dm_eport);
         }
     }
-    destroy_hvm_domain();
+    if (!suspend_requested)
+        destroy_hvm_domain();
+    else {
+        char qemu_file[20];
+        sprintf(qemu_file, "/tmp/xen.qemu-dm.%d", domid);
+        if (qemu_savevm(qemu_file) < 0)
+            fprintf(stderr, "qemu save fail.\n");
+    }
+
     return 0;
 }
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c  Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/ioemu/vl.c  Wed Jul 19 16:09:59 2006 +0800
@@ -3884,6 +3884,11 @@ int qemu_loadvm(const char *filename)
         qemu_fseek(f, cur_pos + record_len, SEEK_SET);
     }
     fclose(f);
+
+    /* del tmp file */
+    if (unlink(filename) == -1)
+        fprintf(stderr, "delete tmp qemu state file failed.\n");
+
     ret = 0;
  the_end:
     if (saved_vm_running)
@@ -4470,6 +4475,7 @@ static QEMUResetEntry *first_reset_entry
 static QEMUResetEntry *first_reset_entry;
 int reset_requested;
 int shutdown_requested;
+int suspend_requested;
 static int powerdown_requested;
 
 void qemu_register_reset(QEMUResetHandler *func, void *opaque)
@@ -5242,6 +5248,14 @@ int set_mm_mapping(int xc_handle, uint32
 #endif
 
     return 0;
+}
+
+void suspend(int sig)
+{
+   fprintf(logfile, "suspend sig handler called with requested=%d!\n", 
suspend_requested);
+    if (sig != SIGUSR1)
+        fprintf(logfile, "suspend signal dismatch, get sig=%d!\n", sig);
+    suspend_requested = 1;
 }
 
 int main(int argc, char **argv)
@@ -6010,6 +6024,27 @@ int main(int argc, char **argv)
             vm_start();
         }
     }
+
+    /* register signal for the suspend request when save */
+    {
+        struct sigaction act;
+        sigset_t set;
+        act.sa_handler = suspend;
+        act.sa_flags = SA_RESTART;
+        sigemptyset(&act.sa_mask);
+
+        if (sigaction(SIGUSR1, &act, 0) == -1)
+            fprintf(stderr, "sigaction fail!\n");
+
+        /* control panel mask some signals when spawn qemu, need unmask here*/
+        sigemptyset(&set);
+        sigaddset(&set, SIGUSR1);
+        sigaddset(&set, SIGTERM);
+        if (sigprocmask(SIG_UNBLOCK, &set, NULL) == -1)
+            fprintf(stderr, "unblock signal fail!\n");
+
+    }
+
     main_loop();
     quit_timers();
     return 0;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/Makefile
--- a/tools/libxc/Makefile      Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/Makefile      Wed Jul 19 16:09:59 2006 +0800
@@ -33,7 +33,7 @@ GUEST_SRCS-$(CONFIG_X86) += xc_linux_bui
 GUEST_SRCS-$(CONFIG_X86) += xc_linux_build.c
 GUEST_SRCS-$(CONFIG_IA64) += xc_ia64_stubs.c xc_linux_build.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_linux_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_restore.c xc_hvm_save.c
 
 CFLAGS   += -Werror
 CFLAGS   += -fno-strict-aliasing
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xc_domain.c   Wed Jul 19 16:09:59 2006 +0800
@@ -182,6 +182,50 @@ int xc_domain_getinfolist(int xc_handle,
         ret = -1;
 
     return ret;
+}
+
+/* get info from hvm guest for save */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOM0_OP;
+
+    op.cmd = DOM0_GETHVMCONTEXT;
+    op.u.gethvmcontext.domain = (domid_t)domid;
+    set_xen_guest_handle(op.u.gethvmcontext.hvm_ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_dom0_op(xc_handle, &op);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
+}
+
+/* set info to hvm guest for restore */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOM0_OP;
+
+    op.cmd = DOM0_SETHVMCONTEXT;
+    op.u.sethvmcontext.domain = (domid_t)domid;
+    set_xen_guest_handle(op.u.gethvmcontext.hvm_ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_dom0_op(xc_handle, &op);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
 }
 
 int xc_vcpu_getcontext(int xc_handle,
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xc_hvm_build.c        Wed Jul 19 16:09:59 2006 +0800
@@ -60,11 +60,11 @@ static unsigned char build_e820map(void 
 
     /* XXX: Doesn't work for > 4GB yet */
     e820entry[nr_map].addr = 0x0;
-    e820entry[nr_map].size = 0x9F800;
+    e820entry[nr_map].size = 0x90000;
     e820entry[nr_map].type = E820_RAM;
     nr_map++;
 
-    e820entry[nr_map].addr = 0x9F800;
+    e820entry[nr_map].addr = 0x90000;
     e820entry[nr_map].size = 0x800;
     e820entry[nr_map].type = E820_RESERVED;
     nr_map++;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xc_linux_save.c       Wed Jul 19 16:09:59 2006 +0800
@@ -261,15 +261,6 @@ static int ratewrite(int io_fd, void *bu
 #endif
 
 
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
-    if(write(fd, buf, count) != count)
-        return 0;
-    return 1;
-}
-
-
-
 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
                        xc_shadow_control_stats_t *stats, int print)
 {
@@ -358,7 +349,7 @@ static int analysis_phase(int xc_handle,
 }
 
 
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
                              int dom, xc_dominfo_t *info,
                              vcpu_guest_context_t *ctxt)
 {
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xenctrl.h     Wed Jul 19 16:09:59 2006 +0800
@@ -286,6 +286,30 @@ int xc_domain_getinfolist(int xc_handle,
                           xc_domaininfo_t *info);
 
 /**
+ * This function returns information about the context of a hvm domain
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to get information from
+ * @parm hvm_ctxt a pointer to a structure to store the execution context of 
the
+ *            hvm domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
+ * This function will set the context for hvm domain
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to set the hvm domain context for
+ * @parm hvm_ctxt pointer to the the hvm context with the values to set
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
  *
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xenguest.h    Wed Jul 19 16:09:59 2006 +0800
@@ -11,6 +11,7 @@
 
 #define XCFLAGS_LIVE      1
 #define XCFLAGS_DEBUG     2
+#define XCFLAGS_HVM       4
 
 
 /**
@@ -25,6 +26,13 @@ int xc_linux_save(int xc_handle, int io_
                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
                   int (*suspend)(int domid));
 
+/**
+ * This function will save a hvm domain running unmodified guest.
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+                  int (*suspend)(int domid));
 
 /**
  * This function will restore a saved domain running Linux.
@@ -41,6 +49,17 @@ int xc_linux_restore(int xc_handle, int 
                      unsigned long nr_pfns, unsigned int store_evtchn,
                      unsigned long *store_mfn, unsigned int console_evtchn,
                      unsigned long *console_mfn);
+
+/**
+ * This function will restore a saved hvm domain running unmodified guest.
+ *
+ * @parm store_mfn pass mem size & returned with the mfn of the store page
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned long nr_pfns, unsigned int store_evtchn,
+                      unsigned long *store_mfn, unsigned int console_evtchn,
+                      unsigned long *console_mfn);
 
 /**
  * This function will create a domain for a paravirtualized Linux
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xg_save_restore.h     Wed Jul 19 16:09:59 2006 +0800
@@ -65,6 +65,16 @@ static int get_platform_info(int xc_hand
     return 1;
 }
 
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+    if(write(fd, buf, count) != count)
+        return 0;
+    return 1;
+}
+
+extern int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+                             int dom, xc_dominfo_t *info,
+                             vcpu_guest_context_t *ctxt);
 
 /*
 ** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables.
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Jul 19 16:09:59 2006 +0800
@@ -132,6 +132,20 @@ static PyObject *pyxc_domain_destroy(XcO
 static PyObject *pyxc_domain_destroy(XcObject *self, PyObject *args)
 {
     return dom_op(self, args, xc_domain_destroy);
+}
+
+static PyObject *pyxc_domain_shutdown(XcObject *self, PyObject *args)
+{
+    uint32_t dom, reason;
+
+    if (!PyArg_ParseTuple(args, "ii", &dom, &reason))
+      return NULL;
+
+    if (xc_domain_shutdown(self->xc_handle, dom, reason) != 0)
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
 }
 
 
@@ -966,6 +980,14 @@ static PyMethodDef pyxc_methods[] = {
       METH_VARARGS, "\n"
       "Destroy a domain.\n"
       " dom [int]:    Identifier of domain to be destroyed.\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "domain_shutdown", 
+      (PyCFunction)pyxc_domain_shutdown,
+      METH_VARARGS, "\n"
+      "Shutdown a domain.\n"
+      " dom       [int, 0]:      Domain identifier to use.\n"
+      " reason     [int, 0]:      Reason for shutdown.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "vcpu_setaffinity", 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/xend/XendCheckpoint.py   Wed Jul 19 16:09:59 2006 +0800
@@ -25,11 +25,14 @@ from XendDomainInfo import DEV_MIGRATE_S
 from XendDomainInfo import DEV_MIGRATE_STEP3
 
 SIGNATURE = "LinuxGuestRecord"
+QEMU_SIGNATURE = "QemuDeviceModelRecord"
+dm_batch = 512
 XC_SAVE = "xc_save"
 XC_RESTORE = "xc_restore"
 
 
 sizeof_int = calcsize("i")
+sizeof_unsigned_int = calcsize("I")
 sizeof_unsigned_long = calcsize("L")
 
 
@@ -72,6 +75,10 @@ def save(fd, dominfo, network, live, dst
                     "could not write guest state file: config len")
         write_exact(fd, config, "could not write guest state file: config")
 
+        hvm = 0
+        if dominfo.info['image'][0] == 'hvm':
+            hvm = 1
+        log.info("save hvm domain %d", hvm)
         # xc_save takes three customization parameters: maxit, max_f, and
         # flags the last controls whether or not save is 'live', while the
         # first two further customize behaviour when 'live' save is
@@ -79,7 +86,7 @@ def save(fd, dominfo, network, live, dst
         # libxenguest; see the comments and/or code in xc_linux_save() for
         # more information.
         cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(xc.handle()), str(fd),
-               str(dominfo.getDomid()), "0", "0", str(int(live)) ]
+               str(dominfo.getDomid()), "0", "0", str(int(live) | int(hvm << 
2)) ]
         log.debug("[xc_save]: %s", string.join(cmd))
 
         def saveInputHandler(line, tochild):
@@ -93,11 +100,28 @@ def save(fd, dominfo, network, live, dst
                 log.info("Domain %d suspended.", dominfo.getDomid())
                 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP3,
                                        domain_name)
+                #send signal to device model for save
+                if hvm == 1:
+                    log.info("release_devices for hvm domain")
+                    dominfo.release_devices(True)
                 tochild.write("done\n")
                 tochild.flush()
                 log.debug('Written done')
 
         forkHelper(cmd, fd, saveInputHandler, False)
+
+        # put qemu device model state
+        if hvm:
+            write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature")
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(), 
os.O_RDONLY)
+            while True:
+                buf = os.read(qemu_fd, dm_batch)
+                if len(buf):
+                    write_exact(fd, buf, "could not write device model state")
+                else:
+                    break
+            os.close(qemu_fd)
+            os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid())
 
         dominfo.destroyDomain()
 
@@ -139,10 +163,21 @@ def restore(xd, fd):
     assert store_port
     assert console_port
 
-    try:
-        l = read_exact(fd, sizeof_unsigned_long,
-                       "not a valid guest state file: pfn count read")
-        nr_pfns = unpack("L", l)[0]    # native sizeof long
+    #if hvm, pass mem size to calculate the store_mfn
+    hvm = 0
+    if dominfo.info['image'][0] == 'hvm':
+        hvm = dominfo.info['memory']
+        log.info("restore hvm domain %d, mem=%d", dominfo.domid, hvm)
+
+    try:
+        if hvm:
+            l = read_exact(fd, sizeof_unsigned_int,
+                    "not a valid hvm guest state file: pfn count read")
+            nr_pfns = unpack("I", l)[0]    # native sizeof int
+        else:
+            l = read_exact(fd, sizeof_unsigned_long,
+                           "not a valid guest state file: pfn count read")
+            nr_pfns = unpack("L", l)[0]    # native sizeof long
         if nr_pfns > 16*1024*1024:     # XXX 
             raise XendError(
                 "not a valid guest state file: pfn count out of range")
@@ -151,7 +186,7 @@ def restore(xd, fd):
 
         cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
                         xc.handle(), fd, dominfo.getDomid(), nr_pfns,
-                        store_port, console_port])
+                        store_port, console_port, hvm])
         log.debug("[xc_restore]: %s", string.join(cmd))
 
         handler = RestoreInputHandler()
@@ -163,6 +198,23 @@ def restore(xd, fd):
 
         dominfo.unpause()
 
+        # get qemu state and create a tmp file for dm restore
+        if hvm:
+            qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
+                "not a valid device model state: signature read")
+            if qemu_signature != QEMU_SIGNATURE:
+                raise XendError("not a valid device model state: found '%s'" %
+                                signature)
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(),
+                    os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
+            while True:
+                buf = os.read(fd, dm_batch)
+                if len(buf):
+                    write_exact(qemu_fd, buf, "could not write dm state to tmp 
file")
+                else:
+                    break
+            os.close(qemu_fd)
+        
         dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
 
         return dominfo
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/xend/XendDomainInfo.py   Wed Jul 19 16:09:59 2006 +0800
@@ -668,6 +668,13 @@ class XendDomainInfo:
         self.console_mfn = console_mfn
 
         self.introduceDomain()
+        if self.info['image'][0] == 'hvm':
+            self.image = image.create(self,
+                                      self.info['image'],
+                                      self.info['device'])
+            if self.image:
+                log.debug("createDevicemodel for hvm domain restore")
+                self.image.createDeviceModel(True)
         self.storeDomDetails()
         self.registerWatches()
         self.refreshShutdown()
@@ -945,6 +952,13 @@ class XendDomainInfo:
             raise XendError('Invalid reason: %s' % reason)
         self.storeDom("control/shutdown", reason)
 
+        ## shutdown hypercall for hvm domain desides xenstore write
+        if self.info['image'][0] == 'hvm':
+            for code in shutdown_reasons.keys():
+                if shutdown_reasons[code] == reason:
+                    break
+            xc.domain_shutdown(self.domid, code)
+
 
     ## private:
 
@@ -1417,8 +1431,11 @@ class XendDomainInfo:
 
     ## private:
 
-    def release_devices(self):
+    def release_devices(self, suspend = False):
         """Release all domain's devices.  Nothrow guarantee."""
+        if suspend and self.image:
+            self.image.destroy(suspend)
+            return
 
         while True:
             t = xstransact("%s/device" % self.dompath)
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/xend/image.py    Wed Jul 19 16:09:59 2006 +0800
@@ -157,7 +157,7 @@ class ImageHandler:
         """Build the domain. Define in subclass."""
         raise NotImplementedError()
 
-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         """Create device model for the domain (define in subclass if 
needed)."""
         pass
     
@@ -331,7 +331,7 @@ class HVMImageHandler(ImageHandler):
             ret = ret + ['-vnc', '%d' % vncdisplay, '-k', 'en-us']
         return ret
 
-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         if self.pid:
             return
         # Execute device model.
@@ -340,6 +340,8 @@ class HVMImageHandler(ImageHandler):
         args = args + ([ "-d",  "%d" % self.vm.getDomid(),
                   "-m", "%s" % (self.vm.getMemoryTarget() / 1024)])
         args = args + self.dmargs
+        if restore:
+            args = args + ([ "-loadvm", "/tmp/xen.qemu-dm.%d" % 
self.vm.getDomid() ])
         env = dict(os.environ)
         if self.display:
             env['DISPLAY'] = self.display
@@ -351,12 +353,16 @@ class HVMImageHandler(ImageHandler):
         self.pid = os.spawnve(os.P_NOWAIT, self.device_model, args, env)
         log.info("device model pid: %d", self.pid)
 
-    def destroy(self):
+    def destroy(self, suspend = False):
         self.unregister_shutdown_watch();
         import signal
         if not self.pid:
             return
-        os.kill(self.pid, signal.SIGKILL)
+        sig = signal.SIGKILL
+        if suspend:
+            log.info("use sigusr1 to signal qemu %d", self.pid)
+            sig = signal.SIGUSR1
+        os.kill(self.pid, sig)
         os.waitpid(self.pid, 0)
         self.pid = 0
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/xcutils/xc_restore.c
--- a/tools/xcutils/xc_restore.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/xcutils/xc_restore.c        Wed Jul 19 16:09:59 2006 +0800
@@ -18,12 +18,13 @@ main(int argc, char **argv)
 main(int argc, char **argv)
 {
     unsigned int xc_fd, io_fd, domid, nr_pfns, store_evtchn, console_evtchn;
+    unsigned int hvm;
     int ret;
     unsigned long store_mfn, console_mfn;
 
-    if (argc != 7)
+    if (argc != 8)
        errx(1,
-            "usage: %s xcfd iofd domid nr_pfns store_evtchn console_evtchn",
+            "usage: %s xcfd iofd domid nr_pfns store_evtchn console_evtchn 
is_hvm",
             argv[0]);
 
     xc_fd = atoi(argv[1]);
@@ -32,9 +33,17 @@ main(int argc, char **argv)
     nr_pfns = atoi(argv[4]);
     store_evtchn = atoi(argv[5]);
     console_evtchn = atoi(argv[6]);
+    hvm = atoi(argv[7]);
 
-    ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
-                          &store_mfn, console_evtchn, &console_mfn);
+    if (hvm) {
+        /* pass the memsize to xc_hvm_restore to find the store_mfn */
+        store_mfn = hvm;
+        ret = xc_hvm_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn);
+    } else
+        ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn);
+
     if (ret == 0) {
        printf("store-mfn %li\n", store_mfn);
        printf("console-mfn %li\n", console_mfn);
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/xcutils/xc_save.c   Wed Jul 19 16:09:59 2006 +0800
@@ -47,5 +47,8 @@ main(int argc, char **argv)
     max_f = atoi(argv[5]);
     flags = atoi(argv[6]);
 
-    return xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    if (flags & XCFLAGS_HVM)
+        return xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    else
+        return xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
&suspend);
 }
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/dom0_ops.c   Wed Jul 19 16:09:59 2006 +0800
@@ -454,6 +454,7 @@ void arch_getdomaininfo_ctxt(
     if ( hvm_guest(v) )
     {
         hvm_store_cpu_guest_regs(v, &c->user_regs, c->ctrlreg);
+        hvm_save_cpu_context(v, &c->hvmcpu_ctxt);
     }
     else
     {
@@ -473,6 +474,25 @@ void arch_getdomaininfo_ctxt(
     c->ctrlreg[3] = xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table));
 
     c->vm_assist = v->domain->vm_assist;
+}
+
+void arch_gethvm_ctxt(
+    struct vcpu *v, struct hvm_domain_context *c)
+{
+    if ( !hvm_guest(v) )
+        return;
+
+    hvm_save(v, c);
+
+}
+
+void arch_sethvm_ctxt(
+        struct vcpu *v, struct hvm_domain_context *c)
+{
+    if ( !hvm_guest(v) )
+        return;
+
+    hvm_load(v, c);
 }
 
 /*
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/hvm.c    Wed Jul 19 16:09:59 2006 +0800
@@ -182,6 +182,8 @@ static void hvm_get_info(struct domain *
     unmap_domain_page(p);
 }
 
+extern void arch_sethvm_ctxt(
+    struct vcpu *, struct hvm_domain_context *);
 void hvm_setup_platform(struct domain* d)
 {
     struct hvm_domain *platform;
@@ -211,6 +213,16 @@ void hvm_setup_platform(struct domain* d
 
     init_timer(&platform->pl_time.periodic_tm.timer, pt_timer_fn, v, 
v->processor);
     pit_init(v, cpu_khz);
+
+    /* restore hvm context including pic/pit/shpage */
+    shpage_init(get_sp(d));
+
+    if (platform->hvm_ctxt) {
+        arch_sethvm_ctxt(current, platform->hvm_ctxt);
+        xfree(platform->hvm_ctxt);
+        platform->hvm_ctxt = NULL;
+    }
+
 }
 
 void pic_irq_request(void *data, int level)
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/i8254.c  Wed Jul 19 16:09:59 2006 +0800
@@ -357,6 +357,142 @@ static uint32_t pit_ioport_read(void *op
     return ret;
 }
 
+void pit_info(PITState *pit)
+{
+    PITChannelState *s;
+    int i;
+
+    for(i = 0; i < 3; i++) {
+        printk("*****pit channel %d's state:*****\n", i);
+        s = &pit->channels[i];
+        printk("pit 0x%x.\n", s->count);
+        printk("pit 0x%x.\n", s->latched_count);
+        printk("pit 0x%x.\n", s->count_latched);
+        printk("pit 0x%x.\n", s->status_latched);
+        printk("pit 0x%x.\n", s->status);
+        printk("pit 0x%x.\n", s->read_state);
+        printk("pit 0x%x.\n", s->write_state);
+        printk("pit 0x%x.\n", s->write_latch);
+        printk("pit 0x%x.\n", s->rw_mode);
+        printk("pit 0x%x.\n", s->mode);
+        printk("pit 0x%x.\n", s->bcd);
+        printk("pit 0x%x.\n", s->gate);
+        printk("pit %"PRId64"\n", s->count_load_time);
+
+        if (s->pt) {
+            struct periodic_time *pt = s->pt;
+            printk("pit channel %d has a periodic timer:\n", i);
+            printk("pt %d.\n", pt->enabled);
+            printk("pt %d.\n", pt->one_shot);
+            printk("pt %d.\n", pt->irq);
+            printk("pt %d.\n", pt->first_injected);
+
+            printk("pt %d.\n", pt->pending_intr_nr);
+            printk("pt %d.\n", pt->period);
+            printk("pt %"PRId64"\n", pt->period_cycles);
+            printk("pt %"PRId64"\n", pt->last_plt_gtime);
+        }
+    }
+
+}
+
+static void pit_save(hvm_domain_context_t *h, void *opaque)
+{
+    PITState *pit = opaque;
+    PITChannelState *s;
+    struct periodic_time *pt;
+    int i, pti = -1;
+    
+#ifdef HVM_DEBUG_SUSPEND
+    pit_info(pit);
+#endif
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        hvm_put_32u(h, s->count);
+        hvm_put_16u(h, s->latched_count);
+        hvm_put_8u(h, s->count_latched);
+        hvm_put_8u(h, s->status_latched);
+        hvm_put_8u(h, s->status);
+        hvm_put_8u(h, s->read_state);
+        hvm_put_8u(h, s->write_state);
+        hvm_put_8u(h, s->write_latch);
+        hvm_put_8u(h, s->rw_mode);
+        hvm_put_8u(h, s->mode);
+        hvm_put_8u(h, s->bcd);
+        hvm_put_8u(h, s->gate);
+        hvm_put_64u(h, s->count_load_time);
+
+        if (s->pt && pti == -1)
+            pti = i;
+    }
+
+    /* save guest time */
+    pt = pit->channels[pti].pt;
+    hvm_put_8u(h, pti);
+    hvm_put_8u(h, pt->first_injected);
+    hvm_put_32u(h, pt->pending_intr_nr);
+    hvm_put_64u(h, pt->last_plt_gtime);
+
+}
+
+static int pit_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    PITState *pit = opaque;
+    PITChannelState *s;
+    int i, pti;
+    u32 period;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        s->count = hvm_get_32u(h);
+        s->latched_count = hvm_get_16u(h);
+        s->count_latched = hvm_get_8u(h);
+        s->status_latched = hvm_get_8u(h);
+        s->status = hvm_get_8u(h);
+        s->read_state = hvm_get_8u(h);
+        s->write_state = hvm_get_8u(h);
+        s->write_latch = hvm_get_8u(h);
+        s->rw_mode = hvm_get_8u(h);
+        s->mode = hvm_get_8u(h);
+        s->bcd = hvm_get_8u(h);
+        s->gate = hvm_get_8u(h);
+        s->count_load_time = hvm_get_64u(h);
+    }
+
+    pti = hvm_get_8u(h);
+    s = &pit->channels[pti];
+    period = DIV_ROUND((s->count * 1000000000ULL), PIT_FREQ);
+
+    printk("recreate periodic timer %d in mode %d, freq=%d.\n", pti, s->mode, 
period);
+    switch (s->mode) {
+        case 2:
+            /* create periodic time */
+            s->pt = create_periodic_time(s, period, 0, 0);
+            break;
+        case 1:
+            /* create one shot time */
+            s->pt = create_periodic_time(s, period, 0, 1);
+            break;
+        default:
+            break;
+    }
+
+    s->pt->first_injected = hvm_get_8u(h);
+    s->pt->pending_intr_nr = hvm_get_32u(h);
+    s->pt->last_plt_gtime = hvm_get_64u(h);
+    /*XXX: need set_guest_time here or do this when post_inject? */
+ 
+#ifdef HVM_DEBUG_SUSPEND
+    pit_info(pit);
+#endif
+
+    return 0;
+}
+
 static void pit_reset(void *opaque)
 {
     PITState *pit = opaque;
@@ -385,6 +521,8 @@ void pit_init(struct vcpu *v, unsigned l
     s->vcpu = v;
     s++; s->vcpu = v;
     s++; s->vcpu = v;
+
+    hvm_register_savevm("xen_hvm_i8254", PIT_BASE, 1, pit_save, pit_load, pit);
 
     register_portio_handler(PIT_BASE, 4, handle_pit_io);
     /* register the speaker port */
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/i8259.c
--- a/xen/arch/x86/hvm/i8259.c  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/i8259.c  Wed Jul 19 16:09:59 2006 +0800
@@ -454,12 +454,91 @@ static uint32_t elcr_ioport_read(void *o
     return s->elcr;
 }
 
+void pic_info(PicState *s)
+{
+    printk("*****pic state:*****\n");
+    printk("pic 0x%x.\n", s->last_irr);
+    printk("pic 0x%x.\n", s->irr);
+    printk("pic 0x%x.\n", s->imr);
+    printk("pic 0x%x.\n", s->isr);
+    printk("pic 0x%x.\n", s->priority_add);
+    printk("pic 0x%x.\n", s->irq_base);
+    printk("pic 0x%x.\n", s->read_reg_select);
+    printk("pic 0x%x.\n", s->poll);
+    printk("pic 0x%x.\n", s->special_mask);
+    printk("pic 0x%x.\n", s->init_state);
+    printk("pic 0x%x.\n", s->auto_eoi);
+    printk("pic 0x%x.\n", s->rotate_on_auto_eoi);
+    printk("pic 0x%x.\n", s->special_fully_nested_mode);
+    printk("pic 0x%x.\n", s->init4);
+    printk("pic 0x%x.\n", s->elcr);
+    printk("pic 0x%x.\n", s->elcr_mask);
+}
+
+static void pic_save(hvm_domain_context_t *h, void *opaque)
+{
+    PicState *s = opaque;
+    
+#ifdef HVM_DEBUG_SUSPEND
+    pic_info(s);
+#endif
+
+    hvm_put_8u(h, s->last_irr);
+    hvm_put_8u(h, s->irr);
+    hvm_put_8u(h, s->imr);
+    hvm_put_8u(h, s->isr);
+    hvm_put_8u(h, s->priority_add);
+    hvm_put_8u(h, s->irq_base);
+    hvm_put_8u(h, s->read_reg_select);
+    hvm_put_8u(h, s->poll);
+    hvm_put_8u(h, s->special_mask);
+    hvm_put_8u(h, s->init_state);
+    hvm_put_8u(h, s->auto_eoi);
+    hvm_put_8u(h, s->rotate_on_auto_eoi);
+    hvm_put_8u(h, s->special_fully_nested_mode);
+    hvm_put_8u(h, s->init4);
+    hvm_put_8u(h, s->elcr);
+    hvm_put_8u(h, s->elcr_mask);
+}
+
+static int pic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    PicState *s = opaque;
+    
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->last_irr = hvm_get_8u(h);
+    s->irr = hvm_get_8u(h);
+    s->imr = hvm_get_8u(h);
+    s->isr = hvm_get_8u(h);
+    s->priority_add = hvm_get_8u(h);
+    s->irq_base = hvm_get_8u(h);
+    s->read_reg_select= hvm_get_8u(h);
+    s->poll = hvm_get_8u(h);
+    s->special_mask = hvm_get_8u(h);
+    s->init_state = hvm_get_8u(h);
+    s->auto_eoi = hvm_get_8u(h);
+    s->rotate_on_auto_eoi = hvm_get_8u(h);
+    s->special_fully_nested_mode = hvm_get_8u(h);
+    s->init4 = hvm_get_8u(h);
+    s->elcr = hvm_get_8u(h);
+    s->elcr_mask = hvm_get_8u(h);
+
+#ifdef HVM_DEBUG_SUSPEND
+    pic_info(s);
+#endif
+
+    return 0;
+}
+
 /* XXX: add generic master/slave system */
 /* Caller must hold vpic lock */
 static void pic_init1(int io_addr, int elcr_addr, PicState *s)
 {
     BUG_ON(!spin_is_locked(&s->pics_state->lock));
 
+    hvm_register_savevm("xen_hvm_i8259", io_addr, 1, pic_save, pic_load, s);
     pic_reset(s);
 }
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/intercept.c
--- a/xen/arch/x86/hvm/intercept.c      Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/intercept.c      Wed Jul 19 16:09:59 2006 +0800
@@ -29,6 +29,8 @@
 #include <asm/current.h>
 #include <io_ports.h>
 #include <xen/event.h>
+#include <xen/compile.h>
+#include <public/version.h>
 
 
 extern struct hvm_mmio_handler vlapic_mmio_handler;
@@ -303,6 +305,266 @@ void destroy_periodic_time(struct period
     }
 }
 
+/* save/restore support */
+#define HVM_FILE_MAGIC   0x54381286
+#define HVM_FILE_VERSION 0x00000001
+
+int hvm_register_savevm(const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque)
+{
+    HVMStateEntry *se, **pse;
+    struct vcpu *v = current;
+
+    if (!hvm_guest(v)) {
+        printk("register savevm only for hvm guest!\n");
+        return -1;
+    }
+
+    if ( (se = xmalloc(struct HVMStateEntry)) == NULL ){
+        printk("allocat hvmstate entry fail.\n");
+        return -1;
+    }
+
+    strncpy(se->idstr, idstr, HVM_SE_IDSTR_LEN);
+
+    se->instance_id = instance_id;
+    se->version_id = version_id;
+    se->save_state = save_state;
+    se->load_state = load_state;
+    se->opaque = opaque;
+    se->next = NULL;
+
+    /* add at the end of list */
+    pse = &v->domain->arch.hvm_domain.first_se;
+    while (*pse != NULL)
+        pse = &(*pse)->next;
+    *pse = se;
+    return 0;
+}
+
+int hvm_save(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, len_pos, cur_pos;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char *chgset;
+
+    if (!hvm_guest(v)) {
+        printk("hvm_save only for hvm guest!\n");
+        return -1;
+    }
+
+    memset(h, 0, sizeof(hvm_domain_context_t));
+    hvm_put_32u(h, HVM_FILE_MAGIC);
+    hvm_put_32u(h, HVM_FILE_VERSION);
+
+    /* save xen changeset */
+    chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = strlen(chgset);
+    hvm_put_8u(h, len);
+    hvm_put_buffer(h, chgset, len);
+
+    /* save cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    hvm_put_32u(h, eax);
+
+    for(se = v->domain->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        /* ID string */
+        len = strnlen(se->idstr, HVM_SE_IDSTR_LEN);
+        hvm_put_8u(h, len);
+        hvm_put_buffer(h, se->idstr, len);
+
+        hvm_put_32u(h, se->instance_id);
+        hvm_put_32u(h, se->version_id);
+
+        /* record size */
+        len_pos = hvm_ctxt_tell(h);
+        hvm_put_32u(h, 0);
+
+        se->save_state(h, se->opaque);
+
+        cur_pos = hvm_ctxt_tell(h);
+        len = cur_pos - len_pos - 4;
+        hvm_ctxt_seek(h, len_pos);
+        hvm_put_32u(h, len);
+        hvm_ctxt_seek(h, cur_pos);
+
+    }
+
+    h->size = hvm_ctxt_tell(h);
+    hvm_ctxt_seek(h, 0);
+
+    return 0;
+
+}
+
+static HVMStateEntry *find_se(struct domain *d, const char *idstr, int 
instance_id)
+{
+    HVMStateEntry *se;
+
+    for(se = d->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        if (!strncmp(se->idstr, idstr, HVM_SE_IDSTR_LEN) &&
+            instance_id == se->instance_id){
+            return se;
+        }
+    }
+    return NULL;
+}
+
+int hvm_load(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, rec_len, rec_pos, magic, instance_id, version_id;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char idstr[HVM_SE_IDSTR_LEN];
+    xen_changeset_info_t chgset;
+    char *cur_chgset;
+
+    if (!hvm_guest(v)) {
+        printk("hvm_load only for hvm guest!\n");
+        return -1;
+    }
+
+    hvm_ctxt_seek(h, 0);
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_MAGIC) {
+        printk("HVM restore magic dismatch!\n");
+        return -1;
+    }
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_VERSION) {
+        printk("HVM restore version dismatch!\n");
+        return -1;
+    }
+
+    /* check xen change set */
+    cur_chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = hvm_get_8u(h);
+    hvm_get_buffer(h, chgset, len);
+    chgset[len] = '\0';
+    if (strncmp(cur_chgset, chgset, len + 1))
+        printk("warnings: try to restore hvm guest(%s) on a different 
changeset %s.\n",
+                chgset, cur_chgset);
+
+    /* check cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    ebx = hvm_get_32u(h);
+    /*TODO: need difine how big difference is acceptable */
+    if (ebx != eax)
+        printk("warnings: try to restore hvm guest(0x%"PRIx32") "
+               "on a different type processor(0x%"PRIx32").\n",
+                ebx,
+                eax);
+
+    while(1) {
+        if (hvm_ctxt_end(h)) {
+            break;
+        }
+
+        /* ID string */
+        len = hvm_get_8u(h);
+        if (len > HVM_SE_IDSTR_LEN)
+            printk("HVM save entry idstr len wrong!");
+
+        hvm_get_buffer(h, idstr, len);
+        idstr[len] = '\0';
+
+        instance_id = hvm_get_32u(h);
+        version_id = hvm_get_32u(h);
+
+        rec_len = hvm_get_32u(h);
+        rec_pos = hvm_ctxt_tell(h);
+
+        se = find_se(v->domain, idstr, instance_id);
+        if (se)
+            se->load_state(h, se->opaque, version_id);
+        else
+            printk("warnings: hvm load can't find device %s's instance %d!\n",
+                    idstr, version_id);
+                    
+
+        /* make sure to jump end of record */
+        if ( hvm_ctxt_tell(h) - rec_pos != rec_len) {
+            printk("wrong hvm record size, maybe some dismatch between 
save&restoreo handler!\n");
+        }
+        hvm_ctxt_seek(h, rec_pos + rec_len);
+    }
+
+    return 0;
+}
+
+void shpage_info(shared_iopage_t *sh)
+{
+
+    vcpu_iodata_t *p = &sh->vcpu_iodata[0];
+    ioreq_t *req = &p->vp_ioreq;
+    printk("*****sharepage_info******!\n");
+    printk("vp_eport=%d,dm_eport=%d\n", p->vp_eport, p->dm_eport);
+    printk("io packet: "
+                     "state:%x, pvalid: %x, dir:%x, port: %"PRIx64", "
+                     "data: %"PRIx64", count: %"PRIx64", size: %"PRIx64"\n",
+                     req->state, req->pdata_valid, req->dir, req->addr,
+                     req->u.data, req->count, req->size);
+    printk("pic_elcr=0x%x, pic_irr=0x%x, pic_last_irr=0x%x, 
pic_clear_irr=0x%x.\n",
+            sh->sp_global.pic_elcr,
+            sh->sp_global.pic_irr,
+            sh->sp_global.pic_last_irr,
+            sh->sp_global.pic_clear_irr);
+}
+
+static void shpage_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct shared_iopage *s = opaque;
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+
+#ifdef HVM_DEBUG_SUSPEND
+    shpage_info(s);
+#endif
+
+    hvm_put_16u(h, s->sp_global.pic_elcr);
+    hvm_put_16u(h, s->sp_global.pic_irr);
+    hvm_put_16u(h, s->sp_global.pic_last_irr);
+    hvm_put_16u(h, s->sp_global.pic_clear_irr);
+
+    hvm_put_buffer(h, (char*)req, sizeof(struct ioreq));
+}
+
+static int shpage_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct shared_iopage *s = opaque;
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->sp_global.pic_elcr = hvm_get_16u(h);
+    s->sp_global.pic_irr = hvm_get_16u(h);
+    s->sp_global.pic_last_irr = hvm_get_16u(h);
+    s->sp_global.pic_clear_irr = hvm_get_16u(h);
+
+    hvm_get_buffer(h, (char*)req, sizeof(struct ioreq));
+
+#ifdef HVM_DEBUG_SUSPEND
+    shpage_info(s);
+#endif
+
+    return 0;
+}
+
+void shpage_init(shared_iopage_t *sp)
+{
+    hvm_register_savevm("xen_hvm_shpage", 0x10, 1, shpage_save, shpage_load, 
sp);
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Jul 19 16:09:59 2006 +0800
@@ -763,6 +763,7 @@ static void svm_relinquish_guest_resourc
 {
     extern void destroy_vmcb(struct arch_svm_struct *); /* XXX */
     struct vcpu *v;
+    HVMStateEntry *se, *dse;
 
     for_each_vcpu ( d, v )
     {
@@ -780,6 +781,13 @@ static void svm_relinquish_guest_resourc
     }
 
     kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
+
+    se = d->arch.hvm_domain.first_se;
+    while (se) {
+        dse = se;
+        se = se->next;
+        xfree(dse);
+    }
 
     if ( d->arch.hvm_domain.shared_page_va )
         unmap_domain_page_global(
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Jul 19 16:09:59 2006 +0800
@@ -572,6 +572,7 @@ void arch_vmx_do_launch(struct vcpu *v)
     }
 
     vmx_do_launch(v);
+    hvm_load_cpu_context(v, &v->arch.guest_context.hvmcpu_ctxt);
     reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Jul 19 16:09:59 2006 +0800
@@ -126,6 +126,7 @@ static void vmx_relinquish_guest_resourc
 static void vmx_relinquish_guest_resources(struct domain *d)
 {
     struct vcpu *v;
+    HVMStateEntry *se, *dse;
 
     for_each_vcpu ( d, v )
     {
@@ -142,6 +143,13 @@ static void vmx_relinquish_guest_resourc
     }
 
     kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
+
+    se = d->arch.hvm_domain.first_se;
+    while (se) {
+        dse = se;
+        se = se->next;
+        xfree(dse);
+    }
 
     if ( d->arch.hvm_domain.shared_page_va )
         unmap_domain_page_global(
@@ -521,6 +529,337 @@ static void vmx_store_cpu_guest_regs(
     }
 
     vmx_vmcs_exit(v);
+}
+
+int vmx_vmcs_save(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long inst_len;
+    int error = 0;
+
+    error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
+    error |= __vmread(GUEST_RIP, &c->eip);
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: inst_len=0x%lx, eip=0x%"PRIx64".\n", 
+            inst_len, c->eip);
+#endif
+
+    error |= __vmread(GUEST_RSP, &c->esp);
+    error |= __vmread(GUEST_RFLAGS, &c->eflags);
+
+    error |= __vmread(CR0_READ_SHADOW, &c->cr0);
+
+    c->cr3 = v->arch.hvm_vmx.cpu_cr3;
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: cr3=0x%"PRIx64".\n", c->cr3);
+#endif
+
+    error |= __vmread(CR4_READ_SHADOW, &c->cr4);
+
+    error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
+    error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
+
+    error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
+    error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
+
+    error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
+    error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
+    error |= __vmread(GUEST_CS_BASE, &c->cs_base);
+    error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes);
+
+    error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
+    error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
+    error |= __vmread(GUEST_DS_BASE, &c->ds_base);
+    error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes);
+
+    error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
+    error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
+    error |= __vmread(GUEST_ES_BASE, &c->es_base);
+    error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes);
+
+    error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
+    error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
+    error |= __vmread(GUEST_SS_BASE, &c->ss_base);
+    error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes);
+
+    error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
+    error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
+    error |= __vmread(GUEST_FS_BASE, &c->fs_base);
+    error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes);
+
+    error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
+    error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
+    error |= __vmread(GUEST_GS_BASE, &c->gs_base);
+    error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes);
+
+    error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
+    error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
+    error |= __vmread(GUEST_TR_BASE, &c->tr_base);
+    error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes);
+
+    error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
+    error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
+    error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
+    error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes);
+
+    error |= __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
+    error |= __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
+    error |= __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
+
+    return !error;
+}
+
+int vmx_vmcs_restore(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long mfn, old_cr4, old_base_mfn;
+    int error = 0;
+
+    error |= __vmwrite(GUEST_RIP, c->eip);
+    error |= __vmwrite(GUEST_RSP, c->esp);
+    error |= __vmwrite(GUEST_RFLAGS, c->eflags);
+
+    error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
+
+    if (!vmx_paging_enabled(v)) {
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+        goto skip_cr3;
+    }
+
+    if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
+        /*
+         * This is simple TLB flush, implying the guest has
+         * removed some translation or changed page attributes.
+         * We simply invalidate the shadow.
+         */
+        mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
+        if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash_synchronous();
+            return 0;
+        }
+        shadow_sync_all(v->domain);
+    } else {
+        /*
+         * If different, make a shadow. Check if the PDBR is valid
+         * first.
+         */
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
+        if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash_synchronous();
+            return 0;
+        }
+        mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
+        if(!get_page(mfn_to_page(mfn), v->domain))
+                return 0;
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+        if (old_base_mfn)
+             put_page(mfn_to_page(old_base_mfn));
+        /*
+         * arch.shadow_table should now hold the next CR3 for shadow
+         */
+        v->arch.hvm_vmx.cpu_cr3 = c->cr3;
+        update_pagetables(v);
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
+        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+    }
+
+ skip_cr3:
+
+    error |= __vmread(CR4_READ_SHADOW, &old_cr4);
+    error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
+    error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
+
+    error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+    error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+    error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+    error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+    error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+    error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+    error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
+    error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
+
+    error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+    error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+    error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
+    error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
+
+    error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+    error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+    error |= __vmwrite(GUEST_ES_BASE, c->es_base);
+    error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
+
+    error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+    error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+    error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
+    error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
+
+    error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+    error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+    error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
+    error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
+
+    error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+    error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+    error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
+    error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
+
+    error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+    error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+    error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
+    error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
+
+    error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+    error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+    error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+    error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
+
+    error |= __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
+    error |= __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
+    error |= __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
+
+    return !error;
+}
+
+void dump_msr_state(struct vmx_msr_state *m)
+{
+    int i = 0;
+    printk("**** msr state ****\n");
+    printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        printk("0x%lx,", m->msr_items[i]);
+    printk("\n");
+}
+        
+void vmx_save_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_content;
+    unsigned long guest_flags = guest_state->flags;
+    int i = 0;
+
+    data->shadow_gs = guest_state->shadow_gs;
+    data->cpu_state = v->arch.hvm_vmx.cpu_state;
+    /* save msrs */
+    data->flags = guest_flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        data->msr_items[i] = guest_state->msr_items[i];
+
+#ifdef HVM_DEBUG_SUSPEND
+    dump_msr_state(guest_state);
+    printk("saved cpu_state=0x%"PRIX64"\n", data->cpu_state);
+#endif
+}
+
+void vmx_load_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    int i = 0;
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_content;
+
+    /* restore msrs */
+    guest_state->flags = data->flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        guest_state->msr_items[i] = data->msr_items[i];
+
+    guest_state->shadow_gs = data->shadow_gs;
+
+    vmx_restore_msrs(v);
+
+    v->arch.hvm_vmx.cpu_state = data->cpu_state;
+
+#ifdef HVM_DEBUG_SUSPEND
+    dump_msr_state(guest_state);
+    printk("restore cpu_state=0x%lx.\n", v->arch.hvm_vmx.cpu_state);
+
+#endif
+
+#if defined(__x86_64__)
+        if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
+                     &v->arch.hvm_vmx.cpu_state) )
+        {
+            unsigned long vm_entry_value;
+            if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
+                        &v->arch.hvm_vmx.cpu_state) ) {
+                __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
+                vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
+                __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
+
+                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
+                {
+                    printk("Unsupported guest paging levels\n");
+                    domain_crash_synchronous(); /* need to take a clean path */
+                }
+            }
+        }
+        else
+#endif  /* __x86_64__ */
+        {
+#if CONFIG_PAGING_LEVELS >= 3
+            /* seems it's a 32-bit or 32-bit PAE guest */
+            if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
+                        &v->arch.hvm_vmx.cpu_state) )
+            {
+                /* The guest enables PAE first and then it enables PG, it is
+                 * really a PAE guest */
+                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
+                {
+                    printk("Unsupported guest paging levels\n");
+                    domain_crash_synchronous();
+                }
+            }
+            else
+            {
+                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
+                {
+                    printk("Unsupported guest paging levels\n");
+                    domain_crash_synchronous(); /* need to take a clean path */
+                }
+            }
+#endif
+        }
+
+}
+
+void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+
+    /* set valid flag to recover whole vmcs when restore */
+    ctxt->valid = 1;
+
+    vmx_save_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    if (!vmx_vmcs_save(v, data))
+        printk("vmx_vmcs save failed!\n");
+
+    vmx_vmcs_exit(v);
+
+}
+
+void vmx_load_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    if (!ctxt->valid)
+        return;
+
+    vmx_load_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    if (!vmx_vmcs_restore(v, &ctxt->data))
+        printk("vmx_vmcs restore failed!\n");
+
+    /* only load vmcs once */
+    ctxt->valid = 0;
+
+    vmx_vmcs_exit(v);
+
 }
 
 /*
@@ -741,6 +1080,9 @@ int start_vmx(void)
 
     hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
     hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
+
+    hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt;
+    hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt;
 
     hvm_funcs.realmode = vmx_realmode;
     hvm_funcs.paging_enabled = vmx_paging_enabled;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c     Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/common/dom0_ops.c     Wed Jul 19 16:09:59 2006 +0800
@@ -27,6 +27,8 @@ extern long arch_do_dom0_op(
     struct dom0_op *op, XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op);
 extern void arch_getdomaininfo_ctxt(
     struct vcpu *, struct vcpu_guest_context *);
+extern void arch_gethvm_ctxt(
+    struct vcpu *, struct hvm_domain_context *);
 
 static inline int is_free_domid(domid_t dom)
 {
@@ -504,6 +506,77 @@ long do_dom0_op(XEN_GUEST_HANDLE(dom0_op
     }
     break;
 
+    case DOM0_GETHVMCONTEXT:
+    { 
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->u.gethvmcontext.domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto gethvmcontext_out;
+
+        v = d->vcpu[0];
+
+        ret = -ENODATA;
+        if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+            goto gethvmcontext_out;
+        
+        arch_gethvm_ctxt(v, c);
+
+        ret = 0;
+        if ( copy_to_guest(op->u.gethvmcontext.hvm_ctxt, c, 1) )
+            ret = -EFAULT;
+
+        xfree(c);
+
+        if ( copy_to_guest(u_dom0_op, op, 1) )
+            ret = -EFAULT;
+
+    gethvmcontext_out:
+        put_domain(d);
+    }
+    break;
+
+    case DOM0_SETHVMCONTEXT:
+    { 
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->u.sethvmcontext.domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto sethvmcontext_out;
+
+        /*XXX: need check input vcpu when smp */
+        v = d->vcpu[0];
+        
+        ret = -EFAULT;
+        if ( copy_from_guest(c, op->u.sethvmcontext.hvm_ctxt, 1) != 0 )
+            goto sethvmcontext_out;
+
+        /* store the data for future use */
+        d->arch.hvm_domain.hvm_ctxt = c;
+
+        ret = 0;
+
+        if ( copy_to_guest(u_dom0_op, op, 1) )
+            ret = -EFAULT;
+
+    sethvmcontext_out:
+        put_domain(d);
+    }
+    break;
+
+
     case DOM0_GETVCPUINFO:
     { 
         struct domain *d;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/asm-x86/hvm/domain.h  Wed Jul 19 16:09:59 2006 +0800
@@ -30,6 +30,20 @@
 
 #define HVM_PBUF_SIZE   80
 
+typedef void SaveStateHandler(hvm_domain_context_t *h, void *opaque);
+typedef int LoadStateHandler(hvm_domain_context_t *h, void *opaque, int 
version_id);
+
+#define HVM_SE_IDSTR_LEN 32
+typedef struct HVMStateEntry {
+    char idstr[HVM_SE_IDSTR_LEN];
+    int instance_id;
+    int version_id;
+    SaveStateHandler *save_state;
+    LoadStateHandler *load_state;
+    void *opaque;
+    struct HVMStateEntry *next;
+} HVMStateEntry;
+
 struct hvm_domain {
     unsigned long          shared_page_va;
     unsigned int           nr_vcpus;
@@ -48,6 +62,8 @@ struct hvm_domain {
 
     int                    pbuf_index;
     char                   pbuf[HVM_PBUF_SIZE];
+    struct hvm_domain_context *hvm_ctxt;
+    HVMStateEntry *first_se;
 };
 
 #endif /* __ASM_X86_HVM_DOMAIN_H__ */
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Jul 19 16:09:59 2006 +0800
@@ -47,6 +47,13 @@ struct hvm_function_table {
         struct vcpu *v, struct cpu_user_regs *r, unsigned long *crs);
     void (*load_cpu_guest_regs)(
         struct vcpu *v, struct cpu_user_regs *r);
+
+    /* save and load hvm guest cpu context for save/restore */
+    void (*save_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+    void (*load_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+
     /*
      * Examine specifics of the guest state:
      * 1) determine whether the guest is in real or vm8086 mode,
@@ -103,6 +110,20 @@ hvm_load_cpu_guest_regs(struct vcpu *v, 
     hvm_funcs.load_cpu_guest_regs(v, r);
 }
 
+static inline void
+hvm_save_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.save_cpu_ctxt(v, ctxt);
+}
+
+static inline void
+hvm_load_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.load_cpu_ctxt(v, ctxt);
+}
+
 static inline int
 hvm_realmode(struct vcpu *v)
 {
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/asm-x86/hvm/support.h Wed Jul 19 16:09:59 2006 +0800
@@ -25,6 +25,7 @@
 #include <asm/types.h>
 #include <asm/regs.h>
 #include <asm/processor.h>
+#include <public/dom0_ops.h>
 
 #ifndef NDEBUG
 #define HVM_DEBUG 1
@@ -136,6 +137,129 @@ extern unsigned int opt_hvm_debug_level;
         domain_crash_synchronous();                             \
     } while (0)
 
+/* save/restore support */
+
+//#define HVM_DEBUG_SUSPEND
+
+extern int hvm_register_savevm(const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque);
+
+static inline void hvm_ctxt_seek(hvm_domain_context_t *h, unsigned int pos)
+{
+    h->cur = pos;
+}
+
+static inline uint32_t hvm_ctxt_tell(hvm_domain_context_t *h)
+{
+    return h->cur;
+}
+
+static inline int hvm_ctxt_end(hvm_domain_context_t *h)
+{
+    return (h->cur >= h->size || h->cur >= HVM_CTXT_SIZE);
+}
+
+static inline void hvm_put_byte(hvm_domain_context_t *h, unsigned int i)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        printk("hvm_put_byte overflow.\n");
+        return;
+    }
+    h->data[h->cur++] = (char)i;
+}
+
+static inline void hvm_put_8u(hvm_domain_context_t *h, uint8_t b)
+{
+    hvm_put_byte(h, b);
+}
+
+static inline void hvm_put_16u(hvm_domain_context_t *h, uint16_t b)
+{
+    hvm_put_8u(h, b >> 8);
+    hvm_put_8u(h, b);
+}
+
+static inline void hvm_put_32u(hvm_domain_context_t *h, uint32_t b)
+{
+    hvm_put_16u(h, b >> 16);
+    hvm_put_16u(h, b);
+}
+
+static inline void hvm_put_64u(hvm_domain_context_t *h, uint64_t b)
+{
+    hvm_put_32u(h, b >> 32);
+    hvm_put_32u(h, b);
+}
+
+static inline void hvm_put_buffer(hvm_domain_context_t *h, const char *buf, 
int len)
+{
+    memcpy(&h->data[h->cur], buf, len);
+    h->cur += len;
+}
+
+
+static inline char hvm_get_byte(hvm_domain_context_t *h)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        printk("hvm_get_byte overflow.\n");
+        return -1;
+    }
+
+    if (h->cur >= h->size) {
+        printk("hvm_get_byte exceed data area.\n");
+        return -1;
+    }
+
+    return h->data[h->cur++];
+}
+
+static inline uint8_t hvm_get_8u(hvm_domain_context_t *h)
+{
+    return hvm_get_byte(h);
+}
+
+static inline uint16_t hvm_get_16u(hvm_domain_context_t *h)
+{
+    uint16_t v;
+    v =  hvm_get_8u(h) << 8;
+    v |= hvm_get_8u(h);
+
+    return v;
+}
+
+static inline uint32_t hvm_get_32u(hvm_domain_context_t *h)
+{
+    uint32_t v;
+    v =  hvm_get_16u(h) << 16;
+    v |= hvm_get_16u(h);
+
+    return v;
+}
+
+static inline uint64_t hvm_get_64u(hvm_domain_context_t *h)
+{
+    uint64_t v;
+    v =  (uint64_t)hvm_get_32u(h) << 32;
+    v |= hvm_get_32u(h);
+
+    return v;
+}
+
+static inline void hvm_get_buffer(hvm_domain_context_t *h, char *buf, int len)
+{
+    memcpy(buf, &h->data[h->cur], len);
+    h->cur += len;
+}
+
+extern int hvm_save(struct vcpu*, hvm_domain_context_t *h);
+extern int hvm_load(struct vcpu*, hvm_domain_context_t *h);
+
+extern void shpage_init(shared_iopage_t *sp);
+
 extern int hvm_enabled;
 
 enum { HVM_COPY_IN = 0, HVM_COPY_OUT };
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/public/arch-x86_32.h  Wed Jul 19 16:09:59 2006 +0800
@@ -142,6 +142,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
 DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
 
 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};
 
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled 
@@ -174,6 +181,7 @@ struct vcpu_guest_context {
     unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
     unsigned long failsafe_callback_eip;
     unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/public/arch-x86_64.h  Wed Jul 19 16:09:59 2006 +0800
@@ -212,6 +212,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
 #undef __DECL_REG
 
 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};
 
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled 
@@ -249,6 +256,7 @@ struct vcpu_guest_context {
     uint64_t      fs_base;
     uint64_t      gs_base_kernel;
     uint64_t      gs_base_user;
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/public/dom0_ops.h     Wed Jul 19 16:09:59 2006 +0800
@@ -535,6 +535,31 @@ struct dom0_settimeoffset {
 };
 typedef struct dom0_settimeoffset dom0_settimeoffset_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_settimeoffset_t);
+
+#define HVM_CTXT_SIZE        4096
+typedef struct hvm_domain_context {
+    uint32_t cur;
+    uint32_t size;
+    uint8_t data[HVM_CTXT_SIZE];
+} hvm_domain_context_t;
+DEFINE_XEN_GUEST_HANDLE(hvm_domain_context_t);
+#define DOM0_GETHVMCONTEXT   51
+typedef struct dom0_gethvmcontext {
+    /* IN variables. */
+    domid_t  domain;                  /* domain to be affected */
+    /* OUT variables. */
+    XEN_GUEST_HANDLE(hvm_domain_context_t) hvm_ctxt;
+} dom0_gethvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_gethvmcontext_t);
+
+#define DOM0_SETHVMCONTEXT   52
+typedef struct dom0_sethvmcontext {
+    /* IN variables. */
+    domid_t  domain;                  /* domain to be affected */
+    /* OUT variables. */
+    XEN_GUEST_HANDLE(hvm_domain_context_t) hvm_ctxt;
+} dom0_sethvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_sethvmcontext_t);
 
 struct dom0_op {
     uint32_t cmd;
@@ -579,6 +604,8 @@ struct dom0_op {
         struct dom0_hypercall_init    hypercall_init;
         struct dom0_domain_setup      domain_setup;
         struct dom0_settimeoffset     settimeoffset;
+        struct dom0_gethvmcontext     gethvmcontext;
+        struct dom0_sethvmcontext     sethvmcontext;
         uint8_t                       pad[128];
     } u;
 };
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_hvm_restore.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_restore.c      Wed Jul 19 16:09:59 2006 +0800
@@ -0,0 +1,264 @@
+/******************************************************************************
+ * xc_hvm_restore.c
+ *
+ * Restore the state of a HVM guest.
+ *
+ * Copyright (c) 2006 Intel Corperation
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/ioreq.h>
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+static ssize_t
+read_exact(int fd, void *buf, size_t count)
+{
+    int r = 0, s;
+    unsigned char *b = buf;
+
+    while (r < count) {
+        s = read(fd, &b[r], count - r);
+        if ((s == -1) && (errno == EINTR))
+            continue;
+        if (s <= 0) {
+            break;
+        }
+        r += s;
+    }
+
+    return (r == count) ? 1 : 0;
+}
+
+int xc_hvm_restore(int xc_handle, int io_fd,
+                     uint32_t dom, unsigned long nr_pfns,
+                     unsigned int store_evtchn, unsigned long *store_mfn,
+                     unsigned int console_evtchn, unsigned long *console_mfn)
+{
+    DECLARE_DOM0_OP;
+
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    char *region_base;
+
+    xc_mmu_t *mmu = NULL;
+
+    xc_dominfo_t info;
+    unsigned int rc = 1, i;
+    uint32_t rec_len;
+    hvm_domain_context_t hvm_ctxt;
+    unsigned int vp_eport;
+    unsigned long count;
+    unsigned long long ptr;
+    unsigned long long v_end, memsize;
+    unsigned long shared_page_frame = 0;
+    shared_iopage_t *sp;
+
+    /* hvm guest mem size (Mb) */
+    memsize = (unsigned long long)*store_mfn;
+    v_end = memsize << 20;
+
+    DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, 
*store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld.\n", 
+            dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, 
*console_mfn);
+
+
+    max_pfn = nr_pfns;
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERR("Unable to get platform info.");
+        return 1;
+    }
+
+    DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, 
hvirt_start=%lx, pt_levels=%d\n",
+            max_pfn,
+            max_mfn,
+            hvirt_start,
+            pt_levels);
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        /* needed for build dom0 op, but might as well do early */
+        ERR("Unable to mlock ctxt");
+        return 1;
+    }
+
+
+    /* We want zeroed memory so use calloc rather than malloc. */
+    p2m        = calloc(max_pfn, sizeof(unsigned long));
+
+    if (p2m == NULL) {
+        ERR("memory alloc failed");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    /* Get the domain's shared-info frame. */
+    op.cmd = DOM0_GETDOMAININFO;
+    op.u.getdomaininfo.domain = (domid_t)dom;
+    if (xc_dom0_op(xc_handle, &op) < 0) {
+        ERR("Could not get information on new domain");
+        goto out;
+    }
+    shared_info_frame = op.u.getdomaininfo.shared_info_frame;
+
+    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
+        errno = ENOMEM;
+        goto out;
+    }
+
+    if(xc_domain_memory_increase_reservation(
+           xc_handle, dom, max_pfn, 0, 0, NULL) != 0) {
+        ERR("Failed to increase reservation by %lx KB", PFN_TO_KB(max_pfn));
+        errno = ENOMEM;
+        goto out;
+    }
+
+    DPRINTF("Increased domain reservation by %lx KB\n", PFN_TO_KB(max_pfn));
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERR("Could not get domain info");
+        return 1;
+    }
+
+    DPRINTF("after increasing domain reservation, nr_pages=0x%lx, 
maxmemkb=0x%lx\n", info.nr_pages, info.max_memkb);
+
+    /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
+    if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
+        ERR("Did not read correct number of frame numbers for new dom");
+        goto out;
+    }
+
+    if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
+        ERR("Could not initialise for MMU updates");
+        goto out;
+    }
+
+    /* resotre memory */
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | 
PROT_WRITE, p2m, max_pfn) ) == 0) {
+        ERR("HVM:map page_array failed!\n");
+        goto out;
+    }
+
+    for (i = 0; i < max_pfn; i++) {
+        void *zpage = region_base + i * PAGE_SIZE;
+        if (!read_exact(io_fd, zpage, PAGE_SIZE)) {
+            ERR("HVM:read page %d failed!\n", i);
+            goto out;
+        }
+    }
+
+    /* Write the machine->phys table entries. */
+    for ( count = 0; count < max_pfn; count++ )
+    {
+        ptr = (unsigned long long)p2m[count] << PAGE_SHIFT;
+        if ( xc_add_mmu_update(xc_handle, mmu,
+                               ptr | MMU_MACHPHYS_UPDATE, count) )
+            goto out;
+    }
+
+    (void)munmap(region_base, max_pfn*PAGE_SIZE);
+
+    if (xc_finish_mmu_updates(xc_handle, mmu)) {
+        ERR("HVM:Error doing finish_mmu_updates()");
+        goto out;
+    }
+
+    /* realloc a evtchn port on vcpu */
+    vp_eport = xc_evtchn_alloc_unbound(xc_handle, dom, 0);
+    if ( vp_eport < 0 ) {
+        ERR("Couldn't get unbound port from VMX guest when restore.\n");
+        goto out;
+    }
+
+    /* restore hvm context including pic/pit/shpage */
+    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+        ERR("error read hvm context size!\n");
+        goto out;
+    }
+    if (rec_len != sizeof(hvm_ctxt)) {
+        ERR("hvm context size dismatch!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERR("error read hvm context!\n");
+        goto out;
+    }
+
+    xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt);
+
+    /* Populate the event channel port in the shared page */
+    shared_page_frame = p2m[(v_end >> PAGE_SHIFT) - 1];
+    if ( (sp = (shared_iopage_t *) xc_map_foreign_range(
+              xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
+              shared_page_frame)) == 0 ) {
+        ERR("map share page fail");
+        goto out;
+    }
+
+    /* set new vp_eport */
+    DPRINTF("new vp_eport=%d.\n", 
+            vp_eport);
+    /*XXX: smp support */
+    sp->vcpu_iodata[0].vp_eport = vp_eport;
+
+    /* restore vcpu ctxt & vmcs */
+    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+        ERR("error read vcpu context size!\n");
+        goto out;
+    }
+    if (rec_len != sizeof(ctxt)) {
+        ERR("vcpu context size dismatch!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) {
+        ERR("error read vcpu context.\n");
+        goto out;
+    }
+
+    if ( (rc = xc_vcpu_setcontext(xc_handle, dom, 0, &ctxt)) ) {
+        ERR("Could not set vcpu context, rc=%d", rc);
+        goto out;
+    }
+
+    /* caculate the store_mfn , wrong val cause hang when introduceDomain */
+    *store_mfn = p2m[(v_end >> PAGE_SHIFT) - 2];
+    DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", 
*store_mfn, v_end);
+
+    rc = 0;
+    goto out;
+
+ out:
+    if ( (rc != 0) && (dom != 0) )
+        xc_domain_destroy(xc_handle, dom);
+    free(mmu);
+    free(p2m);
+
+    DPRINTF("Restore exit with rc=%d\n", rc);
+
+    return rc;
+}
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_hvm_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_save.c Wed Jul 19 16:09:59 2006 +0800
@@ -0,0 +1,207 @@
+/******************************************************************************
+ * xc_hvm_save.c
+ *
+ * Save the state of a running HVM guest.
+ *
+ * Copyright (c) 2006 Intel Corperation
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
+
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+{
+    xc_dominfo_t info;
+
+    int rc = 1, i;
+    int live  = (flags & XCFLAGS_LIVE);
+    int debug = (flags & XCFLAGS_DEBUG);
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    /* A copy of hvm domain context */
+    hvm_domain_context_t hvm_ctxt;
+
+    /* Live mapping of shared info structure */
+    shared_info_t *live_shinfo = NULL;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    uint32_t nr_pfns, rec_size;
+    unsigned long *page_array;
+
+    DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x.\n",
+            dom, max_iters, max_factor, flags);
+
+    /* If no explicit control parameters given, use defaults */
+/*    if(!max_iters)*/
+/*        max_iters = DEF_MAX_ITERS;*/
+/*    if(!max_factor)*/
+/*        max_factor = DEF_MAX_FACTOR;*/
+
+/*    initialize_mbit_rate();*/
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERR("HVM:Unable to get platform info.");
+        return 1;
+    }
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERR("HVM:Could not get domain info");
+        return 1;
+    }
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        ERR("HVM:Unable to mlock ctxt");
+        return 1;
+    }
+
+    /* Only have to worry about vcpu 0 even for SMP */
+    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
+        ERR("HVM:Could not get vcpu context");
+        goto out;
+    }
+    shared_info_frame = info.shared_info_frame;
+
+    /* A cheesy test to see whether the domain contains valid state. */
+    if (ctxt.ctrlreg[3] == 0)
+    {
+        ERR("Domain is not in a valid HVM guest state");
+        goto out;
+    }
+
+   /* cheesy sanity check */
+    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
+        ERR("Invalid HVM state record -- pfn count out of range: %lu",
+            (info.max_memkb >> (PAGE_SHIFT - 10)));
+        goto out;
+    }
+
+    /* Map the shared info frame */
+    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                            PROT_READ, shared_info_frame))) {
+        ERR("HVM:Couldn't map live_shinfo");
+        goto out;
+    }
+
+    max_pfn = live_shinfo->arch.max_pfn;
+
+    DPRINTF("saved hvm domain info:max_pfn=0x%lx, max_mfn=0x%lx, 
nr_pages=0x%lx\n", max_pfn, max_mfn, info.nr_pages); 
+
+    if (live) {
+        ERR("hvm domain doesn't support live migration now.\n");
+        if (debug)
+            ERR("hvm domain debug on.\n");
+        goto out;
+    }
+
+    /* suspend hvm domain */
+    if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
+        ERR("HVM Domain appears not to have suspended");
+        goto out;
+    }
+
+    nr_pfns = info.nr_pages;
+    DPRINTF("after suspend hvm domain nr_pages=0x%x.\n", nr_pfns);
+
+    /* get all the HVM domain pfns */
+    if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * 
nr_pfns)) == NULL) {
+        ERR("HVM:malloc fail!\n");
+        goto out;
+    }
+
+    if ( xc_get_pfn_list(xc_handle, dom, page_array, nr_pfns) != nr_pfns) {
+        ERR("HVM domain get pfn list fail!\n");
+        goto out;
+    }
+
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | 
PROT_WRITE, page_array, nr_pfns) ) == 0) {
+        ERR("HVM domain map pages failed!\n");
+        goto out;
+    }
+
+
+    /* Start writing out the saved-domain record. begin with mem */
+    if (!write_exact(io_fd, &nr_pfns, sizeof(unsigned int))) {
+        ERR("write: nr_pfns");
+        goto out;
+    }
+
+    for (i = 0; i < nr_pfns; i++) {
+        void *zpage = region_base + i * PAGE_SIZE;
+        if (ratewrite(io_fd, zpage, PAGE_SIZE) != PAGE_SIZE) {
+            ERR("HVM:write page %d failed!.\n", i);
+            goto out;
+        }
+    }
+
+    /* save hvm hypervisor state including pic/pit/shpage */
+    if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERR("Unable to mlock ctxt");
+        return 1;
+    }
+    xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt);
+
+/*    ERR("hvm_getcontext get %d, size=%d!\n", hvm_ctxt.size, 
sizeof(hvm_ctxt));*/
+    rec_size = sizeof(hvm_ctxt);
+    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+        ERR("error write hvm ctxt size");
+        goto out;
+    }
+
+    if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) {
+        ERR("write HVM info failed!\n");
+    }
+
+
+    /* save vcpu/vmcs context XXX:smp support*/
+    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
+        ERR("HVM:Could not get vcpu context");
+        goto out;
+    }
+
+    rec_size = sizeof(ctxt);
+    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+        ERR("error write vcpu ctxt size");
+        goto out;
+    }
+
+    if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
+        ERR("write vmcs failed!\n");
+
+
+
+    /* Success! */
+    rc = 0;
+
+ out:
+    return !!rc;
+}
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/vmcs_data.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/vmcs_data.h    Wed Jul 19 16:09:59 2006 +0800
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * vmcs_data.h
+ * 
+ * Copyright (c) 2006 Intel Corperation
+ * 
+ */
+
+#ifndef __XEN_PUBLIC_VMCS_DATA_H__
+#define __XEN_PUBLIC_VMCS_DATA_H__
+
+/*
+ * World vmcs state
+ */
+struct vmcs_data {
+    uint64_t  eip;        /* execution pointer */
+    uint64_t  esp;        /* stack pointer */
+    uint64_t  eflags;     /* flags register */
+    uint64_t  cr0;
+    uint64_t  cr3;        /* page table directory */
+    uint64_t  cr4;
+    uint32_t  idtr_limit; /* idt */
+    uint64_t  idtr_base;
+    uint32_t  gdtr_limit; /* gdt */
+    uint64_t  gdtr_base;
+    uint32_t  cs_sel;     /* cs selector */
+    uint32_t  cs_limit;
+    uint64_t  cs_base;
+    uint32_t  cs_arbytes;
+    uint32_t  ds_sel;     /* ds selector */
+    uint32_t  ds_limit;
+    uint64_t  ds_base;
+    uint32_t  ds_arbytes;
+    uint32_t  es_sel;     /* es selector */
+    uint32_t  es_limit;
+    uint64_t  es_base;
+    uint32_t  es_arbytes;
+    uint32_t  ss_sel;     /* ss selector */
+    uint32_t  ss_limit;
+    uint64_t  ss_base;
+    uint32_t  ss_arbytes;
+    uint32_t  fs_sel;     /* fs selector */
+    uint32_t  fs_limit;
+    uint64_t  fs_base;
+    uint32_t  fs_arbytes;
+    uint32_t  gs_sel;     /* gs selector */
+    uint32_t  gs_limit;
+    uint64_t  gs_base;
+    uint32_t  gs_arbytes;
+    uint32_t  tr_sel;     /* task selector */
+    uint32_t  tr_limit;
+    uint64_t  tr_base;
+    uint32_t  tr_arbytes;
+    uint32_t  ldtr_sel;   /* ldtr selector */
+    uint32_t  ldtr_limit;
+    uint64_t  ldtr_base;
+    uint32_t  ldtr_arbytes;
+    uint32_t  sysenter_cs;
+    uint64_t  sysenter_esp;
+    uint64_t  sysenter_eip;
+    /* msr for em64t */
+    uint64_t shadow_gs;
+    uint64_t flags;
+    /* same size as VMX_MSR_COUNT */
+    uint64_t msr_items[6];
+    uint64_t cpu_state;
+};
+typedef struct vmcs_data vmcs_data_t;
+#endif
# HG changeset patch
# User Edwin Zhai <edwin.zhai@xxxxxxxxx>
# Node ID 98c3ddf83a59b0cbbdce63bb210adfd0d2ec1aea
# Parent  ecb8ff1fcf1fc24561c8bd272a58828592d90806
cirrus&rtl8139 coexist issue fix

diff -r ecb8ff1fcf1f -r 98c3ddf83a59 tools/ioemu/target-i386-dm/exec-dm.c
--- a/tools/ioemu/target-i386-dm/exec-dm.c      Fri Jul 14 18:53:27 2006 +0100
+++ b/tools/ioemu/target-i386-dm/exec-dm.c      Wed Jul 19 13:45:04 2006 +0800
@@ -382,7 +382,7 @@ int iomem_index(target_phys_addr_t addr)
                 start = mmio[i].start;
                 end = mmio[i].start + mmio[i].size;
 
-                if ((addr >= start) && (addr <= end)){
+                if ((addr >= start) && (addr < end)){
                         return (mmio[i].io_index >> IO_MEM_SHIFT) & 
(IO_MEM_NB_ENTRIES - 1);
                 }
         }
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.