WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Implement SVM specific part for Nested Vi

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Implement SVM specific part for Nested Virtualization
From: Xen patchbot-unstable <patchbot@xxxxxxx>
Date: Sat, 09 Apr 2011 09:20:15 +0100
Delivery-date: Sat, 09 Apr 2011 01:25:33 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User cegger
# Date 1299670565 -3600
# Node ID a5e69b6fdd16a2c16d14afaad7025dfd794a44e1
# Parent  a21d019bb8fe8535a0bbbf4d2ecf1adab4783dc8
Implement SVM specific part for Nested Virtualization

Signed-off-by: Christoph Egger <Christoph.Egger@xxxxxxx>
Acked-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Committed-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---


diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/Makefile
--- a/xen/arch/x86/hvm/svm/Makefile     Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/Makefile     Wed Mar 09 12:36:05 2011 +0100
@@ -2,6 +2,8 @@
 obj-y += emulate.o
 obj-bin-y += entry.o
 obj-y += intr.o
+obj-y += nestedsvm.o
 obj-y += svm.o
+obj-y += svmdebug.o
 obj-y += vmcb.o
 obj-y += vpmu.o
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/emulate.c
--- a/xen/arch/x86/hvm/svm/emulate.c    Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/emulate.c    Wed Mar 09 12:36:05 2011 +0100
@@ -102,6 +102,11 @@
 MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
 MAKE_INSTR(PAUSE,  1, 0x90);
 MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
+MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
+MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
+MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
+MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
+MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
 
 static const u8 *opc_bytes[INSTR_MAX_COUNT] = 
 {
@@ -116,6 +121,11 @@
     [INSTR_RDTSC]  = OPCODE_RDTSC,
     [INSTR_PAUSE]  = OPCODE_PAUSE,
     [INSTR_XSETBV] = OPCODE_XSETBV,
+    [INSTR_VMRUN]  = OPCODE_VMRUN,
+    [INSTR_VMLOAD] = OPCODE_VMLOAD,
+    [INSTR_VMSAVE] = OPCODE_VMSAVE,
+    [INSTR_STGI]   = OPCODE_STGI,
+    [INSTR_CLGI]   = OPCODE_CLGI,
 };
 
 static int fetch(struct vcpu *v, u8 *buf, unsigned long addr, int len)
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/entry.S
--- a/xen/arch/x86/hvm/svm/entry.S      Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/entry.S      Wed Mar 09 12:36:05 2011 +0100
@@ -54,6 +54,7 @@
 
 ENTRY(svm_asm_do_resume)
         call svm_intr_assist
+        call_with_regs(nsvm_vcpu_switch)
 
         get_current(bx)
         CLGI
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/nestedsvm.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c  Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,1279 @@
+/*
+ * nestedsvm.c: Nested Virtualization
+ * Copyright (c) 2011, Advanced Micro Devices, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <asm/hvm/support.h>
+#include <asm/hvm/svm/emulate.h>
+#include <asm/hvm/svm/svm.h>
+#include <asm/hvm/svm/vmcb.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/nestedsvm.h>
+#include <asm/hvm/svm/svmdebug.h>
+#include <asm/paging.h> /* paging_mode_hap */
+
+static int
+nestedsvm_vmcb_isvalid(struct vcpu *v, uint64_t vmcxaddr)
+{
+    if ( !hvm_svm_enabled(v) || hvm_guest_x86_mode(v) < 2 )
+        return 0;
+
+    /* Maximum valid physical address.
+     * See AMD BKDG for HSAVE_PA MSR.
+     */
+    if ( vmcxaddr > 0xfd00000000ULL )
+        return 0;
+
+    return 1;
+}
+
+int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    if (nv->nv_vvmcx != NULL && nv->nv_vvmcxaddr != vmcbaddr) {
+        ASSERT(nv->nv_vvmcx != NULL);
+        ASSERT(nv->nv_vvmcxaddr != VMCX_EADDR);
+        hvm_unmap_guest_frame(nv->nv_vvmcx);
+        nv->nv_vvmcx = NULL;
+        nv->nv_vvmcxaddr = VMCX_EADDR;
+    }
+
+    if (nv->nv_vvmcx == NULL) {
+        nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT);
+        if (nv->nv_vvmcx == NULL)
+            return 0;
+        nv->nv_vvmcxaddr = vmcbaddr;
+    }
+
+    return 1;
+}
+
+/* Interface methods */
+int nsvm_vcpu_initialise(struct vcpu *v)
+{
+    void *msrpm;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0);
+    svm->ns_cached_msrpm = msrpm;
+    if (msrpm == NULL)
+        goto err;
+    memset(msrpm, 0x0, MSRPM_SIZE);
+
+    msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0);
+    svm->ns_merged_msrpm = msrpm;
+    if (msrpm == NULL)
+        goto err;
+    memset(msrpm, 0x0, MSRPM_SIZE);
+
+    nv->nv_n2vmcx = alloc_vmcb();
+    if (nv->nv_n2vmcx == NULL)
+        goto err;
+    nv->nv_n2vmcx_pa = virt_to_maddr(nv->nv_n2vmcx);
+
+    return 0;
+
+err:
+    nsvm_vcpu_destroy(v);
+    return -ENOMEM;
+}
+
+int nsvm_vcpu_destroy(struct vcpu *v)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    if (svm->ns_cached_msrpm) {
+        free_xenheap_pages(svm->ns_cached_msrpm,
+                           get_order_from_bytes(MSRPM_SIZE));
+        svm->ns_cached_msrpm = NULL;
+    }
+    if (svm->ns_merged_msrpm) {
+        free_xenheap_pages(svm->ns_merged_msrpm,
+                           get_order_from_bytes(MSRPM_SIZE));
+        svm->ns_merged_msrpm = NULL;
+    }
+    if (nv->nv_n2vmcx) {
+        free_vmcb(nv->nv_n2vmcx);
+        nv->nv_n2vmcx = NULL;
+        nv->nv_n2vmcx_pa = VMCX_EADDR;
+    }
+    if (svm->ns_iomap)
+        svm->ns_iomap = NULL;
+
+    return 0;
+}
+
+int nsvm_vcpu_reset(struct vcpu *v)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    svm->ns_msr_hsavepa = VMCX_EADDR;
+    svm->ns_ovvmcb_pa = VMCX_EADDR;
+
+    svm->ns_cr_intercepts = 0;
+    svm->ns_dr_intercepts = 0;
+    svm->ns_exception_intercepts = 0;
+    svm->ns_general1_intercepts = 0;
+    svm->ns_general2_intercepts = 0;
+    svm->ns_lbr_control.bytes = 0;
+
+    svm->ns_hap_enabled = 0;
+    svm->ns_vmcb_guestcr3 = 0;
+    svm->ns_vmcb_hostcr3 = 0;
+    svm->ns_guest_asid = 0;
+    svm->ns_hostflags.bytes = 0;
+    svm->ns_vmexit.exitinfo1 = 0;
+    svm->ns_vmexit.exitinfo2 = 0;
+
+    if (svm->ns_iomap)
+        svm->ns_iomap = NULL;
+
+    return 0;
+}
+
+static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct vmcb_struct *n1vmcb;
+
+    n1vmcb = nv->nv_n1vmcx;
+    ASSERT(n1vmcb != NULL);
+
+    n1vmcb->rip += inst_len;
+
+    /* Remember the host interrupt flag */
+    svm->ns_hostflags.fields.rflagsif =
+        (n1vmcb->rflags & X86_EFLAGS_IF) ? 1 : 0;
+
+    return 0;
+}
+
+int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct vmcb_struct *n1vmcb, *n2vmcb;
+    int rc;
+
+    n1vmcb = nv->nv_n1vmcx;
+    n2vmcb = nv->nv_n2vmcx;
+    ASSERT(n1vmcb != NULL);
+    ASSERT(n2vmcb != NULL);
+
+    /* nsvm_vmcb_prepare4vmexit() already saved register values
+     * handled by VMSAVE/VMLOAD into n1vmcb directly.
+     */
+
+    /* switch vmcb to l1 guest's vmcb */
+    v->arch.hvm_svm.vmcb = n1vmcb;
+    v->arch.hvm_svm.vmcb_pa = nv->nv_n1vmcx_pa;
+
+    /* EFER */
+    v->arch.hvm_vcpu.guest_efer = n1vmcb->_efer;
+    rc = hvm_set_efer(n1vmcb->_efer);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc);
+
+    /* CR4 */
+    v->arch.hvm_vcpu.guest_cr[4] = n1vmcb->_cr4;
+    rc = hvm_set_cr4(n1vmcb->_cr4);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
+
+    /* CR0 */
+    v->arch.hvm_vcpu.guest_cr[0] = n1vmcb->_cr0 | X86_CR0_PE;
+    n1vmcb->rflags &= ~X86_EFLAGS_VM;
+    rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
+
+    /* CR2 */
+    v->arch.hvm_vcpu.guest_cr[2] = n1vmcb->_cr2;
+    hvm_update_guest_cr(v, 2);
+
+    /* CR3 */
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    } else {
+        /* host shadow paging + guest shadow paging. */
+
+        /* Reset MMU context  -- XXX (hostrestore) not yet working*/
+        if (!pagetable_is_null(v->arch.guest_table))
+            put_page(pagetable_get_page(v->arch.guest_table));
+        v->arch.guest_table = pagetable_null();
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    }
+    rc = hvm_set_cr3(n1vmcb->_cr3);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+
+    regs->eax = n1vmcb->rax;
+    regs->esp = n1vmcb->rsp;
+    regs->eip = n1vmcb->rip;
+    regs->eflags = n1vmcb->rflags;
+    n1vmcb->_dr7 = 0; /* disable all breakpoints */
+    n1vmcb->_cpl = 0;
+
+    /* Clear exitintinfo to prevent a fault loop of re-injecting
+     * exceptions forever.
+     */
+    n1vmcb->exitintinfo.bytes = 0;
+
+    /* Cleanbits */
+    n1vmcb->cleanbits.bytes = 0;
+
+    hvm_asid_flush_vcpu(v);
+
+    return 0;
+}
+
+static int nsvm_vmrun_permissionmap(struct vcpu *v, bool_t viopm)
+{
+    struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+    struct vmcb_struct *host_vmcb = arch_svm->vmcb;
+    unsigned long *ns_msrpm_ptr;
+    unsigned int i;
+    enum hvm_copy_result ret;
+    unsigned long *ns_viomap;
+    bool_t ioport_80, ioport_ed;
+
+    ns_msrpm_ptr = (unsigned long *)svm->ns_cached_msrpm;
+
+    ret = hvm_copy_from_guest_phys(svm->ns_cached_msrpm,
+                                   ns_vmcb->_msrpm_base_pa, MSRPM_SIZE);
+    if (ret != HVMCOPY_okay) {
+        gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret);
+        return 1;
+    }
+
+    /* Check l1 guest io permission map and get a shadow one based on
+     * if l1 guest intercepts io ports 0x80 and/or 0xED.
+     */
+    svm->ns_oiomap_pa = svm->ns_iomap_pa;
+    svm->ns_iomap_pa = ns_vmcb->_iopm_base_pa;
+
+    ns_viomap = hvm_map_guest_frame_ro(svm->ns_iomap_pa >> PAGE_SHIFT);
+    ASSERT(ns_viomap != NULL);
+    ioport_80 = test_bit(0x80, ns_viomap);
+    ioport_ed = test_bit(0xed, ns_viomap);
+    hvm_unmap_guest_frame(ns_viomap);
+
+    svm->ns_iomap = nestedhvm_vcpu_iomap_get(ioport_80, ioport_ed);
+
+    nv->nv_ioport80 = ioport_80;
+    nv->nv_ioportED = ioport_ed;
+
+    /* v->arch.hvm_svm.msrpm has type unsigned long, thus
+     * BYTES_PER_LONG.
+     */
+    for (i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++)
+        svm->ns_merged_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i];
+
+    host_vmcb->_iopm_base_pa =
+        (uint64_t)virt_to_maddr(svm->ns_iomap);
+    host_vmcb->_msrpm_base_pa =
+        (uint64_t)virt_to_maddr(svm->ns_merged_msrpm);
+
+    return 0;
+}
+
+static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb, *n1vmcb, *n2vmcb;
+    bool_t vcleanbits_valid;
+    int rc;
+
+    ns_vmcb = nv->nv_vvmcx;
+    n1vmcb = nv->nv_n1vmcx;
+    n2vmcb = nv->nv_n2vmcx;
+    ASSERT(ns_vmcb != NULL);
+    ASSERT(n1vmcb != NULL);
+    ASSERT(n2vmcb != NULL);
+
+    /* Check if virtual VMCB cleanbits are valid */
+    vcleanbits_valid = 1;
+    if (svm->ns_ovvmcb_pa == VMCX_EADDR)
+        vcleanbits_valid = 0;
+    if (svm->ns_ovvmcb_pa != nv->nv_vvmcxaddr)
+        vcleanbits_valid = 0;
+
+#define vcleanbit_set(_name)   \
+    (vcleanbits_valid && ns_vmcb->cleanbits.fields._name)
+
+    /* Enable l2 guest intercepts */
+    if (!vcleanbit_set(intercepts)) {
+        svm->ns_cr_intercepts = ns_vmcb->_cr_intercepts;
+        svm->ns_dr_intercepts = ns_vmcb->_dr_intercepts;
+        svm->ns_exception_intercepts = ns_vmcb->_exception_intercepts;
+        svm->ns_general1_intercepts = ns_vmcb->_general1_intercepts;
+        svm->ns_general2_intercepts = ns_vmcb->_general2_intercepts;
+    }
+
+    /* We could track the cleanbits of the n1vmcb from
+     * last emulated #VMEXIT to this emulated VMRUN to safe the merges
+     * below. Those cleanbits would be tracked in an integer field
+     * in struct nestedsvm.
+     * But this effort is not worth doing because:
+     * - Only the intercepts bit of the n1vmcb can effectively be used here 
+     * - The CPU runs more instructions for the tracking than can be
+     *   safed here.
+     * The overhead comes from (ordered from highest to lowest):
+     * - svm_ctxt_switch_to (CPU context switching)
+     * - svm_fpu_enter, svm_fpu_leave (lazy FPU switching)
+     * - emulated CLGI (clears VINTR intercept)
+     * - host clears VINTR intercept
+     * Test results show that the overhead is high enough that the
+     * tracked intercepts bit of the n1vmcb is practically *always* cleared.
+     */
+
+    n2vmcb->_cr_intercepts =
+        n1vmcb->_cr_intercepts | ns_vmcb->_cr_intercepts;
+    n2vmcb->_dr_intercepts =
+        n1vmcb->_dr_intercepts | ns_vmcb->_dr_intercepts;
+    n2vmcb->_exception_intercepts =
+        n1vmcb->_exception_intercepts | ns_vmcb->_exception_intercepts;
+    n2vmcb->_general1_intercepts =
+        n1vmcb->_general1_intercepts | ns_vmcb->_general1_intercepts;
+    n2vmcb->_general2_intercepts =
+        n1vmcb->_general2_intercepts | ns_vmcb->_general2_intercepts;
+
+    /* Nested Pause Filter */
+    if (ns_vmcb->_general1_intercepts & GENERAL1_INTERCEPT_PAUSE)
+        n2vmcb->_pause_filter_count =
+            min(n1vmcb->_pause_filter_count, ns_vmcb->_pause_filter_count);
+    else
+        n2vmcb->_pause_filter_count = n1vmcb->_pause_filter_count;
+
+    /* TSC offset */
+    n2vmcb->_tsc_offset = n1vmcb->_tsc_offset + ns_vmcb->_tsc_offset;
+
+    /* Nested IO permission bitmaps */
+    rc = nsvm_vmrun_permissionmap(v, vcleanbit_set(iopm));
+    if (rc)
+        return rc;
+
+    /* ASID */
+    hvm_asid_flush_vcpu(v);
+    /* n2vmcb->_guest_asid = ns_vmcb->_guest_asid; */
+
+    /* TLB control */
+    n2vmcb->tlb_control = n1vmcb->tlb_control | ns_vmcb->tlb_control;
+
+    /* Virtual Interrupts */
+    if (!vcleanbit_set(tpr)) {
+        n2vmcb->_vintr = ns_vmcb->_vintr;
+        n2vmcb->_vintr.fields.intr_masking = 1;
+    }
+
+    /* Shadow Mode */
+    n2vmcb->interrupt_shadow = ns_vmcb->interrupt_shadow;
+
+    /* Exit codes */
+    n2vmcb->exitcode = ns_vmcb->exitcode;
+    n2vmcb->exitinfo1 = ns_vmcb->exitinfo1;
+    n2vmcb->exitinfo2 = ns_vmcb->exitinfo2;
+    n2vmcb->exitintinfo = ns_vmcb->exitintinfo;
+
+    /* Pending Interrupts */
+    n2vmcb->eventinj = ns_vmcb->eventinj;
+
+    /* LBR virtualization */
+    if (!vcleanbit_set(lbr)) {
+        svm->ns_lbr_control = ns_vmcb->lbr_control;
+    }
+    n2vmcb->lbr_control.bytes =
+        n1vmcb->lbr_control.bytes | ns_vmcb->lbr_control.bytes;
+
+    /* NextRIP */
+    n2vmcb->nextrip = ns_vmcb->nextrip;
+
+    /*
+     * VMCB Save State Area
+     */
+
+    /* Segments */
+    if (!vcleanbit_set(seg)) {
+        n2vmcb->es = ns_vmcb->es;
+        n2vmcb->cs = ns_vmcb->cs;
+        n2vmcb->ss = ns_vmcb->ss;
+        n2vmcb->ds = ns_vmcb->ds;
+        /* CPL */
+        n2vmcb->_cpl = ns_vmcb->_cpl;
+    }
+    if (!vcleanbit_set(dt)) {
+        n2vmcb->gdtr = ns_vmcb->gdtr;
+        n2vmcb->idtr = ns_vmcb->idtr;
+    }
+
+    /* EFER */
+    v->arch.hvm_vcpu.guest_efer = ns_vmcb->_efer;
+    rc = hvm_set_efer(ns_vmcb->_efer);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc);
+
+    /* CR4 */
+    v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->_cr4;
+    rc = hvm_set_cr4(ns_vmcb->_cr4);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
+
+    /* CR0 */
+    v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->_cr0;
+    rc = hvm_set_cr0(ns_vmcb->_cr0);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
+
+    /* CR2 */
+    v->arch.hvm_vcpu.guest_cr[2] = ns_vmcb->_cr2;
+    hvm_update_guest_cr(v, 2);
+
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+        rc = hvm_set_cr3(ns_vmcb->_cr3);
+        if (rc != X86EMUL_OKAY)
+            gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        n2vmcb->_np_enable = 1;
+        /* Keep h_cr3 as it is. */
+        /* When l1 guest does shadow paging
+         * we assume it intercepts page faults.
+         */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+        rc = hvm_set_cr3(ns_vmcb->_cr3);
+        if (rc != X86EMUL_OKAY)
+            gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+    } else {
+        /* host shadow paging + guest shadow paging. */
+        n2vmcb->_np_enable = 0;
+        n2vmcb->_h_cr3 = 0x0;
+
+        /* TODO: Once shadow-shadow paging is in place come back to here
+         * and set host_vmcb->_cr3 to the shadowed shadow table.
+         */
+    }
+
+    /* DRn */
+    if (!vcleanbit_set(dr)) {
+        n2vmcb->_dr7 = ns_vmcb->_dr7;
+        n2vmcb->_dr6 = ns_vmcb->_dr6;
+    }
+
+    /* RFLAGS */
+    n2vmcb->rflags = ns_vmcb->rflags;
+
+    /* RIP */
+    n2vmcb->rip = ns_vmcb->rip;
+
+    /* RSP */
+    n2vmcb->rsp = ns_vmcb->rsp;
+
+    /* RAX */
+    n2vmcb->rax = ns_vmcb->rax;
+
+    /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase,
+     * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp,
+     * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation.
+     */
+
+    /* Page tables */
+    n2vmcb->pdpe0 = ns_vmcb->pdpe0;
+    n2vmcb->pdpe1 = ns_vmcb->pdpe1;
+    n2vmcb->pdpe2 = ns_vmcb->pdpe2;
+    n2vmcb->pdpe3 = ns_vmcb->pdpe3;
+
+    /* PAT */
+    if (!vcleanbit_set(np)) {
+        n2vmcb->_g_pat = ns_vmcb->_g_pat;
+    }
+
+    if (!vcleanbit_set(lbr)) {
+        /* Debug Control MSR */
+        n2vmcb->_debugctlmsr = ns_vmcb->_debugctlmsr;
+
+        /* LBR MSRs */
+        n2vmcb->_lastbranchfromip = ns_vmcb->_lastbranchfromip;
+        n2vmcb->_lastbranchtoip = ns_vmcb->_lastbranchtoip;
+        n2vmcb->_lastintfromip = ns_vmcb->_lastintfromip;
+        n2vmcb->_lastinttoip = ns_vmcb->_lastinttoip;
+    }
+
+    /* Cleanbits */
+    n2vmcb->cleanbits.bytes = 0;
+
+    rc = svm_vmcb_isvalid(__func__, ns_vmcb, 1);
+    if (rc) {
+        gdprintk(XENLOG_ERR, "virtual vmcb invalid\n");
+        return rc;
+    }
+
+    rc = svm_vmcb_isvalid(__func__, n2vmcb, 1);
+    if (rc) {
+        gdprintk(XENLOG_ERR, "n2vmcb invalid\n");
+        return rc;
+    }
+
+    /* Switch guest registers to l2 guest */
+    regs->eax = ns_vmcb->rax;
+    regs->eip = ns_vmcb->rip;
+    regs->esp = ns_vmcb->rsp;
+    regs->eflags = ns_vmcb->rflags;
+
+#undef vcleanbit_set
+    return 0;
+}
+
+static int
+nsvm_vcpu_vmentry(struct vcpu *v, struct cpu_user_regs *regs,
+    unsigned int inst_len)
+{
+    int ret;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb;
+
+    ns_vmcb = nv->nv_vvmcx;
+    ASSERT(ns_vmcb != NULL);
+    ASSERT(nv->nv_n2vmcx != NULL);
+    ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR);
+
+    /* Save values for later use. Needed for Nested-on-Nested and
+     * Shadow-on-Shadow paging.
+     */
+    svm->ns_vmcb_guestcr3 = ns_vmcb->_cr3;
+    svm->ns_vmcb_hostcr3 = ns_vmcb->_h_cr3;
+
+    nv->nv_flushp2m = (ns_vmcb->tlb_control
+        || (svm->ns_guest_asid != ns_vmcb->_guest_asid));
+    svm->ns_guest_asid = ns_vmcb->_guest_asid;
+
+    /* nested paging for the guest */
+    svm->ns_hap_enabled = (ns_vmcb->_np_enable) ? 1 : 0;
+
+    /* Remember the V_INTR_MASK in hostflags */
+    svm->ns_hostflags.fields.vintrmask =
+        (ns_vmcb->_vintr.fields.intr_masking) ? 1 : 0;
+
+    /* Save l1 guest state (= host state) */
+    ret = nsvm_vcpu_hostsave(v, inst_len);
+    if (ret) {
+        gdprintk(XENLOG_ERR, "hostsave failed, ret = %i\n", ret);
+        return ret;
+    }
+
+    /* switch vmcb to shadow vmcb */
+    v->arch.hvm_svm.vmcb = nv->nv_n2vmcx;
+    v->arch.hvm_svm.vmcb_pa = nv->nv_n2vmcx_pa;
+
+    ret = nsvm_vmcb_prepare4vmrun(v, regs);
+    if (ret) {
+        gdprintk(XENLOG_ERR, "prepare4vmrun failed, ret = %i\n", ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+int
+nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    inst_len = __get_instruction_length(v, INSTR_VMRUN);
+    if (inst_len == 0) {
+        svm->ns_vmexit.exitcode = VMEXIT_SHUTDOWN;
+        return -1;
+    }
+
+    nv->nv_vmswitch_in_progress = 1;
+    ASSERT(nv->nv_vvmcx != NULL);
+
+    /* save host state */
+    ret = nsvm_vcpu_vmentry(v, regs, inst_len);
+    if (ret) {
+        gdprintk(XENLOG_ERR,
+            "nsvm_vcpu_vmentry failed, injecting #UD\n");
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        nv->nv_vmswitch_in_progress = 0;
+        return 1;
+    }
+
+    /* Switch vcpu to guest mode
+     */
+    nestedhvm_vcpu_enter_guestmode(v);
+    nv->nv_vmswitch_in_progress = 0;
+    return 0;
+}
+
+int
+nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb;
+
+    ns_vmcb = nv->nv_vvmcx;
+
+    if (nv->nv_vmexit_pending) {
+
+        switch (exitcode) {
+        case VMEXIT_INTR:
+            if ( unlikely(ns_vmcb->eventinj.fields.v)
+                && nv->nv_vmentry_pending
+                && hvm_event_needs_reinjection(ns_vmcb->eventinj.fields.type,
+                    ns_vmcb->eventinj.fields.vector) )
+            {
+                ns_vmcb->exitintinfo.bytes = ns_vmcb->eventinj.bytes;
+            }
+            break;
+        case VMEXIT_EXCEPTION_PF:
+            ns_vmcb->_cr2 = ns_vmcb->exitinfo2;
+            /* fall through */
+        case VMEXIT_NPF:
+            /* PF error code */
+            ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1;
+            /* fault address */
+            ns_vmcb->exitinfo2 = svm->ns_vmexit.exitinfo2;
+            break;
+        case VMEXIT_EXCEPTION_NP:
+        case VMEXIT_EXCEPTION_SS:
+        case VMEXIT_EXCEPTION_GP:
+        case VMEXIT_EXCEPTION_15:
+        case VMEXIT_EXCEPTION_MF:
+        case VMEXIT_EXCEPTION_AC:
+            ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1;
+            break;
+        default:
+            break;
+        }
+    }
+
+    ns_vmcb->exitcode = exitcode;
+    ns_vmcb->eventinj.bytes = 0;
+    return 0;
+}
+
+int
+nsvm_vcpu_vmexit_trap(struct vcpu *v, unsigned int trapnr,
+                      int errcode, unsigned long cr2)
+{
+    ASSERT(vcpu_nestedhvm(v).nv_vvmcx != NULL);
+
+    nestedsvm_vmexit_defer(v, VMEXIT_EXCEPTION_DE + trapnr, errcode, cr2);
+    return NESTEDHVM_VMEXIT_DONE;
+}
+
+uint64_t nsvm_vcpu_guestcr3(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_vmcb_guestcr3;
+}
+
+uint64_t nsvm_vcpu_hostcr3(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_vmcb_hostcr3;
+}
+
+uint32_t nsvm_vcpu_asid(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_guest_asid;
+}
+
+static int
+nsvm_vmcb_guest_intercepts_msr(unsigned long *msr_bitmap,
+    uint32_t msr, bool_t write)
+{
+    bool_t enabled;
+    unsigned long *msr_bit;
+
+    msr_bit = svm_msrbit(msr_bitmap, msr);
+
+    if (msr_bit == NULL)
+        /* MSR not in the permission map: Let the guest handle it. */
+        return NESTEDHVM_VMEXIT_INJECT;
+
+    BUG_ON(msr_bit == NULL);
+    msr &= 0x1fff;
+
+    if (write)
+        /* write access */
+        enabled = test_bit(msr * 2 + 1, msr_bit);
+    else
+        /* read access */
+        enabled = test_bit(msr * 2, msr_bit);
+
+    if (!enabled)
+        return NESTEDHVM_VMEXIT_HOST;
+
+    return NESTEDHVM_VMEXIT_INJECT;
+}
+
+static int
+nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1)
+{
+    unsigned long iopm_gfn = iopm_pa >> PAGE_SHIFT;
+    unsigned long *io_bitmap = NULL;
+    ioio_info_t ioinfo;
+    uint16_t port;
+    bool_t enabled;
+
+    ioinfo.bytes = exitinfo1;
+    port = ioinfo.fields.port;
+
+    switch (port) {
+    case 0 ... 32767: /* first 4KB page */
+        io_bitmap = hvm_map_guest_frame_ro(iopm_gfn);
+        break;
+    case 32768 ... 65535: /* second 4KB page */
+        port -= 32768;
+        io_bitmap = hvm_map_guest_frame_ro(iopm_gfn+1);
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    if (io_bitmap == NULL) {
+        gdprintk(XENLOG_ERR,
+            "IOIO intercept: mapping of permission map failed\n");
+        return NESTEDHVM_VMEXIT_ERROR;
+    }
+
+    enabled = test_bit(port, io_bitmap);
+    hvm_unmap_guest_frame(io_bitmap);
+    if (!enabled)
+        return NESTEDHVM_VMEXIT_HOST;
+
+    return NESTEDHVM_VMEXIT_INJECT;
+}
+
+int
+nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v,
+    struct cpu_user_regs *regs, uint64_t exitcode)
+{
+    uint64_t exit_bits;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+    enum nestedhvm_vmexits vmexits;
+
+    switch (exitcode) {
+    case VMEXIT_CR0_READ ... VMEXIT_CR15_READ:
+    case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE:
+        exit_bits = 1ULL << (exitcode - VMEXIT_CR0_READ);
+        if (svm->ns_cr_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_DR0_READ ... VMEXIT_DR7_READ:
+    case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
+        exit_bits = 1ULL << (exitcode - VMEXIT_DR0_READ);
+        if (svm->ns_dr_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_EXCEPTION_DE ... VMEXIT_EXCEPTION_XF:
+        exit_bits = 1ULL << (exitcode - VMEXIT_EXCEPTION_DE);
+        if (svm->ns_exception_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_INTR ... VMEXIT_SHUTDOWN:
+        exit_bits = 1ULL << (exitcode - VMEXIT_INTR);
+        if (svm->ns_general1_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_VMRUN ... VMEXIT_XSETBV:
+        exit_bits = 1ULL << (exitcode - VMEXIT_VMRUN);
+        if (svm->ns_general2_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_NPF:
+    case VMEXIT_INVALID:
+        /* Always intercepted */
+        break;
+
+    default:
+        gdprintk(XENLOG_ERR, "Illegal exitcode 0x%"PRIx64"\n", exitcode);
+        BUG();
+        break;
+    }
+
+    /* Special cases: Do more detailed checks */
+    switch (exitcode) {
+    case VMEXIT_MSR:
+        ASSERT(regs != NULL);
+        nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr);
+        ASSERT(nv->nv_vvmcx != NULL);
+        ns_vmcb = nv->nv_vvmcx;
+        vmexits = nsvm_vmcb_guest_intercepts_msr(svm->ns_cached_msrpm,
+            regs->ecx, ns_vmcb->exitinfo1 != 0);
+        if (vmexits == NESTEDHVM_VMEXIT_HOST)
+            return 0;
+        break;
+    case VMEXIT_IOIO:
+        nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr);
+        ASSERT(nv->nv_vvmcx != NULL);
+        ns_vmcb = nv->nv_vvmcx;
+        vmexits = nsvm_vmcb_guest_intercepts_ioio(ns_vmcb->_iopm_base_pa,
+            ns_vmcb->exitinfo1);
+        if (vmexits == NESTEDHVM_VMEXIT_HOST)
+            return 0;
+        break;
+    }
+
+    return 1;
+}
+
+int
+nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr)
+{
+    return nsvm_vmcb_guest_intercepts_exitcode(v,
+        guest_cpu_user_regs(), VMEXIT_EXCEPTION_DE + trapnr);
+}
+
+static int
+nsvm_vmcb_prepare4vmexit(struct vcpu *v)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+    struct vmcb_struct *n2vmcb = nv->nv_n2vmcx;
+
+    svm_vmsave(nv->nv_n1vmcx);
+
+    /* Cache guest physical address of virtual vmcb
+     * for VMCB Cleanbit emulation.
+     */
+    svm->ns_ovvmcb_pa = nv->nv_vvmcxaddr;
+
+    /* Intercepts - keep them as they are */
+
+    /* Pausefilter - keep it as is */
+
+    /* Nested IO permission bitmap */
+    /* Just keep the iopm_base_pa and msrpm_base_pa values.
+     * The guest must not see the virtualized values.
+     */
+
+    /* TSC offset */
+    /* Keep it. It's maintainted by the l1 guest. */ 
+
+    /* ASID */
+    /* ns_vmcb->_guest_asid = n2vmcb->_guest_asid; */
+
+    /* TLB control */
+    ns_vmcb->tlb_control = 0;
+
+    /* Virtual Interrupts */
+    ns_vmcb->_vintr = n2vmcb->_vintr;
+    if (!(svm->ns_hostflags.fields.vintrmask))
+        ns_vmcb->_vintr.fields.intr_masking = 0;
+
+    /* Shadow mode */
+    ns_vmcb->interrupt_shadow = n2vmcb->interrupt_shadow;
+
+    /* Exit codes */
+    ns_vmcb->exitcode = n2vmcb->exitcode;
+    ns_vmcb->exitinfo1 = n2vmcb->exitinfo1;
+    ns_vmcb->exitinfo2 = n2vmcb->exitinfo2;
+    ns_vmcb->exitintinfo = n2vmcb->exitintinfo;
+
+    /* Interrupts */
+    /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have
+     * to make sure that we do not lose injected events. So check eventinj
+     * here and copy it to exitintinfo if it is valid.
+     * exitintinfo and eventinj can't be both valid because the case below
+     * only happens on a VMRUN instruction intercept which has no valid
+     * exitintinfo set.
+     */
+    if ( unlikely(n2vmcb->eventinj.fields.v) &&
+         hvm_event_needs_reinjection(n2vmcb->eventinj.fields.type,
+                                     n2vmcb->eventinj.fields.vector) )
+    {
+        ns_vmcb->exitintinfo = n2vmcb->eventinj;
+    }
+
+    ns_vmcb->eventinj.bytes = 0;
+
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+        ns_vmcb->_np_enable = n2vmcb->_np_enable;
+        ns_vmcb->_cr3 = n2vmcb->_cr3;
+        /* The vmcb->h_cr3 is the shadowed h_cr3. The original
+         * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3,
+         * hence we keep the ns_vmcb->h_cr3 value. */
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        ns_vmcb->_np_enable = 0;
+        /* Throw h_cr3 away. Guest is not allowed to set it or
+         * it can break out, otherwise (security hole!) */
+        ns_vmcb->_h_cr3 = 0x0;
+        /* Stop intercepting #PF (already done above
+         * by restoring cached intercepts). */
+        ns_vmcb->_cr3 = n2vmcb->_cr3;
+    } else {
+        /* host shadow paging + guest shadow paging. */
+        ns_vmcb->_np_enable = 0;
+        ns_vmcb->_h_cr3 = 0x0;
+        /* The vmcb->_cr3 is the shadowed cr3. The original
+         * unshadowed guest cr3 is kept in ns_vmcb->_cr3,
+         * hence we keep the ns_vmcb->_cr3 value. */
+    }
+
+    /* LBR virtualization - keep lbr control as is */
+
+    /* NextRIP */
+    ns_vmcb->nextrip = n2vmcb->nextrip;
+
+    /*
+     * VMCB Save State Area
+     */
+
+    /* Segments */
+    ns_vmcb->es = n2vmcb->es;
+    ns_vmcb->cs = n2vmcb->cs;
+    ns_vmcb->ss = n2vmcb->ss;
+    ns_vmcb->ds = n2vmcb->ds;
+    ns_vmcb->gdtr = n2vmcb->gdtr;
+    ns_vmcb->idtr = n2vmcb->idtr;
+
+    /* CPL */
+    ns_vmcb->_cpl = n2vmcb->_cpl;
+
+    /* EFER */
+    ns_vmcb->_efer = n2vmcb->_efer;
+
+    /* CRn */
+    ns_vmcb->_cr4 = n2vmcb->_cr4;
+    ns_vmcb->_cr0 = n2vmcb->_cr0;
+
+    /* DRn */
+    ns_vmcb->_dr7 = n2vmcb->_dr7;
+    ns_vmcb->_dr6 = n2vmcb->_dr6;
+
+    /* RFLAGS */
+    ns_vmcb->rflags = n2vmcb->rflags;
+
+    /* RIP */
+    ns_vmcb->rip = n2vmcb->rip;
+
+    /* RSP */
+    ns_vmcb->rsp = n2vmcb->rsp;
+
+    /* RAX */
+    ns_vmcb->rax = n2vmcb->rax;
+
+    /* Keep the l2 guest values of the fs, gs, ldtr, tr, kerngsbase,
+     * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp,
+     * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation.
+     */
+
+    /* CR2 */
+    ns_vmcb->_cr2 = n2vmcb->_cr2;
+
+    /* Page tables */
+    ns_vmcb->pdpe0 = n2vmcb->pdpe0;
+    ns_vmcb->pdpe1 = n2vmcb->pdpe1;
+    ns_vmcb->pdpe2 = n2vmcb->pdpe2;
+    ns_vmcb->pdpe3 = n2vmcb->pdpe3;
+
+    /* PAT */
+    ns_vmcb->_g_pat = n2vmcb->_g_pat;
+
+    /* Debug Control MSR */
+    ns_vmcb->_debugctlmsr = n2vmcb->_debugctlmsr;
+
+    /* LBR MSRs */
+    ns_vmcb->_lastbranchfromip = n2vmcb->_lastbranchfromip;
+    ns_vmcb->_lastbranchtoip = n2vmcb->_lastbranchtoip;
+    ns_vmcb->_lastintfromip = n2vmcb->_lastintfromip;
+    ns_vmcb->_lastinttoip = n2vmcb->_lastinttoip;
+
+    return 0;
+}
+
+bool_t
+nsvm_vmcb_hap_enabled(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_hap_enabled;
+}
+
+/* MSR handling */
+int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    int ret = 1;
+
+    *msr_content = 0;
+
+    switch (msr) {
+    case MSR_K8_VM_CR:
+        break;
+    case MSR_K8_VM_HSAVE_PA:
+        *msr_content = svm->ns_msr_hsavepa;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+
+    return ret;
+}
+
+int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content)
+{
+    int ret = 1;
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    switch (msr) {
+    case MSR_K8_VM_CR:
+        /* ignore write. handle all bits as read-only. */
+        break;
+    case MSR_K8_VM_HSAVE_PA:
+        if (!nestedsvm_vmcb_isvalid(v, msr_content)) {
+            gdprintk(XENLOG_ERR,
+                "MSR_K8_VM_HSAVE_PA value invalid 0x%"PRIx64"\n", msr_content);
+            ret = -1; /* inject #GP */
+            break;
+        }
+        svm->ns_msr_hsavepa = msr_content;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+
+    return ret;
+}
+
+/* VMEXIT emulation */
+void
+nestedsvm_vmexit_defer(struct vcpu *v,
+    uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    svm->ns_vmexit.exitcode = exitcode;
+    svm->ns_vmexit.exitinfo1 = exitinfo1;
+    svm->ns_vmexit.exitinfo2 = exitinfo2;
+    vcpu_nestedhvm(v).nv_vmexit_pending = 1;
+}
+
+enum nestedhvm_vmexits
+nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode)
+{
+    bool_t is_intercepted;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    ASSERT(nv->nv_vmexit_pending == 0);
+    is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode);
+
+    switch (exitcode) {
+    case VMEXIT_INVALID:
+        if (is_intercepted)
+            return NESTEDHVM_VMEXIT_INJECT;
+        return NESTEDHVM_VMEXIT_HOST;
+
+    case VMEXIT_INTR:
+    case VMEXIT_NMI:
+        return NESTEDHVM_VMEXIT_HOST;
+    case VMEXIT_EXCEPTION_NM:
+        /* Host must handle lazy fpu context switching first.
+         * Then inject the VMEXIT if L1 guest intercepts this.
+         */
+        return NESTEDHVM_VMEXIT_HOST;
+
+    case VMEXIT_NPF:
+        if (nestedhvm_paging_mode_hap(v)) {
+            if (!is_intercepted)
+                return NESTEDHVM_VMEXIT_FATALERROR;
+            /* host nested paging + guest nested paging */
+            return NESTEDHVM_VMEXIT_HOST;
+        }
+        if (paging_mode_hap(v->domain)) {
+            if (is_intercepted)
+                return NESTEDHVM_VMEXIT_FATALERROR;
+            /* host nested paging + guest shadow paging */
+            return NESTEDHVM_VMEXIT_HOST;
+        }
+        /* host shadow paging + guest shadow paging */
+        /* Can this happen? */
+        BUG();
+        return NESTEDHVM_VMEXIT_FATALERROR;
+    case VMEXIT_EXCEPTION_PF:
+        if (nestedhvm_paging_mode_hap(v)) {
+            /* host nested paging + guest nested paging */
+            if (!is_intercepted)
+                /* l1 guest intercepts #PF unnecessarily */
+                return NESTEDHVM_VMEXIT_HOST;
+            /* l2 guest intercepts #PF unnecessarily */
+            return NESTEDHVM_VMEXIT_INJECT;
+        }
+        if (!paging_mode_hap(v->domain)) {
+            /* host shadow paging + guest shadow paging */
+            return NESTEDHVM_VMEXIT_HOST;
+        }
+        /* host nested paging + guest shadow paging */
+        return NESTEDHVM_VMEXIT_INJECT;
+    case VMEXIT_VMMCALL:
+        /* Always let the guest handle VMMCALL/VMCALL */
+        return NESTEDHVM_VMEXIT_INJECT;
+    default:
+        break;
+    }
+
+    if (is_intercepted)
+        return NESTEDHVM_VMEXIT_INJECT;
+    return NESTEDHVM_VMEXIT_HOST;
+}
+
+enum nestedhvm_vmexits
+nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    int rc;
+    enum nestedhvm_vmexits ret = NESTEDHVM_VMEXIT_DONE;
+
+    ASSERT(vcpu_nestedhvm(v).nv_vmswitch_in_progress);
+    ASSERT(nestedhvm_vcpu_in_guestmode(v));
+
+    rc = nsvm_vmcb_prepare4vmexit(v);
+    if (rc)
+        ret = NESTEDHVM_VMEXIT_ERROR;
+
+    rc = nhvm_vcpu_hostrestore(v, regs);
+    if (rc)
+        ret = NESTEDHVM_VMEXIT_FATALERROR;
+
+    nestedhvm_vcpu_exit_guestmode(v);
+    return ret;
+}
+
+/* The exitcode is in native SVM/VMX format. The forced exitcode
+ * is in generic format.
+ */
+static enum nestedhvm_vmexits
+nestedsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode)
+{
+    int rc;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    nv->nv_vmswitch_in_progress = 1;
+
+    ASSERT(nv->nv_vvmcx != NULL);
+
+    /* On special intercepts the host has to handle
+     * the vcpu is still in guest mode here.
+     */
+    if (nestedhvm_vcpu_in_guestmode(v)) {
+        enum nestedhvm_vmexits ret;
+
+        ret = nestedsvm_vmexit_n2n1(v, regs);
+        switch (ret) {
+        case NESTEDHVM_VMEXIT_FATALERROR:
+            gdprintk(XENLOG_ERR, "VMEXIT: fatal error\n");
+            return ret;
+        case NESTEDHVM_VMEXIT_HOST:
+            BUG();
+            return ret;
+        case NESTEDHVM_VMEXIT_ERROR:
+            exitcode = VMEXIT_INVALID;
+            break;
+        default:
+            ASSERT(!nestedhvm_vcpu_in_guestmode(v));
+            break;
+        }
+
+        /* host state has been restored */
+    }
+
+    ASSERT(!nestedhvm_vcpu_in_guestmode(v));
+
+    /* Prepare for running the l1 guest. Make the actual
+     * modifications to the virtual VMCB/VMCS.
+     */
+    rc = nhvm_vcpu_vmexit(v, regs, exitcode);
+
+    nv->nv_vmswitch_in_progress = 0;
+
+    if (rc)
+        return NESTEDHVM_VMEXIT_FATALERROR;
+
+    return NESTEDHVM_VMEXIT_DONE;
+}
+
+/* VCPU switch */
+asmlinkage void nsvm_vcpu_switch(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+    struct nestedvcpu *nv;
+    struct nestedsvm *svm;
+
+    if (!nestedhvm_enabled(v->domain))
+        return;
+
+    nv = &vcpu_nestedhvm(v);
+    svm = &vcpu_nestedsvm(v);
+    ASSERT(v->arch.hvm_svm.vmcb != NULL);
+    ASSERT(nv->nv_n1vmcx != NULL);
+    ASSERT(nv->nv_n2vmcx != NULL);
+    ASSERT(nv->nv_n1vmcx_pa != VMCX_EADDR);
+    ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR);
+
+    if (nv->nv_vmexit_pending) {
+ vmexit:
+        nestedsvm_vcpu_vmexit(v, regs, svm->ns_vmexit.exitcode);
+        nv->nv_vmexit_pending = 0;
+        nv->nv_vmentry_pending = 0;
+        return;
+    }
+    if (nv->nv_vmentry_pending) {
+        int ret;
+        ASSERT(!nv->nv_vmexit_pending);
+        ret = nsvm_vcpu_vmrun(v, regs);
+        if (ret < 0)
+            goto vmexit;
+        nv->nv_vmentry_pending = 0;
+        return;
+    }
+}
+
+
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Mar 09 12:36:05 2011 +0100
@@ -49,6 +49,9 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 #include <asm/hvm/svm/intr.h>
+#include <asm/hvm/svm/svmdebug.h>
+#include <asm/hvm/svm/nestedsvm.h>
+#include <asm/hvm/nestedhvm.h>
 #include <asm/x86_emulate.h>
 #include <public/sched.h>
 #include <asm/hvm/vpt.h>
@@ -106,6 +109,44 @@
     write_efer(read_efer() & ~EFER_SVME);
 }
 
+unsigned long *
+svm_msrbit(unsigned long *msr_bitmap, uint32_t msr)
+{
+    unsigned long *msr_bit = NULL;
+
+    /*
+     * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
+     */
+    if ( msr <= 0x1fff )
+        msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
+    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
+        msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
+    else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
+        msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
+
+    return msr_bit;
+}
+
+void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable)
+{
+    unsigned long *msr_bit;
+
+    msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr);
+    BUG_ON(msr_bit == NULL);
+    msr &= 0x1fff;
+
+    if ( enable )
+    {
+        __set_bit(msr * 2, msr_bit);
+        __set_bit(msr * 2 + 1, msr_bit);
+    }
+    else
+    {
+        __clear_bit(msr * 2, msr_bit);
+        __clear_bit(msr * 2 + 1, msr_bit);
+    }
+}
+
 static void svm_save_dr(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -296,7 +337,7 @@
 {
     svm_load_cpu_state(v, ctxt);
     if (svm_vmcb_restore(v, ctxt)) {
-        printk("svm_vmcb restore failed!\n");
+        gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n");
         domain_crash(v->domain);
         return -EINVAL;
     }
@@ -588,7 +629,24 @@
 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    vmcb_set_tsc_offset(vmcb, offset);
+    struct vmcb_struct *n1vmcb, *n2vmcb;
+    uint64_t n2_tsc_offset = 0;
+
+    if ( !nestedhvm_enabled(v->domain) ) {
+        vmcb_set_tsc_offset(vmcb, offset);
+        return;
+    }
+
+    n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
+    n2vmcb = vcpu_nestedhvm(v).nv_n2vmcx;
+
+    if ( nestedhvm_vcpu_in_guestmode(v) ) {
+        n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) -
+            vmcb_get_tsc_offset(n1vmcb);
+        vmcb_set_tsc_offset(n1vmcb, offset);
+    }
+
+    vmcb_set_tsc_offset(vmcb, offset + n2_tsc_offset);
 }
 
 static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
@@ -683,9 +741,13 @@
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     bool_t debug_state = v->domain->debugger_attached;
-    vintr_t intr;
+    bool_t vcpu_guestmode = 0;
 
-    if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
+        vcpu_guestmode = 1;
+
+    if ( !vcpu_guestmode &&
+        unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
     {
         uint32_t intercepts = vmcb_get_exception_intercepts(vmcb);
         uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
@@ -703,13 +765,19 @@
         hvm_asid_flush_vcpu(v);
     }
 
-    /* Reflect the vlapic's TPR in the hardware vtpr */
-    intr = vmcb_get_vintr(vmcb);
-    intr.fields.tpr =
-        (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
-    vmcb_set_vintr(vmcb, intr);
+    if ( !vcpu_guestmode )
+    {
+        vintr_t intr;
+
+        /* Reflect the vlapic's TPR in the hardware vtpr */
+        intr = vmcb_get_vintr(vmcb);
+        intr.fields.tpr =
+            (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
+        vmcb_set_vintr(vmcb, intr);
+    }
 
     hvm_do_resume(v);
+
     reset_stack_and_jump(svm_asm_do_resume);
 }
 
@@ -961,8 +1029,8 @@
         struct {
             uint64_t gpa;
             uint64_t mfn;
-            u32 qualification;
-            u32 p2mt;
+            uint32_t qualification;
+            uint32_t p2mt;
         } _d;
 
         _d.gpa = gpa;
@@ -984,12 +1052,21 @@
 
 static void svm_fpu_dirty_intercept(void)
 {
-    struct vcpu *curr = current;
-    struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
+    struct vcpu *v = current;
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
-    svm_fpu_enter(curr);
+    svm_fpu_enter(v);
 
-    if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) {
+       /* Check if guest must make FPU ready for the nested guest */
+       if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS )
+           hvm_inject_exception(TRAP_no_device, HVM_DELIVER_NO_ERROR_CODE, 0);
+       else
+           vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS);
+       return;
+    }
+
+    if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
         vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS);
 }
 
@@ -1003,11 +1080,14 @@
 
     hvm_cpuid(input, eax, ebx, ecx, edx);
 
-    if ( input == 0x80000001 )
-    {
+    switch (input) {
+    case 0x80000001:
         /* Fix up VLAPIC details. */
         if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
             __clear_bit(X86_FEATURE_APIC & 31, edx);
+        break;
+    default:
+        break;
     }
 
     HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
@@ -1043,6 +1123,7 @@
 
 static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
 {
+    int ret;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
@@ -1076,9 +1157,6 @@
         *msr_content = 0;
         break;
 
-    case MSR_K8_VM_HSAVE_PA:
-        goto gpf;
-
     case MSR_IA32_DEBUGCTLMSR:
         *msr_content = vmcb_get_debugctlmsr(vmcb);
         break;
@@ -1111,6 +1189,11 @@
         break;
 
     default:
+        ret = nsvm_rdmsr(v, msr, msr_content);
+        if ( ret < 0 )
+            goto gpf;
+        else if ( ret )
+            break;
 
         if ( rdmsr_viridian_regs(msr, msr_content) ||
              rdmsr_hypervisor_regs(msr, msr_content) )
@@ -1133,6 +1216,7 @@
 
 static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 {
+    int ret;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     int sync = 0;
@@ -1153,9 +1237,6 @@
 
     switch ( msr )
     {
-    case MSR_K8_VM_HSAVE_PA:
-        goto gpf;
-
     case MSR_IA32_SYSENTER_CS:
         vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
         break;
@@ -1215,6 +1296,12 @@
         break;
 
     default:
+        ret = nsvm_wrmsr(v, msr, msr_content);
+        if ( ret < 0 )
+            goto gpf;
+        else if ( ret )
+            break;
+
         if ( wrmsr_viridian_regs(msr, msr_content) )
             break;
 
@@ -1298,6 +1385,96 @@
     do_sched_op_compat(SCHEDOP_yield, 0);
 }
 
+static void
+svm_vmexit_do_vmrun(struct cpu_user_regs *regs,
+                    struct vcpu *v, uint64_t vmcbaddr)
+{
+    if (!nestedhvm_enabled(v->domain)) {
+        gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n");
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
+        gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #UD\n");
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    vcpu_nestedhvm(v).nv_vmentry_pending = 1;
+    return;
+}
+
+static void
+svm_vmexit_do_vmload(struct vmcb_struct *vmcb,
+                     struct cpu_user_regs *regs,
+                     struct vcpu *v, uint64_t vmcbaddr)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
+        return;
+
+    if (!nestedhvm_enabled(v->domain)) {
+        gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
+        gdprintk(XENLOG_ERR, "VMLOAD: mapping vmcb failed, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    svm_vmload(nv->nv_vvmcx);
+    /* State in L1 VMCB is stale now */
+    v->arch.hvm_svm.vmcb_in_sync = 0;
+
+    __update_guest_eip(regs, inst_len);
+    return;
+
+ inject:
+    hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0);
+    return;
+}
+
+static void
+svm_vmexit_do_vmsave(struct vmcb_struct *vmcb,
+                     struct cpu_user_regs *regs,
+                     struct vcpu *v, uint64_t vmcbaddr)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
+        return;
+
+    if (!nestedhvm_enabled(v->domain)) {
+        gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
+        gdprintk(XENLOG_ERR, "VMSAVE: mapping vmcb failed, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    svm_vmsave(nv->nv_vvmcx);
+
+    __update_guest_eip(regs, inst_len);
+    return;
+
+ inject:
+    hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0);
+    return;
+}
+
 static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs)
 {
     struct hvm_emulate_ctxt ctxt;
@@ -1428,22 +1605,38 @@
     .msr_read_intercept   = svm_msr_read_intercept,
     .msr_write_intercept  = svm_msr_write_intercept,
     .invlpg_intercept     = svm_invlpg_intercept,
-    .set_rdtsc_exiting    = svm_set_rdtsc_exiting
+    .set_rdtsc_exiting    = svm_set_rdtsc_exiting,
+
+    .nhvm_vcpu_initialise = nsvm_vcpu_initialise,
+    .nhvm_vcpu_destroy = nsvm_vcpu_destroy,
+    .nhvm_vcpu_reset = nsvm_vcpu_reset,
+    .nhvm_vcpu_hostrestore = nsvm_vcpu_hostrestore,
+    .nhvm_vcpu_vmexit = nsvm_vcpu_vmexit_inject,
+    .nhvm_vcpu_vmexit_trap = nsvm_vcpu_vmexit_trap,
+    .nhvm_vcpu_guestcr3 = nsvm_vcpu_guestcr3,
+    .nhvm_vcpu_hostcr3 = nsvm_vcpu_hostcr3,
+    .nhvm_vcpu_asid = nsvm_vcpu_asid,
+    .nhvm_vmcx_guest_intercepts_trap = nsvm_vmcb_guest_intercepts_trap,
+    .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
 };
 
 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
 {
-    unsigned int exit_reason;
+    uint64_t exit_reason;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     eventinj_t eventinj;
     int inst_len, rc;
     vintr_t intr;
+    bool_t vcpu_guestmode = 0;
 
     if ( paging_mode_hap(v->domain) )
         v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
             vmcb_get_cr3(vmcb);
 
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
+        vcpu_guestmode = 1;
+
     /*
      * Before doing anything else, we need to sync up the VLAPIC's TPR with
      * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows)
@@ -1451,13 +1644,73 @@
      * NB. We need to preserve the low bits of the TPR to make checked builds
      * of Windows work, even though they don't actually do anything.
      */
-    intr = vmcb_get_vintr(vmcb);
-    vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
+    if ( !vcpu_guestmode ) {
+        intr = vmcb_get_vintr(vmcb);
+        vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
                    ((intr.fields.tpr & 0x0F) << 4) |
                    (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F));
+    }
 
     exit_reason = vmcb->exitcode;
 
+    if ( vcpu_guestmode ) {
+        enum nestedhvm_vmexits nsret;
+        struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+        struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+        uint64_t exitinfo1, exitinfo2;
+
+        /* Write real exitinfo1 back into virtual vmcb.
+         * nestedsvm_check_intercepts() expects to have the correct
+         * exitinfo1 value there.
+         */
+        exitinfo1 = ns_vmcb->exitinfo1;
+        ns_vmcb->exitinfo1 = vmcb->exitinfo1;
+        nsret = nestedsvm_check_intercepts(v, regs, exit_reason);
+        switch (nsret) {
+        case NESTEDHVM_VMEXIT_CONTINUE:
+            BUG();
+            break;
+        case NESTEDHVM_VMEXIT_HOST:
+            break;
+        case NESTEDHVM_VMEXIT_INJECT:
+            /* Switch vcpu from l2 to l1 guest. We must perform
+             * the switch here to have svm_do_resume() working
+             * as intended.
+             */
+            exitinfo1 = vmcb->exitinfo1;
+            exitinfo2 = vmcb->exitinfo2;
+            nv->nv_vmswitch_in_progress = 1;
+            nsret = nestedsvm_vmexit_n2n1(v, regs);
+            nv->nv_vmswitch_in_progress = 0;
+            switch (nsret) {
+            case NESTEDHVM_VMEXIT_DONE:
+                /* defer VMEXIT injection */
+                nestedsvm_vmexit_defer(v, exit_reason, exitinfo1, exitinfo2);
+                goto out;
+            case NESTEDHVM_VMEXIT_FATALERROR:
+                gdprintk(XENLOG_ERR, "unexpected nestedsvm_vmexit() error\n");
+                goto exit_and_crash;
+
+            default:
+                BUG();
+            case NESTEDHVM_VMEXIT_ERROR:
+                break;
+            }
+        case NESTEDHVM_VMEXIT_ERROR:
+            gdprintk(XENLOG_ERR,
+                "nestedsvm_check_intercepts() returned 
NESTEDHVM_VMEXIT_ERROR\n");
+            goto out;
+        case NESTEDHVM_VMEXIT_FATALERROR:
+            gdprintk(XENLOG_ERR,
+                "unexpected nestedsvm_check_intercepts() error\n");
+            goto exit_and_crash;
+        default:
+            gdprintk(XENLOG_INFO, "nestedsvm_check_intercepts() returned %i\n",
+                nsret);
+            goto exit_and_crash;
+        }
+    }
+
     if ( hvm_long_mode_enabled(v) )
         HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
                     (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
@@ -1469,7 +1722,7 @@
 
     if ( unlikely(exit_reason == VMEXIT_INVALID) )
     {
-        svm_dump_vmcb(__func__, vmcb);
+        svm_vmcb_dump(__func__, vmcb);
         goto exit_and_crash;
     }
 
@@ -1630,6 +1883,7 @@
     case VMEXIT_VMMCALL:
         if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
             break;
+        BUG_ON(vcpu_guestmode);
         HVMTRACE_1D(VMMCALL, regs->eax);
         rc = hvm_do_hypercall(regs);
         if ( rc != HVM_HCALL_preempted )
@@ -1662,9 +1916,18 @@
 
     case VMEXIT_MONITOR:
     case VMEXIT_MWAIT:
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        break;
+
     case VMEXIT_VMRUN:
+        svm_vmexit_do_vmrun(regs, v, regs->eax);
+        break;
     case VMEXIT_VMLOAD:
+        svm_vmexit_do_vmload(vmcb, regs, v, regs->eax);
+        break;
     case VMEXIT_VMSAVE:
+        svm_vmexit_do_vmsave(vmcb, regs, v, regs->eax);
+        break;
     case VMEXIT_STGI:
     case VMEXIT_CLGI:
     case VMEXIT_SKINIT:
@@ -1708,7 +1971,7 @@
 
     default:
     exit_and_crash:
-        gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
+        gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%"PRIx64", "
                  "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
                  exit_reason, 
                  (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
@@ -1716,6 +1979,11 @@
         break;
     }
 
+  out:
+    if ( vcpu_guestmode )
+        /* Don't clobber TPR of the nested guest. */
+        return;
+
     /* The exit may have updated the TPR: reflect this in the hardware vtpr */
     intr = vmcb_get_vintr(vmcb);
     intr.fields.tpr =
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/svmdebug.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/svm/svmdebug.c   Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,191 @@
+/*
+ * svmdebug.c: debug functions
+ * Copyright (c) 2011, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/msr-index.h>
+#include <asm/hvm/svm/svmdebug.h>
+
+static void svm_dump_sel(const char *name, svm_segment_register_t *s)
+{
+    printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", 
+           name, s->sel, s->attr.bytes, s->limit,
+           (unsigned long long)s->base);
+}
+
+/* This function can directly access fields which are covered by clean bits. */
+void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb)
+{
+    printk("Dumping guest's current state at %s...\n", from);
+    printk("Size of VMCB = %d, paddr = 0x%016lx, vaddr = %p\n",
+           (int) sizeof(struct vmcb_struct), virt_to_maddr(vmcb), vmcb);
+
+    printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x "
+           "exception_intercepts = 0x%08x\n", 
+           vmcb->_cr_intercepts, vmcb->_dr_intercepts, 
+           vmcb->_exception_intercepts);
+    printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", 
+           vmcb->_general1_intercepts, vmcb->_general2_intercepts);
+    printk("iopm_base_pa = 0x%016llx msrpm_base_pa = 0x%016llx tsc_offset = "
+            "0x%016llx\n", 
+           (unsigned long long)vmcb->_iopm_base_pa,
+           (unsigned long long)vmcb->_msrpm_base_pa,
+           (unsigned long long)vmcb->_tsc_offset);
+    printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = "
+            "0x%016llx\n", vmcb->tlb_control,
+           (unsigned long long)vmcb->_vintr.bytes,
+           (unsigned long long)vmcb->interrupt_shadow);
+    printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", 
+           (unsigned long long)vmcb->exitcode,
+           (unsigned long long)vmcb->exitintinfo.bytes);
+    printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n",
+           (unsigned long long)vmcb->exitinfo1,
+           (unsigned long long)vmcb->exitinfo2);
+    printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", 
+           (unsigned long long)vmcb->_np_enable, vmcb->_guest_asid);
+    printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", 
+           vmcb->_cpl, (unsigned long long)vmcb->_efer,
+           (unsigned long long)vmcb->star, (unsigned long long)vmcb->lstar);
+    printk("CR0 = 0x%016llx CR2 = 0x%016llx\n",
+           (unsigned long long)vmcb->_cr0, (unsigned long long)vmcb->_cr2);
+    printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", 
+           (unsigned long long)vmcb->_cr3, (unsigned long long)vmcb->_cr4);
+    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
+           (unsigned long long)vmcb->rsp, (unsigned long long)vmcb->rip);
+    printk("RAX = 0x%016llx  RFLAGS=0x%016llx\n",
+           (unsigned long long)vmcb->rax, (unsigned long long)vmcb->rflags);
+    printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", 
+           (unsigned long long)vmcb->_dr6, (unsigned long long)vmcb->_dr7);
+    printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n",
+           (unsigned long long)vmcb->cstar, 
+           (unsigned long long)vmcb->sfmask);
+    printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", 
+           (unsigned long long)vmcb->kerngsbase,
+           (unsigned long long)vmcb->_g_pat);
+    printk("H_CR3 = 0x%016llx CleanBits = 0x%08x\n",
+           (unsigned long long)vmcb->_h_cr3, vmcb->cleanbits.bytes);
+
+    /* print out all the selectors */
+    svm_dump_sel("CS", &vmcb->cs);
+    svm_dump_sel("DS", &vmcb->ds);
+    svm_dump_sel("SS", &vmcb->ss);
+    svm_dump_sel("ES", &vmcb->es);
+    svm_dump_sel("FS", &vmcb->fs);
+    svm_dump_sel("GS", &vmcb->gs);
+    svm_dump_sel("GDTR", &vmcb->gdtr);
+    svm_dump_sel("LDTR", &vmcb->ldtr);
+    svm_dump_sel("IDTR", &vmcb->idtr);
+    svm_dump_sel("TR", &vmcb->tr);
+}
+
+bool_t
+svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb,
+                 bool_t verbose)
+{
+    bool_t ret = 0; /* ok */
+
+#define PRINTF(...) \
+    if (verbose) { ret = 1; printk("%s: ", from); printk(__VA_ARGS__); \
+    } else return 1;
+
+    if ((vmcb->_efer & EFER_SVME) == 0) {
+        PRINTF("EFER: SVME bit not set (0x%"PRIx64")\n", vmcb->_efer);
+    }
+
+    if ((vmcb->_cr0 & X86_CR0_CD) == 0 && (vmcb->_cr0 & X86_CR0_NW) != 0) {
+        PRINTF("CR0: CD bit is zero and NW bit set (0x%"PRIx64")\n",
+                vmcb->_cr0);
+    }
+
+    if ((vmcb->_cr0 >> 32U) != 0) {
+        PRINTF("CR0: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->_cr0);
+    }
+
+    if ((vmcb->_cr3 & 0x7) != 0) {
+        PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->_cr3);
+    }
+    if ((vmcb->_efer & EFER_LMA) && (vmcb->_cr3 & 0xfe) != 0) {
+        PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->_cr3);
+    }
+
+    if ((vmcb->_cr4 >> 11U) != 0) {
+        PRINTF("CR4: bits [63:11] are not zero (0x%"PRIx64")\n",
+                vmcb->_cr4);
+    }
+
+    if ((vmcb->_dr6 >> 32U) != 0) {
+        PRINTF("DR6: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->_dr6);
+    }
+
+    if ((vmcb->_dr7 >> 32U) != 0) {
+        PRINTF("DR7: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->_dr7);
+    }
+
+    if ((vmcb->_efer >> 15U) != 0) {
+        PRINTF("EFER: bits [63:15] are not zero (0x%"PRIx64")\n",
+                vmcb->_efer);
+    }
+
+    if ((vmcb->_efer & EFER_LME) != 0 && ((vmcb->_cr0 & X86_CR0_PG) != 0)) {
+        if ((vmcb->_cr4 & X86_CR4_PAE) == 0) {
+            PRINTF("EFER_LME and CR0.PG are both set and CR4.PAE is zero.\n");
+        }
+        if ((vmcb->_cr0 & X86_CR0_PE) == 0) {
+            PRINTF("EFER_LME and CR0.PG are both set and CR0.PE is zero.\n");
+        }
+    }
+
+    if ((vmcb->_efer & EFER_LME) != 0
+        && (vmcb->_cr0 & X86_CR0_PG) != 0
+        && (vmcb->_cr4 & X86_CR4_PAE) != 0
+        && (vmcb->cs.attr.fields.l != 0)
+        && (vmcb->cs.attr.fields.db != 0))
+    {
+        PRINTF("EFER_LME, CR0.PG, CR4.PAE, CS.L and CS.D are all non-zero.\n");
+    }
+
+    if ((vmcb->_general2_intercepts & GENERAL2_INTERCEPT_VMRUN) == 0) {
+        PRINTF("GENERAL2_INTERCEPT: VMRUN intercept bit is clear 
(0x%"PRIx32")\n",
+            vmcb->_general2_intercepts);
+    }
+
+    if (vmcb->eventinj.fields.resvd1 != 0) {
+        PRINTF("eventinj: MBZ bits are set (0x%"PRIx64")\n",
+                vmcb->eventinj.bytes);
+    }
+
+    if (vmcb->_np_enable && vmcb->_h_cr3 == 0) {
+        PRINTF("nested paging enabled but host cr3 is 0\n");
+    }
+
+#undef PRINTF
+    return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c       Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/vmcb.c       Wed Mar 09 12:36:05 2011 +0100
@@ -33,6 +33,7 @@
 #include <asm/hvm/svm/svm.h>
 #include <asm/hvm/svm/intr.h>
 #include <asm/hvm/svm/asid.h>
+#include <asm/hvm/svm/svmdebug.h>
 #include <xen/event.h>
 #include <xen/kernel.h>
 #include <xen/domain_page.h>
@@ -40,9 +41,6 @@
 
 extern int svm_dbg_on;
 
-#define IOPM_SIZE   (12 * 1024)
-#define MSRPM_SIZE  (8  * 1024)
-
 struct vmcb_struct *alloc_vmcb(void) 
 {
     struct vmcb_struct *vmcb;
@@ -78,37 +76,6 @@
     return hsa;
 }
 
-void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable)
-{
-    unsigned long *msr_bitmap = v->arch.hvm_svm.msrpm;
-    unsigned long *msr_bit = NULL;
-
-    /*
-     * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
-     */
-    if ( msr <= 0x1fff )
-        msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
-    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
-        msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
-    else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
-        msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
-
-    BUG_ON(msr_bit == NULL);
-
-    msr &= 0x1fff;
-
-    if ( enable )
-    {
-        __set_bit(msr * 2, msr_bit);
-        __set_bit(msr * 2 + 1, msr_bit);
-    }
-    else
-    {
-        __clear_bit(msr * 2, msr_bit);
-        __clear_bit(msr * 2 + 1, msr_bit);
-    }
-}
-
 /* This function can directly access fields which are covered by clean bits. */
 static int construct_vmcb(struct vcpu *v)
 {
@@ -257,7 +224,7 @@
 
     if ( cpu_has_pause_filter )
     {
-        vmcb->_pause_filter_count = 3000;
+        vmcb->_pause_filter_count = SVM_PAUSEFILTER_INIT;
         vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_PAUSE;
     }
 
@@ -268,34 +235,38 @@
 
 int svm_create_vmcb(struct vcpu *v)
 {
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
     struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
     int rc;
 
-    if ( (arch_svm->vmcb == NULL) &&
-         (arch_svm->vmcb = alloc_vmcb()) == NULL )
+    if ( (nv->nv_n1vmcx == NULL) &&
+         (nv->nv_n1vmcx = alloc_vmcb()) == NULL )
     {
         printk("Failed to create a new VMCB\n");
         return -ENOMEM;
     }
 
-    if ( (rc = construct_vmcb(v)) != 0 )
+    arch_svm->vmcb = nv->nv_n1vmcx;
+    rc = construct_vmcb(v);
+    if ( rc != 0 )
     {
-        free_vmcb(arch_svm->vmcb);
+        free_vmcb(nv->nv_n1vmcx);
+        nv->nv_n1vmcx = NULL;
         arch_svm->vmcb = NULL;
         return rc;
     }
 
-    arch_svm->vmcb_pa = virt_to_maddr(arch_svm->vmcb);
-
+    arch_svm->vmcb_pa = nv->nv_n1vmcx_pa = virt_to_maddr(arch_svm->vmcb);
     return 0;
 }
 
 void svm_destroy_vmcb(struct vcpu *v)
 {
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
     struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
 
-    if ( arch_svm->vmcb != NULL )
-        free_vmcb(arch_svm->vmcb);
+    if ( nv->nv_n1vmcx != NULL )
+        free_vmcb(nv->nv_n1vmcx);
 
     if ( arch_svm->msrpm != NULL )
     {
@@ -304,81 +275,11 @@
         arch_svm->msrpm = NULL;
     }
 
+    nv->nv_n1vmcx = NULL;
+    nv->nv_n1vmcx_pa = VMCX_EADDR;
     arch_svm->vmcb = NULL;
 }
 
-static void svm_dump_sel(char *name, svm_segment_register_t *s)
-{
-    printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", 
-           name, s->sel, s->attr.bytes, s->limit,
-           (unsigned long long)s->base);
-}
-
-/* This function can directly access fields which are covered by clean bits. */
-void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb)
-{
-    printk("Dumping guest's current state at %s...\n", from);
-    printk("Size of VMCB = %d, paddr = 0x%016lx, vaddr = %p\n",
-           (int) sizeof(struct vmcb_struct),  virt_to_maddr(vmcb), vmcb);
-
-    printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x "
-           "exception_intercepts = 0x%08x\n", 
-           vmcb->_cr_intercepts, vmcb->_dr_intercepts, 
-           vmcb->_exception_intercepts);
-    printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", 
-           vmcb->_general1_intercepts, vmcb->_general2_intercepts);
-    printk("iopm_base_pa = 0x%016llx msrpm_base_pa = 0x%016llx tsc_offset = "
-            "0x%016llx\n", 
-           (unsigned long long)vmcb->_iopm_base_pa,
-           (unsigned long long)vmcb->_msrpm_base_pa,
-           (unsigned long long)vmcb->_tsc_offset);
-    printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = "
-            "0x%016llx\n", vmcb->tlb_control,
-           (unsigned long long)vmcb->_vintr.bytes,
-           (unsigned long long)vmcb->interrupt_shadow);
-    printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", 
-           (unsigned long long)vmcb->exitcode,
-           (unsigned long long)vmcb->exitintinfo.bytes);
-    printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n",
-           (unsigned long long)vmcb->exitinfo1,
-           (unsigned long long)vmcb->exitinfo2);
-    printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", 
-           (unsigned long long)vmcb->_np_enable, vmcb->_guest_asid);
-    printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", 
-           vmcb->_cpl, (unsigned long long)vmcb->_efer,
-           (unsigned long long)vmcb->star, (unsigned long long)vmcb->lstar);
-    printk("CR0 = 0x%016llx CR2 = 0x%016llx\n",
-           (unsigned long long)vmcb->_cr0, (unsigned long long)vmcb->_cr2);
-    printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", 
-           (unsigned long long)vmcb->_cr3, (unsigned long long)vmcb->_cr4);
-    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
-           (unsigned long long)vmcb->rsp, (unsigned long long)vmcb->rip);
-    printk("RAX = 0x%016llx  RFLAGS=0x%016llx\n",
-           (unsigned long long)vmcb->rax, (unsigned long long)vmcb->rflags);
-    printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", 
-           (unsigned long long)vmcb->_dr6, (unsigned long long)vmcb->_dr7);
-    printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n",
-           (unsigned long long)vmcb->cstar, 
-           (unsigned long long)vmcb->sfmask);
-    printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", 
-           (unsigned long long)vmcb->kerngsbase,
-           (unsigned long long)vmcb->_g_pat);
-    printk("H_CR3 = 0x%016llx CleanBits = 0x%08x\n", 
-           (unsigned long long)vmcb->_h_cr3, vmcb->cleanbits.bytes);
-
-    /* print out all the selectors */
-    svm_dump_sel("CS", &vmcb->cs);
-    svm_dump_sel("DS", &vmcb->ds);
-    svm_dump_sel("SS", &vmcb->ss);
-    svm_dump_sel("ES", &vmcb->es);
-    svm_dump_sel("FS", &vmcb->fs);
-    svm_dump_sel("GS", &vmcb->gs);
-    svm_dump_sel("GDTR", &vmcb->gdtr);
-    svm_dump_sel("LDTR", &vmcb->ldtr);
-    svm_dump_sel("IDTR", &vmcb->idtr);
-    svm_dump_sel("TR", &vmcb->tr);
-}
-
 static void vmcb_dump(unsigned char ch)
 {
     struct domain *d;
@@ -396,7 +297,7 @@
         for_each_vcpu ( d, v )
         {
             printk("\tVCPU %d\n", v->vcpu_id);
-            svm_dump_vmcb("key_handler", v->arch.hvm_svm.vmcb);
+            svm_vmcb_dump("key_handler", v->arch.hvm_svm.vmcb);
         }
     }
 
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/emulate.h
--- a/xen/include/asm-x86/hvm/svm/emulate.h     Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/svm/emulate.h     Wed Mar 09 12:36:05 2011 +0100
@@ -33,6 +33,11 @@
     INSTR_RDTSC,
     INSTR_PAUSE,
     INSTR_XSETBV,
+    INSTR_VMRUN,
+    INSTR_VMLOAD,
+    INSTR_VMSAVE,
+    INSTR_STGI,
+    INSTR_CLGI,
     INSTR_MAX_COUNT /* Must be last - Number of instructions supported */
 };
 
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/nestedsvm.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/hvm/svm/nestedsvm.h   Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,129 @@
+/*
+ * nestedsvm.h: Nested Virtualization
+ * Copyright (c) 2011, Advanced Micro Devices, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_HVM_SVM_NESTEDSVM_H__
+#define __ASM_X86_HVM_SVM_NESTEDSVM_H__
+
+#include <asm/config.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/svm/vmcb.h>
+
+struct nestedsvm {
+    uint64_t ns_msr_hsavepa; /* MSR HSAVE_PA value */
+
+    /* l1 guest physical address of virtual vmcb used by prior VMRUN.
+     * Needed for VMCB Cleanbit emulation.
+     */
+    uint64_t ns_ovvmcb_pa;
+
+    /* Cached real intercepts of the l2 guest */
+    uint32_t ns_cr_intercepts;
+    uint32_t ns_dr_intercepts;
+    uint32_t ns_exception_intercepts;
+    uint32_t ns_general1_intercepts;
+    uint32_t ns_general2_intercepts;
+
+    /* Cached real lbr of the l2 guest */
+    lbrctrl_t ns_lbr_control;
+
+    /* Cached real MSR permission bitmaps of the l2 guest */
+    unsigned long *ns_cached_msrpm;
+    /* Merged MSR permission bitmap */
+    unsigned long *ns_merged_msrpm;
+
+    /* guest physical address of virtual io permission map */
+    paddr_t ns_iomap_pa, ns_oiomap_pa;
+    /* Shadow io permission map */
+    unsigned long *ns_iomap;
+
+    /* Cache guest cr3/host cr3 the guest sets up for the l2 guest.
+     * Used by Shadow-on-Shadow and Nested-on-Nested.
+     * ns_vmcb_guestcr3: in l2 guest physical address space and points to
+     *     the l2 guest page table
+     * ns_vmcb_hostcr3: in l1 guest physical address space and points to
+     *     the l1 guest nested page table
+     */
+    uint64_t ns_vmcb_guestcr3, ns_vmcb_hostcr3;
+    uint32_t ns_guest_asid;
+
+    bool_t ns_hap_enabled;
+
+    /* Only meaningful when vmexit_pending flag is set */
+    struct {
+        uint64_t exitcode;  /* native exitcode to inject into l1 guest */
+        uint64_t exitinfo1; /* additional information to the exitcode */
+        uint64_t exitinfo2; /* additional information to the exitcode */
+    } ns_vmexit;
+    union {
+        uint32_t bytes;
+        struct {
+            uint32_t rflagsif: 1;
+            uint32_t vintrmask: 1;
+            uint32_t reserved: 30;
+        } fields;
+    } ns_hostflags;
+};
+
+#define vcpu_nestedsvm(v) (vcpu_nestedhvm(v).u.nsvm)
+
+/* True when l1 guest enabled SVM in EFER */
+#define hvm_svm_enabled(v) \
+    (!!((v)->arch.hvm_vcpu.guest_efer & EFER_SVME))
+
+int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr);
+void nestedsvm_vmexit_defer(struct vcpu *v,
+    uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2);
+enum nestedhvm_vmexits
+nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs);
+enum nestedhvm_vmexits
+nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode);
+
+/* Interface methods */
+int nsvm_vcpu_destroy(struct vcpu *v);
+int nsvm_vcpu_initialise(struct vcpu *v);
+int nsvm_vcpu_reset(struct vcpu *v);
+int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs);
+int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs);
+int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode);
+int nsvm_vcpu_vmexit_trap(struct vcpu *v, unsigned int trapnr,
+                      int errcode, unsigned long cr2);
+uint64_t nsvm_vcpu_guestcr3(struct vcpu *v);
+uint64_t nsvm_vcpu_hostcr3(struct vcpu *v);
+uint32_t nsvm_vcpu_asid(struct vcpu *v);
+int nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v,
+    struct cpu_user_regs *regs, uint64_t exitcode);
+int nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr);
+bool_t nsvm_vmcb_hap_enabled(struct vcpu *v);
+
+/* MSRs */
+int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content);
+int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content);
+
+#endif /* ASM_X86_HVM_SVM_NESTEDSVM_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/svm.h
--- a/xen/include/asm-x86/hvm/svm/svm.h Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/svm/svm.h Wed Mar 09 12:36:05 2011 +0100
@@ -29,8 +29,6 @@
 #include <asm/i387.h>
 #include <asm/hvm/vpmu.h>
 
-void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb);
-
 #define SVM_REG_EAX (0) 
 #define SVM_REG_ECX (1) 
 #define SVM_REG_EDX (2) 
@@ -62,6 +60,8 @@
         : : "a" (__pa(vmcb)) : "memory" );
 }
 
+unsigned long *svm_msrbit(unsigned long *msr_bitmap, uint32_t msr);
+
 extern u32 svm_feature_flags;
 
 #define SVM_FEATURE_NPT            0 /* Nested page table support */
@@ -82,4 +82,6 @@
 #define cpu_has_svm_cleanbits cpu_has_svm_feature(SVM_FEATURE_VMCBCLEAN)
 #define cpu_has_pause_filter  cpu_has_svm_feature(SVM_FEATURE_PAUSEFILTER)
 
+#define SVM_PAUSEFILTER_INIT    3000
+
 #endif /* __ASM_X86_HVM_SVM_H__ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/svmdebug.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/hvm/svm/svmdebug.h    Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,30 @@
+/*
+ * svmdebug.h: SVM related debug defintions
+ * Copyright (c) 2011, AMD Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifndef __ASM_X86_HVM_SVM_SVMDEBUG_H__
+#define __ASM_X86_HVM_SVM_SVMDEBUG_H__
+
+#include <asm/types.h>
+#include <asm/hvm/svm/vmcb.h>
+
+void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb);
+bool_t svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb,
+                        bool_t verbose);
+
+#endif /* __ASM_X86_HVM_SVM_SVMDEBUG_H__ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h        Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h        Wed Mar 09 12:36:05 2011 +0100
@@ -398,6 +398,9 @@
     } fields;
 } __attribute__ ((packed)) vmcbcleanbits_t;
 
+#define IOPM_SIZE   (12 * 1024)
+#define MSRPM_SIZE  (8  * 1024)
+
 struct vmcb_struct {
     u32 _cr_intercepts;         /* offset 0x00 - cleanbit 0 */
     u32 _dr_intercepts;         /* offset 0x04 - cleanbit 0 */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/vcpu.h    Wed Mar 09 12:36:05 2011 +0100
@@ -25,6 +25,7 @@
 #include <asm/hvm/vlapic.h>
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/svm/vmcb.h>
+#include <asm/hvm/svm/nestedsvm.h>
 #include <asm/mtrr.h>
 
 enum hvm_io_state {
@@ -50,6 +51,7 @@
 
     /* SVM/VMX arch specific */
     union {
+        struct nestedsvm nsvm;
     } u;
 
     bool_t nv_flushp2m; /* True, when p2m table must be flushed */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Implement SVM specific part for Nested Virtualization, Xen patchbot-unstable <=