# HG changeset patch
# User tdeegan@xxxxxxxxxxxxxxxxxxxxx
# Node ID 0f917d63e9608315e4a925109401e383fc895b2a
# Parent fda70200da01b89d5339342df6c0db372369a16d
Replace shadow pagetable code with shadow2.
---
xen/arch/x86/audit.c | 984 ------
xen/arch/x86/shadow.c | 4150 ----------------------------
xen/arch/x86/shadow32.c | 3782 --------------------------
xen/arch/x86/shadow_guest32.c | 16
xen/arch/x86/shadow_guest32pae.c | 16
xen/arch/x86/shadow_public.c | 2143 --------------
xen/include/asm-x86/shadow_64.h | 587 ----
xen/include/asm-x86/shadow_ops.h | 138
xen/include/asm-x86/shadow_public.h | 61
.hgtags | 10
tools/examples/xmexample.hvm | 4
tools/libxc/xc_domain.c | 13
tools/libxc/xc_hvm_build.c | 13
tools/libxc/xc_linux_build.c | 2
tools/libxc/xc_linux_save.c | 18
tools/libxc/xenctrl.h | 2
tools/misc/xc_shadow.c | 2
tools/python/xen/lowlevel/xc/xc.c | 69
tools/python/xen/xend/XendDomain.py | 24
tools/python/xen/xend/XendDomainInfo.py | 47
tools/python/xen/xend/image.py | 17
tools/python/xen/xm/create.py | 9
xen/arch/x86/Makefile | 16
xen/arch/x86/dom0_ops.c | 2
xen/arch/x86/domain.c | 106
xen/arch/x86/domain_build.c | 13
xen/arch/x86/hvm/hvm.c | 23
xen/arch/x86/hvm/platform.c | 9
xen/arch/x86/hvm/svm/svm.c | 265 -
xen/arch/x86/hvm/svm/vmcb.c | 4
xen/arch/x86/hvm/vlapic.c | 3
xen/arch/x86/hvm/vmx/vmcs.c | 15
xen/arch/x86/hvm/vmx/vmx.c | 228 -
xen/arch/x86/mm.c | 485 +--
xen/arch/x86/setup.c | 2
xen/arch/x86/shadow2-common.c | 3394 +++++++++++++++++++++++
xen/arch/x86/shadow2.c | 4469 +++++++++++++++++++++++++++++++
xen/arch/x86/smpboot.c | 2
xen/arch/x86/traps.c | 32
xen/arch/x86/x86_32/domain_page.c | 33
xen/arch/x86/x86_32/mm.c | 3
xen/arch/x86/x86_64/mm.c | 3
xen/arch/x86/x86_64/traps.c | 14
xen/common/acm_ops.c | 1
xen/common/grant_table.c | 4
xen/common/keyhandler.c | 19
xen/common/memory.c | 11
xen/drivers/char/console.c | 50
xen/include/asm-x86/bitops.h | 18
xen/include/asm-x86/config.h | 22
xen/include/asm-x86/domain.h | 99
xen/include/asm-x86/grant_table.h | 2
xen/include/asm-x86/hvm/hvm.h | 25
xen/include/asm-x86/hvm/support.h | 11
xen/include/asm-x86/hvm/vcpu.h | 6
xen/include/asm-x86/hvm/vmx/vmcs.h | 1
xen/include/asm-x86/hvm/vmx/vmx.h | 49
xen/include/asm-x86/mm.h | 140
xen/include/asm-x86/msr.h | 4
xen/include/asm-x86/page-guest32.h | 7
xen/include/asm-x86/page.h | 37
xen/include/asm-x86/perfc_defn.h | 53
xen/include/asm-x86/processor.h | 1
xen/include/asm-x86/shadow.h | 1791 ------------
xen/include/asm-x86/shadow2-multi.h | 116
xen/include/asm-x86/shadow2-private.h | 612 ++++
xen/include/asm-x86/shadow2-types.h | 705 ++++
xen/include/asm-x86/shadow2.h | 627 ++++
xen/include/asm-x86/x86_32/page-2level.h | 1
xen/include/asm-x86/x86_32/page-3level.h | 3
xen/include/asm-x86/x86_64/page.h | 5
xen/include/public/dom0_ops.h | 16
xen/include/xen/domain_page.h | 13
xen/include/xen/lib.h | 4
xen/include/xen/list.h | 10
xen/include/xen/sched.h | 5
76 files changed, 11147 insertions(+), 14549 deletions(-)
diff -r fda70200da01 -r 0f917d63e960 .hgtags
--- a/.hgtags Wed Aug 16 16:16:32 2006 +0100
+++ b/.hgtags Wed Aug 16 17:02:35 2006 +0100
@@ -15,3 +15,13 @@ c8fdb0caa77b429cf47f9707926e83947778cb48
c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0
af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc
d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched
+6e864d7de9db066f92bea505d256bfe286200fed last-code-review
+a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline
+bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review
+fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable
+8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline
+2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline
+0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline
+88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline
+5233c4b076b9aa073eff63508461b7bfa597737c mainline
+fda70200da01b89d5339342df6c0db372369a16d mainline
diff -r fda70200da01 -r 0f917d63e960 tools/examples/xmexample.hvm
--- a/tools/examples/xmexample.hvm Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/examples/xmexample.hvm Wed Aug 16 17:02:35 2006 +0100
@@ -26,6 +26,10 @@ builder='hvm'
# memory errors. The domain needs enough memory to boot kernel
# and modules. Allocating less than 32MBs is not recommended.
memory = 128
+
+# Shadow pagetable memory for the domain, in MB.
+# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu.
+shadow_memory = 8
# A name for your domain. All domains must have different names.
name = "ExampleHVMDomain"
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_domain.c Wed Aug 16 17:02:35 2006 +0100
@@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle,
unsigned int sop,
unsigned long *dirty_bitmap,
unsigned long pages,
- xc_shadow_control_stats_t *stats )
+ unsigned long *mb,
+ uint32_t mode,
+ xc_shadow_control_stats_t *stats)
{
int rc;
DECLARE_DOM0_OP;
op.cmd = DOM0_SHADOW_CONTROL;
op.u.shadow_control.domain = (domid_t)domid;
op.u.shadow_control.op = sop;
+ op.u.shadow_control.pages = pages;
+ op.u.shadow_control.mb = mb ? *mb : 0;
+ op.u.shadow_control.mode = mode;
set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
- op.u.shadow_control.pages = pages;
rc = do_dom0_op(xc_handle, &op);
if ( stats )
memcpy(stats, &op.u.shadow_control.stats,
sizeof(xc_shadow_control_stats_t));
+
+ if ( mb )
+ *mb = op.u.shadow_control.mb;
return (rc == 0) ? op.u.shadow_control.pages : rc;
}
@@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(in
if ( err > 0 )
{
- DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n",
+ DPRINTF("Failed allocation for dom %d: %ld pages order %d\n",
domid, nr_extents, extent_order);
errno = EBUSY;
err = -1;
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_hvm_build.c Wed Aug 16 17:02:35 2006 +0100
@@ -395,6 +395,19 @@ static int xc_hvm_build_internal(int xc_
PERROR("Could not get info on domain");
goto error_out;
}
+
+ /* HVM domains must be put into shadow2 mode at the start of day */
+ if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE,
+ NULL, 0, NULL,
+ DOM0_SHADOW2_CONTROL_FLAG_ENABLE
+ | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT
+ | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE
+ | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL,
+ NULL) )
+ {
+ PERROR("Could not enable shadow paging for domain.\n");
+ goto error_out;
+ }
memset(ctxt, 0, sizeof(*ctxt));
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_linux_build.c Wed Aug 16 17:02:35 2006 +0100
@@ -972,7 +972,7 @@ static int setup_guest(int xc_handle,
/* Enable shadow translate mode */
if ( xc_shadow_control(xc_handle, dom,
DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE,
- NULL, 0, NULL) < 0 )
+ NULL, 0, NULL, 0, NULL) < 0 )
{
PERROR("Could not enable translation mode");
goto error_out;
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_linux_save.c Wed Aug 16 17:02:35 2006 +0100
@@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle,
int i;
xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
- arr, max_pfn, NULL);
+ arr, max_pfn, NULL, 0, NULL);
DPRINTF("#Flush\n");
for ( i = 0; i < 40; i++ ) {
usleep(50000);
now = llgettimeofday();
xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
- NULL, 0, &stats);
+ NULL, 0, NULL, 0, &stats);
DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32
" dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
@@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_
if (xc_shadow_control(xc_handle, dom,
DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL ) < 0) {
+ NULL, 0, NULL, 0, NULL) < 0) {
ERR("Couldn't enable shadow mode");
goto out;
}
@@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_
but this is fast enough for the moment. */
if (!last_iter && xc_shadow_control(
xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
- to_skip, max_pfn, NULL) != max_pfn) {
+ to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
ERR("Error peeking shadow bitmap");
goto out;
}
@@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_
(unsigned long)ctxt.user_regs.edx);
}
- if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
- to_send, max_pfn, &stats ) != max_pfn) {
+ if (xc_shadow_control(xc_handle, dom,
+ DOM0_SHADOW_CONTROL_OP_CLEAN, to_send,
+ max_pfn, NULL, 0, &stats) != max_pfn) {
ERR("Error flushing shadow PT");
goto out;
}
@@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_
out:
if (live) {
- if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
- NULL, 0, NULL ) < 0) {
+ if(xc_shadow_control(xc_handle, dom,
+ DOM0_SHADOW_CONTROL_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0) {
DPRINTF("Warning - couldn't disable shadow mode");
}
}
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xenctrl.h Wed Aug 16 17:02:35 2006 +0100
@@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle,
unsigned int sop,
unsigned long *dirty_bitmap,
unsigned long pages,
+ unsigned long *mb,
+ uint32_t mode,
xc_shadow_control_stats_t *stats);
int xc_bvtsched_global_set(int xc_handle,
diff -r fda70200da01 -r 0f917d63e960 tools/misc/xc_shadow.c
--- a/tools/misc/xc_shadow.c Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/misc/xc_shadow.c Wed Aug 16 17:02:35 2006 +0100
@@ -60,6 +60,8 @@ int main(int argc, char *argv[])
mode,
NULL,
0,
+ NULL,
+ 0,
NULL) < 0 )
{
fprintf(stderr, "Error reseting performance counters: %d (%s)\n",
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 17:02:35 2006 +0100
@@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(Xc
"weight", weight);
}
+static PyObject *pyxc_shadow_control(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+
+ uint32_t dom;
+ int op=0;
+
+ static char *kwd_list[] = { "dom", "op", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list,
+ &dom, &op) )
+ return NULL;
+
+ if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL)
+ < 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ Py_INCREF(zero);
+ return zero;
+}
+
+static PyObject *pyxc_shadow_mem_control(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+ int op;
+ uint32_t dom;
+ int mbarg = -1;
+ unsigned long mb;
+
+ static char *kwd_list[] = { "dom", "mb", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list,
+ &dom, &mbarg) )
+ return NULL;
+
+ if ( mbarg < 0 )
+ op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION;
+ else
+ {
+ mb = mbarg;
+ op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION;
+ }
+ if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ mbarg = mb;
+ return Py_BuildValue("i", mbarg);
+}
+
static PyObject *pyxc_sched_credit_domain_set(XcObject *self,
PyObject *args,
PyObject *kwds)
@@ -1118,6 +1171,22 @@ static PyMethodDef pyxc_methods[] = {
"Get information about the Xen host\n"
"Returns [dict]: information about Xen"
" [None]: on failure.\n" },
+
+ { "shadow_control",
+ (PyCFunction)pyxc_shadow_control,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Set parameter for shadow pagetable interface\n"
+ " dom [int]: Identifier of domain.\n"
+ " op [int, 0]: operation\n\n"
+ "Returns: [int] 0 on success; -1 on error.\n" },
+
+ { "shadow_mem_control",
+ (PyCFunction)pyxc_shadow_mem_control,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Set or read shadow pagetable memory use\n"
+ " dom [int]: Identifier of domain.\n"
+ " mb [int, -1]: MB of shadow memory this domain should have.\n\n"
+ "Returns: [int] MB of shadow memory in use by this domain.\n" },
{ "domain_setmaxmem",
(PyCFunction)pyxc_domain_setmaxmem,
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xend/XendDomain.py Wed Aug 16 17:02:35 2006 +0100
@@ -532,6 +532,30 @@ class XendDomain:
except Exception, ex:
raise XendError(str(ex))
+ def domain_shadow_control(self, domid, op):
+ """Shadow page control."""
+ dominfo = self.domain_lookup(domid)
+ try:
+ return xc.shadow_control(dominfo.getDomid(), op)
+ except Exception, ex:
+ raise XendError(str(ex))
+
+ def domain_shadow_mem_get(self, domid):
+ """Get shadow pagetable memory allocation."""
+ dominfo = self.domain_lookup(domid)
+ try:
+ return xc.shadow_mem_control(dominfo.getDomid())
+ except Exception, ex:
+ raise XendError(str(ex))
+
+ def domain_shadow_mem_set(self, domid, mb):
+ """Set shadow pagetable memory allocation."""
+ dominfo = self.domain_lookup(domid)
+ try:
+ return xc.shadow_mem_control(dominfo.getDomid(), mb=mb)
+ except Exception, ex:
+ raise XendError(str(ex))
+
def domain_sched_credit_get(self, domid):
"""Get credit scheduler parameters for a domain.
"""
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py Wed Aug 16 17:02:35 2006 +0100
@@ -30,6 +30,7 @@ import time
import time
import threading
import os
+import math
import xen.lowlevel.xc
from xen.util import asserts
@@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [
# don't come out of xc in the same form as they are specified in the config
# file, so those are handled separately.
ROUNDTRIPPING_CONFIG_ENTRIES = [
- ('uuid', str),
- ('vcpus', int),
- ('vcpu_avail', int),
- ('cpu_weight', float),
- ('memory', int),
- ('maxmem', int),
- ('bootloader', str),
+ ('uuid', str),
+ ('vcpus', int),
+ ('vcpu_avail', int),
+ ('cpu_weight', float),
+ ('memory', int),
+ ('shadow_memory', int),
+ ('maxmem', int),
+ ('bootloader', str),
('bootloader_args', str),
- ('features', str),
- ('localtime', int),
+ ('features', str),
+ ('localtime', int),
]
ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
@@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFI
# entries written to the store that cannot be reconfigured on-the-fly.
#
VM_STORE_ENTRIES = [
- ('uuid', str),
- ('vcpus', int),
- ('vcpu_avail', int),
- ('memory', int),
- ('maxmem', int),
- ('start_time', float),
+ ('uuid', str),
+ ('vcpus', int),
+ ('vcpu_avail', int),
+ ('memory', int),
+ ('shadow_memory', int),
+ ('maxmem', int),
+ ('start_time', float),
]
VM_STORE_ENTRIES += VM_CONFIG_PARAMS
@@ -572,6 +575,7 @@ class XendDomainInfo:
defaultInfo('vcpu_avail', lambda: (1 << self.info['vcpus']) - 1)
defaultInfo('memory', lambda: 0)
+ defaultInfo('shadow_memory', lambda: 0)
defaultInfo('maxmem', lambda: 0)
defaultInfo('bootloader', lambda: None)
defaultInfo('bootloader_args', lambda: None)
@@ -1280,7 +1284,18 @@ class XendDomainInfo:
xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024)
m = self.image.getDomainMemory(self.info['memory'] * 1024)
- balloon.free(m)
+
+ # get the domain's shadow memory requirement
+ sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0))
+ if self.info['shadow_memory'] > sm:
+ sm = self.info['shadow_memory']
+
+ # Make sure there's enough RAM available for the domain
+ balloon.free(m + sm * 1024)
+
+ # Set up the shadow memory
+ sm = xc.shadow_mem_control(self.domid, mb=sm)
+ self.info['shadow_memory'] = sm
init_reservation = self.info['memory'] * 1024
if os.uname()[4] in ('ia64', 'ppc64'):
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xend/image.py Wed Aug 16 17:02:35 2006 +0100
@@ -152,6 +152,12 @@ class ImageHandler:
if 'hvm' in xc.xeninfo()['xen_caps']:
mem_kb += 4*1024;
return mem_kb
+
+ def getDomainShadowMemory(self, mem_kb):
+ """@return The minimum shadow memory required, in KiB, for a domain
+ with mem_kb KiB of RAM."""
+ # PV domains don't need any shadow memory
+ return 0
def buildDomain(self):
"""Build the domain. Define in subclass."""
@@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler):
extra_pages = int( math.ceil( extra_mb*1024 / page_kb ))
return mem_kb + extra_pages * page_kb
+ def getDomainShadowMemory(self, mem_kb):
+ """@return The minimum shadow memory required, in KiB, for a domain
+ with mem_kb KiB of RAM."""
+ if os.uname()[4] in ('ia64', 'ppc64'):
+ # Explicit shadow memory is not a concept
+ return 0
+ else:
+ # 1MB per vcpu plus 4Kib/Mib of RAM. This is higher than
+ # the minimum that Xen would allocate if no value were given.
+ return 1024 * self.vm.getVCpuCount() + mem_kb / 256
+
def register_shutdown_watch(self):
""" add xen store watch on control/shutdown """
self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xm/create.py Wed Aug 16 17:02:35 2006 +0100
@@ -157,6 +157,10 @@ gopts.var('maxmem', val='MEMORY',
gopts.var('maxmem', val='MEMORY',
fn=set_int, default=None,
use="Maximum domain memory in MB.")
+
+gopts.var('shadow_memory', val='MEMORY',
+ fn=set_int, default=0,
+ use="Domain shadow memory in MB.")
gopts.var('cpu', val='CPU',
fn=set_int, default=None,
@@ -666,8 +670,9 @@ def make_config(vals):
if v:
config.append([n, v])
- map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff',
- 'on_reboot', 'on_crash', 'vcpus', 'features'])
+ map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
+ 'restart', 'on_poweroff', 'on_reboot', 'on_crash',
+ 'vcpus', 'features'])
if vals.uuid is not None:
config.append(['uuid', vals.uuid])
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/Makefile Wed Aug 16 17:02:35 2006 +0100
@@ -8,7 +8,6 @@ subdir-$(x86_64) += x86_64
subdir-$(x86_64) += x86_64
obj-y += apic.o
-obj-y += audit.o
obj-y += bitops.o
obj-y += compat.o
obj-y += delay.o
@@ -41,12 +40,21 @@ obj-y += x86_emulate.o
obj-y += x86_emulate.o
ifneq ($(pae),n)
-obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
else
-obj-$(x86_32) += shadow32.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
endif
-obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
+ shadow2_g2_on_s3.o
+
+guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst
shadow2_,,$(1))))))
+shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst
shadow2_,,$(1))))))
+shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
+ -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
+
+shadow2_%.o: shadow2.c $(HDRS) Makefile
+ $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
obj-$(crash_debug) += gdbstub.o
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/dom0_ops.c Wed Aug 16 17:02:35 2006 +0100
@@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op,
d = find_domain_by_id(op->u.shadow_control.domain);
if ( d != NULL )
{
- ret = shadow_mode_control(d, &op->u.shadow_control);
+ ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op);
put_domain(d);
copy_to_guest(u_dom0_op, op, 1);
}
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/domain.c Wed Aug 16 17:02:35 2006 +0100
@@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct do
v->arch.perdomain_ptes =
d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
- v->arch.guest_vtable = __linear_l2_table;
- v->arch.shadow_vtable = __shadow_linear_l2_table;
-#if defined(__x86_64__)
- v->arch.guest_vl3table = __linear_l3_table;
- v->arch.guest_vl4table = __linear_l4_table;
-#endif
-
pae_l3_cache_init(&v->arch.pae_l3_cache);
return v;
@@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d)
{
l1_pgentry_t gdt_l1e;
int vcpuid, pdpt_order;
-#ifdef __x86_64__
int i;
-#endif
pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
@@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d)
#endif /* __x86_64__ */
- shadow_lock_init(d);
- INIT_LIST_HEAD(&d->arch.free_shadow_frames);
+ shadow2_lock_init(d);
+ for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
+ INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]);
+ INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist);
+ INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse);
+ INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows);
if ( !is_idle_domain(d) )
{
@@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d)
void arch_domain_destroy(struct domain *d)
{
+ shadow2_final_teardown(d);
+
free_xenheap_pages(
d->arch.mm_perdomain_pt,
get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
@@ -328,31 +325,35 @@ int arch_set_info_guest(
if ( !hvm_initialize_guest_resources(v) )
return -EINVAL;
}
- else if ( shadow_mode_refcounts(d) )
- {
- if ( !get_page(mfn_to_page(cr3_pfn), d) )
+ else
+ {
+ if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
+ PGT_base_page_table) )
{
destroy_gdt(v);
return -EINVAL;
}
- }
- else
- {
- if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
- PGT_base_page_table) )
- {
- destroy_gdt(v);
- return -EINVAL;
- }
- }
-
- update_pagetables(v);
+ }
+
+ /* Shadow2: make sure the domain has enough shadow memory to
+ * boot another vcpu */
+ if ( shadow2_mode_enabled(d)
+ && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) )
+ {
+ destroy_gdt(v);
+ return -ENOMEM;
+ }
if ( v->vcpu_id == 0 )
update_domain_wallclock_time(d);
/* Don't redo final setup */
set_bit(_VCPUF_initialised, &v->vcpu_flags);
+
+ if ( shadow2_mode_enabled(d) )
+ shadow2_update_paging_modes(v);
+
+ update_cr3(v);
return 0;
}
@@ -669,7 +670,6 @@ static void __context_switch(void)
loaddebug(&n->arch.guest_context, 6);
loaddebug(&n->arch.guest_context, 7);
}
-
n->arch.ctxt_switch_to(n);
}
@@ -927,29 +927,34 @@ void domain_relinquish_resources(struct
/* Drop the in-use references to page-table bases. */
for_each_vcpu ( d, v )
{
- if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
- {
- if ( !shadow_mode_refcounts(d) )
- put_page_type(mfn_to_page(pfn));
- put_page(mfn_to_page(pfn));
-
+ /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
+ * or sh2_update_paging_modes()) */
+ pfn = pagetable_get_pfn(v->arch.guest_table);
+ if ( pfn != 0 )
+ {
+ if ( shadow2_mode_refcounts(d) )
+ put_page(mfn_to_page(pfn));
+ else
+ put_page_and_type(mfn_to_page(pfn));
v->arch.guest_table = pagetable_null();
}
- if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
- {
- if ( !shadow_mode_refcounts(d) )
- put_page_type(mfn_to_page(pfn));
- put_page(mfn_to_page(pfn));
-
+#ifdef __x86_64__
+ /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+ pfn = pagetable_get_pfn(v->arch.guest_table_user);
+ if ( pfn != 0 )
+ {
+ put_page_and_type(mfn_to_page(pfn));
v->arch.guest_table_user = pagetable_null();
}
+#endif
}
if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) )
hvm_relinquish_guest_resources(d);
- shadow_mode_disable(d);
+ /* Tear down shadow mode stuff. */
+ shadow2_teardown(d);
/*
* Relinquish GDT mappings. No need for explicit unmapping of the LDT as
@@ -964,26 +969,23 @@ void domain_relinquish_resources(struct
/* Free page used by xen oprofile buffer */
free_xenoprof_pages(d);
-
}
void arch_dump_domain_info(struct domain *d)
{
- if ( shadow_mode_enabled(d) )
- {
- printk(" shadow mode: ");
- if ( shadow_mode_refcounts(d) )
+ if ( shadow2_mode_enabled(d) )
+ {
+ printk(" shadow2 mode: ");
+ if ( d->arch.shadow2_mode & SHM2_enable )
+ printk("enabled ");
+ if ( shadow2_mode_refcounts(d) )
printk("refcounts ");
- if ( shadow_mode_write_all(d) )
- printk("write_all ");
- if ( shadow_mode_log_dirty(d) )
+ if ( shadow2_mode_log_dirty(d) )
printk("log_dirty ");
- if ( shadow_mode_translate(d) )
+ if ( shadow2_mode_translate(d) )
printk("translate ");
- if ( shadow_mode_external(d) )
+ if ( shadow2_mode_external(d) )
printk("external ");
- if ( shadow_mode_wr_pt_pte(d) )
- printk("wr_pt_pte ");
printk("\n");
}
}
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/domain_build.c Wed Aug 16 17:02:35 2006 +0100
@@ -683,8 +683,11 @@ int construct_dom0(struct domain *d,
for ( i = 1; i < opt_dom0_max_vcpus; i++ )
(void)alloc_vcpu(d, i, i);
- /* Set up monitor table */
- update_pagetables(v);
+ /* Set up CR3 value for write_ptbase */
+ if ( shadow2_mode_enabled(v->domain) )
+ shadow2_update_paging_modes(v);
+ else
+ update_cr3(v);
/* Install the new page tables. */
local_irq_disable();
@@ -796,10 +799,8 @@ int construct_dom0(struct domain *d,
new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
if ( opt_dom0_shadow )
- {
- shadow_mode_enable(d, SHM_enable);
- update_pagetables(v);
- }
+ if ( shadow2_test_enable(d) == 0 )
+ shadow2_update_paging_modes(v);
if ( supervisor_mode_kernel )
{
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/hvm.c Wed Aug 16 17:02:35 2006 +0100
@@ -30,6 +30,7 @@
#include <xen/hypercall.h>
#include <xen/guest_access.h>
#include <xen/event.h>
+#include <xen/shadow.h>
#include <asm/current.h>
#include <asm/e820.h>
#include <asm/io.h>
@@ -42,10 +43,6 @@
#include <asm/spinlock.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <public/version.h>
@@ -61,7 +58,7 @@ static void hvm_zap_mmio_range(
static void hvm_zap_mmio_range(
struct domain *d, unsigned long pfn, unsigned long nr_pfn)
{
- unsigned long i, val = INVALID_MFN;
+ unsigned long i;
ASSERT(d == current->domain);
@@ -70,7 +67,8 @@ static void hvm_zap_mmio_range(
if ( pfn + i >= 0xfffff )
break;
- __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
+ if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) )
+ guest_remove_page(d, pfn + i);
}
}
@@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d
if ( !hvm_guest(v) || (v->vcpu_id != 0) )
return;
+#if 0 /* SHADOW2 does not have this */
if ( shadow_direct_map_init(d) == 0 )
{
printk("Can not allocate shadow direct map for HVM domain.\n");
domain_crash_synchronous();
}
+#endif
hvm_zap_iommu_pages(d);
@@ -380,6 +380,8 @@ void hvm_hlt(unsigned long rflags)
*/
int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
{
+ struct vcpu *v = current;
+ unsigned long gfn;
unsigned long mfn;
char *addr;
int count;
@@ -389,10 +391,9 @@ int hvm_copy(void *buf, unsigned long va
if (count > size)
count = size;
- if (hvm_paging_enabled(current))
- mfn = gva_to_mfn(vaddr);
- else
- mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT);
+ gfn = shadow2_gva_to_gfn(v, vaddr);
+ mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
+
if (mfn == INVALID_MFN)
return 0;
@@ -545,7 +546,7 @@ void hvm_do_hypercall(struct cpu_user_re
return;
}
- if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 )
+ if ( current->arch.shadow2->guest_levels == 4 )
{
pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
pregs->rsi,
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/platform.c
--- a/xen/arch/x86/hvm/platform.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/platform.c Wed Aug 16 17:02:35 2006 +0100
@@ -21,7 +21,7 @@
#include <xen/config.h>
#include <xen/types.h>
#include <xen/mm.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
#include <xen/domain_page.h>
#include <asm/page.h>
#include <xen/event.h>
@@ -35,9 +35,6 @@
#include <xen/lib.h>
#include <xen/sched.h>
#include <asm/current.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
#define DECODE_success 1
#define DECODE_failure 0
@@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *
if (pvalid) {
if (hvm_paging_enabled(current))
- p->u.pdata = (void *) gva_to_gpa(value);
+ p->u.data = shadow2_gva_to_gpa(current, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
@@ -774,7 +771,7 @@ void send_mmio_req(
if (pvalid) {
if (hvm_paging_enabled(v))
- p->u.pdata = (void *) gva_to_gpa(value);
+ p->u.data = shadow2_gva_to_gpa(v, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/svm/svm.c Wed Aug 16 17:02:35 2006 +0100
@@ -26,9 +26,10 @@
#include <xen/irq.h>
#include <xen/softirq.h>
#include <xen/hypercall.h>
+#include <xen/domain_page.h>
#include <asm/current.h>
#include <asm/io.h>
-#include <asm/shadow.h>
+#include <asm/shadow2.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
@@ -43,10 +44,6 @@
#include <asm/hvm/svm/emulate.h>
#include <asm/hvm/svm/vmmcall.h>
#include <asm/hvm/svm/intr.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
#include <public/sched.h>
#define SVM_EXTRA_DEBUG
@@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v)
return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
}
-static int svm_instruction_length(struct vcpu *v)
+int svm_guest_x86_mode(struct vcpu *v)
{
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
@@ -423,10 +420,20 @@ static int svm_instruction_length(struct
mode = vmcb->cs.attributes.fields.l ? 8 : 4;
else
mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
- return svm_instrlen(guest_cpu_user_regs(), mode);
-}
-
-static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+ return mode;
+}
+
+int svm_instruction_length(struct vcpu *v)
+{
+ return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v));
+}
+
+void svm_update_host_cr3(struct vcpu *v)
+{
+ /* SVM doesn't have a HOST_CR3 equivalent to update. */
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
{
switch ( num )
{
@@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(st
return v->arch.hvm_svm.cpu_cr2;
case 3:
return v->arch.hvm_svm.cpu_cr3;
+ case 4:
+ return v->arch.hvm_svm.cpu_shadow_cr4;
default:
BUG();
}
@@ -524,8 +533,6 @@ static void svm_init_hypercall_page(stru
/* Don't support HYPERVISOR_iret at the moment */
*(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
}
-
-
int svm_dbg_on = 0;
@@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs(
svm_load_cpu_user_regs(v, regs);
}
+int svm_long_mode_enabled(struct vcpu *v)
+{
+ return SVM_LONG_GUEST(v);
+}
+
static void arch_svm_do_launch(struct vcpu *v)
@@ -726,7 +738,6 @@ static void svm_final_setup_guest(struct
static void svm_final_setup_guest(struct vcpu *v)
{
struct domain *d = v->domain;
- struct vcpu *vc;
v->arch.schedule_tail = arch_svm_do_launch;
v->arch.ctxt_switch_from = svm_ctxt_switch_from;
@@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct
if ( v != d->vcpu[0] )
return;
- /* Initialize monitor page table */
- for_each_vcpu( d, vc )
- vc->arch.monitor_table = pagetable_null();
+ if ( !shadow2_mode_external(d) )
+ {
+ DPRINTK("Can't init HVM for dom %u vcpu %u: "
+ "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
+ domain_crash(d);
+ }
/*
* Required to do this once per domain
@@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct
*/
memset(&d->shared_info->evtchn_mask[0], 0xff,
sizeof(d->shared_info->evtchn_mask));
-
- /*
- * Put the domain in shadow mode even though we're going to be using
- * the shared 1:1 page table initially. It shouldn't hurt
- */
- shadow_mode_enable(d, SHM_enable|SHM_refcounts|
- SHM_translate|SHM_external|SHM_wr_pt_pte);
}
@@ -809,9 +816,13 @@ int start_svm(void)
hvm_funcs.realmode = svm_realmode;
hvm_funcs.paging_enabled = svm_paging_enabled;
+ hvm_funcs.long_mode_enabled = svm_long_mode_enabled;
+ hvm_funcs.guest_x86_mode = svm_guest_x86_mode;
hvm_funcs.instruction_length = svm_instruction_length;
hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
+ hvm_funcs.update_host_cr3 = svm_update_host_cr3;
+
hvm_funcs.stts = svm_stts;
hvm_funcs.set_tsc_offset = svm_set_tsc_offset;
@@ -834,7 +845,6 @@ static void svm_relinquish_guest_resourc
continue;
destroy_vmcb(&v->arch.hvm_svm);
- free_monitor_pagetable(v);
kill_timer(&v->arch.hvm_vcpu.hlt_timer);
if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
{
@@ -851,8 +861,6 @@ static void svm_relinquish_guest_resourc
if ( d->arch.hvm_domain.buffered_io_va )
unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
- shadow_direct_map_clean(d);
}
@@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned lo
{
struct vcpu *v = current;
unsigned long eip;
- unsigned long gpa; /* FIXME: PAE */
int result;
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned lo
va, eip, (unsigned long)regs->error_code);
//#endif
- if ( !svm_paging_enabled(v) )
- {
- if ( shadow_direct_map_fault(va, regs) )
- return 1;
-
- handle_mmio(va, va);
- return 1;
- }
-
-
- gpa = gva_to_gpa(va);
-
- /* Use 1:1 page table to identify MMIO address space */
- if (mmio_space(gpa))
- {
- /* No support for APIC */
- if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
- {
- int inst_len;
- inst_len = svm_instruction_length(v);
- if (inst_len == -1)
- {
- printf("%s: INST_LEN - Unable to decode properly\n", __func__);
- domain_crash_synchronous();
- }
-
- __update_guest_eip(vmcb, inst_len);
-
- return 1;
- }
-
- handle_mmio(va, gpa);
-
- return 1;
- }
-
- result = shadow_fault(va, regs);
+ result = shadow2_fault(va, regs);
if( result ) {
/* Let's make sure that the Guest TLB is flushed */
@@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct v
clear_bit(X86_FEATURE_APIC, &edx);
}
-#if CONFIG_PAGING_LEVELS < 3
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
+#endif
+ clear_bit(X86_FEATURE_PAE, &edx);
clear_bit(X86_FEATURE_PSE36, &edx);
-#else
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
- }
-#endif
+
/* Clear out reserved bits. */
ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
@@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct v
clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
#endif
-#if CONFIG_PAGING_LEVELS < 3
- clear_bit(X86_FEATURE_NX & 31, &edx);
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
+#endif
+ clear_bit(X86_FEATURE_PAE, &edx);
clear_bit(X86_FEATURE_PSE36, &edx);
-#else
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
- {
- clear_bit(X86_FEATURE_NX & 31, &edx);
- clear_bit(X86_FEATURE_PAE, &edx);
- }
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
- }
-#endif
/* Make SVM feature invisible to the guest. */
clear_bit(X86_FEATURE_SVME & 31, &ecx);
@@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long val
unsigned long mfn;
int paging_enabled;
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+ unsigned long old_base_mfn;
ASSERT(vmcb);
@@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long val
set_bit(SVM_CPU_STATE_LMA_ENABLED,
&v->arch.hvm_svm.cpu_state);
vmcb->efer |= (EFER_LMA | EFER_LME);
- if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
- }
- else
+ }
#endif /* __x86_64__ */
- {
-#if CONFIG_PAGING_LEVELS >= 3
- /* seems it's a 32-bit or 32-bit PAE guest */
- if ( test_bit(SVM_CPU_STATE_PAE_ENABLED,
- &v->arch.hvm_svm.cpu_state) )
- {
- /* The guest enables PAE first and then it enables PG, it is
- * really a PAE guest */
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
- else
- {
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
- }
-#endif
- }
/* Now arch.guest_table points to machine physical. */
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if ( old_base_mfn )
+ put_page(mfn_to_page(old_base_mfn));
+ shadow2_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
-
- /* arch->shadow_table should hold the next CR3 for shadow */
- HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n",
- v->arch.hvm_svm.cpu_cr3, mfn);
-
- return 1;
}
if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
@@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long val
svm_inject_exception(v, TRAP_gp_fault, 1, 0);
return 0;
}
-
- clear_all_shadow_status( v->domain );
+ shadow2_update_paging_modes(v);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
}
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
/* we should take care of this kind of situation */
- clear_all_shadow_status(v->domain);
+ shadow2_update_paging_modes(v);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
}
return 1;
@@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr,
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow_sync_all(v->domain);
+ shadow2_update_cr3(v);
}
else
{
@@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr,
/*
* arch.shadow_table should now hold the next CR3 for shadow
*/
-#if CONFIG_PAGING_LEVELS >= 3
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
- shadow_sync_all(v->domain);
-#endif
v->arch.hvm_svm.cpu_cr3 = value;
- update_pagetables(v);
+ update_cr3(v);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
- vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
}
break;
}
@@ -1838,12 +1754,6 @@ static int mov_to_cr(int gpreg, int cr,
/* The guest is a 32-bit PAE guest. */
#if CONFIG_PAGING_LEVELS >= 3
unsigned long mfn, old_base_mfn;
-
- if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) ||
@@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr,
domain_crash_synchronous(); /* need to take a clean path */
}
- old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
- if ( old_base_mfn )
- put_page(mfn_to_page(old_base_mfn));
-
/*
* Now arch.guest_table points to machine physical.
*/
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if ( old_base_mfn )
+ put_page(mfn_to_page(old_base_mfn));
+ shadow2_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
- vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
/*
* arch->shadow_table should hold the next CR3 for shadow
@@ -1876,33 +1785,6 @@ static int mov_to_cr(int gpreg, int cr,
HVM_DBG_LOG(DBG_LEVEL_VMMU,
"Update CR3 value = %lx, mfn = %lx",
v->arch.hvm_svm.cpu_cr3, mfn);
-#endif
- }
- else
- {
- /* The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
- if ( (v->domain->arch.ops != NULL) &&
- v->domain->arch.ops->guest_paging_levels == PAGING_L2)
- {
- /* Seems the guest first enables PAE without enabling PG,
- * it must enable PG after that, and it is a 32-bit PAE
- * guest */
-
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3))
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
- else
- {
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4))
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
#endif
}
}
@@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr,
if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
{
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow_sync_all(v->domain);
+ shadow2_update_paging_modes(v);
}
break;
}
@@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlp
/* Overkill, we may not this */
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow_invlpg(v, g_vaddr);
+ shadow2_invlpg(v, g_vaddr);
}
@@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned l
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
unsigned long gpa;
- gpa = gva_to_gpa( gva );
+ gpa = shadow2_gva_to_gpa(current, gva);
printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
if( !svm_paging_enabled(v) || mmio_space(gpa) )
return;
@@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned l
__copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ],
sizeof(gpte) );
printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
- __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
+
+ BUG(); // need to think about this, and convert usage of
+ // phys_to_machine_mapping to use pagetable format...
+ __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
sizeof(spte) );
+
printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
}
#endif /* SVM_WALK_GUEST_PAGES */
@@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struc
if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF)
{
- if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
+ if (svm_paging_enabled(v) &&
+ !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
{
printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, "
"gpa=%llx\n", intercepts_counter,
@@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struc
(unsigned long long) vmcb->exitinfo1,
(unsigned long long) vmcb->exitinfo2,
(unsigned long long) vmcb->exitintinfo.bytes,
- (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) );
+ (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
}
else
{
@@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struc
&& ( ( vmcb->exitinfo2 == vmcb->rip )
|| vmcb->exitintinfo.bytes) )
{
- if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
+ if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
walk_shadow_and_guest_pt( vmcb->exitinfo2 );
}
#endif
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/svm/vmcb.c Wed Aug 16 17:02:35 2006 +0100
@@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v)
printk("%s: phys_table = %lx\n", __func__, pt);
}
- /* At launch we always use the phys_table */
- vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
+ /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
if (svm_dbg_on)
{
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/vlapic.c Wed Aug 16 17:02:35 2006 +0100
@@ -21,7 +21,8 @@
#include <xen/types.h>
#include <xen/mm.h>
#include <xen/xmalloc.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
+#include <xen/domain_page.h>
#include <asm/page.h>
#include <xen/event.h>
#include <xen/trace.h>
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c Wed Aug 16 17:02:35 2006 +0100
@@ -34,12 +34,8 @@
#include <asm/flushtlb.h>
#include <xen/event.h>
#include <xen/kernel.h>
-#include <asm/shadow.h>
#include <xen/keyhandler.h>
-
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
static int vmcs_size;
static int vmcs_order;
@@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu
static void vmx_do_launch(struct vcpu *v)
{
-/* Update CR3, GDT, LDT, TR */
+/* Update CR3, CR0, CR4, GDT, LDT, TR */
unsigned int error = 0;
unsigned long cr0, cr4;
@@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v
error |= __vmwrite(GUEST_TR_BASE, 0);
error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
- __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table));
+ shadow2_update_paging_modes(v);
+ printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
+ __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+ __vmwrite(HOST_CR3, v->arch.cr3);
v->arch.schedule_tail = arch_vmx_do_resume;
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Aug 16 17:02:35 2006 +0100
@@ -26,9 +26,9 @@
#include <xen/softirq.h>
#include <xen/domain_page.h>
#include <xen/hypercall.h>
+#include <xen/perfc.h>
#include <asm/current.h>
#include <asm/io.h>
-#include <asm/shadow.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
@@ -40,10 +40,7 @@
#include <asm/hvm/vmx/vmx.h>
#include <asm/hvm/vmx/vmcs.h>
#include <asm/hvm/vmx/cpu.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <asm/hvm/vpic.h>
@@ -69,11 +66,16 @@ static int vmx_initialize_guest_resource
if ( v->vcpu_id != 0 )
return 1;
+ if ( !shadow2_mode_external(d) )
+ {
+ DPRINTK("Can't init HVM for dom %u vcpu %u: "
+ "not in shadow2 external mode\n",
+ d->domain_id, v->vcpu_id);
+ domain_crash(d);
+ }
+
for_each_vcpu ( d, vc )
{
- /* Initialize monitor page table */
- vc->arch.monitor_table = pagetable_null();
-
memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
if ( (rc = vmx_create_vmcs(vc)) != 0 )
@@ -107,6 +109,7 @@ static int vmx_initialize_guest_resource
vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
+
}
/*
@@ -116,11 +119,6 @@ static int vmx_initialize_guest_resource
memset(&d->shared_info->evtchn_mask[0], 0xff,
sizeof(d->shared_info->evtchn_mask));
- /* Put the domain in shadow mode even though we're going to be using
- * the shared 1:1 page table initially. It shouldn't hurt */
- shadow_mode_enable(
- d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte);
-
return 1;
}
@@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resourc
vmx_destroy_vmcs(v);
if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
continue;
- free_monitor_pagetable(v);
kill_timer(&v->arch.hvm_vcpu.hlt_timer);
if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
{
@@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resourc
if ( d->arch.hvm_domain.buffered_io_va )
unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
- shadow_direct_map_clean(d);
}
#ifdef __x86_64__
@@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(stru
vmx_vmcs_exit(v);
}
-static int vmx_realmode(struct vcpu *v)
-{
- unsigned long rflags;
-
- __vmread(GUEST_RFLAGS, &rflags);
- return rflags & X86_EFLAGS_VM;
-}
-
static int vmx_instruction_length(struct vcpu *v)
{
unsigned long inst_len;
@@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(st
return v->arch.hvm_vmx.cpu_cr2;
case 3:
return v->arch.hvm_vmx.cpu_cr3;
+ case 4:
+ return v->arch.hvm_vmx.cpu_shadow_cr4;
default:
BUG();
}
@@ -753,8 +742,12 @@ static void vmx_setup_hvm_funcs(void)
hvm_funcs.realmode = vmx_realmode;
hvm_funcs.paging_enabled = vmx_paging_enabled;
+ hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
+ hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
hvm_funcs.instruction_length = vmx_instruction_length;
hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
+
+ hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
hvm_funcs.stts = vmx_stts;
hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
@@ -855,53 +848,25 @@ static void inline __update_guest_eip(un
__vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
}
-
static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
{
- unsigned long gpa; /* FIXME: PAE */
int result;
#if 0 /* keep for debugging */
{
- unsigned long eip;
-
+ unsigned long eip, cs;
+
+ __vmread(GUEST_CS_BASE, &cs);
__vmread(GUEST_RIP, &eip);
HVM_DBG_LOG(DBG_LEVEL_VMMU,
- "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
- va, eip, (unsigned long)regs->error_code);
+ "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
+ "eip = %lx, error_code = %lx\n",
+ va, cs, eip, (unsigned long)regs->error_code);
}
#endif
- if ( !vmx_paging_enabled(current) )
- {
- /* construct 1-to-1 direct mapping */
- if ( shadow_direct_map_fault(va, regs) )
- return 1;
-
- handle_mmio(va, va);
- TRACE_VMEXIT (2,2);
- return 1;
- }
- gpa = gva_to_gpa(va);
-
- /* Use 1:1 page table to identify MMIO address space */
- if ( mmio_space(gpa) ){
- struct vcpu *v = current;
- /* No support for APIC */
- if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) {
- u32 inst_len;
- __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
- __update_guest_eip(inst_len);
- return 1;
- }
- TRACE_VMEXIT (2,2);
- /* in the case of MMIO, we are more interested in gpa than in va */
- TRACE_VMEXIT (4,gpa);
- handle_mmio(va, gpa);
- return 1;
- }
-
- result = shadow_fault(va, regs);
+ result = shadow2_fault(va, regs);
+
TRACE_VMEXIT (2,result);
#if 0
if ( !result )
@@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct c
clear_bit(X86_FEATURE_APIC, &edx);
}
-#if CONFIG_PAGING_LEVELS < 3
- edx &= ~(bitmaskof(X86_FEATURE_PAE) |
- bitmaskof(X86_FEATURE_PSE) |
- bitmaskof(X86_FEATURE_PSE36));
-#else
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
- clear_bit(X86_FEATURE_PSE36, &edx);
- else
- {
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
- }
- }
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
#endif
+ clear_bit(X86_FEATURE_PAE, &edx);
+ clear_bit(X86_FEATURE_PSE36, &edx);
ebx &= NUM_THREADS_RESET_MASK;
@@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne
* We do the safest things first, then try to update the shadow
* copying from guest
*/
- shadow_invlpg(v, va);
+ shadow2_invlpg(v, va);
}
@@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct
error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
- if (!vmx_paging_enabled(v)) {
- HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+ if (!vmx_paging_enabled(v))
goto skip_cr3;
- }
if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
/*
@@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct
domain_crash_synchronous();
return 0;
}
- shadow_sync_all(v->domain);
} else {
/*
* If different, make a shadow. Check if the PDBR is valid
@@ -1348,12 +1297,16 @@ vmx_world_restore(struct vcpu *v, struct
* arch.shadow_table should now hold the next CR3 for shadow
*/
v->arch.hvm_vmx.cpu_cr3 = c->cr3;
- update_pagetables(v);
+ }
+
+ skip_cr3:
+
+ shadow2_update_paging_modes(v);
+ if (!vmx_paging_enabled(v))
+ HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+ else
HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
- }
-
- skip_cr3:
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
error |= __vmread(CR4_READ_SHADOW, &old_cr4);
error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
@@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long val
int paging_enabled;
unsigned long vm_entry_value;
unsigned long old_cr0;
+ unsigned long old_base_mfn;
/*
* CR0: We don't want to lose PE and PG.
@@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long val
v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
!get_page(mfn_to_page(mfn), v->domain) )
{
- printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
+ printk("Invalid CR3 value = %lx (mfn=%lx)\n",
+ v->arch.hvm_vmx.cpu_cr3, mfn);
domain_crash_synchronous(); /* need to take a clean path */
}
@@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long val
__vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
__vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
- }
- else
-#endif /* __x86_64__ */
- {
-#if CONFIG_PAGING_LEVELS >= 3
- /* seems it's a 32-bit or 32-bit PAE guest */
-
- if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
- &v->arch.hvm_vmx.cpu_state) )
- {
- /* The guest enables PAE first and then it enables PG, it is
- * really a PAE guest */
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
- else
- {
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
- }
+ }
#endif
- }
/*
* Now arch.guest_table points to machine physical.
*/
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if (old_base_mfn)
+ put_page(mfn_to_page(old_base_mfn));
+ shadow2_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
/*
* arch->shadow_table should hold the next CR3 for shadow
*/
@@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long val
}
}
- clear_all_shadow_status(v->domain);
if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
__vmread(GUEST_RIP, &eip);
@@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long val
}
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
- /* we should take care of this kind of situation */
- clear_all_shadow_status(v->domain);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+ shadow2_update_paging_modes(v);
}
return 1;
@@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow_sync_all(v->domain);
+ shadow2_update_cr3(v);
} else {
/*
* If different, make a shadow. Check if the PDBR is valid
@@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, str
/*
* arch.shadow_table should now hold the next CR3 for shadow
*/
-#if CONFIG_PAGING_LEVELS >= 3
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
- shadow_sync_all(v->domain);
-#endif
-
v->arch.hvm_vmx.cpu_cr3 = value;
- update_pagetables(v);
+ update_cr3(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
value);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
}
break;
}
@@ -1785,12 +1704,6 @@ static int mov_to_cr(int gp, int cr, str
/* The guest is a 32-bit PAE guest. */
#if CONFIG_PAGING_LEVELS >= 3
unsigned long mfn, old_base_mfn;
-
- if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
@@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, str
domain_crash_synchronous(); /* need to take a clean path */
}
- old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
- if ( old_base_mfn )
- put_page(mfn_to_page(old_base_mfn));
/*
* Now arch.guest_table points to machine physical.
*/
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if ( old_base_mfn )
+ put_page(mfn_to_page(old_base_mfn));
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
- __vmwrite(GUEST_CR3,
pagetable_get_paddr(v->arch.shadow_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
/*
* arch->shadow_table should hold the next CR3 for shadow
@@ -1822,27 +1734,6 @@ static int mov_to_cr(int gp, int cr, str
HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn =
%lx",
v->arch.hvm_vmx.cpu_cr3, mfn);
-#endif
- }
- else
- {
- /* The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
- if ( (v->domain->arch.ops != NULL) &&
- v->domain->arch.ops->guest_paging_levels == PAGING_L2)
- {
- /* Seems the guest first enables PAE without enabling PG,
- * it must enable PG after that, and it is a 32-bit PAE
- * guest */
-
- if ( !shadow_set_guest_paging_levels(v->domain,
- PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- /* need to take a clean path */
- domain_crash_synchronous();
- }
- }
#endif
}
}
@@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, str
* all TLB entries except global entries.
*/
if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- shadow_sync_all(v->domain);
-
+ shadow2_update_paging_modes(v);
break;
}
default:
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/mm.c Wed Aug 16 17:02:35 2006 +0100
@@ -137,7 +137,7 @@ static void free_l1_table(struct page_in
static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
unsigned long type);
-static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
+static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
/* Used to defer flushing of memory structures. */
struct percpu_mm_info {
@@ -274,9 +274,9 @@ void share_xen_page_with_privileged_gues
#else
/*
* In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
- * We cannot safely shadow the idle page table, nor shadow-mode page tables
+ * We cannot safely shadow the idle page table, nor shadow (v1) page tables
* (detected by lack of an owning domain). As required for correctness, we
- * always shadow PDPTs aboive 4GB.
+ * always shadow PDPTs above 4GB.
*/
#define l3tab_needs_shadow(mfn) \
(((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
@@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_addre
}
__initcall(cache_pae_fixmap_address);
-static void __write_ptbase(unsigned long mfn)
+static DEFINE_PER_CPU(u32, make_cr3_timestamp);
+
+void make_cr3(struct vcpu *v, unsigned long mfn)
+/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
+ * necessary, and sets v->arch.cr3 to the value to load in CR3. */
{
l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
- struct pae_l3_cache *cache = ¤t->arch.pae_l3_cache;
+ struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
unsigned int cpu = smp_processor_id();
- /* Fast path 1: does this mfn need a shadow at all? */
+ /* Fast path: does this mfn need a shadow at all? */
if ( !l3tab_needs_shadow(mfn) )
{
- write_cr3(mfn << PAGE_SHIFT);
- /* Cache is no longer in use or valid (/after/ write to %cr3). */
+ v->arch.cr3 = mfn << PAGE_SHIFT;
+ /* Cache is no longer in use or valid */
cache->high_mfn = 0;
return;
}
@@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long
/* Caching logic is not interrupt safe. */
ASSERT(!in_irq());
- /* Fast path 2: is this mfn already cached? */
- if ( cache->high_mfn == mfn )
- {
- write_cr3(__pa(cache->table[cache->inuse_idx]));
- return;
- }
-
/* Protects against pae_flush_pgd(). */
spin_lock(&cache->lock);
@@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long
/* Map the guest L3 table and copy to the chosen low-memory cache. */
*(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
+ /* First check the previous high mapping can't be in the TLB.
+ * (i.e. have we loaded CR3 since we last did this?) */
+ if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
+ local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
lowmem_l3tab = cache->table[cache->inuse_idx];
memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
*(fix_pae_highmem_pl1e - cpu) = l1e_empty();
-
- /* Install the low-memory L3 table in CR3. */
- write_cr3(__pa(lowmem_l3tab));
+ this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
+
+ v->arch.cr3 = __pa(lowmem_l3tab);
spin_unlock(&cache->lock);
}
#else /* !CONFIG_X86_PAE */
-static void __write_ptbase(unsigned long mfn)
-{
- write_cr3(mfn << PAGE_SHIFT);
+void make_cr3(struct vcpu *v, unsigned long mfn)
+{
+ v->arch.cr3 = mfn << PAGE_SHIFT;
}
#endif /* !CONFIG_X86_PAE */
void write_ptbase(struct vcpu *v)
{
- __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
+ write_cr3(v->arch.cr3);
}
void invalidate_shadow_ldt(struct vcpu *v)
@@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off
BUG_ON(unlikely(in_irq()));
- shadow_sync_va(v, gva);
-
TOGGLE_MODE();
__copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
sizeof(l1e));
@@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- if ( !res && unlikely(shadow_mode_refcounts(d)) )
- {
- shadow_lock(d);
- shadow_remove_all_write_access(d, gmfn, mfn);
+ if ( !res && unlikely(shadow2_mode_refcounts(d)) )
+ {
+ shadow2_lock(d);
+ shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- shadow_unlock(d);
+ shadow2_unlock(d);
}
if ( unlikely(!res) )
@@ -513,7 +512,7 @@ get_linear_pagetable(
struct page_info *page;
unsigned long pfn;
- ASSERT( !shadow_mode_refcounts(d) );
+ ASSERT( !shadow2_mode_refcounts(d) );
if ( (root_get_flags(re) & _PAGE_RW) )
{
@@ -576,7 +575,8 @@ get_page_from_l1e(
if ( !iomem_access_permitted(d, mfn, mfn) )
{
- MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
+ MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
+ d->domain_id, mfn);
return 0;
}
@@ -587,9 +587,14 @@ get_page_from_l1e(
d = dom_io;
}
- okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
- get_page_and_type(page, d, PGT_writable_page) :
- get_page(page, d));
+ /* Foreign mappings into guests in shadow2 external mode don't
+ * contribute to writeable mapping refcounts. (This allows the
+ * qemu-dm helper process in dom0 to map the domain's memory without
+ * messing up the count of "real" writable mappings.) */
+ okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
+ !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
+ ? get_page_and_type(page, d, PGT_writable_page)
+ : get_page(page, d));
if ( !okay )
{
MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
@@ -609,8 +614,6 @@ get_page_from_l2e(
struct domain *d, unsigned long vaddr)
{
int rc;
-
- ASSERT(!shadow_mode_refcounts(d));
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
return 1;
@@ -641,8 +644,6 @@ get_page_from_l3e(
{
int rc;
- ASSERT(!shadow_mode_refcounts(d));
-
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
return 1;
@@ -668,8 +669,6 @@ get_page_from_l4e(
struct domain *d, unsigned long vaddr)
{
int rc;
-
- ASSERT( !shadow_mode_refcounts(d) );
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
return 1;
@@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e,
domain_crash(d);
}
- if ( l1e_get_flags(l1e) & _PAGE_RW )
+ /* Remember we didn't take a type-count of foreign writable mappings
+ * to shadow2 external domains */
+ if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
+ !(unlikely((e != d) && shadow2_mode_external(e))) )
{
put_page_and_type(page);
}
@@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_in
l1_pgentry_t *pl1e;
int i;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
pl1e = map_domain_page(pfn);
@@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pg
* 2. Cannot appear in another page table's L3:
* a. alloc_l3_table() calls this function and this check will fail
* b. mod_l3_entry() disallows updates to slot 3 in an existing table
+ *
+ * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
*/
page = l3e_get_page(l3e3);
BUG_ON(page->u.inuse.type_info & PGT_pinned);
@@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_in
l2_pgentry_t *pl2e;
int i;
- /* See the code in shadow_promote() to understand why this is here. */
- if ( (PGT_base_page_table == PGT_l2_page_table) &&
- unlikely(shadow_mode_refcounts(d)) )
- return 1;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
pl2e = map_domain_page(pfn);
@@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_in
l3_pgentry_t *pl3e;
int i;
- /* See the code in shadow_promote() to understand why this is here. */
- if ( (PGT_base_page_table == PGT_l3_page_table) &&
- shadow_mode_refcounts(d) )
- return 1;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
#ifdef CONFIG_X86_PAE
/*
@@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_in
unsigned long vaddr;
int i;
- /* See the code in shadow_promote() to understand why this is here. */
- if ( (PGT_base_page_table == PGT_l4_page_table) &&
- shadow_mode_refcounts(d) )
- return 1;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
{
@@ -1183,51 +1175,61 @@ static void free_l4_table(struct page_in
static inline int update_l1e(l1_pgentry_t *pl1e,
l1_pgentry_t ol1e,
- l1_pgentry_t nl1e)
-{
+ l1_pgentry_t nl1e,
+ unsigned long gl1mfn,
+ struct vcpu *v)
+{
+ int rv = 1;
+ if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ shadow2_lock(v->domain);
#ifndef PTE_UPDATE_WITH_CMPXCHG
- return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
+ rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
#else
- intpte_t o = l1e_get_intpte(ol1e);
- intpte_t n = l1e_get_intpte(nl1e);
-
- for ( ; ; )
- {
- if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
- {
- MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
- ": saw %" PRIpte,
- l1e_get_intpte(ol1e),
- l1e_get_intpte(nl1e),
- o);
- return 0;
- }
-
- if ( o == l1e_get_intpte(ol1e) )
- break;
-
- /* Allowed to change in Accessed/Dirty flags only. */
- BUG_ON((o ^ l1e_get_intpte(ol1e)) &
- ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
- ol1e = l1e_from_intpte(o);
- }
-
- return 1;
+ {
+ intpte_t o = l1e_get_intpte(ol1e);
+ intpte_t n = l1e_get_intpte(nl1e);
+
+ for ( ; ; )
+ {
+ if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
+ {
+ MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
+ ": saw %" PRIpte,
+ l1e_get_intpte(ol1e),
+ l1e_get_intpte(nl1e),
+ o);
+ rv = 0;
+ break;
+ }
+
+ if ( o == l1e_get_intpte(ol1e) )
+ break;
+
+ /* Allowed to change in Accessed/Dirty flags only. */
+ BUG_ON((o ^ l1e_get_intpte(ol1e)) &
+ ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
+ ol1e = l1e_from_intpte(o);
+ }
+ }
#endif
+ if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ {
+ shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
+ shadow2_unlock(v->domain);
+ }
+ return rv;
}
/* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
+ unsigned long gl1mfn)
{
l1_pgentry_t ol1e;
struct domain *d = current->domain;
if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
return 0;
-
- if ( unlikely(shadow_mode_refcounts(d)) )
- return update_l1e(pl1e, ol1e, nl1e);
if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
{
@@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl
}
/* Fast path for identical mapping, r/w and presence. */
- if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
- return update_l1e(pl1e, ol1e, nl1e);
+ if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
+ return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
return 0;
- if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
{
put_page_from_l1e(nl1e, d);
return 0;
@@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
}
else
{
- if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
return 0;
}
@@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl
}
#ifndef PTE_UPDATE_WITH_CMPXCHG
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
#else
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ \
for ( ; ; ) \
{ \
intpte_t __o = cmpxchg((intpte_t *)(_p), \
@@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl
} \
1; })
#endif
+#define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \
+ int rv; \
+ if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
+ shadow2_lock(current->domain); \
+ rv = _UPDATE_ENTRY(_t, _p, _o, _n); \
+ if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
+ { \
+ shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \
+ shadow2_unlock(current->domain); \
+ } \
+ rv; \
+})
/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
static int mod_l2_entry(l2_pgentry_t *pl2e,
@@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl
/* Fast path for identical mapping and presence. */
if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
- return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
+ return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
return 0;
- if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+ if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
{
put_page_from_l2e(nl2e, pfn);
return 0;
}
}
- else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+ else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
{
return 0;
}
@@ -1329,7 +1343,6 @@ static int mod_l2_entry(l2_pgentry_t *pl
put_page_from_l2e(ol2e, pfn);
return 1;
}
-
#if CONFIG_PAGING_LEVELS >= 3
@@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
*/
if ( pgentry_ptr_to_slot(pl3e) >= 3 )
return 0;
-#endif
+#endif
if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
return 0;
@@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
/* Fast path for identical mapping and presence. */
if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
- return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
+ return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
#if CONFIG_PAGING_LEVELS >= 4
if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
@@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl
<< L3_PAGETABLE_SHIFT;
if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
return 0;
-#endif
-
- if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+#endif
+
+ if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
{
put_page_from_l3e(nl3e, pfn);
return 0;
}
}
- else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+ else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
{
return 0;
}
@@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl
/* Fast path for identical mapping and presence. */
if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
- return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
+ return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
return 0;
- if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+ if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
{
put_page_from_l4e(nl4e, pfn);
return 0;
}
}
- else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+ else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
{
return 0;
}
@@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *pa
*/
this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
- if ( unlikely(shadow_mode_enabled(owner)) )
+ if ( unlikely(shadow2_mode_enabled(owner)
+ && !shadow2_lock_is_acquired(owner)) )
{
/* Raw page tables are rewritten during save/restore. */
- if ( !shadow_mode_translate(owner) )
+ if ( !shadow2_mode_translate(owner) )
mark_dirty(owner, page_to_mfn(page));
- if ( shadow_mode_refcounts(owner) )
+ if ( shadow2_mode_refcounts(owner) )
return;
gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
ASSERT(VALID_M2P(gmfn));
- remove_shadow(owner, gmfn, type & PGT_type_mask);
+ shadow2_lock(owner);
+ shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+ shadow2_unlock(owner);
}
}
@@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *pag
if ( unlikely((nx & PGT_count_mask) == 0) )
{
- /* Record TLB information for flush later. Races are harmless. */
- page->tlbflush_timestamp = tlbflush_current_time();
-
if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
likely(nx & PGT_validated) )
{
@@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *pag
x &= ~PGT_validated;
nx &= ~PGT_validated;
}
+
+ /* Record TLB information for flush later. */
+ page->tlbflush_timestamp = tlbflush_current_time();
}
else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
(PGT_pinned|PGT_l1_page_table|1)) )
@@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page
#endif
/* Fixme: add code to propagate va_unknown to subtables. */
if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
- !shadow_mode_refcounts(page_get_owner(page)) )
+ !shadow2_mode_refcounts(page_get_owner(page)) )
return 0;
/* This table is possibly mapped at multiple locations. */
nx &= ~PGT_va_mask;
@@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn)
int okay;
unsigned long old_base_mfn;
- if ( shadow_mode_refcounts(d) )
+ if ( hvm_guest(v) && !hvm_paging_enabled(v) )
+ domain_crash_synchronous();
+
+ if ( shadow2_mode_refcounts(d) )
{
okay = get_page_from_pagenr(mfn, d);
if ( unlikely(!okay) )
@@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn)
MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_null();
- update_pagetables(v);
+ update_cr3(v);
write_cr3(__pa(idle_pg_table));
if ( old_base_mfn != 0 )
put_page_and_type(mfn_to_page(old_base_mfn));
@@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn)
invalidate_shadow_ldt(v);
old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v); /* update shadow_table and monitor_table */
+ update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
write_ptbase(v);
if ( likely(old_base_mfn != 0) )
{
- if ( shadow_mode_refcounts(d) )
+ if ( shadow2_mode_refcounts(d) )
put_page(mfn_to_page(old_base_mfn));
else
put_page_and_type(mfn_to_page(old_base_mfn));
}
- /* CR3 also holds a ref to its shadow... */
- if ( shadow_mode_enabled(d) )
- {
- if ( v->arch.monitor_shadow_ref )
- put_shadow_ref(v->arch.monitor_shadow_ref);
- v->arch.monitor_shadow_ref =
- pagetable_get_pfn(v->arch.monitor_table);
- ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
- get_shadow_ref(v->arch.monitor_shadow_ref);
- }
-
return 1;
}
@@ -1807,8 +1816,6 @@ static void process_deferred_ops(void)
if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
{
- if ( shadow_mode_enabled(d) )
- shadow_sync_all(d);
if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
flush_tlb_mask(d->domain_dirty_cpumask);
else
@@ -1974,7 +1981,7 @@ int do_mmuext_op(
type = PGT_root_page_table;
pin_page:
- if ( shadow_mode_refcounts(FOREIGNDOM) )
+ if ( shadow2_mode_refcounts(FOREIGNDOM) )
break;
okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
@@ -1996,7 +2003,7 @@ int do_mmuext_op(
break;
case MMUEXT_UNPIN_TABLE:
- if ( shadow_mode_refcounts(d) )
+ if ( shadow2_mode_refcounts(d) )
break;
if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
@@ -2009,6 +2016,12 @@ int do_mmuext_op(
{
put_page_and_type(page);
put_page(page);
+ if ( shadow2_mode_enabled(d) )
+ {
+ shadow2_lock(d);
+ shadow2_remove_all_shadows(v, _mfn(mfn));
+ shadow2_unlock(d);
+ }
}
else
{
@@ -2050,9 +2063,9 @@ int do_mmuext_op(
break;
case MMUEXT_INVLPG_LOCAL:
- if ( shadow_mode_enabled(d) )
- shadow_invlpg(v, op.arg1.linear_addr);
- local_flush_tlb_one(op.arg1.linear_addr);
+ if ( !shadow2_mode_enabled(d)
+ || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
+ local_flush_tlb_one(op.arg1.linear_addr);
break;
case MMUEXT_TLB_FLUSH_MULTI:
@@ -2098,7 +2111,7 @@ int do_mmuext_op(
unsigned long ptr = op.arg1.linear_addr;
unsigned long ents = op.arg2.nr_ents;
- if ( shadow_mode_external(d) )
+ if ( shadow2_mode_external(d) )
{
MEM_LOG("ignoring SET_LDT hypercall from external "
"domain %u", d->domain_id);
@@ -2171,9 +2184,6 @@ int do_mmu_update(
LOCK_BIGLOCK(d);
- if ( unlikely(shadow_mode_enabled(d)) )
- check_pagetable(v, "pre-mmu"); /* debug */
-
if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
{
count &= ~MMU_UPDATE_PREEMPTED;
@@ -2248,7 +2258,12 @@ int do_mmu_update(
case PGT_l3_page_table:
case PGT_l4_page_table:
{
- ASSERT(!shadow_mode_refcounts(d));
+ if ( shadow2_mode_refcounts(d) )
+ {
+ DPRINTK("mmu update on shadow-refcounted domain!");
+ break;
+ }
+
if ( unlikely(!get_page_type(
page, type_info & (PGT_type_mask|PGT_va_mask))) )
goto not_a_pt;
@@ -2258,10 +2273,7 @@ int do_mmu_update(
case PGT_l1_page_table:
{
l1_pgentry_t l1e = l1e_from_intpte(req.val);
- okay = mod_l1_entry(va, l1e);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l1_normal_pt_update(
- d, req.ptr, l1e, &sh_mapcache);
+ okay = mod_l1_entry(va, l1e, mfn);
}
break;
case PGT_l2_page_table:
@@ -2269,9 +2281,6 @@ int do_mmu_update(
l2_pgentry_t l2e = l2e_from_intpte(req.val);
okay = mod_l2_entry(
(l2_pgentry_t *)va, l2e, mfn, type_info);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l2_normal_pt_update(
- d, req.ptr, l2e, &sh_mapcache);
}
break;
#if CONFIG_PAGING_LEVELS >= 3
@@ -2279,9 +2288,6 @@ int do_mmu_update(
{
l3_pgentry_t l3e = l3e_from_intpte(req.val);
okay = mod_l3_entry(va, l3e, mfn, type_info);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l3_normal_pt_update(
- d, req.ptr, l3e, &sh_mapcache);
}
break;
#endif
@@ -2290,9 +2296,6 @@ int do_mmu_update(
{
l4_pgentry_t l4e = l4e_from_intpte(req.val);
okay = mod_l4_entry(va, l4e, mfn, type_info);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l4_normal_pt_update(
- d, req.ptr, l4e, &sh_mapcache);
}
break;
#endif
@@ -2308,19 +2311,17 @@ int do_mmu_update(
if ( unlikely(!get_page_type(page, PGT_writable_page)) )
break;
- if ( shadow_mode_enabled(d) )
- {
- shadow_lock(d);
- __mark_dirty(d, mfn);
- if ( page_is_page_table(page) && !page_out_of_sync(page) )
- shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
- }
+ if ( unlikely(shadow2_mode_enabled(d)) )
+ shadow2_lock(d);
*(intpte_t *)va = req.val;
okay = 1;
- if ( shadow_mode_enabled(d) )
- shadow_unlock(d);
+ if ( unlikely(shadow2_mode_enabled(d)) )
+ {
+ shadow2_validate_guest_entry(v, _mfn(mfn), va);
+ shadow2_unlock(d);
+ }
put_page_type(page);
}
@@ -2333,12 +2334,6 @@ int do_mmu_update(
break;
case MMU_MACHPHYS_UPDATE:
-
- if ( shadow_mode_translate(FOREIGNDOM) )
- {
- MEM_LOG("can't mutate m2p table of translate mode guest");
- break;
- }
mfn = req.ptr >> PAGE_SHIFT;
gpfn = req.val;
@@ -2349,9 +2344,13 @@ int do_mmu_update(
break;
}
- set_gpfn_from_mfn(mfn, gpfn);
+ if ( shadow2_mode_translate(FOREIGNDOM) )
+ shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
+ else
+ set_gpfn_from_mfn(mfn, gpfn);
okay = 1;
+ // Mark the new gfn dirty...
mark_dirty(FOREIGNDOM, mfn);
put_page(mfn_to_page(mfn));
@@ -2381,9 +2380,6 @@ int do_mmu_update(
done += i;
if ( unlikely(!guest_handle_is_null(pdone)) )
copy_to_guest(pdone, &done, 1);
-
- if ( unlikely(shadow_mode_enabled(d)) )
- check_pagetable(v, "post-mmu"); /* debug */
UNLOCK_BIGLOCK(d);
return rc;
@@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping(
struct domain *d = v->domain;
ASSERT(spin_is_locked(&d->big_lock));
- ASSERT(!shadow_mode_refcounts(d));
gmfn = pte_addr >> PAGE_SHIFT;
mfn = gmfn_to_mfn(d, gmfn);
@@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping(
page = mfn_to_page(mfn);
type_info = page->u.inuse.type_info;
- if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
+ if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
!get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
{
MEM_LOG("Grant map attempted to update a non-L1 page");
@@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping(
}
ol1e = *(l1_pgentry_t *)va;
- if ( !update_l1e(va, ol1e, _nl1e) )
+ if ( !update_l1e(va, ol1e, _nl1e, mfn, v) )
{
put_page_type(page);
rc = GNTST_general_error;
goto failed;
}
- put_page_from_l1e(ol1e, d);
-
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- struct domain_mmap_cache sh_mapcache;
- domain_mmap_cache_init(&sh_mapcache);
- shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
- domain_mmap_cache_destroy(&sh_mapcache);
- }
+ if ( !shadow2_mode_refcounts(d) )
+ put_page_from_l1e(ol1e, d);
put_page_type(page);
failed:
unmap_domain_page(va);
put_page(page);
+
return rc;
}
@@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping(
u32 type_info;
l1_pgentry_t ol1e;
- ASSERT(!shadow_mode_refcounts(d));
-
gmfn = addr >> PAGE_SHIFT;
mfn = gmfn_to_mfn(d, gmfn);
@@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping(
}
/* Delete pagetable entry. */
- if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
+ if ( unlikely(!update_l1e(
+ (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
+ d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
{
MEM_LOG("Cannot delete PTE entry at %p", va);
put_page_type(page);
@@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping(
goto failed;
}
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- struct domain_mmap_cache sh_mapcache;
- domain_mmap_cache_init(&sh_mapcache);
- shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
- domain_mmap_cache_destroy(&sh_mapcache);
- }
-
put_page_type(page);
failed:
@@ -2536,31 +2517,22 @@ static int create_grant_va_mapping(
struct domain *d = v->domain;
ASSERT(spin_is_locked(&d->big_lock));
- ASSERT(!shadow_mode_refcounts(d));
-
- /*
- * This is actually overkill - we don't need to sync the L1 itself,
- * just everything involved in getting to this L1 (i.e. we need
- * linear_pg_table[l1_linear_offset(va)] to be in sync)...
- */
- __shadow_sync_va(v, va);
pl1e = &linear_pg_table[l1_linear_offset(va)];
if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
- !update_l1e(pl1e, ol1e, _nl1e) )
+ !update_l1e(pl1e, ol1e, _nl1e,
+ l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
return GNTST_general_error;
- put_page_from_l1e(ol1e, d);
-
- if ( unlikely(shadow_mode_enabled(d)) )
- shadow_do_update_va_mapping(va, _nl1e, v);
+ if ( !shadow2_mode_refcounts(d) )
+ put_page_from_l1e(ol1e, d);
return GNTST_okay;
}
static int destroy_grant_va_mapping(
- unsigned long addr, unsigned long frame)
+ unsigned long addr, unsigned long frame, struct domain *d)
{
l1_pgentry_t *pl1e, ol1e;
@@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping(
}
/* Delete pagetable entry. */
- if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
+ if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(),
+ l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
+ d->vcpu[0] /* Change for per-vcpu shadows */)) )
{
MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
return GNTST_general_error;
}
-
+
return 0;
}
@@ -2597,7 +2571,7 @@ int create_grant_host_mapping(
unsigned long addr, unsigned long frame, unsigned int flags)
{
l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
-
+
if ( (flags & GNTMAP_application_map) )
l1e_add_flags(pte,_PAGE_USER);
if ( !(flags & GNTMAP_readonly) )
@@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping(
{
if ( flags & GNTMAP_contains_pte )
return destroy_grant_pte_mapping(addr, frame, current->domain);
- return destroy_grant_va_mapping(addr, frame);
+ return destroy_grant_va_mapping(addr, frame, current->domain);
}
int steal_page(
@@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long v
perfc_incrc(calls_to_update_va);
- if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
+ if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
return -EINVAL;
+ if ( unlikely(shadow2_mode_refcounts(d)) )
+ {
+ DPRINTK("Grant op on a shadow-refcounted domain\n");
+ return -EINVAL;
+ }
+
LOCK_BIGLOCK(d);
- if ( unlikely(shadow_mode_enabled(d)) )
- check_pagetable(v, "pre-va"); /* debug */
-
- if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
- val)) )
- rc = -EINVAL;
-
- if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
+ if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
{
if ( unlikely(this_cpu(percpu_mm_info).foreign &&
- (shadow_mode_translate(d) ||
- shadow_mode_translate(
+ (shadow2_mode_translate(d) ||
+ shadow2_mode_translate(
this_cpu(percpu_mm_info).foreign))) )
{
/*
* The foreign domain's pfn's are in a different namespace. There's
- * not enough information in just a gpte to figure out how to
+ * not enough information in just a gpte to figure out how to
* (re-)shadow this entry.
*/
domain_crash(d);
}
+ }
+
+ if ( unlikely(!mod_l1_entry(
+ &linear_pg_table[l1_linear_offset(va)], val,
+ l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
+ rc = -EINVAL;
- rc = shadow_do_update_va_mapping(va, val, v);
-
- check_pagetable(v, "post-va"); /* debug */
- }
-
switch ( flags & UVMF_FLUSHTYPE_MASK )
{
case UVMF_TLB_FLUSH:
switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
{
case UVMF_LOCAL:
- if ( unlikely(shadow_mode_enabled(d)) )
- shadow_sync_all(d);
local_flush_tlb();
break;
case UVMF_ALL:
@@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long v
switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
{
case UVMF_LOCAL:
- if ( unlikely(shadow_mode_enabled(d)) )
- shadow_invlpg(current, va);
- local_flush_tlb_one(va);
+ if ( !shadow2_mode_enabled(d)
+ || (shadow2_invlpg(current, va) != 0) )
+ local_flush_tlb_one(va);
break;
case UVMF_ALL:
flush_tlb_one_mask(d->domain_dirty_cpumask, va);
@@ -2807,8 +2779,6 @@ long set_gdt(struct vcpu *v,
if ( entries > FIRST_RESERVED_GDT_ENTRY )
return -EINVAL;
-
- shadow_sync_all(d);
/* Check the pages in the new GDT. */
for ( i = 0; i < nr_pages; i++ ) {
@@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 de
break;
}
- if ( shadow_mode_enabled(dom) )
- {
- shadow_lock(dom);
-
- __mark_dirty(dom, mfn);
-
- if ( page_is_page_table(page) && !page_out_of_sync(page) )
- shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
- }
+ mark_dirty(dom, mfn);
/* All is good so make the update. */
gdt_pent = map_domain_page(mfn);
memcpy(&gdt_pent[offset], &d, 8);
unmap_domain_page(gdt_pent);
- if ( shadow_mode_enabled(dom) )
- shadow_unlock(dom);
-
put_page_type(page);
ret = 0; /* success */
@@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
default:
break;
}
-
- if ( !shadow_mode_translate(d) || (mfn == 0) )
+
+ if ( !shadow2_mode_translate(d) || (mfn == 0) )
{
put_domain(d);
return -EINVAL;
@@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
guest_physmap_add_page(d, xatp.gpfn, mfn);
UNLOCK_BIGLOCK(d);
-
+
put_domain(d);
break;
@@ -3136,7 +3095,8 @@ static int ptwr_emulated_update(
unsigned long pfn;
struct page_info *page;
l1_pgentry_t pte, ol1e, nl1e, *pl1e;
- struct domain *d = current->domain;
+ struct vcpu *v = current;
+ struct domain *d = v->domain;
/* Aligned access only, thank you. */
if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
@@ -3196,25 +3156,36 @@ static int ptwr_emulated_update(
return X86EMUL_UNHANDLEABLE;
}
+
/* Checked successfully: do the update (write or cmpxchg). */
pl1e = map_domain_page(page_to_mfn(page));
pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
if ( do_cmpxchg )
{
+ if ( shadow2_mode_enabled(d) )
+ shadow2_lock(d);
ol1e = l1e_from_intpte(old);
if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
{
+ if ( shadow2_mode_enabled(d) )
+ shadow2_unlock(d);
unmap_domain_page(pl1e);
put_page_from_l1e(nl1e, d);
return X86EMUL_CMPXCHG_FAILED;
}
+ if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ {
+ shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
+ shadow2_unlock(v->domain);
+ }
}
else
{
ol1e = *pl1e;
- if ( !update_l1e(pl1e, ol1e, nl1e) )
+ if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
BUG();
}
+
unmap_domain_page(pl1e);
/* Finally, drop the old PTE. */
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/setup.c Wed Aug 16 17:02:35 2006 +0100
@@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t
if ( opt_watchdog )
watchdog_enable();
- shadow_mode_init();
-
/* initialize access control security module */
acm_init(&initrdidx, mbi, initial_images_start);
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/smpboot.c Wed Aug 16 17:02:35 2006 +0100
@@ -896,7 +896,7 @@ static int __devinit do_boot_cpu(int api
v = alloc_idle_vcpu(cpu);
BUG_ON(v == NULL);
- v->arch.monitor_table = pagetable_from_paddr(__pa(idle_pg_table));
+ v->arch.cr3 = __pa(idle_pg_table);
/* start_eip had better be page-aligned! */
start_eip = setup_trampoline();
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/traps.c Wed Aug 16 17:02:35 2006 +0100
@@ -277,6 +277,21 @@ void show_stack(struct cpu_user_regs *re
show_trace(regs);
}
+void show_xen_trace()
+{
+ struct cpu_user_regs regs;
+#ifdef __x86_64
+ __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
+ __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
+ __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
+#else
+ __asm__("movl %%esp,%0" : "=m" (regs.esp));
+ __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
+ __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
+#endif
+ show_trace(®s);
+}
+
void show_stack_overflow(unsigned long esp)
{
#ifdef MEMORY_GUARD
@@ -861,8 +876,8 @@ static int fixup_page_fault(unsigned lon
if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
{
- if ( shadow_mode_external(d) && guest_mode(regs) )
- return shadow_fault(addr, regs);
+ if ( shadow2_mode_external(d) && guest_mode(regs) )
+ return shadow2_fault(addr, regs);
if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
return handle_gdt_ldt_mapping_fault(
addr - GDT_LDT_VIRT_START, regs);
@@ -873,14 +888,14 @@ static int fixup_page_fault(unsigned lon
return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
}
- if ( unlikely(shadow_mode_enabled(d)) )
- return shadow_fault(addr, regs);
-
if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) &&
guest_kernel_mode(v, regs) &&
((regs->error_code & (PGERR_write_access|PGERR_page_present)) ==
(PGERR_write_access|PGERR_page_present)) )
return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0;
+
+ if ( shadow2_mode_enabled(d) )
+ return shadow2_fault(addr, regs);
return 0;
}
@@ -905,6 +920,13 @@ asmlinkage int do_page_fault(struct cpu_
DEBUGGER_trap_entry(TRAP_page_fault, regs);
perfc_incrc(page_faults);
+
+ if ( shadow2_mode_enabled(current->domain) )
+ debugtrace_printk("%s %s %d dom=%d eip=%p cr2=%p code=%d cs=%x\n",
+ __func__, __FILE__, __LINE__,
+ current->domain->domain_id,
+ (void *)regs->eip, (void *)addr, regs->error_code,
+ regs->cs);
if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
return rc;
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_32/domain_page.c
--- a/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 17:02:35 2006 +0100
@@ -15,6 +15,7 @@
#include <asm/current.h>
#include <asm/flushtlb.h>
#include <asm/hardirq.h>
+#include <asm/hvm/support.h>
static inline struct vcpu *mapcache_current_vcpu(void)
{
@@ -58,10 +59,10 @@ void *map_domain_page(unsigned long pfn)
cache = &v->domain->arch.mapcache;
hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)];
- if ( hashent->pfn == pfn )
- {
- idx = hashent->idx;
+ if ( hashent->pfn == pfn && (idx = hashent->idx) != MAPHASHENT_NOTINUSE )
+ {
hashent->refcnt++;
+ ASSERT(idx < MAPCACHE_ENTRIES);
ASSERT(hashent->refcnt != 0);
ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn);
goto out;
@@ -178,6 +179,30 @@ void mapcache_init(struct domain *d)
MAPHASHENT_NOTINUSE;
}
+paddr_t mapped_domain_page_to_maddr(void *va)
+/* Convert a pointer in a mapped domain page to a machine address.
+ * Takes any pointer that's valid for use in unmap_domain_page() */
+{
+ unsigned int idx;
+ struct vcpu *v;
+ struct mapcache *cache;
+ unsigned long pfn;
+
+ ASSERT(!in_irq());
+
+ ASSERT((void *)MAPCACHE_VIRT_START <= va);
+ ASSERT(va < (void *)MAPCACHE_VIRT_END);
+
+ v = mapcache_current_vcpu();
+
+ cache = &v->domain->arch.mapcache;
+
+ idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
+ pfn = l1e_get_pfn(cache->l1tab[idx]);
+ return ((paddr_t) pfn << PAGE_SHIFT
+ | ((unsigned long) va & ~PAGE_MASK));
+}
+
#define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT))
static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)];
static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)];
@@ -233,6 +258,8 @@ void unmap_domain_page_global(void *va)
l1_pgentry_t *pl1e;
unsigned int idx;
+ ASSERT((__va >= IOREMAP_VIRT_START) && (__va <= (IOREMAP_VIRT_END - 1)));
+
/* /First/, we zap the PTE. */
pl2e = virt_to_xen_l2e(__va);
pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va);
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_32/mm.c
--- a/xen/arch/x86/x86_32/mm.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_32/mm.c Wed Aug 16 17:02:35 2006 +0100
@@ -75,8 +75,7 @@ void __init paging_init(void)
printk("PAE disabled.\n");
#endif
- idle_vcpu[0]->arch.monitor_table =
- pagetable_from_paddr(__pa(idle_pg_table));
+ idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
if ( cpu_has_pge )
{
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_64/mm.c
--- a/xen/arch/x86/x86_64/mm.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_64/mm.c Wed Aug 16 17:02:35 2006 +0100
@@ -81,8 +81,7 @@ void __init paging_init(void)
l2_pgentry_t *l2_ro_mpt;
struct page_info *pg;
- idle_vcpu[0]->arch.monitor_table =
- pagetable_from_paddr(__pa(idle_pg_table));
+ idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
/* Create user-accessible L2 directory to map the MPT for guests. */
l3_ro_mpt = alloc_xenheap_page();
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_64/traps.c Wed Aug 16 17:02:35 2006 +0100
@@ -84,7 +84,8 @@ void show_page_walk(unsigned long addr)
l4e = l4t[l4_table_offset(addr)];
mfn = l4e_get_pfn(l4e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L4 = %"PRIpte" %016lx\n", l4e_get_intpte(l4e), pfn);
+ printk(" L4[0x%lx] = %"PRIpte" %016lx\n",
+ l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
return;
@@ -92,7 +93,8 @@ void show_page_walk(unsigned long addr)
l3e = l3t[l3_table_offset(addr)];
mfn = l3e_get_pfn(l3e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L3 = %"PRIpte" %016lx\n", l3e_get_intpte(l3e), pfn);
+ printk(" L3[0x%lx] = %"PRIpte" %016lx\n",
+ l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
return;
@@ -100,7 +102,8 @@ void show_page_walk(unsigned long addr)
l2e = l2t[l2_table_offset(addr)];
mfn = l2e_get_pfn(l2e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L2 = %"PRIpte" %016lx %s\n", l2e_get_intpte(l2e), pfn,
+ printk(" L2[0x%lx] = %"PRIpte" %016lx %s\n",
+ l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
(l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
(l2e_get_flags(l2e) & _PAGE_PSE) )
@@ -110,7 +113,8 @@ void show_page_walk(unsigned long addr)
l1e = l1t[l1_table_offset(addr)];
mfn = l1e_get_pfn(l1e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
+ printk(" L1[0x%lx] = %"PRIpte" %016lx\n",
+ l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
}
asmlinkage void double_fault(void);
@@ -162,7 +166,7 @@ void toggle_guest_mode(struct vcpu *v)
{
v->arch.flags ^= TF_kernel_mode;
__asm__ __volatile__ ( "swapgs" );
- update_pagetables(v);
+ update_cr3(v);
write_ptbase(v);
}
diff -r fda70200da01 -r 0f917d63e960 xen/common/acm_ops.c
--- a/xen/common/acm_ops.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/acm_ops.c Wed Aug 16 17:02:35 2006 +0100
@@ -26,7 +26,6 @@
#include <xen/trace.h>
#include <xen/console.h>
#include <xen/guest_access.h>
-#include <asm/shadow.h>
#include <public/sched_ctl.h>
#include <acm/acm_hooks.h>
diff -r fda70200da01 -r 0f917d63e960 xen/common/grant_table.c
--- a/xen/common/grant_table.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/grant_table.c Wed Aug 16 17:02:35 2006 +0100
@@ -434,7 +434,7 @@ __gnttab_unmap_grant_ref(
/* If just unmapped a writable mapping, mark as dirtied */
if ( !(flags & GNTMAP_readonly) )
- gnttab_log_dirty(rd, frame);
+ gnttab_mark_dirty(rd, frame);
if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) &&
!(flags & GNTMAP_readonly) )
@@ -731,7 +731,7 @@ __release_grant_for_copy(
const unsigned long r_frame = act->frame;
if ( !readonly )
- gnttab_log_dirty(rd, r_frame);
+ gnttab_mark_dirty(rd, r_frame);
spin_lock(&rd->grant_table->lock);
if ( readonly )
diff -r fda70200da01 -r 0f917d63e960 xen/common/keyhandler.c
--- a/xen/common/keyhandler.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/keyhandler.c Wed Aug 16 17:02:35 2006 +0100
@@ -241,9 +241,6 @@ static void read_clocks(unsigned char ke
}
extern void dump_runq(unsigned char key);
-#ifndef NDEBUG
-extern void audit_domains_key(unsigned char key);
-#endif
#ifdef PERF_COUNTERS
extern void perfc_printall(unsigned char key);
@@ -261,10 +258,16 @@ static void do_debug_key(unsigned char k
#ifndef NDEBUG
static void debugtrace_key(unsigned char key)
{
- debugtrace_send_to_console = !debugtrace_send_to_console;
- debugtrace_dump();
- printk("debugtrace_printk now writing to %s.\n",
- debugtrace_send_to_console ? "console" : "buffer");
+ debugtrace_toggle();
+}
+
+static void shadow2_audit_key(unsigned char key)
+{
+ extern int shadow2_audit_enable;
+
+ shadow2_audit_enable = !shadow2_audit_enable;
+ printk("%s shadow2_audit_enable=%d\n",
+ __func__, shadow2_audit_enable);
}
#endif
@@ -288,7 +291,7 @@ void initialize_keytable(void)
#ifndef NDEBUG
register_keyhandler(
- 'o', audit_domains_key, "audit domains >0 EXPERIMENTAL");
+ 'O', shadow2_audit_key, "toggle shadow2 audits");
register_keyhandler(
'T', debugtrace_key, "toggle debugtrace to console/buffer");
#endif
diff -r fda70200da01 -r 0f917d63e960 xen/common/memory.c
--- a/xen/common/memory.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/memory.c Wed Aug 16 17:02:35 2006 +0100
@@ -126,6 +126,11 @@ populate_physmap(
for ( j = 0; j < (1 << extent_order); j++ )
guest_physmap_add_page(d, gpfn + j, mfn + j);
}
+ else if ( unlikely(shadow2_mode_translate(d)) )
+ {
+ for ( j = 0; j < (1 << extent_order); j++ )
+ shadow2_guest_physmap_add_page(d, gpfn + j, mfn + j);
+ }
else
{
for ( j = 0; j < (1 << extent_order); j++ )
@@ -153,7 +158,7 @@ guest_remove_page(
if ( unlikely(!mfn_valid(mfn)) )
{
DPRINTK("Domain %u page number %lx invalid\n",
- d->domain_id, mfn);
+ d->domain_id, gmfn);
return 0;
}
@@ -179,7 +184,7 @@ guest_remove_page(
(unsigned long)page->count_info, page->u.inuse.type_info);
}
- guest_physmap_remove_page(d, gmfn, mfn);
+ shadow2_guest_physmap_remove_page(d, gmfn, mfn);
put_page(page);
@@ -250,7 +255,7 @@ translate_gpfn_list(
if ( (d = find_domain_by_id(op.domid)) == NULL )
return -ESRCH;
- if ( !shadow_mode_translate(d) )
+ if ( !(shadow_mode_translate(d) || shadow2_mode_translate(d)) )
{
put_domain(d);
return -EINVAL;
diff -r fda70200da01 -r 0f917d63e960 xen/drivers/char/console.c
--- a/xen/drivers/char/console.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/drivers/char/console.c Wed Aug 16 17:02:35 2006 +0100
@@ -569,7 +569,7 @@ int console_getc(void)
#ifndef NDEBUG
/* Send output direct to console, or buffer it? */
-int debugtrace_send_to_console;
+static volatile int debugtrace_send_to_console;
static char *debugtrace_buf; /* Debug-trace buffer */
static unsigned int debugtrace_prd; /* Producer index */
@@ -578,16 +578,10 @@ static DEFINE_SPINLOCK(debugtrace_lock);
static DEFINE_SPINLOCK(debugtrace_lock);
integer_param("debugtrace", debugtrace_kilobytes);
-void debugtrace_dump(void)
-{
- unsigned long flags;
-
+static void debugtrace_dump_worker(void)
+{
if ( (debugtrace_bytes == 0) || !debugtrace_used )
return;
-
- watchdog_disable();
-
- spin_lock_irqsave(&debugtrace_lock, flags);
printk("debugtrace_dump() starting\n");
@@ -602,15 +596,47 @@ void debugtrace_dump(void)
memset(debugtrace_buf, '\0', debugtrace_bytes);
printk("debugtrace_dump() finished\n");
+}
+
+void debugtrace_toggle(void)
+{
+ unsigned long flags;
+
+ watchdog_disable();
+ spin_lock_irqsave(&debugtrace_lock, flags);
+
+ // dump the buffer *before* toggling, in case the act of dumping the
+ // buffer itself causes more printk's...
+ //
+ printk("debugtrace_printk now writing to %s.\n",
+ !debugtrace_send_to_console ? "console": "buffer");
+ if ( !debugtrace_send_to_console )
+ debugtrace_dump_worker();
+
+ debugtrace_send_to_console = !debugtrace_send_to_console;
spin_unlock_irqrestore(&debugtrace_lock, flags);
-
watchdog_enable();
+
+}
+
+void debugtrace_dump(void)
+{
+ unsigned long flags;
+
+ watchdog_disable();
+ spin_lock_irqsave(&debugtrace_lock, flags);
+
+ debugtrace_dump_worker();
+
+ spin_unlock_irqrestore(&debugtrace_lock, flags);
+ watchdog_enable();
}
void debugtrace_printk(const char *fmt, ...)
{
static char buf[1024];
+ static u32 count;
va_list args;
char *p;
@@ -625,8 +651,10 @@ void debugtrace_printk(const char *fmt,
ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0);
+ sprintf(buf, "%u ", ++count);
+
va_start(args, fmt);
- (void)vsnprintf(buf, sizeof(buf), fmt, args);
+ (void)vsnprintf(buf + strlen(buf), sizeof(buf), fmt, args);
va_end(args);
if ( debugtrace_send_to_console )
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/bitops.h
--- a/xen/include/asm-x86/bitops.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/bitops.h Wed Aug 16 17:02:35 2006 +0100
@@ -75,6 +75,24 @@ static __inline__ void clear_bit(int nr,
:"=m" (ADDR)
:"dIr" (nr));
}
+
+/**
+ * __clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * Unlike clear_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+ __asm__(
+ "btrl %1,%0"
+ :"=m" (ADDR)
+ :"dIr" (nr));
+}
+
#define smp_mb__before_clear_bit() barrier()
#define smp_mb__after_clear_bit() barrier()
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/config.h Wed Aug 16 17:02:35 2006 +0100
@@ -79,9 +79,14 @@
#ifndef __ASSEMBLY__
extern unsigned long _end; /* standard ELF symbol */
+
+static inline void FORCE_CRASH(void) __attribute__((noreturn,always_inline));
+static inline void FORCE_CRASH(void)
+{
+ __asm__ __volatile__ ( "ud2" );
+ while(1);
+}
#endif /* __ASSEMBLY__ */
-
-#define FORCE_CRASH() __asm__ __volatile__ ( "ud2" )
#if defined(__x86_64__)
@@ -149,9 +154,14 @@ extern unsigned long _end; /* standard E
/* Slot 256: read-only guest-accessible machine-to-phys translation table. */
#define RO_MPT_VIRT_START (PML4_ADDR(256))
#define RO_MPT_VIRT_END (RO_MPT_VIRT_START + PML4_ENTRY_BYTES/2)
+
+// current unused?
+#if 0
/* Slot 257: read-only guest-accessible linear page table. */
#define RO_LINEAR_PT_VIRT_START (PML4_ADDR(257))
#define RO_LINEAR_PT_VIRT_END (RO_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
+#endif
+
/* Slot 258: linear page table (guest table). */
#define LINEAR_PT_VIRT_START (PML4_ADDR(258))
#define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
@@ -175,7 +185,7 @@ extern unsigned long _end; /* standard E
#define DIRECTMAP_VIRT_START (PML4_ADDR(262))
#define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2)
-#define PGT_base_page_table PGT_l4_page_table
+#define PGT_base_page_table PGT_l4_page_table
#define __HYPERVISOR_CS64 0xe010
#define __HYPERVISOR_CS32 0xe008
@@ -274,9 +284,9 @@ extern unsigned long _end; /* standard E
(L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1)
#ifdef CONFIG_X86_PAE
-# define PGT_base_page_table PGT_l3_page_table
-#else
-# define PGT_base_page_table PGT_l2_page_table
+# define PGT_base_page_table PGT_l3_page_table
+#else
+# define PGT_base_page_table PGT_l2_page_table
#endif
#define __HYPERVISOR_CS 0xe008
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/domain.h Wed Aug 16 17:02:35 2006 +0100
@@ -73,42 +73,42 @@ struct arch_domain
/* I/O-port admin-specified access capabilities. */
struct rangeset *ioport_caps;
- /* Shadow mode status and controls. */
- struct shadow_ops *ops;
- unsigned int shadow_mode; /* flags to control shadow table operation */
- unsigned int shadow_nest; /* Recursive depth of shadow_lock() nesting */
-
- /* shadow hashtable */
- struct shadow_status *shadow_ht;
- struct shadow_status *shadow_ht_free;
- struct shadow_status *shadow_ht_extras; /* extra allocation units */
- unsigned int shadow_extras_count;
-
- /* shadow dirty bitmap */
+ /* HVM stuff */
+ struct hvm_domain hvm_domain;
+
+ /* Shadow-translated guest: Pseudophys base address of reserved area. */
+ unsigned long first_reserved_pfn;
+
+ /* Shadow2 stuff */
+ u32 shadow2_mode; /* flags to control shadow operation */
+ spinlock_t shadow2_lock; /* shadow2 domain lock */
+ int shadow2_locker; /* processor which holds the lock */
+ const char *shadow2_locker_function; /* Func that took it */
+ struct list_head shadow2_freelists[SHADOW2_MAX_ORDER + 1];
+ struct list_head shadow2_p2m_freelist;
+ struct list_head shadow2_p2m_inuse;
+ struct list_head shadow2_toplevel_shadows;
+ unsigned int shadow2_total_pages; /* number of pages allocated */
+ unsigned int shadow2_free_pages; /* number of pages on freelists */
+ unsigned int shadow2_p2m_pages; /* number of pages in p2m map */
+
+ /* Shadow2 hashtable */
+ struct shadow2_hash_entry *shadow2_hash_table;
+ struct shadow2_hash_entry *shadow2_hash_freelist;
+ struct shadow2_hash_entry *shadow2_hash_allocations;
+ int shadow2_hash_walking; /* Some function is walking the hash table */
+
+ /* Shadow log-dirty bitmap */
unsigned long *shadow_dirty_bitmap;
unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */
- /* shadow mode stats */
- unsigned int shadow_page_count;
- unsigned int hl2_page_count;
- unsigned int snapshot_page_count;
-
+ /* Shadow log-dirty mode stats */
unsigned int shadow_fault_count;
unsigned int shadow_dirty_count;
- /* full shadow mode */
- struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
- struct out_of_sync_entry *out_of_sync_free;
- struct out_of_sync_entry *out_of_sync_extras;
- unsigned int out_of_sync_extras_count;
-
- struct list_head free_shadow_frames;
-
- pagetable_t phys_table; /* guest 1:1 pagetable */
- struct hvm_domain hvm_domain;
-
- /* Shadow-translated guest: Pseudophys base address of reserved area. */
- unsigned long first_reserved_pfn;
+ /* Shadow translated domain: P2M mapping */
+ pagetable_t phys_table;
+
} __cacheline_aligned;
#ifdef CONFIG_X86_PAE
@@ -166,25 +166,34 @@ struct arch_vcpu
*/
l1_pgentry_t *perdomain_ptes;
- pagetable_t guest_table_user; /* x86/64: user-space pagetable. */
- pagetable_t guest_table; /* (MA) guest notion of cr3 */
- pagetable_t shadow_table; /* (MA) shadow of guest */
- pagetable_t monitor_table; /* (MA) used in hypervisor */
-
- l2_pgentry_t *guest_vtable; /* virtual address of pagetable */
- l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */
- l2_pgentry_t *monitor_vtable; /* virtual address of
monitor_table */
- l1_pgentry_t *hl2_vtable; /* virtual address of hl2_table
*/
-
#ifdef CONFIG_X86_64
- l3_pgentry_t *guest_vl3table;
- l4_pgentry_t *guest_vl4table;
-#endif
-
- unsigned long monitor_shadow_ref;
+ pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */
+#endif
+ pagetable_t guest_table; /* (MFN) guest notion of cr3 */
+ /* guest_table holds a ref to the page, and also a type-count unless
+ * shadow refcounts are in use */
+ pagetable_t shadow_table; /* (MFN) shadow of guest */
+ pagetable_t monitor_table; /* (MFN) hypervisor PT (for HVM) */
+ unsigned long cr3; /* (MA) value to install in HW CR3
*/
+
+ void *guest_vtable; /* virtual address of pagetable */
+ void *shadow_vtable; /* virtual address of shadow_table */
+ root_pgentry_t *monitor_vtable; /* virtual address of
monitor_table */
/* Current LDT details. */
unsigned long shadow_ldt_mapcnt;
+
+ /* Shadow2 stuff */
+ /* -- pointers to mode-specific entry points */
+ struct shadow2_entry_points *shadow2;
+ unsigned long last_emulated_mfn; /* last mfn we emulated a write to */
+ u8 shadow2_propagate_fault; /* emulated fault needs to be */
+ /* propagated to guest */
+#if CONFIG_PAGING_LEVELS >= 3
+ u8 shadow2_pae_flip_pending; /* shadow update requires this PAE cpu
+ * to recopy/install its L3 table.
+ */
+#endif
} __cacheline_aligned;
/* shorthands to improve code legibility */
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/grant_table.h
--- a/xen/include/asm-x86/grant_table.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/grant_table.h Wed Aug 16 17:02:35 2006 +0100
@@ -31,7 +31,7 @@ int destroy_grant_host_mapping(
#define gnttab_shared_gmfn(d, t, i) \
(mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i)))
-#define gnttab_log_dirty(d, f) mark_dirty((d), (f))
+#define gnttab_mark_dirty(d, f) mark_dirty((d), (f))
static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr)
{
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h Wed Aug 16 17:02:35 2006 +0100
@@ -56,8 +56,15 @@ struct hvm_function_table {
*/
int (*realmode)(struct vcpu *v);
int (*paging_enabled)(struct vcpu *v);
+ int (*long_mode_enabled)(struct vcpu *v);
+ int (*guest_x86_mode)(struct vcpu *v);
int (*instruction_length)(struct vcpu *v);
unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
+
+ /*
+ * Re-set the value of CR3 that Xen runs on when handling VM exits
+ */
+ void (*update_host_cr3)(struct vcpu *v);
/*
* Update specifics of the guest state:
@@ -134,9 +141,27 @@ hvm_paging_enabled(struct vcpu *v)
}
static inline int
+hvm_long_mode_enabled(struct vcpu *v)
+{
+ return hvm_funcs.long_mode_enabled(v);
+}
+
+static inline int
+hvm_guest_x86_mode(struct vcpu *v)
+{
+ return hvm_funcs.guest_x86_mode(v);
+}
+
+static inline int
hvm_instruction_length(struct vcpu *v)
{
return hvm_funcs.instruction_length(v);
+}
+
+static inline void
+hvm_update_host_cr3(struct vcpu *v)
+{
+ hvm_funcs.update_host_cr3(v);
}
void hvm_hypercall_page_initialise(struct domain *d,
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/support.h Wed Aug 16 17:02:35 2006 +0100
@@ -116,10 +116,13 @@ enum hval_bitmaps {
#define DBG_LEVEL_IOAPIC (1 << 9)
extern unsigned int opt_hvm_debug_level;
-#define HVM_DBG_LOG(level, _f, _a...) \
- if ( (level) & opt_hvm_debug_level ) \
- printk("[HVM:%d.%d] <%s> " _f "\n", \
- current->domain->domain_id, current->vcpu_id, __func__, ## _a)
+#define HVM_DBG_LOG(level, _f, _a...) \
+ do { \
+ if ( (level) & opt_hvm_debug_level ) \
+ printk("[HVM:%d.%d] <%s> " _f "\n", \
+ current->domain->domain_id, current->vcpu_id, __func__, \
+ ## _a); \
+ } while (0)
#else
#define HVM_DBG_LOG(level, _f, _a...)
#endif
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/vcpu.h Wed Aug 16 17:02:35 2006 +0100
@@ -29,6 +29,7 @@
#define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1
struct hvm_vcpu {
+ unsigned long hw_cr3; /* value we give to HW to use */
unsigned long ioflags;
struct hvm_io_op io_op;
struct vlapic *vlapic;
@@ -39,6 +40,11 @@ struct hvm_vcpu {
unsigned long init_sipi_sipi_state;
int xen_port;
+
+#if CONFIG_PAGING_LEVELS >= 3
+ l3_pgentry_t hvm_lowmem_l3tab[4]
+ __attribute__((__aligned__(32)));
+#endif
/* Flags */
int flag_dr_dirty;
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Wed Aug 16 17:02:35 2006 +0100
@@ -87,6 +87,7 @@ struct arch_vmx_struct {
unsigned long cpu_cr0; /* copy of guest CR0 */
unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+ unsigned long cpu_shadow_cr4; /* copy of guest read shadow CR4 */
unsigned long cpu_cr2; /* save CR2 */
unsigned long cpu_cr3;
unsigned long cpu_state;
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 17:02:35 2006 +0100
@@ -298,6 +298,9 @@ static always_inline void __vmwrite_vcpu
case GUEST_CR0:
v->arch.hvm_vmx.cpu_cr0 = value;
break;
+ case CR4_READ_SHADOW:
+ v->arch.hvm_vmx.cpu_shadow_cr4 = value;
+ break;
case CPU_BASED_VM_EXEC_CONTROL:
v->arch.hvm_vmx.cpu_based_exec_control = value;
break;
@@ -317,11 +320,14 @@ static always_inline void __vmread_vcpu(
case GUEST_CR0:
*value = v->arch.hvm_vmx.cpu_cr0;
break;
+ case CR4_READ_SHADOW:
+ *value = v->arch.hvm_vmx.cpu_shadow_cr4;
+ break;
case CPU_BASED_VM_EXEC_CONTROL:
*value = v->arch.hvm_vmx.cpu_based_exec_control;
break;
default:
- printk("__vmread_cpu: invalid field %lx\n", field);
+ printk("__vmread_vcpu: invalid field %lx\n", field);
break;
}
}
@@ -342,6 +348,7 @@ static inline int __vmwrite(unsigned lon
switch ( field ) {
case CR0_READ_SHADOW:
case GUEST_CR0:
+ case CR4_READ_SHADOW:
case CPU_BASED_VM_EXEC_CONTROL:
__vmwrite_vcpu(v, field, value);
break;
@@ -402,6 +409,46 @@ static inline int vmx_paging_enabled(str
__vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_long_mode_enabled(struct vcpu *v)
+{
+ ASSERT(v == current);
+ return VMX_LONG_GUEST(current);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_realmode(struct vcpu *v)
+{
+ unsigned long rflags;
+ ASSERT(v == current);
+
+ __vmread(GUEST_RFLAGS, &rflags);
+ return rflags & X86_EFLAGS_VM;
+}
+
+/* Works only for vcpu == current */
+static inline void vmx_update_host_cr3(struct vcpu *v)
+{
+ ASSERT(v == current);
+ __vmwrite(HOST_CR3, v->arch.cr3);
+}
+
+static inline int vmx_guest_x86_mode(struct vcpu *v)
+{
+ unsigned long cs_ar_bytes;
+ ASSERT(v == current);
+
+ if ( vmx_long_mode_enabled(v) )
+ {
+ __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+ return (cs_ar_bytes & (1u<<13)) ? 8 : 4;
+ }
+ if ( vmx_realmode(v) )
+ return 2;
+ __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+ return (cs_ar_bytes & (1u<<14)) ? 4 : 2;
}
static inline int vmx_pgbit_test(struct vcpu *v)
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/mm.h Wed Aug 16 17:02:35 2006 +0100
@@ -20,7 +20,11 @@ struct page_info
struct page_info
{
/* Each frame can be threaded onto a doubly-linked list. */
- struct list_head list;
+ union {
+ struct list_head list;
+ /* Shadow2 uses this field as an up-pointer in lower-level shadows */
+ paddr_t up;
+ };
/* Reference count and various PGC_xxx flags and fields. */
u32 count_info;
@@ -46,8 +50,20 @@ struct page_info
} u;
- /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
- u32 tlbflush_timestamp;
+ union {
+ /* Timestamp from 'TLB clock', used to reduce need for safety
+ * flushes. Only valid on a) free pages, and b) guest pages with a
+ * zero type count. */
+ u32 tlbflush_timestamp;
+
+ /* Only used on guest pages with a shadow.
+ * Guest pages with a shadow must have a non-zero type count, so this
+ * does not conflict with the tlbflush timestamp. */
+ u32 shadow2_flags;
+
+ // XXX -- we expect to add another field here, to be used for min/max
+ // purposes, which is only used for shadow pages.
+ };
};
/* The following page types are MUTUALLY EXCLUSIVE. */
@@ -60,6 +76,7 @@ struct page_info
#define PGT_ldt_page (6U<<29) /* using this page in an LDT? */
#define PGT_writable_page (7U<<29) /* has writable mappings of this page? */
+#ifndef SHADOW2
#define PGT_l1_shadow PGT_l1_page_table
#define PGT_l2_shadow PGT_l2_page_table
#define PGT_l3_shadow PGT_l3_page_table
@@ -69,14 +86,16 @@ struct page_info
#define PGT_writable_pred (7U<<29) /* predicted gpfn with writable ref */
#define PGT_fl1_shadow (5U<<29)
+#endif
+
#define PGT_type_mask (7U<<29) /* Bits 29-31. */
+ /* Owning guest has pinned this page to its current type? */
+#define _PGT_pinned 28
+#define PGT_pinned (1U<<_PGT_pinned)
/* Has this page been validated for use as its current type? */
-#define _PGT_validated 28
+#define _PGT_validated 27
#define PGT_validated (1U<<_PGT_validated)
- /* Owning guest has pinned this page to its current type? */
-#define _PGT_pinned 27
-#define PGT_pinned (1U<<_PGT_pinned)
#if defined(__i386__)
/* The 11 most significant bits of virt address if this is a page table. */
#define PGT_va_shift 16
@@ -98,6 +117,7 @@ struct page_info
/* 16-bit count of uses of this frame as its current type. */
#define PGT_count_mask ((1U<<16)-1)
+#ifndef SHADOW2
#ifdef __x86_64__
#define PGT_high_mfn_shift 52
#define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift)
@@ -112,19 +132,53 @@ struct page_info
#define PGT_score_shift 23
#define PGT_score_mask (((1U<<4)-1)<<PGT_score_shift)
#endif
+#endif /* SHADOW2 */
/* Cleared when the owning guest 'frees' this page. */
#define _PGC_allocated 31
#define PGC_allocated (1U<<_PGC_allocated)
- /* Set when fullshadow mode marks a page out-of-sync */
+ /* Set on a *guest* page to mark it out-of-sync with its shadow */
#define _PGC_out_of_sync 30
#define PGC_out_of_sync (1U<<_PGC_out_of_sync)
- /* Set when fullshadow mode is using a page as a page table */
+ /* Set when is using a page as a page table */
#define _PGC_page_table 29
#define PGC_page_table (1U<<_PGC_page_table)
/* 29-bit count of references to this frame. */
#define PGC_count_mask ((1U<<29)-1)
+/* shadow2 uses the count_info on shadow pages somewhat differently */
+/* NB: please coordinate any changes here with the SH2F's in shadow2.h */
+#define PGC_SH2_none (0U<<28) /* on the shadow2 free list */
+#define PGC_SH2_min_shadow (1U<<28)
+#define PGC_SH2_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */
+#define PGC_SH2_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */
+#define PGC_SH2_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */
+#define PGC_SH2_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */
+#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
+#define PGC_SH2_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */
+#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
+#define PGC_SH2_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */
+#define PGC_SH2_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */
+#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
+#define PGC_SH2_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */
+#define PGC_SH2_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */
+#define PGC_SH2_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */
+#define PGC_SH2_max_shadow (13U<<28)
+#define PGC_SH2_p2m_table (14U<<28) /* in use as the p2m table */
+#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */
+#define PGC_SH2_unused (15U<<28)
+
+#define PGC_SH2_type_mask (15U<<28)
+#define PGC_SH2_type_shift 28
+
+#define PGC_SH2_pinned (1U<<27)
+
+#define _PGC_SH2_log_dirty 26
+#define PGC_SH2_log_dirty (1U<<26)
+
+/* 26 bit ref count for shadow pages */
+#define PGC_SH2_count_mask ((1U<<26) - 1)
+
/* We trust the slab allocator in slab.c, and our use of it. */
#define PageSlab(page) (1)
#define PageSetSlab(page) ((void)0)
@@ -134,14 +188,22 @@ struct page_info
#if defined(__i386__)
#define pickle_domptr(_d) ((u32)(unsigned long)(_d))
-#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d))
+static inline struct domain *unpickle_domptr(u32 _domain)
+{ return (_domain & 1) ? NULL : (void *)_domain; }
#define PRtype_info "08lx" /* should only be used for printk's */
#elif defined(__x86_64__)
static inline struct domain *unpickle_domptr(u32 _domain)
-{ return (_domain == 0) ? NULL : __va(_domain); }
+{ return ((_domain == 0) || (_domain & 1)) ? NULL : __va(_domain); }
static inline u32 pickle_domptr(struct domain *domain)
{ return (domain == NULL) ? 0 : (u32)__pa(domain); }
#define PRtype_info "016lx"/* should only be used for printk's */
+#endif
+
+/* The order of the largest allocation unit we use for shadow pages */
+#if CONFIG_PAGING_LEVELS == 2
+#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */
+#else
+#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
#endif
#define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain))
@@ -165,7 +227,7 @@ extern int shadow_remove_all_write_acces
extern int shadow_remove_all_write_access(
struct domain *d, unsigned long gmfn, unsigned long mfn);
extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow_mode_refcounts(struct domain *d);
+extern int _shadow2_mode_refcounts(struct domain *d);
static inline void put_page(struct page_info *page)
{
@@ -197,8 +259,8 @@ static inline int get_page(struct page_i
unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
unlikely(d != _domain) ) /* Wrong owner? */
{
- if ( !_shadow_mode_refcounts(domain) )
- DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
+ if ( !_shadow2_mode_refcounts(domain) )
+ DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
PRtype_info "\n",
page_to_mfn(page), domain, unpickle_domptr(d),
x, page->u.inuse.type_info);
@@ -254,6 +316,16 @@ static inline int page_is_removable(stru
ASSERT(((_p)->count_info & PGC_count_mask) != 0); \
ASSERT(page_get_owner(_p) == (_d))
+// Quick test for whether a given page can be represented directly in CR3.
+//
+#if CONFIG_PAGING_LEVELS == 3
+#define MFN_FITS_IN_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+
+/* returns a lowmem machine address of the copied L3 root table */
+unsigned long
+pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab);
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
int check_descriptor(struct desc_struct *d);
/*
@@ -271,29 +343,44 @@ int check_descriptor(struct desc_struct
#define set_gpfn_from_mfn(mfn, pfn) (machine_to_phys_mapping[(mfn)] = (pfn))
#define get_gpfn_from_mfn(mfn) (machine_to_phys_mapping[(mfn)])
+
+#define mfn_to_gmfn(_d, mfn) \
+ ( (shadow2_mode_translate(_d)) \
+ ? get_gpfn_from_mfn(mfn) \
+ : (mfn) )
+
+#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn))
+
+
/*
* The phys_to_machine_mapping is the reversed mapping of MPT for full
* virtualization. It is only used by shadow_mode_translate()==true
* guests, so we steal the address space that would have normally
* been used by the read-only MPT map.
*/
-#define phys_to_machine_mapping ((unsigned long *)RO_MPT_VIRT_START)
-#define NR_P2M_TABLE_ENTRIES ((unsigned long *)RO_MPT_VIRT_END \
- - phys_to_machine_mapping)
+#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START)
#define INVALID_MFN (~0UL)
#define VALID_MFN(_mfn) (!((_mfn) & (1U<<31)))
-#define set_mfn_from_gpfn(pfn, mfn) (phys_to_machine_mapping[(pfn)] = (mfn))
static inline unsigned long get_mfn_from_gpfn(unsigned long pfn)
{
- unsigned long mfn;
-
- if ( unlikely(pfn >= NR_P2M_TABLE_ENTRIES) ||
- unlikely(__copy_from_user(&mfn, &phys_to_machine_mapping[pfn],
- sizeof(mfn))) )
- mfn = INVALID_MFN;
-
- return mfn;
+ l1_pgentry_t l1e = l1e_empty();
+ int ret;
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( pfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return INVALID_MFN;
+#endif
+
+ ret = __copy_from_user(&l1e,
+ &phys_to_machine_mapping[pfn],
+ sizeof(l1e));
+
+ if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) )
+ return l1e_get_pfn(l1e);
+
+ return INVALID_MFN;
}
#ifdef MEMORY_GUARD
@@ -333,6 +420,7 @@ void audit_domains(void);
#endif
int new_guest_cr3(unsigned long pfn);
+void make_cr3(struct vcpu *v, unsigned long mfn);
void propagate_page_fault(unsigned long addr, u16 error_code);
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/msr.h
--- a/xen/include/asm-x86/msr.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/msr.h Wed Aug 16 17:02:35 2006 +0100
@@ -112,6 +112,10 @@ static inline void wrmsrl(unsigned int m
#define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483
#define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484
#define MSR_IA32_VMX_MISC_MSR 0x485
+#define MSR_IA32_VMX_CR0_FIXED0 0x486
+#define MSR_IA32_VMX_CR0_FIXED1 0x487
+#define MSR_IA32_VMX_CR4_FIXED0 0x488
+#define MSR_IA32_VMX_CR4_FIXED1 0x489
#define IA32_FEATURE_CONTROL_MSR 0x3a
#define IA32_FEATURE_CONTROL_MSR_LOCK 0x1
#define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON 0x4
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/page-guest32.h
--- a/xen/include/asm-x86/page-guest32.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/page-guest32.h Wed Aug 16 17:02:35 2006 +0100
@@ -89,15 +89,8 @@ static inline l2_pgentry_32_t l2e_from_p
#define linear_l1_table_32 \
((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table_32 \
- ((l2_pgentry_32_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0))))
#define linear_pg_table_32 linear_l1_table_32
-#define linear_l2_table_32(_ed) ((_ed)->arch.guest_vtable)
-
-#define va_to_l1mfn_32(_ed, _va) \
- (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
#endif /* __X86_PAGE_GUEST_H__ */
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/page.h Wed Aug 16 17:02:35 2006 +0100
@@ -233,26 +233,18 @@ typedef struct { u64 pfn; } pagetable_t;
+ DOMAIN_ENTRIES_PER_L4_PAGETABLE)
#endif
-#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK)
-#define linear_l1_table \
- ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0))))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1))))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2))))
-
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
+#define linear_l1_table __linear_l1_table
#define linear_pg_table linear_l1_table
-#define linear_l2_table(v) ((v)->arch.guest_vtable)
-#define linear_l3_table(v) ((v)->arch.guest_vl3table)
-#define linear_l4_table(v) ((v)->arch.guest_vl4table)
+#define linear_l2_table(v) ((l2_pgentry_t *)(v)->arch.guest_vtable)
#ifndef __ASSEMBLY__
#if CONFIG_PAGING_LEVELS == 3
@@ -294,6 +286,7 @@ extern void paging_init(void);
#define _PAGE_AVAIL1 0x400U
#define _PAGE_AVAIL2 0x800U
#define _PAGE_AVAIL 0xE00U
+#define _PAGE_PSE_PAT 0x1000U
/*
* Debug option: Ensure that granted mappings are not implicitly unmapped.
@@ -307,9 +300,9 @@ extern void paging_init(void);
#endif
/*
- * Disallow unused flag bits plus PAT, PSE and GLOBAL. Also disallow GNTTAB
- * if we are using it for grant-table debugging. Permit the NX bit if the
- * hardware supports it.
+ * Disallow unused flag bits plus PAT, PSE and GLOBAL.
+ * Also disallow GNTTAB if we are using it for grant-table debugging.
+ * Permit the NX bit if the hardware supports it.
*/
#define BASE_DISALLOW_MASK ((0xFFFFF180U | _PAGE_GNTTAB) & ~_PAGE_NX)
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/perfc_defn.h Wed Aug 16 17:02:35 2006 +0100
@@ -144,4 +144,57 @@ PERFCOUNTER_CPU(remove_write_bad_predict
PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction")
PERFCOUNTER_CPU(update_hl2e_invlpg, "update_hl2e calls invlpg")
+/* Shadow2 counters */
+PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc")
+PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs")
+PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use")
+PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free")
+PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows")
+PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows")
+PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map")
+PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update")
+PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update")
+PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn")
+PERFCOUNTER_CPU(shadow2_fault_bail_not_present,
+ "shadow2_fault guest not-present")
+PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor,
+ "shadow2_fault guest U/S fault")
+PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read")
+PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write")
+PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails")
+PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio")
+PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault")
+PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate")
+PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e")
+PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e")
+PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e")
+PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e")
+PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup")
+PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head")
+PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses")
+PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status")
+PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert")
+PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete")
+PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access")
+PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low")
+PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force")
+PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings")
+PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force")
+PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit")
+PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit")
+PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page")
+PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer")
+PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force")
+PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed")
+PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables")
+PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits")
+PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses")
+
+
/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/processor.h Wed Aug 16 17:02:35 2006 +0100
@@ -545,6 +545,7 @@ extern always_inline void prefetchw(cons
#endif
void show_stack(struct cpu_user_regs *regs);
+void show_xen_trace(void);
void show_stack_overflow(unsigned long esp);
void show_registers(struct cpu_user_regs *regs);
void show_execution_state(struct cpu_user_regs *regs);
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/shadow.h Wed Aug 16 17:02:35 2006 +0100
@@ -1,8 +1,7 @@
/******************************************************************************
* include/asm-x86/shadow.h
*
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
+ * Copyright (c) 2006 by XenSource Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -22,1782 +21,28 @@
#ifndef _XEN_SHADOW_H
#define _XEN_SHADOW_H
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/perfc.h>
-#include <xen/sched.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/current.h>
-#include <asm/flushtlb.h>
-#include <asm/processor.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/support.h>
-#include <asm/regs.h>
-#include <public/dom0_ops.h>
-#include <asm/shadow_public.h>
-#include <asm/page-guest32.h>
-#include <asm/shadow_ops.h>
+/* This file is just a wrapper around the new Shadow2 header,
+ * providing names that must be defined in any shadow implementation. */
-/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+#include <asm/shadow2.h>
-#define SHM_enable (1<<0) /* we're in one of the shadow modes */
-#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of
- guest tables */
-#define SHM_write_all (1<<2) /* allow write access to all guest pt pages,
- regardless of pte write permissions */
-#define SHM_log_dirty (1<<3) /* enable log dirty mode */
-#define SHM_translate (1<<4) /* Xen does p2m translation, not guest */
-#define SHM_external (1<<5) /* Xen does not steal address space from the
- domain for its own booking; requires VT or
- similar mechanisms */
-#define SHM_wr_pt_pte (1<<6) /* guest allowed to set PAGE_RW bit in PTEs which
- point to page table pages. */
+/* How to make sure a page is not referred to in a shadow PT */
+/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */
+#define shadow_drop_references(_d, _p) \
+ shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+#define shadow_sync_and_drop_references(_d, _p) \
+ shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
-#define shadow_mode_enabled(_d) ((_d)->arch.shadow_mode)
-#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts)
-#define shadow_mode_write_l1(_d) (VM_ASSIST(_d,
VMASST_TYPE_writable_pagetables))
-#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all)
-#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
-#define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
-#define shadow_mode_external(_d) ((_d)->arch.shadow_mode & SHM_external)
-#define shadow_mode_wr_pt_pte(_d) ((_d)->arch.shadow_mode & SHM_wr_pt_pte)
+/* Whether we are translating the domain's frame numbers for it */
+#define shadow_mode_translate(d) shadow2_mode_translate(d)
-#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
- (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-#define shadow_linear_l2_table(_v) ((_v)->arch.shadow_vtable)
+/* ...and if so, how to add and remove entries in the mapping */
+#define guest_physmap_add_page(_d, _p, _m) \
+ shadow2_guest_physmap_add_page((_d), (_p), (_m))
+#define guest_physmap_remove_page(_d, _p, _m ) \
+ shadow2_guest_physmap_remove_page((_d), (_p), (_m))
-// easy access to the hl2 table (for translated but not external modes only)
-#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-
-/*
- * For now we use the per-domain BIGLOCK rather than a shadow-specific lock.
- * We usually have the BIGLOCK already acquired anyway, so this is unlikely
- * to cause much unnecessary extra serialisation. Also it's a recursive
- * lock, and there are some code paths containing nested shadow_lock().
- * The #if0'ed code below is therefore broken until such nesting is removed.
- */
-#if 0
-#define shadow_lock_init(_d) \
- spin_lock_init(&(_d)->arch.shadow_lock)
-#define shadow_lock_is_acquired(_d) \
- spin_is_locked(&(_d)->arch.shadow_lock)
-#define shadow_lock(_d) \
-do { \
- ASSERT(!shadow_lock_is_acquired(_d)); \
- spin_lock(&(_d)->arch.shadow_lock); \
-} while (0)
-#define shadow_unlock(_d) \
-do { \
- ASSERT(!shadow_lock_is_acquired(_d)); \
- spin_unlock(&(_d)->arch.shadow_lock); \
-} while (0)
-#else
-#define shadow_lock_init(_d) \
- ((_d)->arch.shadow_nest = 0)
-#define shadow_lock_is_acquired(_d) \
- (spin_is_locked(&(_d)->big_lock) && ((_d)->arch.shadow_nest != 0))
-#define shadow_lock(_d) \
-do { \
- LOCK_BIGLOCK(_d); \
- (_d)->arch.shadow_nest++; \
-} while (0)
-#define shadow_unlock(_d) \
-do { \
- ASSERT(shadow_lock_is_acquired(_d)); \
- (_d)->arch.shadow_nest--; \
- UNLOCK_BIGLOCK(_d); \
-} while (0)
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-static inline u64 get_cr3_idxval(struct vcpu *v)
-{
- u64 pae_cr3;
-
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 &&
- !shadow_mode_log_dirty(v->domain) )
- {
- pae_cr3 = hvm_get_guest_ctrl_reg(v, 3); /* get CR3 */
- return (pae_cr3 >> PAE_CR3_ALIGN) & PAE_CR3_IDX_MASK;
- }
- else
- return 0;
-}
-
-#define shadow_key_t u64
-#define index_to_key(x) ((x) << 32)
-#else
-#define get_cr3_idxval(v) (0)
-#define shadow_key_t unsigned long
-#define index_to_key(x) (0)
-#endif
-
-
-#define SHADOW_ENCODE_MIN_MAX(_min, _max) ((((GUEST_L1_PAGETABLE_ENTRIES - 1)
- (_max)) << 16) | (_min))
-#define SHADOW_MIN(_encoded) ((_encoded) & ((1u<<16) - 1))
-#define SHADOW_MAX(_encoded) ((GUEST_L1_PAGETABLE_ENTRIES - 1) - ((_encoded)
>> 16))
-extern void shadow_direct_map_clean(struct domain *d);
-extern int shadow_direct_map_init(struct domain *d);
-extern int shadow_direct_map_fault(
- unsigned long vpa, struct cpu_user_regs *regs);
-extern void shadow_mode_init(void);
-extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
-extern int shadow_fault(unsigned long va, struct cpu_user_regs *regs);
-extern int shadow_mode_enable(struct domain *p, unsigned int mode);
-extern void shadow_invlpg(struct vcpu *, unsigned long);
-extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
- struct vcpu *v, unsigned long gpfn, unsigned long mfn);
-extern void free_monitor_pagetable(struct vcpu *v);
-extern void __shadow_sync_all(struct domain *d);
-extern int __shadow_out_of_sync(struct vcpu *v, unsigned long va);
-extern int set_p2m_entry(
- struct domain *d, unsigned long pfn, unsigned long mfn,
- struct domain_mmap_cache *l2cache,
- struct domain_mmap_cache *l1cache);
-extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype);
-
-extern void free_shadow_page(unsigned long smfn);
-
-extern void shadow_l1_normal_pt_update(struct domain *d,
- paddr_t pa, l1_pgentry_t l1e,
- struct domain_mmap_cache *cache);
-extern void shadow_l2_normal_pt_update(struct domain *d,
- paddr_t pa, l2_pgentry_t l2e,
- struct domain_mmap_cache *cache);
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/page-guest32.h>
-/*
- * va_mask cannot be used because it's used by the shadow hash.
- * Use the score area for for now.
- */
-#define is_xen_l2_slot(t,s)
\
- ( ((((t) & PGT_score_mask) >> PGT_score_shift) == 3) &&
\
- ((s) >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES - 1))) )
-
-extern unsigned long gva_to_gpa(unsigned long gva);
-extern void shadow_l3_normal_pt_update(struct domain *d,
- paddr_t pa, l3_pgentry_t l3e,
- struct domain_mmap_cache *cache);
-#endif
-#if CONFIG_PAGING_LEVELS >= 4
-extern void shadow_l4_normal_pt_update(struct domain *d,
- paddr_t pa, l4_pgentry_t l4e,
- struct domain_mmap_cache *cache);
-#endif
-extern int shadow_do_update_va_mapping(unsigned long va,
- l1_pgentry_t val,
- struct vcpu *v);
-
-
-static inline unsigned long __shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long stype);
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void update_hl2e(struct vcpu *v, unsigned long va);
-#endif
-
-static inline int page_is_page_table(struct page_info *page)
-{
- struct domain *owner = page_get_owner(page);
- u32 type_info;
-
- if ( owner && shadow_mode_refcounts(owner) )
- return page->count_info & PGC_page_table;
-
- type_info = page->u.inuse.type_info & PGT_type_mask;
- return type_info && (type_info <= PGT_l4_page_table);
-}
-
-static inline int mfn_is_page_table(unsigned long mfn)
-{
- if ( !mfn_valid(mfn) )
- return 0;
-
- return page_is_page_table(mfn_to_page(mfn));
-}
-
-static inline int page_out_of_sync(struct page_info *page)
-{
- return page->count_info & PGC_out_of_sync;
-}
-
-static inline int mfn_out_of_sync(unsigned long mfn)
-{
- if ( !mfn_valid(mfn) )
- return 0;
-
- return page_out_of_sync(mfn_to_page(mfn));
-}
-
-
-/************************************************************************/
-
-static void inline
-__shadow_sync_mfn(struct domain *d, unsigned long mfn)
-{
- if ( d->arch.out_of_sync )
- {
- // XXX - could be smarter
- //
- __shadow_sync_all(d);
- }
-}
-
-static void inline
-__shadow_sync_va(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
-
- if ( d->arch.out_of_sync && __shadow_out_of_sync(v, va) )
- {
- perfc_incrc(shadow_sync_va);
-
- // XXX - could be smarter
- //
- __shadow_sync_all(v->domain);
- }
-#if CONFIG_PAGING_LEVELS <= 2
- // Also make sure the HL2 is up-to-date for this address.
- //
- if ( unlikely(shadow_mode_translate(v->domain)) )
- update_hl2e(v, va);
-#endif
-}
-
-static void inline
-shadow_sync_all(struct domain *d)
-{
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- shadow_lock(d);
-
- if ( d->arch.out_of_sync )
- __shadow_sync_all(d);
-
- ASSERT(d->arch.out_of_sync == NULL);
-
- shadow_unlock(d);
- }
-}
-
-// SMP BUG: This routine can't ever be used properly in an SMP context.
-// It should be something like get_shadow_and_sync_va().
-// This probably shouldn't exist.
-//
-static void inline
-shadow_sync_va(struct vcpu *v, unsigned long gva)
-{
- struct domain *d = v->domain;
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- shadow_lock(d);
- __shadow_sync_va(v, gva);
- shadow_unlock(d);
- }
-}
-
-extern void __shadow_mode_disable(struct domain *d);
-static inline void shadow_mode_disable(struct domain *d)
-{
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- shadow_lock(d);
- __shadow_mode_disable(d);
- shadow_unlock(d);
- }
-}
-
-/************************************************************************/
-
-#define mfn_to_gmfn(_d, mfn) \
- ( (shadow_mode_translate(_d)) \
- ? get_gpfn_from_mfn(mfn) \
- : (mfn) )
-
-#define gmfn_to_mfn(_d, gpfn) \
- ({ \
- unlikely(shadow_mode_translate(_d)) \
- ? (likely(current->domain == (_d)) \
- ? get_mfn_from_gpfn(gpfn) \
- : get_mfn_from_gpfn_foreign(_d, gpfn)) \
- : (gpfn); \
- })
-
-extern unsigned long get_mfn_from_gpfn_foreign(
- struct domain *d, unsigned long gpfn);
-
-/************************************************************************/
-
-struct shadow_status {
- struct shadow_status *next; /* Pull-to-front list per hash bucket. */
- shadow_key_t gpfn_and_flags; /* Guest pfn plus flags. */
- unsigned long smfn; /* Shadow mfn. */
-};
-
-#define shadow_ht_extra_size 128
-#define shadow_ht_buckets 256
-
-struct out_of_sync_entry {
- struct out_of_sync_entry *next;
- struct vcpu *v;
- unsigned long gpfn; /* why is this here? */
- unsigned long gmfn;
- unsigned long snapshot_mfn;
- paddr_t writable_pl1e; /* NB: this is a machine address */
- unsigned long va;
-};
-
-#define out_of_sync_extra_size 127
-
-#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
-
-/************************************************************************/
-#define SHADOW_DEBUG 0
-#define SHADOW_VERBOSE_DEBUG 0
-#define SHADOW_VVERBOSE_DEBUG 0
-#define SHADOW_VVVERBOSE_DEBUG 0
-#define SHADOW_HASH_DEBUG 0
-#define FULLSHADOW_DEBUG 0
-
-#if SHADOW_DEBUG
-extern int shadow_status_noswap;
-#define SHADOW_REFLECTS_SNAPSHOT _PAGE_AVAIL0
-#endif
-
-#if SHADOW_VERBOSE_DEBUG
-#define SH_LOG(_f, _a...) \
- printk("DOM%uP%u: SH_LOG(%d): " _f "\n", \
- current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a )
-#define SH_VLOG(_f, _a...) \
- printk("DOM%uP%u: SH_VLOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_LOG(_f, _a...) ((void)0)
-#define SH_VLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVERBOSE_DEBUG
-#define SH_VVLOG(_f, _a...) \
- printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVVERBOSE_DEBUG
-#define SH_VVVLOG(_f, _a...) \
- printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVVLOG(_f, _a...) ((void)0)
-#endif
-
-#if FULLSHADOW_DEBUG
-#define FSH_LOG(_f, _a...) \
- printk("DOM%uP%u: FSH_LOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define FSH_LOG(_f, _a...) ((void)0)
-#endif
-
-
-/************************************************************************/
-
-static inline int
-shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
- l1_pgentry_t nl1e;
- int res;
- unsigned long mfn;
- struct domain *owner;
-
- ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
-
- if ( !shadow_mode_refcounts(d) )
- return 1;
-
- nl1e = l1e;
- l1e_remove_flags(nl1e, _PAGE_GLOBAL);
-
- if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
- return 0;
-
- res = get_page_from_l1e(nl1e, d);
-
- if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) &&
- !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) &&
- (mfn = l1e_get_pfn(nl1e)) &&
- mfn_valid(mfn) &&
- (owner = page_get_owner(mfn_to_page(mfn))) &&
- (d != owner) )
- {
- res = get_page_from_l1e(nl1e, owner);
- printk("tried to map mfn %lx from domain %d into shadow page tables "
- "of domain %d; %s\n",
- mfn, owner->domain_id, d->domain_id,
- res ? "success" : "failed");
- }
-
- if ( unlikely(!res) )
- {
- perfc_incrc(shadow_get_page_fail);
- FSH_LOG("%s failed to get ref l1e=%" PRIpte "\n",
- __func__, l1e_get_intpte(l1e));
- }
-
- return res;
-}
-
-static inline void
-shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
-
- put_page_from_l1e(l1e, d);
-}
-
-static inline void
-shadow_put_page_type(struct domain *d, struct page_info *page)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
-
- put_page_type(page);
-}
-
-static inline int shadow_get_page(struct domain *d,
- struct page_info *page,
- struct domain *owner)
-{
- if ( !shadow_mode_refcounts(d) )
- return 1;
- return get_page(page, owner);
-}
-
-static inline void shadow_put_page(struct domain *d,
- struct page_info *page)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
- put_page(page);
-}
-
-/************************************************************************/
-
-static inline void __mark_dirty(struct domain *d, unsigned long mfn)
-{
- unsigned long pfn;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- if ( likely(!shadow_mode_log_dirty(d)) || !VALID_MFN(mfn) )
- return;
-
- ASSERT(d->arch.shadow_dirty_bitmap != NULL);
-
- /* We /really/ mean PFN here, even for non-translated guests. */
- pfn = get_gpfn_from_mfn(mfn);
-
- /*
- * Values with the MSB set denote MFNs that aren't really part of the
- * domain's pseudo-physical memory map (e.g., the shared info frame).
- * Nothing to do here...
- */
- if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) )
- return;
-
- /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
- if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) &&
- !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
- {
- d->arch.shadow_dirty_count++;
- }
-#ifndef NDEBUG
- else if ( mfn_valid(mfn) )
- {
- SH_VLOG("mark_dirty OOR! mfn=%lx pfn=%lx max=%x (dom %p)",
- mfn, pfn, d->arch.shadow_dirty_bitmap_size, d);
- SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info,
- page_get_owner(mfn_to_page(mfn)),
- mfn_to_page(mfn)->count_info,
- mfn_to_page(mfn)->u.inuse.type_info );
- }
-#endif
-}
-
-
-static inline void mark_dirty(struct domain *d, unsigned int mfn)
-{
- if ( unlikely(shadow_mode_log_dirty(d)) )
- {
- shadow_lock(d);
- __mark_dirty(d, mfn);
- shadow_unlock(d);
- }
-}
-
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void
-__shadow_get_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
- ASSERT(shadow_mode_enabled(v->domain));
-
- *psl2e = v->arch.shadow_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__shadow_set_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
- ASSERT(shadow_mode_enabled(v->domain));
-
- v->arch.shadow_vtable[l2_table_offset(va)] = value;
-}
-
-static inline void
-__guest_get_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t *pl2e)
-{
- *pl2e = v->arch.guest_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__guest_set_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
- struct domain *d = v->domain;
-
- v->arch.guest_vtable[l2_table_offset(va)] = value;
-
- if ( unlikely(shadow_mode_translate(d)) )
- update_hl2e(v, va);
-
- __mark_dirty(d, pagetable_get_pfn(v->arch.guest_table));
-}
-
-static inline void
-__direct_get_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
- l2_pgentry_t *phys_vtable;
-
- ASSERT(shadow_mode_enabled(v->domain));
-
- phys_vtable = map_domain_page(
- pagetable_get_pfn(v->domain->arch.phys_table));
-
- *psl2e = phys_vtable[l2_table_offset(va)];
-
- unmap_domain_page(phys_vtable);
-}
-
-static inline void
-__direct_set_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
- l2_pgentry_t *phys_vtable;
-
- ASSERT(shadow_mode_enabled(v->domain));
-
- phys_vtable = map_domain_page(
- pagetable_get_pfn(v->domain->arch.phys_table));
-
- phys_vtable[l2_table_offset(va)] = value;
-
- unmap_domain_page(phys_vtable);
-}
-
-static inline void
-update_hl2e(struct vcpu *v, unsigned long va)
-{
- int index = l2_table_offset(va);
- unsigned long mfn;
- l2_pgentry_t gl2e = v->arch.guest_vtable[index];
- l1_pgentry_t old_hl2e, new_hl2e;
- int need_flush = 0;
-
- ASSERT(shadow_mode_translate(v->domain));
-
- old_hl2e = v->arch.hl2_vtable[index];
-
- if ( (l2e_get_flags(gl2e) & _PAGE_PRESENT) &&
- VALID_MFN(mfn = get_mfn_from_gpfn(l2e_get_pfn(gl2e))) )
- new_hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
- else
- new_hl2e = l1e_empty();
-
- // only do the ref counting if something has changed.
- //
- if ( (l1e_has_changed(old_hl2e, new_hl2e, PAGE_FLAG_MASK)) )
- {
- if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
- !shadow_get_page(v->domain, mfn_to_page(l1e_get_pfn(new_hl2e)),
- v->domain) )
- new_hl2e = l1e_empty();
- if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
- {
- shadow_put_page(v->domain, mfn_to_page(l1e_get_pfn(old_hl2e)));
- need_flush = 1;
- }
-
- v->arch.hl2_vtable[l2_table_offset(va)] = new_hl2e;
-
- if ( need_flush )
- {
- perfc_incrc(update_hl2e_invlpg);
- flush_tlb_one_mask(v->domain->domain_dirty_cpumask,
- &linear_pg_table[l1_linear_offset(va)]);
- }
- }
-}
-
-static inline void shadow_drop_references(
- struct domain *d, struct page_info *page)
-{
- if ( likely(!shadow_mode_refcounts(d)) ||
- ((page->u.inuse.type_info & PGT_count_mask) == 0) )
- return;
-
- /* XXX This needs more thought... */
- printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n",
- __func__, page_to_mfn(page));
- printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
- page->count_info, page->u.inuse.type_info);
-
- shadow_lock(d);
- shadow_remove_all_access(d, page_to_mfn(page));
- shadow_unlock(d);
-
- printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
- page->count_info, page->u.inuse.type_info);
-}
-
-/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
-static inline void shadow_sync_and_drop_references(
- struct domain *d, struct page_info *page)
-{
- if ( likely(!shadow_mode_refcounts(d)) )
- return;
-
- if ( page_out_of_sync(page) )
- __shadow_sync_mfn(d, page_to_mfn(page));
-
- shadow_remove_all_access(d, page_to_mfn(page));
-}
-#endif
-
-/************************************************************************/
-
-/*
- * Add another shadow reference to smfn.
- */
-static inline int
-get_shadow_ref(unsigned long smfn)
-{
- u32 x, nx;
-
- ASSERT(mfn_valid(smfn));
-
- x = mfn_to_page(smfn)->count_info;
- nx = x + 1;
-
- if ( unlikely(nx == 0) )
- {
- printk("get_shadow_ref overflow, gmfn=%" PRtype_info " smfn=%lx\n",
- mfn_to_page(smfn)->u.inuse.type_info & PGT_mfn_mask,
- smfn);
- BUG();
- }
-
- // Guarded by the shadow lock...
- //
- mfn_to_page(smfn)->count_info = nx;
-
- return 1;
-}
-
-/*
- * Drop a shadow reference to smfn.
- */
-static inline void
-put_shadow_ref(unsigned long smfn)
-{
- u32 x, nx;
-
- ASSERT(mfn_valid(smfn));
-
- x = mfn_to_page(smfn)->count_info;
- nx = x - 1;
-
- if ( unlikely(x == 0) )
- {
- printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%"
- PRtype_info "\n",
- smfn,
- mfn_to_page(smfn)->count_info,
- mfn_to_page(smfn)->u.inuse.type_info);
- BUG();
- }
-
- // Guarded by the shadow lock...
- //
- mfn_to_page(smfn)->count_info = nx;
-
- if ( unlikely(nx == 0) )
- {
- free_shadow_page(smfn);
- }
-}
-
-static inline void
-shadow_pin(unsigned long smfn)
-{
- ASSERT( !(mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
- mfn_to_page(smfn)->u.inuse.type_info |= PGT_pinned;
- if ( unlikely(!get_shadow_ref(smfn)) )
- BUG();
-}
-
-static inline void
-shadow_unpin(unsigned long smfn)
-{
- ASSERT( (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
- mfn_to_page(smfn)->u.inuse.type_info &= ~PGT_pinned;
- put_shadow_ref(smfn);
-}
-
-/*
- * SMP issue. The following code assumes the shadow lock is held. Re-visit
- * when working on finer-gained locks for shadow.
- */
-static inline void set_guest_back_ptr(
- struct domain *d, l1_pgentry_t spte,
- unsigned long smfn, unsigned int index)
-{
- struct page_info *gpage;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- if ( !shadow_mode_external(d) ||
- ((l1e_get_flags(spte) & (_PAGE_PRESENT|_PAGE_RW)) !=
- (_PAGE_PRESENT|_PAGE_RW)) )
- return;
-
- gpage = l1e_get_page(spte);
-
- ASSERT(smfn != 0);
- ASSERT(page_to_mfn(gpage) != 0);
-
- gpage->tlbflush_timestamp = smfn;
- gpage->u.inuse.type_info &= ~PGT_va_mask;
- gpage->u.inuse.type_info |= (unsigned long)index << PGT_va_shift;
-}
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_mark_va_out_of_sync(
- struct vcpu *v, unsigned long gpfn, unsigned long mfn,
- unsigned long va);
-
-static inline int l1pte_write_fault(
- struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
- unsigned long va)
-{
- struct domain *d = v->domain;
- l1_pgentry_t gpte = *gpte_p;
- l1_pgentry_t spte;
- unsigned long gpfn = l1e_get_pfn(gpte);
- unsigned long gmfn = gmfn_to_mfn(d, gpfn);
-
- //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
-
- if ( unlikely(!VALID_MFN(gmfn)) )
- {
- SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
- *spte_p = l1e_empty();
- return 0;
- }
-
- ASSERT(l1e_get_flags(gpte) & _PAGE_RW);
- l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
- spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
- SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
- l1e_get_intpte(spte), l1e_get_intpte(gpte));
-
- __mark_dirty(d, gmfn);
-
- if ( mfn_is_page_table(gmfn) )
- shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
-
- *gpte_p = gpte;
- *spte_p = spte;
-
- return 1;
-}
-
-static inline int l1pte_read_fault(
- struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
-{
- l1_pgentry_t gpte = *gpte_p;
- l1_pgentry_t spte = *spte_p;
- unsigned long pfn = l1e_get_pfn(gpte);
- unsigned long mfn = gmfn_to_mfn(d, pfn);
-
- if ( unlikely(!VALID_MFN(mfn)) )
- {
- SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
- *spte_p = l1e_empty();
- return 0;
- }
-
- l1e_add_flags(gpte, _PAGE_ACCESSED);
- spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
- if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) ||
- mfn_is_page_table(mfn) )
- {
- l1e_remove_flags(spte, _PAGE_RW);
- }
-
- SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
- l1e_get_intpte(spte), l1e_get_intpte(gpte));
- *gpte_p = gpte;
- *spte_p = spte;
-
- return 1;
-}
-#endif
-
-static inline void l1pte_propagate_from_guest(
- struct domain *d, guest_l1_pgentry_t gpte, l1_pgentry_t *spte_p)
-{
- unsigned long mfn;
- l1_pgentry_t spte;
-
- spte = l1e_empty();
-
- if ( ((guest_l1e_get_flags(gpte) & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
- (_PAGE_PRESENT|_PAGE_ACCESSED)) &&
- VALID_MFN(mfn = gmfn_to_mfn(d, l1e_get_pfn(gpte))) )
- {
- spte = l1e_from_pfn(
- mfn, guest_l1e_get_flags(gpte) & ~(_PAGE_GLOBAL | _PAGE_AVAIL));
-
- if ( shadow_mode_log_dirty(d) ||
- !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
- mfn_is_page_table(mfn) )
- {
- l1e_remove_flags(spte, _PAGE_RW);
- }
- }
-
- if ( l1e_get_intpte(spte) || l1e_get_intpte(gpte) )
- SH_VVVLOG("%s: gpte=%" PRIpte ", new spte=%" PRIpte,
- __func__, l1e_get_intpte(gpte), l1e_get_intpte(spte));
-
- *spte_p = spte;
-}
-
-static inline void hl2e_propagate_from_guest(
- struct domain *d, l2_pgentry_t gpde, l1_pgentry_t *hl2e_p)
-{
- unsigned long pfn = l2e_get_pfn(gpde);
- unsigned long mfn;
- l1_pgentry_t hl2e;
-
- hl2e = l1e_empty();
-
- if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
- {
- mfn = gmfn_to_mfn(d, pfn);
- if ( VALID_MFN(mfn) && mfn_valid(mfn) )
- hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
- }
-
- if ( l1e_get_intpte(hl2e) || l2e_get_intpte(gpde) )
- SH_VVLOG("%s: gpde=%" PRIpte " hl2e=%" PRIpte, __func__,
- l2e_get_intpte(gpde), l1e_get_intpte(hl2e));
-
- *hl2e_p = hl2e;
-}
-
-static inline void l2pde_general(
- struct domain *d,
- guest_l2_pgentry_t *gpde_p,
- l2_pgentry_t *spde_p,
- unsigned long sl1mfn)
-{
- guest_l2_pgentry_t gpde = *gpde_p;
- l2_pgentry_t spde;
-
- spde = l2e_empty();
- if ( (guest_l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) )
- {
- spde = l2e_from_pfn(
- sl1mfn,
- (guest_l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) &
~_PAGE_AVAIL);
-
- /* N.B. PDEs do not have a dirty bit. */
- guest_l2e_add_flags(gpde, _PAGE_ACCESSED);
-
- *gpde_p = gpde;
- }
-
- if ( l2e_get_intpte(spde) || l2e_get_intpte(gpde) )
- SH_VVLOG("%s: gpde=%" PRIpte ", new spde=%" PRIpte, __func__,
- l2e_get_intpte(gpde), l2e_get_intpte(spde));
-
- *spde_p = spde;
-}
-
-static inline void l2pde_propagate_from_guest(
- struct domain *d, guest_l2_pgentry_t *gpde_p, l2_pgentry_t *spde_p)
-{
- guest_l2_pgentry_t gpde = *gpde_p;
- unsigned long sl1mfn = 0;
-
- if ( guest_l2e_get_flags(gpde) & _PAGE_PRESENT )
- sl1mfn = __shadow_status(d, l2e_get_pfn(gpde), PGT_l1_shadow);
- l2pde_general(d, gpde_p, spde_p, sl1mfn);
-}
-
-/************************************************************************/
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pte_change(
- struct domain *d,
- guest_l1_pgentry_t new_pte,
- l1_pgentry_t *shadow_pte_p)
-{
- l1_pgentry_t old_spte, new_spte;
- int need_flush = 0;
-
- perfc_incrc(validate_pte_calls);
-
- l1pte_propagate_from_guest(d, new_pte, &new_spte);
-
- if ( shadow_mode_refcounts(d) )
- {
- old_spte = *shadow_pte_p;
-
- if ( l1e_get_intpte(old_spte) == l1e_get_intpte(new_spte) )
- {
- // No accounting required...
- //
- perfc_incrc(validate_pte_changes1);
- }
- else if ( l1e_get_intpte(old_spte) ==
(l1e_get_intpte(new_spte)|_PAGE_RW) )
- {
- // Fast path for PTEs that have merely been write-protected
- // (e.g., during a Unix fork()). A strict reduction in privilege.
- //
- perfc_incrc(validate_pte_changes2);
- if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) )
- shadow_put_page_type(d, mfn_to_page(l1e_get_pfn(new_spte)));
- }
- else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) &
- _PAGE_PRESENT ) &&
- l1e_has_changed(old_spte, new_spte, _PAGE_RW |
_PAGE_PRESENT) )
- {
- // only do the ref counting if something important changed.
- //
- perfc_incrc(validate_pte_changes3);
-
- if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
- {
- shadow_put_page_from_l1e(old_spte, d);
- need_flush = 1;
- }
- if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(new_spte, d) ) {
- new_spte = l1e_empty();
- need_flush = -1; /* need to unshadow the page */
- }
- }
- else
- {
- perfc_incrc(validate_pte_changes4);
- }
- }
-
- *shadow_pte_p = new_spte;
-
- return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_hl2e_change(
- struct domain *d,
- l2_pgentry_t new_gpde,
- l1_pgentry_t *shadow_hl2e_p)
-{
- l1_pgentry_t old_hl2e, new_hl2e;
- int need_flush = 0;
-
- perfc_incrc(validate_hl2e_calls);
-
- old_hl2e = *shadow_hl2e_p;
- hl2e_propagate_from_guest(d, new_gpde, &new_hl2e);
-
- // Only do the ref counting if something important changed.
- //
- if ( ((l1e_get_flags(old_hl2e) | l1e_get_flags(new_hl2e)) & _PAGE_PRESENT)
&&
- l1e_has_changed(old_hl2e, new_hl2e, _PAGE_PRESENT) )
- {
- perfc_incrc(validate_hl2e_changes);
-
- if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
- !get_page(mfn_to_page(l1e_get_pfn(new_hl2e)), d) )
- new_hl2e = l1e_empty();
- if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
- {
- put_page(mfn_to_page(l1e_get_pfn(old_hl2e)));
- need_flush = 1;
- }
- }
-
- *shadow_hl2e_p = new_hl2e;
-
- return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pde_change(
- struct domain *d,
- guest_l2_pgentry_t new_gpde,
- l2_pgentry_t *shadow_pde_p)
-{
- l2_pgentry_t old_spde, new_spde;
- int need_flush = 0;
-
- perfc_incrc(validate_pde_calls);
-
- old_spde = *shadow_pde_p;
- l2pde_propagate_from_guest(d, &new_gpde, &new_spde);
-
- // Only do the ref counting if something important changed.
- //
- if ( ((l2e_get_intpte(old_spde) | l2e_get_intpte(new_spde)) &
_PAGE_PRESENT) &&
- l2e_has_changed(old_spde, new_spde, _PAGE_PRESENT) )
- {
- perfc_incrc(validate_pde_changes);
-
- if ( (l2e_get_flags(new_spde) & _PAGE_PRESENT) &&
- !get_shadow_ref(l2e_get_pfn(new_spde)) )
- BUG();
- if ( l2e_get_flags(old_spde) & _PAGE_PRESENT )
- {
- put_shadow_ref(l2e_get_pfn(old_spde));
- need_flush = 1;
- }
- }
-
- *shadow_pde_p = new_spde;
-
- return need_flush;
-}
-
-/*********************************************************************/
-
-#if SHADOW_HASH_DEBUG
-
-static void shadow_audit(struct domain *d, int print)
-{
- int live = 0, free = 0, j = 0, abs;
- struct shadow_status *a;
-
- for ( j = 0; j < shadow_ht_buckets; j++ )
- {
- a = &d->arch.shadow_ht[j];
- if ( a->gpfn_and_flags )
- {
- live++;
- ASSERT(a->smfn);
- }
- else
- ASSERT(!a->next);
-
- a = a->next;
- while ( a && (live < 9999) )
- {
- live++;
- if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
- {
- printk("XXX live=%d gpfn+flags=%lx sp=%lx next=%p\n",
- live, a->gpfn_and_flags, a->smfn, a->next);
- BUG();
- }
- ASSERT(a->smfn);
- a = a->next;
- }
- ASSERT(live < 9999);
- }
-
- for ( a = d->arch.shadow_ht_free; a != NULL; a = a->next )
- free++;
-
- if ( print )
- printk("Xlive=%d free=%d\n", live, free);
-
- // BUG: this only works if there's only a single domain which is
- // using shadow tables.
- //
- abs = (
- perfc_value(shadow_l1_pages) +
- perfc_value(shadow_l2_pages) +
- perfc_value(hl2_table_pages) +
- perfc_value(snapshot_pages) +
- perfc_value(writable_pte_predictions)
- ) - live;
-#ifdef PERF_COUNTERS
- if ( (abs < -1) || (abs > 1) )
- {
- printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d
writable_ptes=%d\n",
- live, free,
- perfc_value(shadow_l1_pages),
- perfc_value(shadow_l2_pages),
- perfc_value(hl2_table_pages),
- perfc_value(snapshot_pages),
- perfc_value(writable_pte_predictions));
- BUG();
- }
-#endif
-
- // XXX ought to add some code to audit the out-of-sync entries, too.
- //
-}
-#else
-#define shadow_audit(p, print) ((void)0)
-#endif
-
-
-static inline struct shadow_status *hash_bucket(
- struct domain *d, unsigned int gpfn)
-{
- return &d->arch.shadow_ht[gpfn % shadow_ht_buckets];
-}
-
-
-/*
- * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
- * which, depending on full shadow mode, may or may not equal
- * its mfn).
- * It returns the shadow's mfn, or zero if it doesn't exist.
- */
-static inline unsigned long __shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long stype)
-{
- struct shadow_status *p, *x, *head;
- shadow_key_t key;
-#if CONFIG_PAGING_LEVELS >= 3
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 && stype ==
PGT_l4_shadow )
- key = gpfn | stype | index_to_key(get_cr3_idxval(current));
- else
-#endif
- key = gpfn | stype;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(gpfn == (gpfn & PGT_mfn_mask));
- ASSERT(stype && !(stype & ~PGT_type_mask));
-
- perfc_incrc(shadow_status_calls);
-
- x = head = hash_bucket(d, gpfn);
- p = NULL;
-
- shadow_audit(d, 0);
-
- do
- {
- ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
-
- if ( x->gpfn_and_flags == key )
- {
-#if SHADOW_DEBUG
- if ( unlikely(shadow_status_noswap) )
- return x->smfn;
-#endif
- /* Pull-to-front if 'x' isn't already the head item. */
- if ( unlikely(x != head) )
- {
- /* Delete 'x' from list and reinsert immediately after head. */
- p->next = x->next;
- x->next = head->next;
- head->next = x;
-
- /* Swap 'x' contents with head contents. */
- SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
- SWAP(head->smfn, x->smfn);
- }
- else
- {
- perfc_incrc(shadow_status_hit_head);
- }
-
- return head->smfn;
- }
-
- p = x;
- x = x->next;
- }
- while ( x != NULL );
-
- perfc_incrc(shadow_status_miss);
- return 0;
-}
-
-/*
- * Not clear if pull-to-front is worth while for this or not,
- * as it generally needs to scan the entire bucket anyway.
- * Much simpler without.
- *
- * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
- */
-static inline u32
-shadow_max_pgtable_type(struct domain *d, unsigned long gpfn,
- unsigned long *smfn)
-{
- struct shadow_status *x;
- u32 pttype = PGT_none, type;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-
- perfc_incrc(shadow_max_type);
-
- x = hash_bucket(d, gpfn);
-
- while ( x && x->gpfn_and_flags )
- {
- if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
- {
- type = x->gpfn_and_flags & PGT_type_mask;
-
- switch ( type )
- {
- case PGT_hl2_shadow:
- // Treat an HL2 as if it's an L1
- //
- type = PGT_l1_shadow;
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- // Ignore snapshots -- they don't in and of themselves
constitute
- // treating a page as a page table
- //
- goto next;
- case PGT_base_page_table:
- // Early exit if we found the max possible value
- //
- return type;
- default:
- break;
- }
-
- if ( type > pttype )
- {
- pttype = type;
- if ( smfn )
- *smfn = x->smfn;
- }
- }
- next:
- x = x->next;
- }
-
- return pttype;
-}
-
-static inline void delete_shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned int
stype, u64 index)
-{
- struct shadow_status *p, *x, *n, *head;
-
- shadow_key_t key = gpfn | stype | index_to_key(index);
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(!(gpfn & ~PGT_mfn_mask));
- ASSERT(stype && !(stype & ~PGT_type_mask));
-
- head = hash_bucket(d, gpfn);
-
- SH_VLOG("delete gpfn=%lx t=%08x bucket=%p", gpfn, stype, head);
- shadow_audit(d, 0);
-
- /* Match on head item? */
- if ( head->gpfn_and_flags == key )
- {
- if ( (n = head->next) != NULL )
- {
- /* Overwrite head with contents of following node. */
- head->gpfn_and_flags = n->gpfn_and_flags;
- head->smfn = n->smfn;
-
- /* Delete following node. */
- head->next = n->next;
-
- /* Add deleted node to the free list. */
- n->gpfn_and_flags = 0;
- n->smfn = 0;
- n->next = d->arch.shadow_ht_free;
- d->arch.shadow_ht_free = n;
- }
- else
- {
- /* This bucket is now empty. Initialise the head node. */
- head->gpfn_and_flags = 0;
- head->smfn = 0;
- }
-
- goto found;
- }
-
- p = head;
- x = head->next;
-
- do
- {
- if ( x->gpfn_and_flags == key )
- {
- /* Delete matching node. */
- p->next = x->next;
-
- /* Add deleted node to the free list. */
- x->gpfn_and_flags = 0;
- x->smfn = 0;
- x->next = d->arch.shadow_ht_free;
- d->arch.shadow_ht_free = x;
-
- goto found;
- }
-
- p = x;
- x = x->next;
- }
- while ( x != NULL );
-
- /* If we got here, it wasn't in the list! */
- BUG();
-
- found:
- // release ref to page
- if ( stype != PGT_writable_pred )
- put_page(mfn_to_page(gmfn));
-
- shadow_audit(d, 0);
-}
-
-static inline void set_shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long gmfn,
- unsigned long smfn, unsigned long stype, u64 index)
-{
- struct shadow_status *x, *head, *extra;
- int i;
-
- shadow_key_t key = gpfn | stype | index_to_key(index);
-
- SH_VVLOG("set gpfn=%lx gmfn=%lx smfn=%lx t=%lx", gpfn, gmfn, smfn, stype);
-
- ASSERT(shadow_lock_is_acquired(d));
-
- ASSERT(shadow_mode_translate(d) || gpfn);
- ASSERT(!(gpfn & ~PGT_mfn_mask));
-
- // XXX - need to be more graceful.
- ASSERT(VALID_MFN(gmfn));
-
- ASSERT(stype && !(stype & ~PGT_type_mask));
-
- x = head = hash_bucket(d, gpfn);
-
- SH_VLOG("set gpfn=%lx smfn=%lx t=%lx bucket=%p(%p)",
- gpfn, smfn, stype, x, x->next);
- shadow_audit(d, 0);
-
- // grab a reference to the guest page to represent the entry in the shadow
- // hash table
- //
- // XXX - Should PGT_writable_pred grab a page ref?
- // - Who/how are these hash table entry refs flushed if/when a page
- // is given away by the domain?
- //
- if ( stype != PGT_writable_pred )
- get_page(mfn_to_page(gmfn), d);
-
- /*
- * STEP 1. If page is already in the table, update it in place.
- */
- do
- {
- if ( unlikely(x->gpfn_and_flags == key) )
- {
- if ( stype != PGT_writable_pred )
- BUG(); // we should never replace entries into the hash table
- x->smfn = smfn;
- if ( stype != PGT_writable_pred )
- put_page(mfn_to_page(gmfn)); // already had a ref...
- goto done;
- }
-
- x = x->next;
- }
- while ( x != NULL );
-
- /*
- * STEP 2. The page must be inserted into the table.
- */
-
- /* If the bucket is empty then insert the new page as the head item. */
- if ( head->gpfn_and_flags == 0 )
- {
- head->gpfn_and_flags = key;
- head->smfn = smfn;
- ASSERT(head->next == NULL);
- goto done;
- }
-
- /* We need to allocate a new node. Ensure the quicklist is non-empty. */
- if ( unlikely(d->arch.shadow_ht_free == NULL) )
- {
- SH_VLOG("Allocate more shadow hashtable blocks.");
-
- extra = xmalloc_bytes(
- sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
- /* XXX Should be more graceful here. */
- if ( extra == NULL )
- BUG();
-
- memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
- /* Record the allocation block so it can be correctly freed later. */
- d->arch.shadow_extras_count++;
- *((struct shadow_status **)&extra[shadow_ht_extra_size]) =
- d->arch.shadow_ht_extras;
- d->arch.shadow_ht_extras = &extra[0];
-
- /* Thread a free chain through the newly-allocated nodes. */
- for ( i = 0; i < (shadow_ht_extra_size - 1); i++ )
- extra[i].next = &extra[i+1];
- extra[i].next = NULL;
-
- /* Add the new nodes to the free list. */
- d->arch.shadow_ht_free = &extra[0];
- }
-
- /* Allocate a new node from the quicklist. */
- x = d->arch.shadow_ht_free;
- d->arch.shadow_ht_free = x->next;
-
- /* Initialise the new node and insert directly after the head item. */
- x->gpfn_and_flags = key;
- x->smfn = smfn;
- x->next = head->next;
- head->next = x;
-
- done:
- shadow_audit(d, 0);
-
- if ( stype <= PGT_l4_shadow )
- {
- // add to front of list of pages to check when removing write
- // permissions for a page...
- //
- }
-}
-
-/************************************************************************/
-
-static inline void guest_physmap_add_page(
- struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
- struct domain_mmap_cache c1, c2;
-
- if ( likely(!shadow_mode_translate(d)) )
- return;
-
- domain_mmap_cache_init(&c1);
- domain_mmap_cache_init(&c2);
- shadow_lock(d);
- shadow_sync_and_drop_references(d, mfn_to_page(mfn));
- set_p2m_entry(d, gpfn, mfn, &c1, &c2);
- set_gpfn_from_mfn(mfn, gpfn);
- shadow_unlock(d);
- domain_mmap_cache_destroy(&c1);
- domain_mmap_cache_destroy(&c2);
-}
-
-static inline void guest_physmap_remove_page(
- struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
- struct domain_mmap_cache c1, c2;
- unsigned long type;
-
- if ( likely(!shadow_mode_translate(d)) )
- return;
-
- domain_mmap_cache_init(&c1);
- domain_mmap_cache_init(&c2);
- shadow_lock(d);
- shadow_sync_and_drop_references(d, mfn_to_page(mfn));
- while ( (type = shadow_max_pgtable_type(d, gpfn, NULL)) != PGT_none )
- free_shadow_page(__shadow_status(d, gpfn, type));
- set_p2m_entry(d, gpfn, -1, &c1, &c2);
- set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
- shadow_unlock(d);
- domain_mmap_cache_destroy(&c1);
- domain_mmap_cache_destroy(&c2);
-}
-
-/************************************************************************/
-
-void static inline
-shadow_update_min_max(unsigned long smfn, int index)
-{
- struct page_info *sl1page = mfn_to_page(smfn);
- u32 min_max = sl1page->tlbflush_timestamp;
- int min = SHADOW_MIN(min_max);
- int max = SHADOW_MAX(min_max);
- int update = 0;
-
- if ( index < min )
- {
- min = index;
- update = 1;
- }
- if ( index > max )
- {
- max = index;
- update = 1;
- }
- if ( update )
- sl1page->tlbflush_timestamp = SHADOW_ENCODE_MIN_MAX(min, max);
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_map_l1_into_current_l2(unsigned long va);
-
-void static inline
-shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l2_pgentry_t sl2e = {0};
-
- __shadow_get_l2e(v, va, &sl2e);
- if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
- {
- /*
- * Either the L1 is not shadowed, or the shadow isn't linked into
- * the current shadow L2.
- */
- if ( create_l1_shadow )
- {
- perfc_incrc(shadow_set_l1e_force_map);
- shadow_map_l1_into_current_l2(va);
- }
- else /* check to see if it exists; if so, link it in */
- {
- l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)];
- unsigned long gl1pfn = l2e_get_pfn(gpde);
- unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
-
- ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT );
-
- if ( sl1mfn )
- {
- perfc_incrc(shadow_set_l1e_unlinked);
- if ( !get_shadow_ref(sl1mfn) )
- BUG();
- l2pde_general(d, &gpde, &sl2e, sl1mfn);
- __guest_set_l2e(v, va, gpde);
- __shadow_set_l2e(v, va, sl2e);
- }
- else
- {
- // no shadow exists, so there's nothing to do.
- perfc_incrc(shadow_set_l1e_fail);
- return;
- }
- }
- }
-
- __shadow_get_l2e(v, va, &sl2e);
-
- if ( shadow_mode_refcounts(d) )
- {
- l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)];
-
- // only do the ref counting if something important changed.
- //
- if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
- {
- if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(new_spte, d) )
- new_spte = l1e_empty();
- if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
- shadow_put_page_from_l1e(old_spte, d);
- }
-
- }
-
- set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
- shadow_linear_pg_table[l1_linear_offset(va)] = new_spte;
- shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
-}
-#endif
-/************************************************************************/
-
-static inline int
-shadow_mode_page_writable(unsigned long va, struct cpu_user_regs *regs,
unsigned long gpfn)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- unsigned long mfn = gmfn_to_mfn(d, gpfn);
- u32 type = mfn_to_page(mfn)->u.inuse.type_info & PGT_type_mask;
-
- if ( shadow_mode_refcounts(d) &&
- (type == PGT_writable_page) )
- type = shadow_max_pgtable_type(d, gpfn, NULL);
-
- // Strange but true: writable page tables allow kernel-mode access
- // to L1 page table pages via write-protected PTEs... Similarly, write
- // access to all page table pages is granted for shadow_mode_write_all
- // clients.
- //
- if ( ((shadow_mode_write_l1(d) && (type == PGT_l1_page_table)) ||
- (shadow_mode_write_all(d) && type && (type <= PGT_l4_page_table))) &&
- ((va < HYPERVISOR_VIRT_START)
-#if defined(__x86_64__)
- || (va >= HYPERVISOR_VIRT_END)
-#endif
- ) &&
- guest_kernel_mode(v, regs) )
- return 1;
-
- return 0;
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline l1_pgentry_t gva_to_gpte(unsigned long gva)
-{
- l2_pgentry_t gpde;
- l1_pgentry_t gpte;
- struct vcpu *v = current;
-
- ASSERT( shadow_mode_translate(current->domain) );
-
- __guest_get_l2e(v, gva, &gpde);
- if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
- return l1e_empty();;
-
- // This is actually overkill - we only need to make sure the hl2
- // is in-sync.
- //
- shadow_sync_va(v, gva);
-
- if ( unlikely(__copy_from_user(&gpte,
- &linear_pg_table[gva >> PAGE_SHIFT],
- sizeof(gpte))) )
- {
- FSH_LOG("gva_to_gpte got a fault on gva=%lx", gva);
- return l1e_empty();
- }
-
- return gpte;
-}
-
-static inline unsigned long gva_to_gpa(unsigned long gva)
-{
- l1_pgentry_t gpte;
-
- gpte = gva_to_gpte(gva);
- if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
- return 0;
-
- return l1e_get_paddr(gpte) + (gva & ~PAGE_MASK);
-}
-#endif
-
-static inline unsigned long gva_to_mfn(unsigned long gva)
-{
- unsigned long gpa = gva_to_gpa(gva);
- return get_mfn_from_gpfn(gpa >> PAGE_SHIFT);
-}
-
-/************************************************************************/
-
-extern void __update_pagetables(struct vcpu *v);
-static inline void update_pagetables(struct vcpu *v)
-{
- struct domain *d = v->domain;
- int paging_enabled;
-
- if ( hvm_guest(v) )
- paging_enabled = hvm_paging_enabled(v);
- else
- // HACK ALERT: there's currently no easy way to figure out if a domU
- // has set its arch.guest_table to zero, vs not yet initialized it.
- //
- paging_enabled = !!pagetable_get_paddr(v->arch.guest_table);
-
- /*
- * We don't call __update_pagetables() when hvm guest paging is
- * disabled as we want the linear_pg_table to be inaccessible so that
- * we bail out early of shadow_fault() if the hvm guest tries illegal
- * accesses while it thinks paging is turned off.
- */
- if ( unlikely(shadow_mode_enabled(d)) && paging_enabled )
- {
- shadow_lock(d);
- __update_pagetables(v);
- shadow_unlock(d);
- }
-
- if ( likely(!shadow_mode_external(d)) )
- {
- if ( shadow_mode_enabled(d) )
- v->arch.monitor_table = v->arch.shadow_table;
- else
-#if CONFIG_PAGING_LEVELS == 4
- if ( !(v->arch.flags & TF_kernel_mode) )
- v->arch.monitor_table = v->arch.guest_table_user;
- else
-#endif
- v->arch.monitor_table = v->arch.guest_table;
- }
-}
-
-void clear_all_shadow_status(struct domain *d);
-
-#if SHADOW_DEBUG
-extern int _check_pagetable(struct vcpu *v, char *s);
-extern int _check_all_pagetables(struct vcpu *v, char *s);
-
-#define check_pagetable(_v, _s) _check_pagetable(_v, _s)
-//#define check_pagetable(_v, _s) _check_all_pagetables(_v, _s)
-
-#else
-#define check_pagetable(_v, _s) ((void)0)
-#endif
-
-#endif /* XEN_SHADOW_H */
+#endif /* _XEN_SHADOW_H */
/*
* Local variables:
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/x86_32/page-2level.h
--- a/xen/include/asm-x86/x86_32/page-2level.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/x86_32/page-2level.h Wed Aug 16 17:02:35 2006 +0100
@@ -46,6 +46,7 @@ typedef l2_pgentry_t root_pgentry_t;
* 12-bit flags = (pte[11:0])
*/
+#define _PAGE_NX_BIT 0U
#define _PAGE_NX 0U
/* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/x86_32/page-3level.h
--- a/xen/include/asm-x86/x86_32/page-3level.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/x86_32/page-3level.h Wed Aug 16 17:02:35 2006 +0100
@@ -59,7 +59,8 @@ typedef l3_pgentry_t root_pgentry_t;
* 32-bit flags = (pte[63:44],pte[11:0])
*/
-#define _PAGE_NX (cpu_has_nx ? (1<<31) : 0)
+#define _PAGE_NX_BIT (1U<<31)
+#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0)
/* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */
#define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/x86_64/page.h
--- a/xen/include/asm-x86/x86_64/page.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/x86_64/page.h Wed Aug 16 17:02:35 2006 +0100
@@ -44,6 +44,8 @@ typedef l4_pgentry_t root_pgentry_t;
/* Given a virtual address, get an entry offset into a linear page table. */
#define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT)
#define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT)
+#define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT)
+#define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT)
#define is_guest_l1_slot(_s) (1)
#define is_guest_l2_slot(_t, _s) (1)
@@ -70,7 +72,8 @@ typedef l4_pgentry_t root_pgentry_t;
#define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF))
/* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
-#define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U)
+#define _PAGE_NX_BIT (1U<<23)
+#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0U)
#define L1_DISALLOW_MASK BASE_DISALLOW_MASK
#define L2_DISALLOW_MASK BASE_DISALLOW_MASK
diff -r fda70200da01 -r 0f917d63e960 xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/public/dom0_ops.h Wed Aug 16 17:02:35 2006 +0100
@@ -262,6 +262,18 @@ DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t)
#define DOM0_SHADOW_CONTROL_OP_CLEAN 11
#define DOM0_SHADOW_CONTROL_OP_PEEK 12
+/* Shadow2 operations */
+#define DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION 30
+#define DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION 31
+#define DOM0_SHADOW2_CONTROL_OP_ENABLE 32
+
+/* Mode flags for Shadow2 enable op */
+#define DOM0_SHADOW2_CONTROL_FLAG_ENABLE (1 << 0)
+#define DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT (1 << 1)
+#define DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY (1 << 2)
+#define DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE (1 << 3)
+#define DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL (1 << 4)
+
struct dom0_shadow_control_stats {
uint32_t fault_count;
uint32_t dirty_count;
@@ -277,7 +289,9 @@ struct dom0_shadow_control {
uint32_t op;
XEN_GUEST_HANDLE(ulong) dirty_bitmap;
/* IN/OUT variables. */
- uint64_t pages; /* size of buffer, updated with actual size */
+ uint64_t pages; /* size of buffer, updated with actual size */
+ uint32_t mb; /* Shadow2 memory allocation in MB */
+ uint32_t mode; /* Shadow2 mode to enable */
/* OUT variables. */
struct dom0_shadow_control_stats stats;
};
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/domain_page.h
--- a/xen/include/xen/domain_page.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/domain_page.h Wed Aug 16 17:02:35 2006 +0100
@@ -25,6 +25,13 @@ extern void *map_domain_page(unsigned lo
* currently-executing VCPU via a call to map_domain_pages().
*/
extern void unmap_domain_page(void *va);
+
+/*
+ * Convert a VA (within a page previously mapped in the context of the
+ * currently-executing VCPU via a call to map_domain_pages()) to a machine
+ * address
+ */
+extern paddr_t mapped_domain_page_to_maddr(void *va);
/*
* Similar to the above calls, except the mapping is accessible in all
@@ -98,6 +105,7 @@ domain_mmap_cache_destroy(struct domain_
#define map_domain_page(pfn) maddr_to_virt((pfn)<<PAGE_SHIFT)
#define unmap_domain_page(va) ((void)(va))
+#define mapped_domain_page_to_maddr(va) (virt_to_maddr(va))
#define map_domain_page_global(pfn) maddr_to_virt((pfn)<<PAGE_SHIFT)
#define unmap_domain_page_global(va) ((void)(va))
@@ -112,4 +120,9 @@ struct domain_mmap_cache {
#endif /* !CONFIG_DOMAIN_PAGE */
+#define HERE_I_AM \
+do { \
+ printk("HERE I AM: %s %s %d\n", __func__, __FILE__, __LINE__); \
+} while (0)
+
#endif /* __XEN_DOMAIN_PAGE_H__ */
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/lib.h
--- a/xen/include/xen/lib.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/lib.h Wed Aug 16 17:02:35 2006 +0100
@@ -18,7 +18,7 @@ extern void __bug(char *file, int line)
#ifndef NDEBUG
#define ASSERT(_p) \
do { \
- if ( !(_p) ) \
+ if ( unlikely(!(_p)) ) \
{ \
printk("Assertion '%s' failed, line %d, file %s\n", #_p , \
__LINE__, __FILE__); \
@@ -41,7 +41,7 @@ void cmdline_parse(char *cmdline);
void cmdline_parse(char *cmdline);
#ifndef NDEBUG
-extern int debugtrace_send_to_console;
+extern void debugtrace_toggle(void);
extern void debugtrace_dump(void);
extern void debugtrace_printk(const char *fmt, ...);
#else
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/list.h
--- a/xen/include/xen/list.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/list.h Wed Aug 16 17:02:35 2006 +0100
@@ -160,6 +160,16 @@ static __inline__ void list_splice(struc
#define list_for_each_safe(pos, n, head) \
for (pos = (head)->next, n = pos->next; pos != (head); \
pos = n, n = pos->next)
+
+/**
+ * list_for_each_backwards_safe - iterate backwards over a list
safe against removal of list entry
+ * @pos: the &struct list_head to use as a loop counter.
+ * @n: another &struct list_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define list_for_each_backwards_safe(pos, n, head) \
+ for (pos = (head)->prev, n = pos->prev; pos != (head); \
+ pos = n, n = pos->prev)
/**
* list_for_each_entry - iterate over list of given type
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/sched.h Wed Aug 16 17:02:35 2006 +0100
@@ -376,9 +376,12 @@ extern struct domain *domain_list;
/* VCPU is paused by the hypervisor? */
#define _VCPUF_paused 11
#define VCPUF_paused (1UL<<_VCPUF_paused)
- /* VCPU is blocked awaiting an event to be consumed by Xen. */
+/* VCPU is blocked awaiting an event to be consumed by Xen. */
#define _VCPUF_blocked_in_xen 12
#define VCPUF_blocked_in_xen (1UL<<_VCPUF_blocked_in_xen)
+ /* HVM vcpu thinks CR0.PG == 0 */
+#define _VCPUF_shadow2_translate 13
+#define VCPUF_shadow2_translate (1UL<<_VCPUF_shadow2_translate)
/*
* Per-domain flags (domain_flags).
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/shadow2-common.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/shadow2-common.c Wed Aug 16 17:02:35 2006 +0100
@@ -0,0 +1,3394 @@
+/******************************************************************************
+ * arch/x86/shadow2-common.c
+ *
+ * Shadow2 code that does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+
+#if SHADOW2_AUDIT
+int shadow2_audit_enable = 0;
+#endif
+
+static void sh2_free_log_dirty_bitmap(struct domain *d);
+
+int _shadow2_mode_refcounts(struct domain *d)
+{
+ return shadow2_mode_refcounts(d);
+}
+
+
+/**************************************************************************/
+/* x86 emulator support for the shadow2 code
+ */
+
+static int
+sh2_x86_emulate_read_std(unsigned long addr,
+ unsigned long *val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+ if ( hvm_guest(v) )
+ {
+ *val = 0;
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that is only a user vs supervisor access check.
+ //
+ if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
+ {
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id,
+ addr, *val, bytes);
+#endif
+ return X86EMUL_CONTINUE;
+ }
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating. */
+ SHADOW2_PRINTK("read failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_write_std(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that includes user vs supervisor, and
+ // write access.
+ //
+ if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
+ return X86EMUL_CONTINUE;
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating,
+ * which should be handled by sh2_x86_emulate_write_emulated. */
+ SHADOW2_PRINTK("write failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_write_emulated(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt);
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
+ unsigned long old,
+ unsigned long new,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new,
+ bytes, ctxt);
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
+ v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
+ new_hi, new_lo, ctxt);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
+ new_lo, new_hi, ctxt);
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+
+struct x86_emulate_ops shadow2_emulator_ops = {
+ .read_std = sh2_x86_emulate_read_std,
+ .write_std = sh2_x86_emulate_write_std,
+ .read_emulated = sh2_x86_emulate_read_std,
+ .write_emulated = sh2_x86_emulate_write_emulated,
+ .cmpxchg_emulated = sh2_x86_emulate_cmpxchg_emulated,
+ .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
+};
+
+
+/**************************************************************************/
+/* Code for "promoting" a guest page to the point where the shadow code is
+ * willing to let it be treated as a guest page table. This generally
+ * involves making sure there are no writable mappings available to the guest
+ * for this page.
+ */
+void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ unsigned long type_info;
+
+ ASSERT(valid_mfn(gmfn));
+
+ /* We should never try to promote a gmfn that has writeable mappings */
+ ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
+
+ // Is the page already shadowed?
+ if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+ {
+ // No prior shadow exists...
+
+ // Grab a type-ref. We don't really care if we are racing with another
+ // vcpu or not, or even what kind of type we get; we just want the type
+ // count to be > 0.
+ //
+ do {
+ type_info =
+ page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
+ } while ( !get_page_type(page, type_info) );
+
+ // Now that the type ref is non-zero, we can safely use the
+ // shadow2_flags.
+ //
+ page->shadow2_flags = 0;
+ }
+
+ ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+ set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+}
+
+void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+
+ ASSERT(test_bit(_PGC_page_table, &page->count_info));
+ ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+
+ clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+
+ if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
+ {
+ // release the extra type ref
+ put_page_type(page);
+
+ // clear the is-a-page-table bit.
+ clear_bit(_PGC_page_table, &page->count_info);
+ }
+}
+
+/**************************************************************************/
+/* Validate a pagetable change from the guest and update the shadows.
+ * Returns a bitmask of SHADOW2_SET_* flags. */
+
+static int
+__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+{
+ int result = 0;
+ struct page_info *page = mfn_to_page(gmfn);
+
+ sh2_mark_dirty(v->domain, gmfn);
+
+ // Determine which types of shadows are affected, and update each.
+ //
+ // Always validate L1s before L2s to prevent another cpu with a linear
+ // mapping of this gmfn from seeing a walk that results from
+ // using the new L2 value and the old L1 value. (It is OK for such a
+ // guest to see a walk that uses the old L2 value with the new L1 value,
+ // as hardware could behave this way if one level of the pagewalk occurs
+ // before the store, and the next level of the pagewalk occurs after the
+ // store.
+ //
+ // Ditto for L2s before L3s, etc.
+ //
+
+ if ( !(page->count_info & PGC_page_table) )
+ return 0; /* Not shadowed at all */
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow2_flags & SH2F_L1_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow2_flags & SH2F_L1_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow2_flags & SH2F_L2_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow2_flags & SH2F_L2_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( page->shadow2_flags & SH2F_L1_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L2_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L2H_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L3_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
+ (v, gmfn, entry, size);
+#else /* 32-bit non-PAE hypervisor does not support PAE guests */
+ ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( page->shadow2_flags & SH2F_L1_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L2_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L3_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L4_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
+ (v, gmfn, entry, size);
+#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
+ ASSERT((page->shadow2_flags
+ & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
+#endif
+
+ return result;
+}
+
+
+int
+shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
+/* This is the entry point from hypercalls. It returns a bitmask of all the
+ * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
+{
+ int rc;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
+ shadow2_audit_tables(v);
+ return rc;
+}
+
+void
+shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+/* This is the entry point for emulated writes to pagetables in HVM guests */
+{
+ struct domain *d = v->domain;
+ int rc;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
+ if ( rc & SHADOW2_SET_FLUSH )
+ {
+ // Flush everyone except the local processor, which will flush when it
+ // re-enters the HVM guest.
+ //
+ cpumask_t mask = d->domain_dirty_cpumask;
+ cpu_clear(v->processor, mask);
+ flush_tlb_mask(mask);
+ }
+ if ( rc & SHADOW2_SET_ERROR )
+ {
+ /* This page is probably not a pagetable any more: tear it out of the
+ * shadows, along with any tables that reference it */
+ shadow2_remove_all_shadows_and_parents(v, gmfn);
+ }
+ /* We ignore the other bits: since we are about to change CR3 on
+ * VMENTER we don't need to do any extra TLB flushes. */
+}
+
+
+/**************************************************************************/
+/* Memory management for shadow pages. */
+
+/* Meaning of the count_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * A count of all references to this page from other shadow pages and
+ * guest CR3s (a.k.a. v->arch.shadow_table).
+ *
+ * The top bits hold the shadow type and the pinned bit. Top-level
+ * shadows are pinned so that they don't disappear when not in a CR3
+ * somewhere.
+ *
+ * We don't need to use get|put_page for this as the updates are all
+ * protected by the shadow lock. We can't use get|put_page for this
+ * as the size of the count on shadow pages is different from that on
+ * normal guest pages.
+ */
+
+/* Meaning of the type_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * type_info use depends on the shadow type (from count_info)
+ *
+ * PGC_SH2_none : This page is in the shadow2 free pool. type_info holds
+ * the chunk order for our freelist allocator.
+ *
+ * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info
+ * holds the mfn of the guest page being shadowed,
+ *
+ * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
+ * type_info holds the gfn being shattered.
+ *
+ * PGC_SH2_monitor_table : This page is part of a monitor table.
+ * type_info is not used.
+ */
+
+/* Meaning of the _domain field in shadow pages
+ * --------------------------------------------
+ *
+ * In shadow pages, this field will always have its least significant bit
+ * set. This ensures that all attempts to get_page() will fail (as all
+ * valid pickled domain pointers have a zero for their least significant bit).
+ * Instead, the remaining upper bits are used to record the shadow generation
+ * counter when the shadow was created.
+ */
+
+/* Meaning of the shadow2_flags field
+ * ----------------------------------
+ *
+ * In guest pages that are shadowed, one bit for each kind of shadow they have.
+ *
+ * In shadow pages, will be used for holding a representation of the populated
+ * entries in this shadow (either a min/max, or a bitmap, or ...)
+ *
+ * In monitor-table pages, holds the level of the particular page (to save
+ * spilling the shadow types into an extra bit by having three types of monitor
+ * page).
+ */
+
+/* Meaning of the list_head struct in shadow pages
+ * -----------------------------------------------
+ *
+ * In free shadow pages, this is used to hold the free-lists of chunks.
+ *
+ * In top-level shadow tables, this holds a linked-list of all top-level
+ * shadows (used for recovering memory and destroying shadows).
+ *
+ * In lower-level shadows, this holds the physical address of a higher-level
+ * shadow entry that holds a reference to this shadow (or zero).
+ */
+
+/* Allocating shadow pages
+ * -----------------------
+ *
+ * Most shadow pages are allocated singly, but there are two cases where we
+ * need to allocate multiple pages together.
+ *
+ * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
+ * A 32-bit guest l1 table covers 4MB of virtuial address space,
+ * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
+ * of virtual address space each). Similarly, a 32-bit guest l2 table
+ * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
+ * each). These multi-page shadows are contiguous and aligned;
+ * functions for handling offsets into them are defined in shadow2.c
+ * (shadow_l1_index() etc.)
+ *
+ * 2: Shadowing PAE top-level pages. Each guest page that contains
+ * any PAE top-level pages requires two shadow pages to shadow it.
+ * They contain alternating l3 tables and pae_l3_bookkeeping structs.
+ *
+ * This table shows the allocation behaviour of the different modes:
+ *
+ * Xen paging 32b pae pae 64b 64b 64b
+ * Guest paging 32b 32b pae 32b pae 64b
+ * PV or HVM * HVM * HVM HVM *
+ * Shadow paging 32b pae pae pae pae 64b
+ *
+ * sl1 size 4k 8k 4k 8k 4k 4k
+ * sl2 size 4k 16k 4k 16k 4k 4k
+ * sl3 size - - 8k - 8k 4k
+ * sl4 size - - - - - 4k
+ *
+ * We allocate memory from xen in four-page units and break them down
+ * with a simple buddy allocator. Can't use the xen allocator to handle
+ * this as it only works for contiguous zones, and a domain's shadow
+ * pool is made of fragments.
+ *
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide
+ * a function for the p2m management to steal pages, in max-order chunks, from
+ * the free pool. We don't provide for giving them back, yet.
+ */
+
+/* Figure out the least acceptable quantity of shadow memory.
+ * The minimum memory requirement for always being able to free up a
+ * chunk of memory is very small -- only three max-order chunks per
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.
+ *
+ * But for a guest to be guaranteed to successfully execute a single
+ * instruction, we must be able to map a large number (about thirty) VAs
+ * at the same time, which means that to guarantee progress, we must
+ * allow for more than ninety allocated pages per vcpu. We round that
+ * up to 128 pages, or half a megabyte per vcpu. */
+unsigned int shadow2_min_acceptable_pages(struct domain *d)
+{
+ u32 vcpu_count = 0;
+ struct vcpu *v;
+
+ for_each_vcpu(d, v)
+ vcpu_count++;
+
+ return (vcpu_count * 128);
+}
+
+/* Using the type_info field to store freelist order */
+#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
+#define SH2_SET_PFN_ORDER(_p, _o) \
+ do { (_p)->u.inuse.type_info = (_o); } while (0)
+
+
+/* Figure out the order of allocation needed for a given shadow type */
+static inline u32
+shadow_order(u32 shadow_type)
+{
+#if CONFIG_PAGING_LEVELS > 2
+ static const u32 type_to_order[16] = {
+ 0, /* PGC_SH2_none */
+ 1, /* PGC_SH2_l1_32_shadow */
+ 1, /* PGC_SH2_fl1_32_shadow */
+ 2, /* PGC_SH2_l2_32_shadow */
+ 0, /* PGC_SH2_l1_pae_shadow */
+ 0, /* PGC_SH2_fl1_pae_shadow */
+ 0, /* PGC_SH2_l2_pae_shadow */
+ 0, /* PGC_SH2_l2h_pae_shadow */
+ 1, /* PGC_SH2_l3_pae_shadow */
+ 0, /* PGC_SH2_l1_64_shadow */
+ 0, /* PGC_SH2_fl1_64_shadow */
+ 0, /* PGC_SH2_l2_64_shadow */
+ 0, /* PGC_SH2_l3_64_shadow */
+ 0, /* PGC_SH2_l4_64_shadow */
+ 2, /* PGC_SH2_p2m_table */
+ 0 /* PGC_SH2_monitor_table */
+ };
+ u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
+ return type_to_order[type];
+#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
+ return 0;
+#endif
+}
+
+
+/* Do we have a free chunk of at least this order? */
+static inline int chunk_is_available(struct domain *d, int order)
+{
+ int i;
+
+ for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+ return 1;
+ return 0;
+}
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
+ {
+ case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
+#else
+ SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
+ break;
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
+ break;
+#endif
+ default:
+ SHADOW2_PRINTK("top-level shadow has bad type %08lx\n",
+ (unsigned long)((pg->count_info & PGC_SH2_type_mask)
+ >> PGC_SH2_type_shift));
+ BUG();
+ }
+}
+
+
+/* Make sure there is at least one chunk of the required order available
+ * in the shadow page pool. This must be called before any calls to
+ * shadow2_alloc(). Since this will free existing shadows to make room,
+ * it must be called early enough to avoid freeing shadows that the
+ * caller is currently working on. */
+void shadow2_prealloc(struct domain *d, unsigned int order)
+{
+ /* Need a vpcu for calling unpins; for now, since we don't have
+ * per-vcpu shadows, any will do */
+ struct vcpu *v = d->vcpu[0];
+ struct list_head *l, *t;
+ struct page_info *pg;
+ mfn_t smfn;
+
+ if ( chunk_is_available(d, order) ) return;
+
+ /* Stage one: walk the list of top-level pages, unpinning them */
+ perfc_incrc(shadow2_prealloc_1);
+ list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
+ {
+ /* For PAE, we need to unpin each subshadow on this shadow */
+ SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
+ }
+ else
+#endif /* 32-bit code always takes this branch */
+ {
+ /* Unpin this top-level shadow */
+ sh2_unpin(v, smfn);
+ }
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Stage two: all shadow pages are in use in hierarchies that are
+ * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
+ * mappings. */
+ perfc_incrc(shadow2_prealloc_2);
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+ /* Walk the list from the tail: recently used toplevels have been pulled
+ * to the head */
+ list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+ shadow2_unhook_mappings(v, smfn);
+
+ /* Need to flush TLB if we've altered our own tables */
+ if ( !shadow2_mode_external(d)
+ && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
+ local_flush_tlb();
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Nothing more we can do: all remaining shadows are of pages that
+ * hold Xen mappings for some vcpu. This can never happen. */
+ SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
+ " shadow pages total = %u, free = %u, p2m=%u\n",
+ 1 << order,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ BUG();
+}
+
+
+/* Allocate another shadow's worth of (contiguous, aligned) pages,
+ * and fill in the type and backpointer fields of their page_infos.
+ * Never fails to allocate. */
+mfn_t shadow2_alloc(struct domain *d,
+ u32 shadow_type,
+ unsigned long backpointer)
+{
+ struct page_info *pg = NULL;
+ unsigned int order = shadow_order(shadow_type);
+ cpumask_t mask;
+ void *p;
+ int i;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(order <= SHADOW2_MAX_ORDER);
+ ASSERT(shadow_type != PGC_SH2_none);
+ perfc_incrc(shadow2_alloc);
+
+ /* Find smallest order which can satisfy the request. */
+ for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+ {
+ pg = list_entry(d->arch.shadow2_freelists[i].next,
+ struct page_info, list);
+ list_del(&pg->list);
+
+ /* We may have to halve the chunk a number of times. */
+ while ( i != order )
+ {
+ i--;
+ SH2_SET_PFN_ORDER(pg, i);
+ list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]);
+ pg += 1 << i;
+ }
+ d->arch.shadow2_free_pages -= 1 << order;
+
+ /* Init page info fields and clear the pages */
+ for ( i = 0; i < 1<<order ; i++ )
+ {
+ pg[i].u.inuse.type_info = backpointer;
+ pg[i].count_info = shadow_type;
+ pg[i].shadow2_flags = 0;
+ INIT_LIST_HEAD(&pg[i].list);
+ /* Before we overwrite the old contents of this page,
+ * we need to be sure that no TLB holds a pointer to it. */
+ mask = d->domain_dirty_cpumask;
+ tlbflush_filter(mask, pg[i].tlbflush_timestamp);
+ if ( unlikely(!cpus_empty(mask)) )
+ {
+ perfc_incrc(shadow2_alloc_tlbflush);
+ flush_tlb_mask(mask);
+ }
+ /* Now safe to clear the page for reuse */
+ p = sh2_map_domain_page(page_to_mfn(pg+i));
+ ASSERT(p != NULL);
+ clear_page(p);
+ sh2_unmap_domain_page(p);
+ perfc_incr(shadow2_alloc_count);
+ }
+ return page_to_mfn(pg);
+ }
+
+ /* If we get here, we failed to allocate. This should never happen.
+ * It means that we didn't call shadow2_prealloc() correctly before
+ * we allocated. We can't recover by calling prealloc here, because
+ * we might free up higher-level pages that the caller is working on. */
+ SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
+ BUG();
+}
+
+
+/* Return some shadow pages to the pool. */
+void shadow2_free(struct domain *d, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 shadow_type;
+ unsigned long order;
+ unsigned long mask;
+ int i;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ perfc_incrc(shadow2_free);
+
+ shadow_type = pg->count_info & PGC_SH2_type_mask;
+ ASSERT(shadow_type != PGC_SH2_none);
+ ASSERT(shadow_type != PGC_SH2_p2m_table);
+ order = shadow_order(shadow_type);
+
+ d->arch.shadow2_free_pages += 1 << order;
+
+ for ( i = 0; i < 1<<order; i++ )
+ {
+ /* Strip out the type: this is now a free shadow page */
+ pg[i].count_info = 0;
+ /* Remember the TLB timestamp so we will know whether to flush
+ * TLBs when we reuse the page. Because the destructors leave the
+ * contents of the pages in place, we can delay TLB flushes until
+ * just before the allocator hands the page out again. */
+ pg[i].tlbflush_timestamp = tlbflush_current_time();
+ perfc_decr(shadow2_alloc_count);
+ }
+
+ /* Merge chunks as far as possible. */
+ while ( order < SHADOW2_MAX_ORDER )
+ {
+ mask = 1 << order;
+ if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
+ /* Merge with predecessor block? */
+ if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none)
+ || (SH2_PFN_ORDER(pg-mask) != order) )
+ break;
+ list_del(&(pg-mask)->list);
+ pg -= mask;
+ } else {
+ /* Merge with successor block? */
+ if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
+ || (SH2_PFN_ORDER(pg+mask) != order) )
+ break;
+ list_del(&(pg+mask)->list);
+ }
+ order++;
+ }
+
+ SH2_SET_PFN_ORDER(pg, order);
+ list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]);
+}
+
+/* Divert some memory from the pool to be used by the p2m mapping.
+ * This action is irreversible: the p2m mapping only ever grows.
+ * That's OK because the p2m table only exists for external domains,
+ * and those domains can't ever turn off shadow mode.
+ * Also, we only ever allocate a max-order chunk, so as to preserve
+ * the invariant that shadow2_prealloc() always works.
+ * Returns 0 iff it can't get a chunk (the caller should then
+ * free up some pages in domheap and call set_sh2_allocation);
+ * returns non-zero on success.
+ */
+static int
+shadow2_alloc_p2m_pages(struct domain *d)
+{
+ struct page_info *pg;
+ u32 i;
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ if ( d->arch.shadow2_total_pages
+ < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
+ return 0; /* Not enough shadow memory: need to increase it first */
+
+ pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
+ d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER);
+ d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER);
+ for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
+ {
+ /* Unlike shadow pages, mark p2m pages as owned by the domain */
+ page_set_owner(&pg[i], d);
+ list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist);
+ }
+ return 1;
+}
+
+// Returns 0 if no memory is available...
+mfn_t
+shadow2_alloc_p2m_page(struct domain *d)
+{
+ struct list_head *entry;
+ mfn_t mfn;
+ void *p;
+
+ if ( list_empty(&d->arch.shadow2_p2m_freelist) &&
+ !shadow2_alloc_p2m_pages(d) )
+ return _mfn(0);
+ entry = d->arch.shadow2_p2m_freelist.next;
+ list_del(entry);
+ list_add_tail(entry, &d->arch.shadow2_p2m_inuse);
+ mfn = page_to_mfn(list_entry(entry, struct page_info, list));
+ sh2_get_ref(mfn, 0);
+ p = sh2_map_domain_page(mfn);
+ clear_page(p);
+ sh2_unmap_domain_page(p);
+
+ return mfn;
+}
+
+#if CONFIG_PAGING_LEVELS == 3
+static void p2m_install_entry_in_monitors(struct domain *d,
+ l3_pgentry_t *l3e)
+/* Special case, only used for external-mode domains on PAE hosts:
+ * update the mapping of the p2m table. Once again, this is trivial in
+ * other paging modes (one top-level entry points to the top-level p2m,
+ * no maintenance needed), but PAE makes life difficult by needing a
+ * copy the eight l3es of the p2m table in eight l2h slots in the
+ * monitor table. This function makes fresh copies when a p2m l3e
+ * changes. */
+{
+ l2_pgentry_t *ml2e;
+ struct vcpu *v;
+ unsigned int index;
+
+ index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
+ ASSERT(index < MACHPHYS_MBYTES>>1);
+
+ for_each_vcpu(d, v)
+ {
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ continue;
+ ASSERT(shadow2_mode_external(v->domain));
+
+ SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
+ d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
+
+ if ( v == current ) /* OK to use linear map of monitor_table */
+ ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
+ else
+ {
+ l3_pgentry_t *ml3e;
+ ml3e =
sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
+ ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
+ ml2e += l2_table_offset(RO_MPT_VIRT_START);
+ sh2_unmap_domain_page(ml3e);
+ }
+ ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
+ if ( v != current )
+ sh2_unmap_domain_page(ml2e);
+ }
+}
+#endif
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+static l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, u32 shift, u32 max)
+{
+ u32 index;
+
+ index = *gfn_remainder >> shift;
+ if ( index >= max )
+ {
+ SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
+ "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+ gfn, *gfn_remainder, shift, index, max);
+ return NULL;
+ }
+ *gfn_remainder &= (1 << shift) - 1;
+ return (l1_pgentry_t *)table + index;
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+static int
+p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
+ unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
+ u32 max, unsigned long type)
+{
+ l1_pgentry_t *p2m_entry;
+ void *next;
+
+ if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+ shift, max)) )
+ return 0;
+
+ if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+ {
+ mfn_t mfn = shadow2_alloc_p2m_page(d);
+ if ( mfn_x(mfn) == 0 )
+ return 0;
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
+ mfn_to_page(mfn)->count_info = 1;
+#if CONFIG_PAGING_LEVELS == 3
+ if (type == PGT_l2_page_table)
+ {
+ /* We have written to the p2m l3: need to sync the per-vcpu
+ * copies of it in the monitor tables */
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
+ }
+#endif
+ /* The P2M can be shadowed: keep the shadows synced */
+ if ( d->vcpu[0] )
+ (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+ }
+ *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+ next = sh2_map_domain_page(*table_mfn);
+ sh2_unmap_domain_page(*table);
+ *table = next;
+
+ return 1;
+}
+
+// Returns 0 on error (out of memory)
+int
+shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+ // XXX -- this might be able to be faster iff current->domain == d
+ mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
+ void *table = sh2_map_domain_page(table_mfn);
+ unsigned long gfn_remainder = gfn;
+ l1_pgentry_t *p2m_entry;
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+ return 0;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ // When using PAE Xen, we only allow 33 bits of pseudo-physical
+ // address in translated guests (i.e. 8 GBytes). This restriction
+ // comes from wanting to map the P2M table into the 16MB RO_MPT hole
+ // in Xen's address space for translated PV guests.
+ //
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+ (CONFIG_PAGING_LEVELS == 3
+ ? 8
+ : L3_PAGETABLE_ENTRIES),
+ PGT_l2_page_table) )
+ return 0;
+#endif
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+ return 0;
+
+ p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+ 0, L1_PAGETABLE_ENTRIES);
+ ASSERT(p2m_entry);
+ if ( valid_mfn(mfn) )
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ else
+ *p2m_entry = l1e_empty();
+
+ /* The P2M can be shadowed: keep the shadows synced */
+ (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+
+ sh2_unmap_domain_page(table);
+
+ return 1;
+}
+
+// Allocate a new p2m table for a domain.
+//
+// The structure of the p2m table is that of a pagetable for xen (i.e. it is
+// controlled by CONFIG_PAGING_LEVELS).
+//
+// Returns 0 if p2m table could not be initialized
+//
+static int
+shadow2_alloc_p2m_table(struct domain *d)
+{
+ mfn_t p2m_top;
+ struct list_head *entry;
+ unsigned int page_count = 0;
+
+ SHADOW2_PRINTK("allocating p2m table\n");
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
+
+ p2m_top = shadow2_alloc_p2m_page(d);
+ mfn_to_page(p2m_top)->count_info = 1;
+ mfn_to_page(p2m_top)->u.inuse.type_info =
+#if CONFIG_PAGING_LEVELS == 4
+ PGT_l4_page_table
+#elif CONFIG_PAGING_LEVELS == 3
+ PGT_l3_page_table
+#elif CONFIG_PAGING_LEVELS == 2
+ PGT_l2_page_table
+#endif
+ | 1 | PGT_validated;
+
+ if ( mfn_x(p2m_top) == 0 )
+ return 0;
+
+ d->arch.phys_table = pagetable_from_mfn(p2m_top);
+
+ SHADOW2_PRINTK("populating p2m table\n");
+
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ struct page_info *page = list_entry(entry, struct page_info, list);
+ mfn_t mfn = page_to_mfn(page);
+ unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
+ page_count++;
+ if (
+#ifdef __x86_64__
+ (gfn != 0x5555555555555555L)
+#else
+ (gfn != 0x55555555L)
+#endif
+ && gfn != INVALID_M2P_ENTRY
+ && !shadow2_set_p2m_entry(d, gfn, mfn) )
+ {
+ SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
SH2_PRI_mfn "\n",
+ gfn, mfn_x(mfn));
+ return 0;
+ }
+ }
+
+ SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
+ return 1;
+}
+
+mfn_t
+sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+/* Read another domain's p2m entries */
+{
+ mfn_t mfn;
+ unsigned long addr = gpfn << PAGE_SHIFT;
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+
+ ASSERT(shadow2_mode_translate(d));
+ mfn = pagetable_get_mfn(d->arch.phys_table);
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return _mfn(INVALID_MFN);
+#endif
+
+
+#if CONFIG_PAGING_LEVELS >= 4
+ {
+ l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
+ l4e += l4_table_offset(addr);
+ if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l4e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l4e_get_pfn(*l4e));
+ sh2_unmap_domain_page(l4e);
+ }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ {
+ l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
+ l3e += l3_table_offset(addr);
+ if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l3e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l3e_get_pfn(*l3e));
+ sh2_unmap_domain_page(l3e);
+ }
+#endif
+
+ l2e = sh2_map_domain_page(mfn);
+ l2e += l2_table_offset(addr);
+ if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l2e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l2e_get_pfn(*l2e));
+ sh2_unmap_domain_page(l2e);
+
+ l1e = sh2_map_domain_page(mfn);
+ l1e += l1_table_offset(addr);
+ if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l1e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l1e_get_pfn(*l1e));
+ sh2_unmap_domain_page(l1e);
+
+ return mfn;
+}
+
+unsigned long
+shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
+{
+ return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
+}
+
+
+static void shadow2_p2m_teardown(struct domain *d)
+/* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+{
+ struct list_head *entry, *n;
+ struct page_info *pg;
+
+ d->arch.phys_table = pagetable_null();
+
+ list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse)
+ {
+ pg = list_entry(entry, struct page_info, list);
+ list_del(entry);
+ /* Should have just the one ref we gave it in alloc_p2m_page() */
+ if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
+ {
+ SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+ pg->count_info, pg->u.inuse.type_info);
+ }
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation, since
+ * these pages were allocated without an owner. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow2_p2m_pages--;
+ perfc_decr(shadow2_alloc_count);
+ }
+ list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist)
+ {
+ list_del(entry);
+ pg = list_entry(entry, struct page_info, list);
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow2_p2m_pages--;
+ perfc_decr(shadow2_alloc_count);
+ }
+ ASSERT(d->arch.shadow2_p2m_pages == 0);
+}
+
+/* Set the pool of shadow pages to the required number of pages.
+ * Input will be rounded up to at least shadow2_min_acceptable_pages(),
+ * plus space for the p2m table.
+ * Returns 0 for success, non-zero for failure. */
+static unsigned int set_sh2_allocation(struct domain *d,
+ unsigned int pages,
+ int *preempted)
+{
+ struct page_info *pg;
+ unsigned int lower_bound;
+ int j;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ /* Don't allocate less than the minimum acceptable, plus one page per
+ * megabyte of RAM (for the p2m table) */
+ lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
+ if ( pages > 0 && pages < lower_bound )
+ pages = lower_bound;
+ /* Round up to largest block size */
+ pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
+
+ SHADOW2_PRINTK("current %i target %i\n",
+ d->arch.shadow2_total_pages, pages);
+
+ while ( d->arch.shadow2_total_pages != pages )
+ {
+ if ( d->arch.shadow2_total_pages < pages )
+ {
+ /* Need to allocate more memory from domheap */
+ pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0);
+ if ( pg == NULL )
+ {
+ SHADOW2_PRINTK("failed to allocate shadow pages.\n");
+ return -ENOMEM;
+ }
+ d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER;
+ d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER;
+ for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ )
+ {
+ pg[j].u.inuse.type_info = 0; /* Free page */
+ pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
+ }
+ SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
+ list_add_tail(&pg->list,
+ &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]);
+ }
+ else if ( d->arch.shadow2_total_pages > pages )
+ {
+ /* Need to return memory to domheap */
+ shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+ ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]));
+ pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next,
+ struct page_info, list);
+ list_del(&pg->list);
+ d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER;
+ d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER;
+ free_domheap_pages(pg, SHADOW2_MAX_ORDER);
+ }
+
+ /* Check to see if we need to yield and try again */
+ if ( preempted && hypercall_preempt_check() )
+ {
+ *preempted = 1;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+unsigned int shadow2_set_allocation(struct domain *d,
+ unsigned int megabytes,
+ int *preempted)
+/* Hypercall interface to set the shadow memory allocation */
+{
+ unsigned int rv;
+ shadow2_lock(d);
+ rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
+ SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ shadow2_get_allocation(d));
+ shadow2_unlock(d);
+ return rv;
+}
+
+/**************************************************************************/
+/* Hash table for storing the guest->shadow mappings */
+
+/* Hash function that takes a gfn or mfn, plus another byte of type info */
+typedef u32 key_t;
+static inline key_t sh2_hash(unsigned long n, u8 t)
+{
+ unsigned char *p = (unsigned char *)&n;
+ key_t k = t;
+ int i;
+ for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
+ return k;
+}
+
+#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
+
+/* Before we get to the mechanism, define a pair of audit functions
+ * that sanity-check the contents of the hash table. */
+static void sh2_hash_audit_bucket(struct domain *d, int bucket)
+/* Audit one bucket of the hash table */
+{
+ struct shadow2_hash_entry *e, *x;
+ struct page_info *pg;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+ e = &d->arch.shadow2_hash_table[bucket];
+ if ( e->t == 0 ) return; /* Bucket is empty */
+ while ( e )
+ {
+ /* Empty link? */
+ BUG_ON( e->t == 0 );
+ /* Bogus type? */
+ BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
+ /* Wrong bucket? */
+ BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket );
+ /* Duplicate entry? */
+ for ( x = e->next; x; x = x->next )
+ BUG_ON( x->n == e->n && x->t == e->t );
+ /* Bogus MFN? */
+ BUG_ON( !valid_mfn(e->smfn) );
+ pg = mfn_to_page(e->smfn);
+ /* Not a shadow? */
+ BUG_ON( page_get_owner(pg) != 0 );
+ /* Wrong kind of shadow? */
+ BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift
+ != e->t );
+ /* Bad backlink? */
+ BUG_ON( pg->u.inuse.type_info != e->n );
+ if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+ && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+ && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
+ {
+ /* Bad shadow flags on guest page? */
+ BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
+ }
+ /* That entry was OK; on we go */
+ e = e->next;
+ }
+}
+
+#else
+#define sh2_hash_audit_bucket(_d, _b)
+#endif /* Hashtable bucket audit */
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
+
+static void sh2_hash_audit(struct domain *d)
+/* Full audit: audit every bucket in the table */
+{
+ int i;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+ for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
+ {
+ sh2_hash_audit_bucket(d, i);
+ }
+}
+
+#else
+#define sh2_hash_audit(_d)
+#endif /* Hashtable bucket audit */
+
+/* Memory management interface for bucket allocation.
+ * These ought to come out of shadow memory, but at least on 32-bit
+ * machines we are forced to allocate them from xenheap so that we can
+ * address them. */
+static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
+{
+ struct shadow2_hash_entry *extra, *x;
+ int i;
+
+ /* We need to allocate a new node. Ensure the free list is not empty.
+ * Allocate new entries in units the same size as the original table. */
+ if ( unlikely(d->arch.shadow2_hash_freelist == NULL) )
+ {
+ size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
+ extra = xmalloc_bytes(sz);
+
+ if ( extra == NULL )
+ {
+ /* No memory left! */
+ SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
+ domain_crash_synchronous();
+ }
+ memset(extra, 0, sz);
+
+ /* Record the allocation block so it can be correctly freed later. */
+ *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) =
+ d->arch.shadow2_hash_allocations;
+ d->arch.shadow2_hash_allocations = &extra[0];
+
+ /* Thread a free chain through the newly-allocated nodes. */
+ for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
+ extra[i].next = &extra[i+1];
+ extra[i].next = NULL;
+
+ /* Add the new nodes to the free list. */
+ d->arch.shadow2_hash_freelist = &extra[0];
+ }
+
+ /* Allocate a new node from the free list. */
+ x = d->arch.shadow2_hash_freelist;
+ d->arch.shadow2_hash_freelist = x->next;
+ return x;
+}
+
+static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
+{
+ /* Mark the bucket as empty and return it to the free list */
+ e->t = 0;
+ e->next = d->arch.shadow2_hash_freelist;
+ d->arch.shadow2_hash_freelist = e;
+}
+
+
+/* Allocate and initialise the table itself.
+ * Returns 0 for success, 1 for error. */
+static int shadow2_hash_alloc(struct domain *d)
+{
+ struct shadow2_hash_entry *table;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(!d->arch.shadow2_hash_table);
+
+ table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
+ if ( !table ) return 1;
+ memset(table, 0,
+ SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
+ d->arch.shadow2_hash_table = table;
+ return 0;
+}
+
+/* Tear down the hash table and return all memory to Xen.
+ * This function does not care whether the table is populated. */
+static void shadow2_hash_teardown(struct domain *d)
+{
+ struct shadow2_hash_entry *a, *n;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+
+ /* Return the table itself */
+ xfree(d->arch.shadow2_hash_table);
+ d->arch.shadow2_hash_table = NULL;
+
+ /* Return any extra allocations */
+ a = d->arch.shadow2_hash_allocations;
+ while ( a )
+ {
+ /* We stored a linked-list pointer at the end of each allocation */
+ n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
+ xfree(a);
+ a = n;
+ }
+ d->arch.shadow2_hash_allocations = NULL;
+ d->arch.shadow2_hash_freelist = NULL;
+}
+
+
+mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
+/* Find an entry in the hash table. Returns the MFN of the shadow,
+ * or INVALID_MFN if it doesn't exist */
+{
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+ ASSERT(t);
+
+ sh2_hash_audit(d);
+
+ perfc_incrc(shadow2_hash_lookups);
+ key = sh2_hash(n, t);
+
+ x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+ p = NULL;
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+ do
+ {
+ ASSERT(x->t || ((x == head) && (x->next == NULL)));
+
+ if ( x->n == n && x->t == t )
+ {
+ /* Pull-to-front if 'x' isn't already the head item */
+ if ( unlikely(x != head) )
+ {
+ if ( unlikely(d->arch.shadow2_hash_walking != 0) )
+ /* Can't reorder: someone is walking the hash chains */
+ return x->smfn;
+ else
+ {
+ /* Delete 'x' from list and reinsert after head. */
+ p->next = x->next;
+ x->next = head->next;
+ head->next = x;
+
+ /* Swap 'x' contents with head contents. */
+ SWAP(head->n, x->n);
+ SWAP(head->t, x->t);
+ SWAP(head->smfn, x->smfn);
+ }
+ }
+ else
+ {
+ perfc_incrc(shadow2_hash_lookup_head);
+ }
+ return head->smfn;
+ }
+
+ p = x;
+ x = x->next;
+ }
+ while ( x != NULL );
+
+ perfc_incrc(shadow2_hash_lookup_miss);
+ return _mfn(INVALID_MFN);
+}
+
+void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Put a mapping (n,t)->smfn into the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *x, *head;
+ key_t key;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+ ASSERT(t);
+
+ sh2_hash_audit(d);
+
+ perfc_incrc(shadow2_hash_inserts);
+ key = sh2_hash(n, t);
+
+ head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+ /* If the bucket is empty then insert the new page as the head item. */
+ if ( head->t == 0 )
+ {
+ head->n = n;
+ head->t = t;
+ head->smfn = smfn;
+ ASSERT(head->next == NULL);
+ }
+ else
+ {
+ /* Insert a new entry directly after the head item. */
+ x = sh2_alloc_hash_entry(d);
+ x->n = n;
+ x->t = t;
+ x->smfn = smfn;
+ x->next = head->next;
+ head->next = x;
+ }
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Excise the mapping (n,t)->smfn from the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+ ASSERT(t);
+
+ sh2_hash_audit(d);
+
+ perfc_incrc(shadow2_hash_deletes);
+ key = sh2_hash(n, t);
+
+ head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+ /* Match on head item? */
+ if ( head->n == n && head->t == t )
+ {
+ if ( (x = head->next) != NULL )
+ {
+ /* Overwrite head with contents of following node. */
+ head->n = x->n;
+ head->t = x->t;
+ head->smfn = x->smfn;
+
+ /* Delete following node. */
+ head->next = x->next;
+ sh2_free_hash_entry(d, x);
+ }
+ else
+ {
+ /* This bucket is now empty. Initialise the head node. */
+ head->t = 0;
+ }
+ }
+ else
+ {
+ /* Not at the head; need to walk the chain */
+ p = head;
+ x = head->next;
+
+ while(1)
+ {
+ ASSERT(x); /* We can't have hit the end, since our target is
+ * still in the chain somehwere... */
+ if ( x->n == n && x->t == t )
+ {
+ /* Delete matching node. */
+ p->next = x->next;
+ sh2_free_hash_entry(d, x);
+ break;
+ }
+ p = x;
+ x = x->next;
+ }
+ }
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+
+static void hash_foreach(struct vcpu *v,
+ unsigned int callback_mask,
+ hash_callback_t callbacks[],
+ mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and
+ * calling the appropriate callback function for each entry.
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan.
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they
+ * then return non-zero to terminate the scan. */
+{
+ int i, done = 0;
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *x;
+
+ /* Say we're here, to stop hash-lookups reordering the chains */
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_walking == 0);
+ d->arch.shadow2_hash_walking = 1;
+
+ callback_mask &= ~1; /* Never attempt to call back on empty buckets */
+ for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
+ {
+ /* WARNING: This is not safe against changes to the hash table.
+ * The callback *must* return non-zero if it has inserted or
+ * deleted anything from the hash (lookups are OK, though). */
+ for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next )
+ {
+ if ( callback_mask & (1 << x->t) )
+ {
+ ASSERT(x->t <= 15);
+ ASSERT(callbacks[x->t] != NULL);
+ if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
+ break;
+ }
+ }
+ if ( done ) break;
+ }
+ d->arch.shadow2_hash_walking = 0;
+}
+
+
+/**************************************************************************/
+/* Destroy a shadow page: simple dispatcher to call the per-type destructor
+ * which will decrement refcounts appropriately and return memory to the
+ * free pool. */
+
+void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 t = pg->count_info & PGC_SH2_type_mask;
+
+
+ SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn));
+
+ /* Double-check, if we can, that the shadowed page belongs to this
+ * domain, (by following the back-pointer). */
+ ASSERT(t == PGC_SH2_fl1_32_shadow ||
+ t == PGC_SH2_fl1_pae_shadow ||
+ t == PGC_SH2_fl1_64_shadow ||
+ t == PGC_SH2_monitor_table ||
+ (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
+ == v->domain));
+
+ /* The down-shifts here are so that the switch statement is on nice
+ * small numbers that the compiler will enjoy */
+ switch ( t >> PGC_SH2_type_shift )
+ {
+#if CONFIG_PAGING_LEVELS == 2
+ case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn);
+ break;
+ case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn);
+ break;
+#else /* PAE or 64bit */
+ case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn);
+ break;
+ case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn);
+ break;
+#endif
+ default:
+ SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n",
+ (unsigned long)t);
+ BUG();
+ }
+}
+
+/**************************************************************************/
+/* Remove all writeable mappings of a guest frame from the shadow tables
+ * Returns non-zero if we need to flush TLBs.
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access.*/
+
+int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn,
+ unsigned int level,
+ unsigned long fault_addr)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+ ;
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ /* Only remove writable mappings if we are doing shadow refcounts.
+ * In guest refcounting, we trust Xen to already be restricting
+ * all the writes to the guest page tables, so we do not need to
+ * do more. */
+ if ( !shadow2_mode_refcounts(v->domain) )
+ return 0;
+
+ /* Early exit if it's already a pagetable, or otherwise not writeable */
+ if ( sh2_mfn_is_a_page_table(gmfn)
+ || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
+ return 0;
+
+ perfc_incrc(shadow2_writeable);
+
+ /* If this isn't a "normal" writeable page, the domain is trying to
+ * put pagetables in special memory of some kind. We can't allow that. */
+ if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
+ {
+ SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %"
+ PRtype_info "\n",
+ mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
+ domain_crash(v->domain);
+ }
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+ if ( v == current && level != 0 )
+ {
+ unsigned long gfn;
+ /* Heuristic: there is likely to be only one writeable mapping,
+ * and that mapping is likely to be in the current pagetable,
+ * either in the guest's linear map (linux, windows) or in a
+ * magic slot used to map high memory regions (linux HIGHTPTE) */
+
+#define GUESS(_a, _h) do { \
+ if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) ) \
+ perfc_incrc(shadow2_writeable_h_ ## _h); \
+ if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
+ return 1; \
+ } while (0)
+
+
+ /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
+ if ( v == current
+ && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
+ GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
+
+ if ( v->arch.shadow2->guest_levels == 2 )
+ {
+ if ( level == 1 )
+ /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
+ GUESS(0xC0000000UL + (fault_addr >> 10), 1);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ else if ( v->arch.shadow2->guest_levels == 3 )
+ {
+ /* 32bit PAE w2k3: linear map at 0xC0000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
+ case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
+ }
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( v->arch.shadow2->guest_levels == 4 )
+ {
+ /* 64bit w2k3: linear map at 0x0000070000000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
+ case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
+ case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
+ }
+ }
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS >= 3 */
+
+#undef GUESS
+
+ }
+#endif
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow2_writeable_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
+ {
+ SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: "
+ "%lu left\n", mfn_x(gmfn),
+ (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
+ domain_crash(v->domain);
+ }
+
+ /* We killed at least one writeable mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+
+/**************************************************************************/
+/* Remove all mappings of a guest frame from the shadow tables.
+ * Returns non-zero if we need to flush TLBs. */
+
+int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ int expected_count;
+
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+ ;
+
+ perfc_incrc(shadow2_mappings);
+ if ( (page->count_info & PGC_count_mask) == 0 )
+ return 0;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ /* XXX TODO:
+ * Heuristics for finding the (probably) single mapping of this gmfn */
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow2_mappings_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
+ if ( (page->count_info & PGC_count_mask) != expected_count )
+ {
+ /* Don't complain if we're in HVM and there's one extra mapping:
+ * The qemu helper process has an untyped mapping of this dom's RAM */
+ if ( !(shadow2_mode_external(v->domain)
+ && (page->count_info & PGC_count_mask) <= 2
+ && (page->u.inuse.type_info & PGT_count_mask) == 0) )
+ {
+ SHADOW2_ERROR("can't find all mappings of mfn %lx: "
+ "c=%08x t=%08lx\n", mfn_x(gmfn),
+ page->count_info, page->u.inuse.type_info);
+ }
+ }
+
+ /* We killed at least one mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+/**************************************************************************/
+/* Remove all shadows of a guest frame from the shadow tables */
+
+static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+/* Follow this shadow's up-pointer, if it has one, and remove the reference
+ * found there. Returns 1 if that was the only reference to this shadow */
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ mfn_t pmfn;
+ void *vaddr;
+ int rc;
+
+ ASSERT((pg->count_info & PGC_SH2_type_mask) > 0);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow);
+
+ if (pg->up == 0) return 0;
+ pmfn = _mfn(pg->up >> PAGE_SHIFT);
+ ASSERT(valid_mfn(pmfn));
+ vaddr = sh2_map_domain_page(pmfn);
+ ASSERT(vaddr);
+ vaddr += pg->up & (PAGE_SIZE-1);
+ ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
+
+ /* Is this the only reference to this shadow? */
+ rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0;
+
+ /* Blank the offending entry */
+ switch ((pg->count_info & PGC_SH2_type_mask))
+ {
+ case PGC_SH2_l1_32_shadow:
+ case PGC_SH2_l2_32_shadow:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn);
+#else
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >=3
+ case PGC_SH2_l1_pae_shadow:
+ case PGC_SH2_l2_pae_shadow:
+ case PGC_SH2_l2h_pae_shadow:
+ case PGC_SH2_l3_pae_shadow:
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn);
+ break;
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH2_l1_64_shadow:
+ case PGC_SH2_l2_64_shadow:
+ case PGC_SH2_l3_64_shadow:
+ case PGC_SH2_l4_64_shadow:
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn);
+ break;
+#endif
+#endif
+ default: BUG(); /* Some wierd unknown shadow type */
+ }
+
+ sh2_unmap_domain_page(vaddr);
+ if ( rc )
+ perfc_incrc(shadow2_up_pointer);
+ else
+ perfc_incrc(shadow2_unshadow_bf);
+
+ return rc;
+}
+
+void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
+/* Remove the shadows of this guest page.
+ * If all != 0, find all shadows, if necessary by walking the tables.
+ * Otherwise, just try the (much faster) heuristics, which will remove
+ * at most one reference to each shadow of the page. */
+{
+ struct page_info *pg;
+ mfn_t smfn;
+ u32 sh_flags;
+ unsigned char t;
+
+ /* Dispatch table for getting per-type functions: each level must
+ * be called with the function to remove a lower-level shadow. */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+ NULL, /* l1_32 */
+ NULL, /* fl1_32 */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32 */
+#endif
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae */
+#else
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#endif
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64 */
+#else
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+#endif
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ /* Another lookup table, for choosing which mask to use */
+ static unsigned int masks[16] = {
+ 0, /* none */
+ 1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32 */
+ 0, /* fl1_32 */
+ 0, /* l2_32 */
+ ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift))
+ | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae */
+ 0, /* fl1_pae */
+ 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae */
+ 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae */
+ 0, /* l3_pae */
+ 1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64 */
+ 0, /* fl1_64 */
+ 1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64 */
+ 1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64 */
+ 0, /* l4_64 */
+ 0, /* p2m */
+ 0 /* unused */
+ };
+
+ SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ pg = mfn_to_page(gmfn);
+
+ /* Bale out now if the page is not shadowed */
+ if ( (pg->count_info & PGC_page_table) == 0 )
+ return;
+
+ /* Search for this shadow in all appropriate shadows */
+ perfc_incrc(shadow2_unshadow);
+ sh_flags = pg->shadow2_flags;
+
+ /* Lower-level shadows need to be excised from upper-level shadows.
+ * This call to hash_foreach() looks dangerous but is in fact OK: each
+ * call will remove at most one shadow, and terminate immediately when
+ * it does remove it, so we never walk the hash after doing a deletion. */
+#define DO_UNSHADOW(_type) do { \
+ t = (_type) >> PGC_SH2_type_shift; \
+ smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( !sh2_remove_shadow_via_pointer(v, smfn) && all ) \
+ hash_foreach(v, masks[t], callbacks, smfn); \
+} while (0)
+
+ /* Top-level shadows need to be unpinned */
+#define DO_UNPIN(_type) do { \
+ t = (_type) >> PGC_SH2_type_shift; \
+ smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned ) \
+ sh2_unpin(v, smfn); \
+ if ( (_type) == PGC_SH2_l3_pae_shadow ) \
+ SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \
+} while (0)
+
+ if ( sh_flags & SH2F_L1_32 ) DO_UNSHADOW(PGC_SH2_l1_32_shadow);
+ if ( sh_flags & SH2F_L2_32 ) DO_UNPIN(PGC_SH2_l2_32_shadow);
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( sh_flags & SH2F_L1_PAE ) DO_UNSHADOW(PGC_SH2_l1_pae_shadow);
+ if ( sh_flags & SH2F_L2_PAE ) DO_UNSHADOW(PGC_SH2_l2_pae_shadow);
+ if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow);
+ if ( sh_flags & SH2F_L3_PAE ) DO_UNPIN(PGC_SH2_l3_pae_shadow);
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( sh_flags & SH2F_L1_64 ) DO_UNSHADOW(PGC_SH2_l1_64_shadow);
+ if ( sh_flags & SH2F_L2_64 ) DO_UNSHADOW(PGC_SH2_l2_64_shadow);
+ if ( sh_flags & SH2F_L3_64 ) DO_UNSHADOW(PGC_SH2_l3_64_shadow);
+ if ( sh_flags & SH2F_L4_64 ) DO_UNPIN(PGC_SH2_l4_64_shadow);
+#endif
+#endif
+
+#undef DO_UNSHADOW
+#undef DO_UNPIN
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ /* We may have caused some PAE l3 entries to change: need to
+ * fix up the copies of them in various places */
+ if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) )
+ sh2_pae_recopy(v->domain);
+#endif
+
+ /* If that didn't catch the shadows, something is wrong */
+ if ( all && (pg->count_info & PGC_page_table) )
+ {
+ SHADOW2_ERROR("can't find all shadows of mfn %05lx
(shadow2_flags=%08x)\n",
+ mfn_x(gmfn), pg->shadow2_flags);
+ domain_crash(v->domain);
+ }
+}
+
+void
+shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+{
+ shadow2_remove_all_shadows(v, gmfn);
+ /* XXX TODO:
+ * Rework this hashtable walker to return a linked-list of all
+ * the shadows it modified, then do breadth-first recursion
+ * to find the way up to higher-level tables and unshadow them too.
+ *
+ * The current code (just tearing down each page's shadows as we
+ * detect that it is not a pagetable) is correct, but very slow.
+ * It means extra emulated writes and slows down removal of mappings. */
+}
+
+/**************************************************************************/
+
+void sh2_update_paging_modes(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct shadow2_entry_points *old_entries = v->arch.shadow2;
+ mfn_t old_guest_table;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ // Valid transitions handled by this function:
+ // - For PV guests:
+ // - after a shadow mode has been changed
+ // - For HVM guests:
+ // - after a shadow mode has been changed
+ // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
+ //
+
+ // Avoid determining the current shadow2 mode for uninitialized CPUs, as
+ // we can not yet determine whether it is an HVM or PV domain.
+ //
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ printk("%s: postponing determination of shadow2 mode\n", __func__);
+ return;
+ }
+
+ // First, tear down any old shadow tables held by this vcpu.
+ //
+ if ( v->arch.shadow2 )
+ shadow2_detach_old_tables(v);
+
+ if ( !hvm_guest(v) )
+ {
+ ///
+ /// PV guest
+ ///
+#if CONFIG_PAGING_LEVELS == 4
+ if ( pv_32bit_guest(v) )
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3);
+ else
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
+#elif CONFIG_PAGING_LEVELS == 2
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
+#else
+#error unexpected paging mode
+#endif
+ }
+ else
+ {
+ ///
+ /// HVM guest
+ ///
+ ASSERT(shadow2_mode_translate(d));
+ ASSERT(shadow2_mode_external(d));
+
+ if ( !hvm_paging_enabled(v) )
+ {
+ // paging disabled...
+ clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+
+ /* Set v->arch.guest_table to use the p2m map, and choose
+ * the appropriate shadow mode */
+ old_guest_table = pagetable_get_mfn(v->arch.guest_table);
+#if CONFIG_PAGING_LEVELS == 2
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#else /* CONFIG_PAGING_LEVELS == 4 */
+ {
+ l4_pgentry_t *l4e;
+ /* Use the start of the first l3 table as a PAE l3 */
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ l4e =
sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ v->arch.guest_table =
+ pagetable_from_pfn(l4e_get_pfn(l4e[0]));
+ sh2_unmap_domain_page(l4e);
+ }
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#endif
+ /* Fix up refcounts on guest_table */
+ get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
+ if ( mfn_x(old_guest_table) != 0 )
+ put_page(mfn_to_page(old_guest_table));
+ }
+ else
+ {
+ set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+
+#ifdef __x86_64__
+ if ( hvm_long_mode_enabled(v) )
+ {
+ // long mode guest...
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+ }
+ else
+#endif
+ if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
+ {
+#if CONFIG_PAGING_LEVELS >= 3
+ // 32-bit PAE mode guest...
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3,
3);
+#else
+ SHADOW2_ERROR("PAE not supported in 32-bit Xen\n");
+ domain_crash(d);
+ return;
+#endif
+ }
+ else
+ {
+ // 32-bit 2 level guest...
+#if CONFIG_PAGING_LEVELS >= 3
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3,
2);
+#else
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2,
2);
+#endif
+ }
+ }
+
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ {
+ mfn_t mmfn = shadow2_make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(mmfn);
+ v->arch.monitor_vtable = sh2_map_domain_page(mmfn);
+ }
+
+ if ( v->arch.shadow2 != old_entries )
+ {
+ SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
+ "(was g=%u s=%u)\n",
+ d->domain_id, v->vcpu_id,
+ v->arch.shadow2->guest_levels,
+ v->arch.shadow2->shadow_levels,
+ old_entries ? old_entries->guest_levels : 0,
+ old_entries ? old_entries->shadow_levels : 0);
+ if ( old_entries &&
+ (v->arch.shadow2->shadow_levels !=
+ old_entries->shadow_levels) )
+ {
+ /* Need to make a new monitor table for the new mode */
+ mfn_t new_mfn, old_mfn;
+
+ if ( v != current )
+ {
+ SHADOW2_ERROR("Some third party (d=%u v=%u) is changing "
+ "this HVM vcpu's (d=%u v=%u) paging mode!\n",
+ current->domain->domain_id, current->vcpu_id,
+ v->domain->domain_id, v->vcpu_id);
+ domain_crash(v->domain);
+ return;
+ }
+
+ sh2_unmap_domain_page(v->arch.monitor_vtable);
+ old_mfn = pagetable_get_mfn(v->arch.monitor_table);
+ v->arch.monitor_table = pagetable_null();
+ new_mfn = v->arch.shadow2->make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(new_mfn);
+ v->arch.monitor_vtable = sh2_map_domain_page(new_mfn);
+ SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n",
+ mfn_x(new_mfn));
+
+ /* Don't be running on the old monitor table when we
+ * pull it down! Switch CR3, and warn the HVM code that
+ * its host cr3 has changed. */
+ make_cr3(v, mfn_x(new_mfn));
+ write_ptbase(v);
+ hvm_update_host_cr3(v);
+ old_entries->destroy_monitor_table(v, old_mfn);
+ }
+ }
+
+ // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
+ // These are HARD: think about the case where two CPU's have
+ // different values for CR4.PSE and CR4.PGE at the same time.
+ // This *does* happen, at least for CR4.PGE...
+ }
+
+ v->arch.shadow2->update_cr3(v);
+}
+
+/**************************************************************************/
+/* Turning on and off shadow2 features */
+
+static void sh2_new_mode(struct domain *d, u32 new_mode)
+/* Inform all the vcpus that the shadow mode has been changed */
+{
+ struct vcpu *v;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d != current->domain);
+ d->arch.shadow2_mode = new_mode;
+ if ( new_mode & SHM2_translate )
+ shadow2_audit_p2m(d);
+ for_each_vcpu(d, v)
+ sh2_update_paging_modes(v);
+}
+
+static int shadow2_enable(struct domain *d, u32 mode)
+/* Turn on "permanent" shadow features: external, translate, refcount.
+ * Can only be called once on a domain, and these features cannot be
+ * disabled.
+ * Returns 0 for success, -errno for failure. */
+{
+ unsigned int old_pages;
+ int rv = 0;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ /* Sanity check the arguments */
+ if ( d == current->domain
+ || shadow2_mode_enabled(d)
+ || !(mode & SHM2_enable)
+ || ((mode & SHM2_external) && !(mode & SHM2_translate)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ // XXX -- eventually would like to require that all memory be allocated
+ // *after* shadow2_enabled() is called... So here, we would test to make
+ // sure that d->page_list is empty.
+#if 0
+ spin_lock(&d->page_alloc_lock);
+ if ( !list_empty(&d->page_list) )
+ {
+ spin_unlock(&d->page_alloc_lock);
+ rv = -EINVAL;
+ goto out;
+ }
+ spin_unlock(&d->page_alloc_lock);
+#endif
+
+ /* Init the shadow memory allocation if the user hasn't done so */
+ old_pages = d->arch.shadow2_total_pages;
+ if ( old_pages == 0 )
+ if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
+ {
+ set_sh2_allocation(d, 0, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the hash table */
+ if ( shadow2_hash_alloc(d) != 0 )
+ {
+ set_sh2_allocation(d, old_pages, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the P2M table */
+ if ( mode & SHM2_translate )
+ if ( !shadow2_alloc_p2m_table(d) )
+ {
+ shadow2_hash_teardown(d);
+ set_sh2_allocation(d, old_pages, NULL);
+ shadow2_p2m_teardown(d);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Update the bits */
+ sh2_new_mode(d, mode);
+ shadow2_audit_p2m(d);
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+void shadow2_teardown(struct domain *d)
+/* Destroy the shadow pagetables of this domain and free its shadow memory.
+ * Should only be called for dying domains. */
+{
+ struct vcpu *v;
+ mfn_t mfn;
+
+ ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
+ ASSERT(d != current->domain);
+
+ if ( !shadow2_lock_is_acquired(d) )
+ shadow2_lock(d); /* Keep various asserts happy */
+
+ if ( shadow2_mode_enabled(d) )
+ {
+ /* Release the shadow and monitor tables held by each vcpu */
+ for_each_vcpu(d, v)
+ {
+ if ( v->arch.shadow2 )
+ shadow2_detach_old_tables(v);
+ if ( shadow2_mode_external(d) )
+ {
+ mfn = pagetable_get_mfn(v->arch.monitor_table);
+ if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
+ shadow2_destroy_monitor_table(v, mfn);
+ v->arch.monitor_table = pagetable_null();
+ }
+ }
+ }
+
+ if ( d->arch.shadow2_total_pages != 0 )
+ {
+ SHADOW2_PRINTK("teardown of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ /* Destroy all the shadows and release memory to domheap */
+ set_sh2_allocation(d, 0, NULL);
+ /* Release the hash table back to xenheap */
+ if (d->arch.shadow2_hash_table)
+ shadow2_hash_teardown(d);
+ /* Release the log-dirty bitmap of dirtied pages */
+ sh2_free_log_dirty_bitmap(d);
+ /* Should not have any more memory held */
+ SHADOW2_PRINTK("teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ ASSERT(d->arch.shadow2_total_pages == 0);
+ }
+
+ /* We leave the "permanent" shadow modes enabled, but clear the
+ * log-dirty mode bit. We don't want any more mark_dirty()
+ * calls now that we've torn down the bitmap */
+ d->arch.shadow2_mode &= ~SHM2_log_dirty;
+
+ shadow2_unlock(d);
+}
+
+void shadow2_final_teardown(struct domain *d)
+/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
+{
+
+ SHADOW2_PRINTK("dom %u final teardown starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+
+ /* Double-check that the domain didn't have any shadow memory.
+ * It is possible for a domain that never got domain_kill()ed
+ * to get here with its shadow allocation intact. */
+ if ( d->arch.shadow2_total_pages != 0 )
+ shadow2_teardown(d);
+
+ /* It is now safe to pull down the p2m map. */
+ if ( d->arch.shadow2_p2m_pages != 0 )
+ shadow2_p2m_teardown(d);
+
+ SHADOW2_PRINTK("dom %u final teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+}
+
+static int shadow2_one_bit_enable(struct domain *d, u32 mode)
+/* Turn on a single shadow mode feature */
+{
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || (d->arch.shadow2_mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ if ( d->arch.shadow2_mode == 0 )
+ {
+ /* Init the shadow memory allocation and the hash table */
+ if ( set_sh2_allocation(d, 1, NULL) != 0
+ || shadow2_hash_alloc(d) != 0 )
+ {
+ set_sh2_allocation(d, 0, NULL);
+ return -ENOMEM;
+ }
+ }
+
+ /* Update the bits */
+ sh2_new_mode(d, d->arch.shadow2_mode | mode);
+
+ return 0;
+}
+
+static int shadow2_one_bit_disable(struct domain *d, u32 mode)
+/* Turn off a single shadow mode feature */
+{
+ struct vcpu *v;
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || !(d->arch.shadow2_mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ /* Update the bits */
+ sh2_new_mode(d, d->arch.shadow2_mode & ~mode);
+ if ( d->arch.shadow2_mode == 0 )
+ {
+ /* Get this domain off shadows */
+ SHADOW2_PRINTK("un-shadowing of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ for_each_vcpu(d, v)
+ {
+ if ( v->arch.shadow2 )
+ shadow2_detach_old_tables(v);
+#if CONFIG_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
+ else
+#endif
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
+
+ }
+
+ /* Pull down the memory allocation */
+ if ( set_sh2_allocation(d, 0, NULL) != 0 )
+ {
+ // XXX - How can this occur?
+ // Seems like a bug to return an error now that we've
+ // disabled the relevant shadow mode.
+ //
+ return -ENOMEM;
+ }
+ shadow2_hash_teardown(d);
+ SHADOW2_PRINTK("un-shadowing of domain %u done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ }
+
+ return 0;
+}
+
+/* Enable/disable ops for the "test" and "log-dirty" modes */
+int shadow2_test_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ if ( shadow2_mode_enabled(d) )
+ {
+ SHADOW2_ERROR("Don't support enabling test mode"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = shadow2_one_bit_enable(d, SHM2_enable);
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+int shadow2_test_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+ ret = shadow2_one_bit_disable(d, SHM2_enable);
+ shadow2_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+static int
+sh2_alloc_log_dirty_bitmap(struct domain *d)
+{
+ ASSERT(d->arch.shadow_dirty_bitmap == NULL);
+ d->arch.shadow_dirty_bitmap_size =
+ (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
+ ~(BITS_PER_LONG - 1);
+ d->arch.shadow_dirty_bitmap =
+ xmalloc_array(unsigned long,
+ d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG);
+ if ( d->arch.shadow_dirty_bitmap == NULL )
+ {
+ d->arch.shadow_dirty_bitmap_size = 0;
+ return -ENOMEM;
+ }
+ memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8);
+
+ return 0;
+}
+
+static void
+sh2_free_log_dirty_bitmap(struct domain *d)
+{
+ d->arch.shadow_dirty_bitmap_size = 0;
+ if ( d->arch.shadow_dirty_bitmap )
+ {
+ xfree(d->arch.shadow_dirty_bitmap);
+ d->arch.shadow_dirty_bitmap = NULL;
+ }
+}
+
+static int shadow2_log_dirty_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ if ( shadow2_mode_log_dirty(d) )
+ {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ( shadow2_mode_enabled(d) )
+ {
+ SHADOW2_ERROR("Don't (yet) support enabling log-dirty"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sh2_alloc_log_dirty_bitmap(d);
+ if ( ret != 0 )
+ {
+ sh2_free_log_dirty_bitmap(d);
+ goto out;
+ }
+
+ ret = shadow2_one_bit_enable(d, SHM2_log_dirty);
+ if ( ret != 0 )
+ sh2_free_log_dirty_bitmap(d);
+
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+ return ret;
+}
+
+static int shadow2_log_dirty_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+ ret = shadow2_one_bit_disable(d, SHM2_log_dirty);
+ if ( !shadow2_mode_log_dirty(d) )
+ sh2_free_log_dirty_bitmap(d);
+ shadow2_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+/**************************************************************************/
+/* P2M map manipulations */
+
+static void
+sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+ struct vcpu *v;
+
+ if ( !shadow2_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+
+ SHADOW2_PRINTK("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn);
+ //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn);
+
+ shadow2_remove_all_shadows_and_parents(v, _mfn(mfn));
+ if ( shadow2_remove_all_mappings(v, _mfn(mfn)) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+}
+
+void
+shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ shadow2_lock(d);
+ shadow2_audit_p2m(d);
+ sh2_p2m_remove_page(d, gfn, mfn);
+ shadow2_audit_p2m(d);
+ shadow2_unlock(d);
+}
+
+void
+shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ struct vcpu *v;
+ unsigned long ogfn;
+ mfn_t omfn;
+
+ if ( !shadow2_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+ shadow2_lock(d);
+ shadow2_audit_p2m(d);
+
+ SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ omfn = sh2_gfn_to_mfn(d, gfn);
+ if ( valid_mfn(omfn) )
+ {
+ /* Get rid of the old mapping, especially any shadows */
+ shadow2_remove_all_shadows_and_parents(v, omfn);
+ if ( shadow2_remove_all_mappings(v, omfn) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+ }
+
+ ogfn = sh2_mfn_to_gfn(d, _mfn(mfn));
+ if (
+#ifdef __x86_64__
+ (ogfn != 0x5555555555555555L)
+#else
+ (ogfn != 0x55555555L)
+#endif
+ && (ogfn != INVALID_M2P_ENTRY)
+ && (ogfn != gfn) )
+ {
+ /* This machine frame is already mapped at another physical address */
+ SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
+ mfn, ogfn, gfn);
+ if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) )
+ {
+ SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
+ ogfn , mfn_x(omfn));
+ if ( mfn_x(omfn) == mfn )
+ sh2_p2m_remove_page(d, ogfn, mfn);
+ }
+ }
+
+ shadow2_set_p2m_entry(d, gfn, _mfn(mfn));
+ set_gpfn_from_mfn(mfn, gfn);
+ shadow2_audit_p2m(d);
+ shadow2_unlock(d);
+}
+
+/**************************************************************************/
+/* Log-dirty mode support */
+
+/* Convert a shadow to log-dirty mode. */
+void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
+{
+ BUG();
+}
+
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats as well. */
+static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc)
+{
+ int i, rv = 0, clean = 0;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN
+ || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH )
+ clean = 1;
+ else
+ ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK);
+
+ SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+ (clean) ? "clean" : "peek",
+ d->domain_id,
+ d->arch.shadow_fault_count,
+ d->arch.shadow_dirty_count);
+
+ sc->stats.fault_count = d->arch.shadow_fault_count;
+ sc->stats.dirty_count = d->arch.shadow_dirty_count;
+
+ if ( clean )
+ {
+ struct list_head *l, *t;
+ struct page_info *pg;
+
+ /* Need to revoke write access to the domain's pages again.
+ * In future, we'll have a less heavy-handed approach to this,
+ * but for now, we just unshadow everything except Xen. */
+ list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
+ }
+
+ d->arch.shadow_fault_count = 0;
+ d->arch.shadow_dirty_count = 0;
+ }
+
+ if ( guest_handle_is_null(sc->dirty_bitmap) ||
+ (d->arch.shadow_dirty_bitmap == NULL) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
+ sc->pages = d->arch.shadow_dirty_bitmap_size;
+
+#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+ for ( i = 0; i < sc->pages; i += CHUNK )
+ {
+ int bytes = ((((sc->pages - i) > CHUNK)
+ ? CHUNK
+ : (sc->pages - i)) + 7) / 8;
+
+ if ( copy_to_guest_offset(
+ sc->dirty_bitmap,
+ i/(8*sizeof(unsigned long)),
+ d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( clean )
+ memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ 0, bytes);
+ }
+#undef CHUNK
+
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+
+/* Mark a page as dirty */
+void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+ unsigned long pfn;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(shadow2_mode_log_dirty(d));
+
+ if ( !valid_mfn(gmfn) )
+ return;
+
+ ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ /*
+ * Values with the MSB set denote MFNs that aren't really part of the
+ * domain's pseudo-physical memory map (e.g., the shared info frame).
+ * Nothing to do here...
+ */
+ if ( unlikely(!VALID_M2P(pfn)) )
+ return;
+
+ /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */
+ if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) )
+ {
+ if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
+ {
+ SHADOW2_DEBUG(LOGDIRTY,
+ "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n",
+ mfn_x(gmfn), pfn, d->domain_id);
+ d->arch.shadow_dirty_count++;
+ }
+ }
+ else
+ {
+ SHADOW2_PRINTK("mark_dirty OOR! "
+ "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
+ "owner=%d c=%08x t=%" PRtype_info "\n",
+ mfn_x(gmfn),
+ pfn,
+ d->arch.shadow_dirty_bitmap_size,
+ d->domain_id,
+ (page_get_owner(mfn_to_page(gmfn))
+ ? page_get_owner(mfn_to_page(gmfn))->domain_id
+ : -1),
+ mfn_to_page(gmfn)->count_info,
+ mfn_to_page(gmfn)->u.inuse.type_info);
+ }
+}
+
+
+/**************************************************************************/
+/* Shadow-control DOM0_OP dispatcher */
+
+int shadow2_control_op(struct domain *d,
+ dom0_shadow_control_t *sc,
+ XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
+{
+ int rc, preempted = 0;
+
+ if ( unlikely(d == current->domain) )
+ {
+ DPRINTK("Don't try to do a shadow op on yourself!\n");
+ return -EINVAL;
+ }
+
+ switch ( sc->op )
+ {
+ case DOM0_SHADOW_CONTROL_OP_OFF:
+ if ( shadow2_mode_log_dirty(d) )
+ if ( (rc = shadow2_log_dirty_disable(d)) != 0 )
+ return rc;
+ if ( d->arch.shadow2_mode & SHM2_enable )
+ if ( (rc = shadow2_test_disable(d)) != 0 )
+ return rc;
+ return 0;
+
+ case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+ return shadow2_test_enable(d);
+
+ case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+ return shadow2_log_dirty_enable(d);
+
+ case DOM0_SHADOW_CONTROL_OP_FLUSH:
+ case DOM0_SHADOW_CONTROL_OP_CLEAN:
+ case DOM0_SHADOW_CONTROL_OP_PEEK:
+ return shadow2_log_dirty_op(d, sc);
+
+
+
+ case DOM0_SHADOW2_CONTROL_OP_ENABLE:
+ return shadow2_enable(d, sc->mode << SHM2_shift);
+
+ case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION:
+ sc->mb = shadow2_get_allocation(d);
+ return 0;
+
+ case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION:
+ rc = shadow2_set_allocation(d, sc->mb, &preempted);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_dom0_op, "h", u_dom0_op);
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = shadow2_get_allocation(d);
+ return rc;
+
+
+ default:
+ SHADOW2_ERROR("Bad shadow op %u\n", sc->op);
+ return -EINVAL;
+ }
+}
+
+
+/**************************************************************************/
+/* Auditing shadow tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
+
+void shadow2_audit_tables(struct vcpu *v)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2), /* l2_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2), /* l2_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3), /* l1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2h_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3), /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4), /* l1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4), /* l2_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4), /* l3_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4), /* l4_64 */
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS > 2 */
+ NULL /* All the rest */
+ };
+ unsigned int mask;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL )
+ mask = ~1; /* Audit every table in the system */
+ else
+ {
+ /* Audit only the current mode's tables */
+ switch (v->arch.shadow2->guest_levels)
+ {
+ case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break;
+ case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE
+ |SH2F_L2H_PAE|SH2F_L3_PAE); break;
+ case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64
+ |SH2F_L3_64|SH2F_L4_64); break;
+ default: BUG();
+ }
+ }
+
+ hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+}
+
+#endif /* Shadow audit */
+
+
+/**************************************************************************/
+/* Auditing p2m tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
+
+void shadow2_audit_p2m(struct domain *d)
+{
+ struct list_head *entry;
+ struct page_info *page;
+ struct domain *od;
+ unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+ mfn_t p2mfn;
+ unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+ int test_linear;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) )
+ return;
+
+ //SHADOW2_PRINTK("p2m audit starts\n");
+
+ test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
+ if ( test_linear )
+ local_flush_tlb();
+
+ /* Audit part one: walk the domain's page allocation list, checking
+ * the m2p entries. */
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ page = list_entry(entry, struct page_info, list);
+ mfn = mfn_x(page_to_mfn(page));
+
+ // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
+
+ od = page_get_owner(page);
+
+ if ( od != d )
+ {
+ SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+ mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+ continue;
+ }
+
+ gfn = get_gpfn_from_mfn(mfn);
+ if ( gfn == INVALID_M2P_ENTRY )
+ {
+ orphans_i++;
+ //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+ // mfn);
+ continue;
+ }
+
+ if ( gfn == 0x55555555 )
+ {
+ orphans_d++;
+ //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
+ // mfn);
+ continue;
+ }
+
+ p2mfn = sh2_gfn_to_mfn_foreign(d, gfn);
+ if ( mfn_x(p2mfn) != mfn )
+ {
+ mpbad++;
+ SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+ " (-> gfn %#lx)\n",
+ mfn, gfn, mfn_x(p2mfn),
+ (mfn_valid(p2mfn)
+ ? get_gpfn_from_mfn(mfn_x(p2mfn))
+ : -1u));
+ /* This m2p entry is stale: the domain has another frame in
+ * this physical slot. No great disaster, but for neatness,
+ * blow away the m2p entry. */
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+ }
+
+ if ( test_linear )
+ {
+ lp2mfn = get_mfn_from_gpfn(gfn);
+ if ( lp2mfn != mfn_x(p2mfn) )
+ {
+ SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+ "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
+ }
+ }
+
+ // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
+ // mfn, gfn, p2mfn, lp2mfn);
+ }
+
+ /* Audit part two: walk the domain's p2m table, checking the entries. */
+ if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
+ {
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+ int i1, i2;
+
+#if CONFIG_PAGING_LEVELS == 4
+ l4_pgentry_t *l4e;
+ l3_pgentry_t *l3e;
+ int i3, i4;
+ l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#elif CONFIG_PAGING_LEVELS == 3
+ l3_pgentry_t *l3e;
+ int i3;
+ l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#endif
+
+ gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 3
+#if CONFIG_PAGING_LEVELS >= 4
+ for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+ {
+ if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
+#endif /* now at levels 3 or 4... */
+ for ( i3 = 0;
+ i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+ i3++ )
+ {
+ if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
+#endif /* all levels... */
+ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+ {
+ if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
+
+ for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+ {
+ if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+ continue;
+ mfn = l1e_get_pfn(l1e[i1]);
+ ASSERT(valid_mfn(_mfn(mfn)));
+ m2pfn = get_gpfn_from_mfn(mfn);
+ if ( m2pfn != gfn )
+ {
+ pmbad++;
+ SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+ " -> gfn %#lx\n", gfn, mfn, m2pfn);
+ BUG();
+ }
+ }
+ sh2_unmap_domain_page(l1e);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ sh2_unmap_domain_page(l2e);
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ sh2_unmap_domain_page(l3e);
+ }
+#endif
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+ sh2_unmap_domain_page(l4e);
+#elif CONFIG_PAGING_LEVELS == 3
+ sh2_unmap_domain_page(l3e);
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ sh2_unmap_domain_page(l2e);
+#endif
+
+ }
+
+ //SHADOW2_PRINTK("p2m audit complete\n");
+ //if ( orphans_i | orphans_d | mpbad | pmbad )
+ // SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+ // orphans_i + orphans_d, orphans_i, orphans_d,
+ if ( mpbad | pmbad )
+ SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+ pmbad, mpbad);
+}
+
+#endif /* p2m audit */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/shadow2.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/shadow2.c Wed Aug 16 17:02:35 2006 +0100
@@ -0,0 +1,4469 @@
+/******************************************************************************
+ * arch/x86/shadow2.c
+ *
+ * Simple, mostly-synchronous shadow page tables.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+// backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+// figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+// space for both PV and HVM guests.
+//
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+#include <asm/shadow2-types.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode).
+ *
+ * THINGS TO DO LATER:
+ *
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE. Should return a gfn instead.
+ *
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's
+ * shadows. When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows. Start with
+ * shadows in a page in two modes as a hint, but beware of clever tricks
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps. Add appropriate unmap_l*e calls in the users.
+ * Then we can test the speed difference made by linear maps. If the
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
+ * to share l2h pages again.
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an
+ * entry in it, and every time we change CR3. We copy it for the linear
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3. Maybe we can avoid some of this recopying
+ * by using the shadow directly in some places.
+ * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable. Should coalesce the flushes to the end,
+ * and if we do flush, re-do the walk. If anything has changed, then
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND 2
+#define FETCH_TYPE_WRITE 4
+typedef enum {
+ ft_prefetch = FETCH_TYPE_PREFETCH,
+ ft_demand_read = FETCH_TYPE_DEMAND,
+ ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifndef NDEBUG
+static char *fetch_type_names[] = {
+ [ft_prefetch] "prefetch",
+ [ft_demand_read] "demand read",
+ [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab,
int clear_res);
+#endif
+static inline void sh2_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
+ * shadow L1 which maps its "splinters".
+ * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ * PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+ mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn),
+ PGC_SH2_fl1_shadow >> PGC_SH2_type_shift);
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH2_log_dirty) )
+ shadow2_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline mfn_t
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+ mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH2_type_shift);
+ perfc_incrc(shadow2_get_shadow_status);
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH2_log_dirty) )
+ shadow2_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline void
+set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Put an FL1 shadow into the hash table */
+{
+ SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain)) )
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|