# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxxxx
# Node ID e1ae7b3cb5b73f11bed3a51a7f4ded85c30cffd8
# Parent 05ab081f3c67cc4a4b3139090914ad9be5a0a100
[XEN] Make the spurious page-fault detection logic
more robust. In particular it must be able to handle
spurious write faults on mappings that have been
changed from read-only to writable. If a CPU has a stale
read-only entry in its TLB, it is allowed to fault on
the next write access without re-walking the page table.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
---
xen/arch/x86/traps.c | 210 ++++++++++++++++++++++++++++------------
xen/arch/x86/x86_32/traps.c | 34 ------
xen/arch/x86/x86_64/traps.c | 34 ------
xen/include/asm-x86/processor.h | 8 +
4 files changed, 155 insertions(+), 131 deletions(-)
diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/arch/x86/traps.c Fri Jun 16 18:18:55 2006 +0100
@@ -511,9 +511,9 @@ void propagate_page_fault(unsigned long
v->vcpu_info->arch.cr2 = addr;
/* Re-set error_code.user flag appropriately for the guest. */
- error_code &= ~4;
+ error_code &= ~PGERR_user_mode;
if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
- error_code |= 4;
+ error_code |= PGERR_user_mode;
ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
@@ -578,54 +578,91 @@ static int handle_gdt_ldt_mapping_fault(
(((va) >= HYPERVISOR_VIRT_START))
#endif
-static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
+static int __spurious_page_fault(
+ unsigned long addr, struct cpu_user_regs *regs)
+{
+ unsigned long mfn = read_cr3() >> PAGE_SHIFT;
+#if CONFIG_PAGING_LEVELS >= 4
+ l4_pgentry_t l4e, *l4t;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ l3_pgentry_t l3e, *l3t;
+#endif
+ l2_pgentry_t l2e, *l2t;
+ l1_pgentry_t l1e, *l1t;
+ unsigned int required_flags, disallowed_flags;
+
+ required_flags = _PAGE_PRESENT;
+ if ( regs->error_code & PGERR_write_access )
+ required_flags |= _PAGE_RW;
+ if ( regs->error_code & PGERR_user_mode )
+ required_flags |= _PAGE_USER;
+
+ disallowed_flags = 0;
+ if ( regs->error_code & PGERR_instr_fetch )
+ disallowed_flags |= _PAGE_NX;
+
+#if CONFIG_PAGING_LEVELS >= 4
+ l4t = map_domain_page(mfn);
+ l4e = l4t[l4_table_offset(addr)];
+ mfn = l4e_get_pfn(l4e);
+ unmap_domain_page(l4t);
+ if ( !(l4e_get_flags(l4e) & required_flags) ||
+ (l4e_get_flags(l4e) & disallowed_flags) )
+ return 0;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ l3t = map_domain_page(mfn);
+ l3e = l3t[l3_table_offset(addr)];
+ mfn = l3e_get_pfn(l3e);
+ unmap_domain_page(l3t);
+#ifdef CONFIG_X86_PAE
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+ return 0;
+#else
+ if ( !(l3e_get_flags(l3e) & required_flags) ||
+ (l3e_get_flags(l3e) & disallowed_flags) )
+ return 0;
+#endif
+#endif
+
+ l2t = map_domain_page(mfn);
+ l2e = l2t[l2_table_offset(addr)];
+ mfn = l2e_get_pfn(l2e);
+ unmap_domain_page(l2t);
+ if ( !(l2e_get_flags(l2e) & required_flags) ||
+ (l2e_get_flags(l2e) & disallowed_flags) )
+ return 0;
+ if ( l2e_get_flags(l2e) & _PAGE_PSE )
+ return 1;
+
+ l1t = map_domain_page(mfn);
+ l1e = l1t[l1_table_offset(addr)];
+ mfn = l1e_get_pfn(l1e);
+ unmap_domain_page(l1t);
+ if ( !(l1e_get_flags(l1e) & required_flags) ||
+ (l1e_get_flags(l1e) & disallowed_flags) )
+ return 0;
+ return 1;
+}
+
+static int spurious_page_fault(
+ unsigned long addr, struct cpu_user_regs *regs)
{
struct vcpu *v = current;
struct domain *d = v->domain;
-
- if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
- {
- if ( shadow_mode_external(d) && guest_mode(regs) )
- return shadow_fault(addr, regs);
- if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
- return handle_gdt_ldt_mapping_fault(
- addr - GDT_LDT_VIRT_START, regs);
- }
- else if ( unlikely(shadow_mode_enabled(d)) )
- {
- return shadow_fault(addr, regs);
- }
- else if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
- {
- LOCK_BIGLOCK(d);
- if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
- unlikely(l2_linear_offset(addr) ==
- d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
- {
- ptwr_flush(d, PTWR_PT_ACTIVE);
- UNLOCK_BIGLOCK(d);
- return EXCRET_fault_fixed;
- }
-
- if ( guest_kernel_mode(v, regs) &&
- /* Protection violation on write? No reserved-bit violation? */
- ((regs->error_code & 0xb) == 0x3) &&
- ptwr_do_page_fault(d, addr, regs) )
- {
- UNLOCK_BIGLOCK(d);
- return EXCRET_fault_fixed;
- }
- UNLOCK_BIGLOCK(d);
- }
-
- return 0;
-}
-
-static int spurious_page_fault(unsigned long addr, struct cpu_user_regs *regs)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- int rc;
+ int is_spurious;
+
+ /* Reserved bit violations are never spurious faults. */
+ if ( regs->error_code & PGERR_reserved_bit )
+ return 0;
+
+ LOCK_BIGLOCK(d);
+
+ is_spurious = __spurious_page_fault(addr, regs);
+ if ( is_spurious )
+ goto out;
/*
* The only possible reason for a spurious page fault not to be picked
@@ -635,10 +672,8 @@ static int spurious_page_fault(unsigned
if ( is_idle_domain(d) || /* no ptwr in idle domain */
IN_HYPERVISOR_RANGE(addr) || /* no ptwr on hypervisor addrs */
shadow_mode_enabled(d) || /* no ptwr logic in shadow mode */
- ((regs->error_code & 0x1d) != 0) ) /* simple not-present fault? */
- return 0;
-
- LOCK_BIGLOCK(d);
+ (regs->error_code & PGERR_page_present) ) /* not-present fault? */
+ goto out;
/*
* The page directory could have been detached again while we weren't
@@ -649,16 +684,67 @@ static int spurious_page_fault(unsigned
d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
{
ptwr_flush(d, PTWR_PT_ACTIVE);
- rc = 1;
- }
- else
- {
- /* Okay, walk the page tables. Only check for not-present faults.*/
- rc = __spurious_page_fault(addr);
- }
-
+ is_spurious = 1;
+ }
+
+ out:
UNLOCK_BIGLOCK(d);
- return rc;
+ return is_spurious;
+}
+
+static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
+{
+ struct vcpu *v = current;
+ struct domain *d = v->domain;
+ int rc;
+
+ if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
+ {
+ if ( shadow_mode_external(d) && guest_mode(regs) )
+ return shadow_fault(addr, regs);
+ if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
+ return handle_gdt_ldt_mapping_fault(
+ addr - GDT_LDT_VIRT_START, regs);
+ /*
+ * Do not propagate spurious faults in the hypervisor area to the
+ * guest. It cannot fix them up.
+ */
+ LOCK_BIGLOCK(d);
+ rc = __spurious_page_fault(addr, regs);
+ UNLOCK_BIGLOCK(d);
+ return rc;
+ }
+
+ if ( unlikely(shadow_mode_enabled(d)) )
+ return shadow_fault(addr, regs);
+
+ if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
+ {
+ LOCK_BIGLOCK(d);
+ if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
+ unlikely(l2_linear_offset(addr) ==
+ d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
+ {
+ ptwr_flush(d, PTWR_PT_ACTIVE);
+ UNLOCK_BIGLOCK(d);
+ return EXCRET_fault_fixed;
+ }
+
+ if ( guest_kernel_mode(v, regs) &&
+ /* Protection violation on write? No reserved-bit violation? */
+ ((regs->error_code & (PGERR_page_present |
+ PGERR_write_access |
+ PGERR_reserved_bit)) ==
+ (PGERR_page_present | PGERR_write_access)) &&
+ ptwr_do_page_fault(d, addr, regs) )
+ {
+ UNLOCK_BIGLOCK(d);
+ return EXCRET_fault_fixed;
+ }
+ UNLOCK_BIGLOCK(d);
+ }
+
+ return 0;
}
/*
@@ -784,8 +870,8 @@ static inline int admin_io_okay(
(admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
/* Propagate a fault back to the guest kernel. */
-#define USER_READ_FAULT 4 /* user mode, read fault */
-#define USER_WRITE_FAULT 6 /* user mode, write fault */
+#define USER_READ_FAULT (PGERR_user_mode)
+#define USER_WRITE_FAULT (PGERR_user_mode | PGERR_write_access)
#define PAGE_FAULT(_faultaddr, _errcode) \
({ propagate_page_fault(_faultaddr, _errcode); \
return EXCRET_fault_fixed; \
diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/arch/x86/x86_32/traps.c Fri Jun 16 18:18:55 2006 +0100
@@ -113,40 +113,6 @@ void show_page_walk(unsigned long addr)
unmap_domain_page(l1t);
}
-int __spurious_page_fault(unsigned long addr)
-{
- unsigned long mfn = read_cr3() >> PAGE_SHIFT;
-#ifdef CONFIG_X86_PAE
- l3_pgentry_t l3e, *l3t;
-#endif
- l2_pgentry_t l2e, *l2t;
- l1_pgentry_t l1e, *l1t;
-
-#ifdef CONFIG_X86_PAE
- l3t = map_domain_page(mfn);
- l3e = l3t[l3_table_offset(addr)];
- mfn = l3e_get_pfn(l3e);
- unmap_domain_page(l3t);
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
- return 0;
-#endif
-
- l2t = map_domain_page(mfn);
- l2e = l2t[l2_table_offset(addr)];
- mfn = l2e_get_pfn(l2e);
- unmap_domain_page(l2t);
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- return 0;
- if ( l2e_get_flags(l2e) & _PAGE_PSE )
- return 1;
-
- l1t = map_domain_page(mfn);
- l1e = l1t[l1_table_offset(addr)];
- mfn = l1e_get_pfn(l1e);
- unmap_domain_page(l1t);
- return !!(l1e_get_flags(l1e) & _PAGE_PRESENT);
-}
-
#define DOUBLEFAULT_STACK_SIZE 1024
static struct tss_struct doublefault_tss;
static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE];
diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/arch/x86/x86_64/traps.c Fri Jun 16 18:18:55 2006 +0100
@@ -115,40 +115,6 @@ void show_page_walk(unsigned long addr)
printk(" L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
}
-int __spurious_page_fault(unsigned long addr)
-{
- unsigned long mfn = read_cr3() >> PAGE_SHIFT;
- l4_pgentry_t l4e, *l4t;
- l3_pgentry_t l3e, *l3t;
- l2_pgentry_t l2e, *l2t;
- l1_pgentry_t l1e, *l1t;
-
- l4t = mfn_to_virt(mfn);
- l4e = l4t[l4_table_offset(addr)];
- mfn = l4e_get_pfn(l4e);
- if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
- return 0;
-
- l3t = mfn_to_virt(mfn);
- l3e = l3t[l3_table_offset(addr)];
- mfn = l3e_get_pfn(l3e);
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
- return 0;
-
- l2t = mfn_to_virt(mfn);
- l2e = l2t[l2_table_offset(addr)];
- mfn = l2e_get_pfn(l2e);
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- return 0;
- if ( l2e_get_flags(l2e) & _PAGE_PSE )
- return 1;
-
- l1t = mfn_to_virt(mfn);
- l1e = l1t[l1_table_offset(addr)];
- mfn = l1e_get_pfn(l1e);
- return !!(l1e_get_flags(l1e) & _PAGE_PRESENT);
-}
-
asmlinkage void double_fault(void);
asmlinkage void do_double_fault(struct cpu_user_regs *regs)
{
diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/include/asm-x86/processor.h Fri Jun 16 18:18:55 2006 +0100
@@ -128,6 +128,13 @@
/* 'arch_vcpu' flags values */
#define _TF_kernel_mode 0
#define TF_kernel_mode (1<<_TF_kernel_mode)
+
+/* #PF error code values. */
+#define PGERR_page_present (1U<<0)
+#define PGERR_write_access (1U<<1)
+#define PGERR_user_mode (1U<<2)
+#define PGERR_reserved_bit (1U<<3)
+#define PGERR_instr_fetch (1U<<4)
#ifndef __ASSEMBLY__
@@ -524,7 +531,6 @@ void show_stack(struct cpu_user_regs *re
void show_stack(struct cpu_user_regs *regs);
void show_registers(struct cpu_user_regs *regs);
void show_page_walk(unsigned long addr);
-int __spurious_page_fault(unsigned long addr);
asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs);
extern void mtrr_ap_init(void);
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|