WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 4 of 13] Allow a kernel to not be in ring 0

To: Andrew Morton <akpm@xxxxxxxx>
Subject: [Xen-devel] [PATCH 4 of 13] Allow a kernel to not be in ring 0
From: Jeremy Fitzhardinge <jeremy@xxxxxxxxxxxxx>
Date: Tue, 01 Aug 2006 13:00:42 -0700
Cc: Virtualization <virtualization@xxxxxxxx>, Zachary Amsden <zach@xxxxxxxxxx>, Xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxx>, Jeremy Fitzhardinge <jeremy@xxxxxxxxxxxxx>, Hollis Blanchard <hollisb@xxxxxxxxxx>, Rusty Russell <rusty@xxxxxxxxxxxxxxx>, Linux Kernel <linux-kernel@xxxxxxxxxxxxxxx>, Chris Wright <chrisw@xxxxxxxxxxxx>, Ian Pratt <ian.pratt@xxxxxxxxxxxxx>, "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>, Gerd Hoffmann <kraxel@xxxxxxx>, Christian Limpach <Christian.Limpach@xxxxxxxxxxxx>, Christoph Lameter <clameter@xxxxxxx>
Delivery-date: Tue, 01 Aug 2006 13:15:29 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
In-reply-to: <patchbomb.1154462438@ezr>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
6 files changed, 22 insertions(+), 13 deletions(-)
arch/i386/kernel/entry.S   |    5 +++--
arch/i386/kernel/process.c |    2 +-
arch/i386/mm/extable.c     |    2 +-
arch/i386/mm/fault.c       |   11 ++++-------
include/asm-i386/ptrace.h  |    5 +++--
include/asm-i386/segment.h |   10 ++++++++++


We allow for the fact that the guest kernel may not run in ring 0.
This requires some abstraction in a few places when setting %cs or
checking privilege level (user vs kernel).

This is Chris' [RFC PATCH 15/33] move segment checks to subarch,
except rather than using #define USER_MODE_MASK which depends on a
config option, we use Zach's more flexible approach of assuming ring 3
== userspace.  I also used "get_kernel_rpl()" over "get_kernel_cs()"
because I think it reads better in the code...

1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3
2) Add a get_kernel_rpl() macro, and don't assume it's zero.

Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
Signed-off-by: Zachary Amsden <zach@xxxxxxxxxx>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xxxxxxxxxxxxx>



===================================================================

diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S  Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/kernel/entry.S  Tue Aug 01 01:32:00 2006 -0700
@@ -229,8 +229,9 @@ check_userspace:
 check_userspace:
        movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
        movb CS(%esp), %al
-       testl $(VM_MASK | 3), %eax
-       jz resume_kernel
+       andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+       cmpl $SEGMENT_RPL_MASK, %eax
+       jb resume_kernel                # not returning to v8086 or userspace
 ENTRY(resume_userspace)
        cli                             # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/kernel/process.c
--- a/arch/i386/kernel/process.c        Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/kernel/process.c        Tue Aug 01 01:32:00 2006 -0700
@@ -346,7 +346,7 @@ int kernel_thread(int (*fn)(void *), voi
        regs.xes = __USER_DS;
        regs.orig_eax = -1;
        regs.eip = (unsigned long) kernel_thread_helper;
-       regs.xcs = __KERNEL_CS;
+       regs.xcs = __KERNEL_CS | get_kernel_rpl();
        regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
        /* Ok, create the new process.. */
diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/mm/extable.c
--- a/arch/i386/mm/extable.c    Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/mm/extable.c    Tue Aug 01 01:32:00 2006 -0700
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs
        const struct exception_table_entry *fixup;
 
 #ifdef CONFIG_PNPBIOS
-       if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3)))
+       if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
        {
                extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
                extern u32 pnp_bios_is_utter_crap;
diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/mm/fault.c
--- a/arch/i386/mm/fault.c      Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/mm/fault.c      Tue Aug 01 01:32:00 2006 -0700
@@ -27,6 +27,7 @@
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/kdebug.h>
+#include <asm/segment.h>
 
 extern void die(const char *,struct pt_regs *,long);
 
@@ -119,10 +120,10 @@ static inline unsigned long get_segment_
        }
 
        /* The standard kernel/user address space limit. */
-       *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
+       *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
        
        /* By far the most common cases. */
-       if (likely(seg == __USER_CS || seg == __KERNEL_CS))
+       if (likely(SEGMENT_IS_FLAT_CODE(seg)))
                return eip;
 
        /* Check the segment exists, is within the current LDT/GDT size,
@@ -436,11 +437,7 @@ good_area:
        write = 0;
        switch (error_code & 3) {
                default:        /* 3: write, present */
-#ifdef TEST_VERIFY_AREA
-                       if (regs->cs == KERNEL_CS)
-                               printk("WP fault at %08lx\n", regs->eip);
-#endif
-                       /* fall through */
+                               /* fall through */
                case 2:         /* write, not present */
                        if (!(vma->vm_flags & VM_WRITE))
                                goto bad_area;
diff -r 85e7eadfaea1 -r cf6767d9babb include/asm-i386/ptrace.h
--- a/include/asm-i386/ptrace.h Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/ptrace.h Tue Aug 01 01:32:00 2006 -0700
@@ -60,6 +60,7 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #include <asm/vm86.h>
+#include <asm/segment.h>
 
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int 
error_code);
@@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_str
  */
 static inline int user_mode(struct pt_regs *regs)
 {
-       return (regs->xcs & 3) != 0;
+       return (regs->xcs & SEGMENT_RPL_MASK) == 3;
 }
 static inline int user_mode_vm(struct pt_regs *regs)
 {
-       return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0;
+       return (((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= 
3);
 }
 #define instruction_pointer(regs) ((regs)->eip)
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
diff -r 85e7eadfaea1 -r cf6767d9babb include/asm-i386/segment.h
--- a/include/asm-i386/segment.h        Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/segment.h        Tue Aug 01 01:32:00 2006 -0700
@@ -83,6 +83,12 @@
 
 #define GDT_SIZE (GDT_ENTRIES * 8)
 
+/*
+ * Some tricky tests to match code segments after a fault
+ */
+#define SEGMENT_IS_FLAT_CODE(x)  (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
+#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
 /* Simple and small GDT entries for booting only */
 
 #define GDT_ENTRY_BOOT_CS              2
@@ -112,4 +118,8 @@
  */
 #define IDT_ENTRIES 256
 
+/* Bottom three bits of xcs give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+
+#define get_kernel_rpl()  0
 #endif



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel