[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] i386: adjustment to segment fixup code



Since SLES9SP4 is unfortunately expected to ship with a glibc that does
use negative offsets into the TLS area, I looked more closely at the
fixup code in Xen. Besides lacking support for SIB addressing and many
general purpose instructions, I realized that the non-atomic updates of
descriptor table entries provide a hole (albeit very small) for attacks
on the hypervisor address space: Accessing negative offsets from a
descriptor set up to have a base of e.g. 0xF0000000 (and its limit
therefore truncated by Xen) would result in the low 16 bits of the limit
in the descriptor being updated first (thus bumping the limit to the 4Gb
boundary), and setting the expand down bit only in a subsequent memory
access. Thus, if a guest tried hard enough it might be possible to load
the %gs selector register at exactly the right point to access this
partially updated descriptor. The kernel of that guest would then, by
avoiding a reload of the selector register, be able to access Xen's
private space.

Even worse - since the code in xen/arch/x86/x86_32/seg_fixup.c doesn't
appear to be serialized in any way, under SMP there was a small potential
for producing permanently corrupted descriptors.

Remains one question: What does BIGLOCK protect in do_update_descriptor()?
If all it's about is to avoid racing updates, I would think that its use
could be dropped with this patch applied.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

Index: 2007-11-13/xen/arch/x86/mm.c
===================================================================
--- 2007-11-13.orig/xen/arch/x86/mm.c   2007-11-13 15:50:56.000000000 +0100
+++ 2007-11-13/xen/arch/x86/mm.c        2007-11-13 15:18:35.000000000 +0100
@@ -3113,7 +3113,7 @@ long do_update_descriptor(u64 pa, u64 de
 
     /* All is good so make the update. */
     gdt_pent = map_domain_page(mfn);
-    memcpy(&gdt_pent[offset], &d, 8);
+    write_guest_descriptor(gdt_pent + offset, d);
     unmap_domain_page(gdt_pent);
 
     put_page_type(page);
Index: 2007-11-13/xen/arch/x86/x86_32/seg_fixup.c
===================================================================
--- 2007-11-13.orig/xen/arch/x86/x86_32/seg_fixup.c     2007-11-13 
15:50:56.000000000 +0100
+++ 2007-11-13/xen/arch/x86/x86_32/seg_fixup.c  2007-11-13 15:25:48.000000000 
+0100
@@ -42,7 +42,7 @@
 #define O  OPCODE_BYTE
 #define M  HAS_MODRM
 
-static unsigned char insn_decode[256] = {
+static const unsigned char insn_decode[256] = {
     /* 0x00 - 0x0F */
     O|M, O|M, O|M, O|M, X, X, X, X,
     O|M, O|M, O|M, O|M, X, X, X, X,
@@ -69,7 +69,7 @@ static unsigned char insn_decode[256] = 
     X, X, X, X, X, X, X, X,
     /* 0x80 - 0x8F */
     O|M|1, O|M|4, O|M|1, O|M|1, O|M, O|M, O|M, O|M,
-    O|M, O|M, O|M, O|M, O|M, O|M, O|M, X,
+    O|M, O|M, O|M, O|M, O|M, X|M, O|M, O|M,
     /* 0x90 - 0x9F */
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
@@ -89,17 +89,17 @@ static unsigned char insn_decode[256] = 
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
     /* 0xF0 - 0xFF */
-    X, X, X, X, X, X, X, X,
+    X, X, X, X, X, X, O|M, O|M,
     X, X, X, X, X, X, O|M, O|M
 };
 
-static unsigned char twobyte_decode[256] = {
+static const unsigned char twobyte_decode[256] = {
     /* 0x00 - 0x0F */
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
     /* 0x10 - 0x1F */
     X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    O|M, X, X, X, X, X, X, X,
     /* 0x20 - 0x2F */
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
@@ -122,16 +122,16 @@ static unsigned char twobyte_decode[256]
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
     /* 0x90 - 0x9F */
-    X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M,
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M,
     /* 0xA0 - 0xAF */
-    X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    X, X, X, O|M, O|M|1, O|M, O|M, X,
+    X, X, X, O|M, O|M|1, O|M, X, O|M,
     /* 0xB0 - 0xBF */
-    X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    X, X, X, O|M, X, X, O|M, O|M,
+    X, X, O|M|1, O|M, O|M, O|M, O|M, O|M,
     /* 0xC0 - 0xCF */
-    X, X, X, X, X, X, X, X,
+    O|M, O|M, X, O|M, X, X, X, O|M,
     X, X, X, X, X, X, X, X,
     /* 0xD0 - 0xDF */
     X, X, X, X, X, X, X, X,
@@ -153,41 +153,40 @@ static unsigned char twobyte_decode[256]
  *  @base  (OUT): Decoded linear base address.
  *  @limit (OUT): Decoded segment limit, in bytes. 0 == unlimited (4GB).
  */
-int get_baselimit(u16 seg, unsigned long *base, unsigned long *limit)
+static int get_baselimit(u16 seg, unsigned long *base, unsigned long *limit)
 {
-    struct vcpu *d = current;
-    unsigned long *table, a, b;
+    struct vcpu   *curr = current;
+    struct desc_struct *table, d;
     int            ldt = !!(seg & 4);
     int            idx = (seg >> 3) & 8191;
 
     /* Get base and check limit. */
     if ( ldt )
     {
-        table = (unsigned long *)LDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.ldt_ents )
+        table = (void *)LDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.ldt_ents )
             goto fail;
     }
     else /* gdt */
     {
-        table = (unsigned long *)GDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.gdt_ents )
+        table = (void *)GDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.gdt_ents )
             goto fail;
     }
 
     /* Grab the segment descriptor. */
-    if ( __get_user(a, &table[2*idx+0]) ||
-         __get_user(b, &table[2*idx+1]) )
+    if ( read_guest_descriptor(table + idx, &d) )
         goto fail; /* Barking up the wrong tree. Decode needs a page fault.*/
 
     /* We only parse 32-bit code and data segments. */
-    if ( (b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB)) != 
+    if ( (d.b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB)) !=
          (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB) )
         goto fail;
 
     /* Decode base and limit. */
-    *base  = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16);
-    *limit = ((b & 0xf0000) | (a & 0x0ffff)) + 1;
-    if ( (b & _SEGMENT_G) )
+    *base  = (d.b&(0xff<<24)) | ((d.b&0xff)<<16) | (d.a>>16);
+    *limit = ((d.b & 0xf0000) | (d.a & 0x0ffff)) + 1;
+    if ( (d.b & _SEGMENT_G) )
         *limit <<= 12;
 
     /*
@@ -204,7 +203,7 @@ int get_baselimit(u16 seg, unsigned long
 }
 
 /* Turn a segment+offset into a linear address. */
-int linearise_address(u16 seg, unsigned long off, unsigned long *linear)
+static int linearise_address(u16 seg, unsigned long off, unsigned long *linear)
 {
     unsigned long base, limit;
 
@@ -219,57 +218,57 @@ int linearise_address(u16 seg, unsigned 
     return 1;
 }
 
-int fixup_seg(u16 seg, unsigned long offset)
+static int fixup_seg(u16 seg, unsigned long offset)
 {
-    struct vcpu *d = current;
-    unsigned long *table, a, b, base, limit;
+    struct vcpu   *curr = current;
+    struct desc_struct *table, d;
+    unsigned long  base, limit;
     int            ldt = !!(seg & 4);
     int            idx = (seg >> 3) & 8191;
 
     /* Get base and check limit. */
     if ( ldt )
     {
-        table = (unsigned long *)LDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.ldt_ents )
+        table = (void *)LDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.ldt_ents )
         {
             dprintk(XENLOG_DEBUG, "Segment %04x out of LDT range (%ld)\n",
-                    seg, d->arch.guest_context.ldt_ents);
+                    seg, curr->arch.guest_context.ldt_ents);
             goto fail;
         }
     }
     else /* gdt */
     {
-        table = (unsigned long *)GDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.gdt_ents )
+        table = (void *)GDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.gdt_ents )
         {
             dprintk(XENLOG_DEBUG, "Segment %04x out of GDT range (%ld)\n",
-                    seg, d->arch.guest_context.gdt_ents);
+                    seg, curr->arch.guest_context.gdt_ents);
             goto fail;
         }
     }
 
     /* Grab the segment descriptor. */
-    if ( __get_user(a, &table[2*idx+0]) ||
-         __get_user(b, &table[2*idx+1]) )
+    if ( read_guest_descriptor(table + idx, &d) )
     {
         dprintk(XENLOG_DEBUG, "Fault while reading segment %04x\n", seg);
         goto fail; /* Barking up the wrong tree. Decode needs a page fault.*/
     }
 
     /* We only parse 32-bit page-granularity non-privileged data segments. */
-    if ( (b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB|
-               _SEGMENT_G|_SEGMENT_CODE|_SEGMENT_DPL)) != 
+    if ( (d.b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB|
+                 _SEGMENT_G|_SEGMENT_CODE|_SEGMENT_DPL)) !=
          (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB|_SEGMENT_G|_SEGMENT_DPL) )
     {
-        dprintk(XENLOG_DEBUG, "Bad segment %08lx:%08lx\n", a, b);
+        dprintk(XENLOG_DEBUG, "Bad segment %08x:%08x\n", d.a, d.b);
         goto fail;
     }
 
     /* Decode base and limit. */
-    base  = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16);
-    limit = (((b & 0xf0000) | (a & 0x0ffff)) + 1) << 12;
+    base  = (d.b&(0xff<<24)) | ((d.b&0xff)<<16) | (d.a>>16);
+    limit = (((d.b & 0xf0000) | (d.a & 0x0ffff)) + 1) << 12;
 
-    if ( b & _SEGMENT_EC )
+    if ( d.b & _SEGMENT_EC )
     {
         /* Expands-down: All the way to zero? Assume 4GB if so. */
         if ( ((base + limit) < PAGE_SIZE) && (offset <= limit)  )
@@ -292,20 +291,19 @@ int fixup_seg(u16 seg, unsigned long off
     }
 
     dprintk(XENLOG_DEBUG, "None of the above! "
-            "(%08lx:%08lx, %08lx, %08lx, %08lx)\n",
-            a, b, base, limit, base+limit);
+            "(%08x:%08x, %08lx, %08lx, %08lx)\n",
+            d.a, d.b, base, limit, base+limit);
 
  fail:
     return 0;
 
  flip:
     limit = (limit >> 12) - 1;
-    a &= ~0x0ffff; a |= limit & 0x0ffff;
-    b &= ~0xf0000; b |= limit & 0xf0000;
-    b ^= _SEGMENT_EC; /* grows-up <-> grows-down */
-    /* NB. These can't fault. Checked readable above; must also be writable. */
-    table[2*idx+0] = a;
-    table[2*idx+1] = b;
+    d.a &= ~0x0ffff; d.a |= limit & 0x0ffff;
+    d.b &= ~0xf0000; d.b |= limit & 0xf0000;
+    d.b ^= _SEGMENT_EC; /* grows-up <-> grows-down */
+    /* NB. This can't fault. Checked readable above; must also be writable. */
+    write_guest_descriptor(table + idx, d);
     return 1;
 }
 
@@ -315,18 +313,15 @@ int fixup_seg(u16 seg, unsigned long off
  */
 int gpf_emulate_4gb(struct cpu_user_regs *regs)
 {
-    struct vcpu *d = current;
-    struct trap_info   *ti;
-    struct trap_bounce *tb;
-    u8            modrm, mod, reg, rm, decode;
-    void         *memreg;
-    unsigned long offset;
-    u8            disp8;
-    u32           disp32 = 0;
+    struct vcpu   *curr = current;
+    u8             modrm, mod, rm, decode;
+    const u32     *base, *index = NULL;
+    unsigned long  offset;
+    s8             disp8;
+    s32            disp32 = 0;
     u8            *eip;         /* ptr to instruction start */
     u8            *pb, b;       /* ptr into instr. / current instr. byte */
-    int            gs_override = 0;
-    int            twobyte = 0;
+    int            gs_override = 0, scale = 0, twobyte = 0;
 
     /* WARNING: We only work for ring-3 segments. */
     if ( unlikely(vm86_mode(regs)) || unlikely(!ring_3(regs)) )
@@ -357,6 +352,9 @@ int gpf_emulate_4gb(struct cpu_user_regs
             goto fail;
         }
 
+        if ( twobyte )
+            break;
+
         switch ( b )
         {
         case 0x67: /* Address-size override */
@@ -375,6 +373,9 @@ int gpf_emulate_4gb(struct cpu_user_regs
         case 0x65: /* GS override */
             gs_override = 1;
             break;
+        case 0x0f: /* Not really a prefix byte */
+            twobyte = 1;
+            break;
         default: /* Not a prefix byte */
             goto done_prefix;
         }
@@ -387,32 +388,10 @@ int gpf_emulate_4gb(struct cpu_user_regs
         goto fail;
     }
 
-    decode = insn_decode[b]; /* opcode byte */
+    decode = (!twobyte ? insn_decode : twobyte_decode)[b];
     pb++;
-    if ( decode == 0 && b == 0x0f )
-    {
-        twobyte = 1;
 
-        if ( get_user(b, pb) )
-        {
-            dprintk(XENLOG_DEBUG,
-                    "Fault while accessing byte %ld of instruction\n",
-                    (long)(pb-eip));
-            goto page_fault;
-        }
-
-        if ( (pb - eip) >= 15 )
-        {
-            dprintk(XENLOG_DEBUG, "Too many opcode bytes for a "
-                    "legal instruction\n");
-            goto fail;
-        }
-
-        decode = twobyte_decode[b];
-        pb++;
-    }
-
-    if ( decode == 0 )
+    if ( !(decode & OPCODE_BYTE) )
     {
         dprintk(XENLOG_DEBUG, "Unsupported %sopcode %02x\n",
                 twobyte ? "two byte " : "", b);
@@ -422,12 +401,12 @@ int gpf_emulate_4gb(struct cpu_user_regs
     if ( !(decode & HAS_MODRM) )
     {
         /* Must be a <disp32>, or bail. */
-        if ( (decode & 7) != 4 )
+        if ( (decode & INSN_SUFFIX_BYTES) != 4 )
             goto fail;
 
         if ( get_user(offset, (u32 *)pb) )
         {
-            dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n");
+            dprintk(XENLOG_DEBUG, "Fault while extracting <moffs32>.\n");
             goto page_fault;
         }
         pb += 4;
@@ -448,29 +427,39 @@ int gpf_emulate_4gb(struct cpu_user_regs
     pb++;
 
     mod = (modrm >> 6) & 3;
-    reg = (modrm >> 3) & 7;
     rm  = (modrm >> 0) & 7;
 
     if ( rm == 4 )
     {
-        dprintk(XENLOG_DEBUG, "FIXME: Add decoding for the SIB byte.\n");
-        goto fixme;
+        u8 sib;
+
+        if ( get_user(sib, pb) )
+        {
+            dprintk(XENLOG_DEBUG, "Fault while extracting sib byte\n");
+            goto page_fault;
+        }
+
+        pb++;
+
+        rm = sib & 7;
+        if ( (sib & 0x38) != 0x20 )
+            index = decode_register((sib >> 3) & 7, regs, 0);
+        scale = sib >> 6;
     }
 
     /* Decode R/M field. */
-    memreg = decode_register(rm,  regs, 0);
+    base = decode_register(rm, regs, 0);
 
     /* Decode Mod field. */
-    switch ( modrm >> 6 )
+    switch ( mod )
     {
     case 0:
-        disp32 = 0;
         if ( rm == 5 ) /* disp32 rather than (EBP) */
         {
-            memreg = NULL;
+            base = NULL;
             if ( get_user(disp32, (u32 *)pb) )
             {
-                dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n");
+                dprintk(XENLOG_DEBUG, "Fault while extracting <base32>.\n");
                 goto page_fault;
             }
             pb += 4;
@@ -484,13 +473,13 @@ int gpf_emulate_4gb(struct cpu_user_regs
             goto page_fault;
         }
         pb++;
-        disp32 = (disp8 & 0x80) ? (disp8 | ~0xff) : disp8;;
+        disp32 = disp8;
         break;
 
     case 2:
         if ( get_user(disp32, (u32 *)pb) )
         {
-            dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n");
+            dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n");
             goto page_fault;
         }
         pb += 4;
@@ -502,8 +491,10 @@ int gpf_emulate_4gb(struct cpu_user_regs
     }
 
     offset = disp32;
-    if ( memreg != NULL )
-        offset += *(u32 *)memreg;
+    if ( base != NULL )
+        offset += *base;
+    if ( index != NULL )
+        offset += *index << scale;
 
  skip_modrm:
     if ( !fixup_seg((u16)regs->gs, offset) )
@@ -513,10 +504,11 @@ int gpf_emulate_4gb(struct cpu_user_regs
     perfc_incr(seg_fixups);
 
     /* If requested, give a callback on otherwise unused vector 15. */
-    if ( VM_ASSIST(d->domain, VMASST_TYPE_4gb_segments_notify) )
+    if ( VM_ASSIST(curr->domain, VMASST_TYPE_4gb_segments_notify) )
     {
-        ti  = &d->arch.guest_context.trap_ctxt[15];
-        tb  = &d->arch.trap_bounce;
+        struct trap_info   *ti  = &curr->arch.guest_context.trap_ctxt[15];
+        struct trap_bounce *tb  = &curr->arch.trap_bounce;
+
         tb->flags      = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
         tb->error_code = pb - eip;
         tb->cs         = ti->cs;
@@ -527,13 +519,6 @@ int gpf_emulate_4gb(struct cpu_user_regs
 
     return EXCRET_fault_fixed;
 
- fixme:
-    dprintk(XENLOG_DEBUG, "Undecodable instruction "
-            "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x "
-            "caused GPF(0) at %04x:%08x\n",
-            eip[0], eip[1], eip[2], eip[3],
-            eip[4], eip[5], eip[6], eip[7],
-            regs->cs, regs->eip);
  fail:
     return 0;
 
Index: 2007-11-13/xen/arch/x86/x86_emulate.c
===================================================================
--- 2007-11-13.orig/xen/arch/x86/x86_emulate.c  2007-11-13 15:50:56.000000000 
+0100
+++ 2007-11-13/xen/arch/x86/x86_emulate.c       2007-11-13 15:51:41.000000000 
+0100
@@ -2263,7 +2263,7 @@ x86_emulate(
     case 0x09: /* wbinvd */
         generate_exception_if(!mode_ring0(), EXC_GP);
         fail_if(ops->wbinvd == NULL);
-        if ( (rc = ops->wbinvd(ctxt)) != 0 )
+        if ( (rc = (ops->wbinvd)(ctxt)) != 0 )
             goto done;
         break;
 
Index: 2007-11-13/xen/include/asm-x86/desc.h
===================================================================
--- 2007-11-13.orig/xen/include/asm-x86/desc.h  2007-11-13 15:50:56.000000000 
+0100
+++ 2007-11-13/xen/include/asm-x86/desc.h       2007-11-13 15:18:35.000000000 
+0100
@@ -137,6 +137,11 @@ struct desc_struct {
 
 #if defined(__x86_64__)
 
+#include <asm/uaccess.h>
+
+/* Use __put_user() just to ensure the compiler doesn't split the write. */
+#define write_guest_descriptor(pent, d) ((void)__put_user(d, pent))
+
 typedef struct {
     u64 a, b;
 } idt_entry_t;
@@ -168,6 +173,29 @@ do {                                    
 
 #elif defined(__i386__)
 
+#include <asm/system.h>
+
+static inline int read_guest_descriptor(struct desc_struct *pent,
+                                        struct desc_struct *pd)
+{
+    union { u64 raw; struct desc_struct d; } val = { 0 };
+    int rc = cmpxchg_user(pent, val.raw, val.raw);
+
+    *pd = val.d;
+    return rc;
+}
+
+static inline void write_guest_descriptor(struct desc_struct *pent,
+                                          struct desc_struct d)
+{
+    u64 cur = ((u64)pent->b << 32) | pent->a, old;
+
+    do
+    {
+        old = cmpxchg((u64 *)pent, cur, ((u64)d.b << 32) | d.a);
+    } while ( old ^ cur );
+}
+
 typedef struct desc_struct idt_entry_t;
 
 #define _set_gate(gate_addr,type,dpl,addr) \



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.