# HG changeset patch
# User awilliam@xxxxxxxxxxx
# Node ID 673f62edbfbe4098ea1d5a34d8a77667da762090
# Parent 88f97bb8f3ae7e0fb85dbe8fb420d7f02f844a34
# Parent d8451bb6278cb5f3f477dd9392213be7c66730b4
merge
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_32
--- a/buildconfigs/linux-defconfig_xen0_x86_32 Wed Mar 1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen0_x86_32 Wed Mar 1 19:47:25 2006
@@ -1320,6 +1320,7 @@
# CONFIG_XEN_BLKDEV_TAP_BE is not set
CONFIG_XEN_NETDEV_BACKEND=y
# CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
# CONFIG_XEN_TPMDEV_BACKEND is not set
CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_64
--- a/buildconfigs/linux-defconfig_xen0_x86_64 Wed Mar 1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen0_x86_64 Wed Mar 1 19:47:25 2006
@@ -1244,6 +1244,7 @@
# CONFIG_XEN_BLKDEV_TAP_BE is not set
CONFIG_XEN_NETDEV_BACKEND=y
# CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
# CONFIG_XEN_TPMDEV_BACKEND is not set
CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_32
--- a/buildconfigs/linux-defconfig_xen_x86_32 Wed Mar 1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen_x86_32 Wed Mar 1 19:47:25 2006
@@ -2986,6 +2986,7 @@
# CONFIG_XEN_BLKDEV_TAP_BE is not set
CONFIG_XEN_NETDEV_BACKEND=y
# CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
# CONFIG_XEN_TPMDEV_BACKEND is not set
CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_64
--- a/buildconfigs/linux-defconfig_xen_x86_64 Wed Mar 1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen_x86_64 Wed Mar 1 19:47:25 2006
@@ -2656,6 +2656,7 @@
# CONFIG_XEN_BLKDEV_TAP_BE is not set
CONFIG_XEN_NETDEV_BACKEND=y
# CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
# CONFIG_XEN_TPMDEV_BACKEND is not set
CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/mk.linux-2.6-xen
--- a/buildconfigs/mk.linux-2.6-xen Wed Mar 1 17:01:54 2006
+++ b/buildconfigs/mk.linux-2.6-xen Wed Mar 1 19:47:25 2006
@@ -2,8 +2,8 @@
OS = linux
LINUX_SERIES = 2.6
-LINUX_VER = 2.6.16-rc4
-LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc4.bz2
+LINUX_VER = 2.6.16-rc5
+LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc5.bz2
LINUX_PDIR = linux-$(LINUX_VER)
EXTRAVERSION ?= xen
@@ -34,7 +34,7 @@
touch $(@D)/.hgskip
touch $@
-pristine-linux-%.16-rc4/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
+pristine-linux-%.16-rc5/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
touch $@ # update timestamp to avoid rebuild
$(LINUX_DIR)/include/linux/autoconf.h: ref-$(OS)-$(LINUX_VER)/.valid-ref
diff -r 88f97bb8f3ae -r 673f62edbfbe docs/src/user.tex
--- a/docs/src/user.tex Wed Mar 1 17:01:54 2006
+++ b/docs/src/user.tex Wed Mar 1 19:47:25 2006
@@ -626,7 +626,7 @@
allow you to monitor and log the Xen boot process via serial console and
can be very useful in debugging.
-%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1
+%% kernel /boot/xen-2.0.gz dom0_mem=131072 console=com1,vga com1=115200,8n1
%% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro
In order to configure Xen serial console output, it is necessary to
@@ -637,8 +637,9 @@
\end{verbatim}}
\end{quote}
-This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1
-stop bit and no parity. Modify these parameters for your environment.
+This configures Xen to output on COM1 at 115,200 baud, 8 data bits, no
+parity and 1 stop bit. Modify these parameters for your environment.
+See Section~\ref{s:xboot} for an explanation of all boot parameters.
One can also configure XenLinux to share the serial console; to achieve
this append ``\path{console=ttyS0}'' to your module line.
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/Kconfig
--- a/linux-2.6-xen-sparse/arch/i386/Kconfig Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/Kconfig Wed Mar 1 19:47:25 2006
@@ -770,7 +770,7 @@
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
- depends on SMP && HOTPLUG && EXPERIMENTAL
+ depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
---help---
Say Y here to experiment with turning CPUs off and on. CPUs
can be controlled through /sys/devices/system/cpu.
@@ -1122,6 +1122,7 @@
config KPROBES
bool "Kprobes (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && MODULES
help
Kprobes allows you to trap at almost any kernel address and
execute a callback function. register_kprobe() establishes
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/i386/kernel/Makefile Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/Makefile Wed Mar 1 19:47:25 2006
@@ -7,7 +7,7 @@
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- quirks.o i8237.o
+ quirks.o i8237.o topology.o
obj-y += cpu/
obj-y += timers/
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c Wed Mar 1
19:47:25 2006
@@ -44,9 +44,6 @@
extern int gsi_irq_sharing(int gsi);
#include <asm/proto.h>
-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) {
return 0; }
-
-
#else /* X86 */
#ifdef CONFIG_X86_LOCAL_APIC
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Wed Mar 1
19:47:25 2006
@@ -4,6 +4,7 @@
#include <linux/smp.h>
#include <linux/module.h>
#include <linux/percpu.h>
+#include <linux/bootmem.h>
#include <asm/semaphore.h>
#include <asm/processor.h>
#include <asm/i387.h>
@@ -18,6 +19,9 @@
#include <asm/hypervisor.h>
#include "cpu.h"
+
+DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
+EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
#ifndef CONFIG_XEN
DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
@@ -598,6 +602,8 @@
struct tss_struct * t = &per_cpu(init_tss, cpu);
#endif
struct thread_struct *thread = ¤t->thread;
+ struct desc_struct *gdt;
+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
if (cpu_test_and_set(cpu, cpu_initialized)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -614,7 +620,54 @@
set_in_cr4(X86_CR4_TSD);
}
- cpu_gdt_init(&cpu_gdt_descr[cpu]);
+#ifndef CONFIG_XEN
+ /*
+ * This is a horrible hack to allocate the GDT. The problem
+ * is that cpu_init() is called really early for the boot CPU
+ * (and hence needs bootmem) but much later for the secondary
+ * CPUs, when bootmem will have gone away
+ */
+ if (NODE_DATA(0)->bdata->node_bootmem_map) {
+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
+ /* alloc_bootmem_pages panics on failure, so no check */
+ memset(gdt, 0, PAGE_SIZE);
+ } else {
+ gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
+ if (unlikely(!gdt)) {
+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+ for (;;)
+ local_irq_enable();
+ }
+ }
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+
+ /* Set up GDT entry for 16bit stack */
+ *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
+ ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
+ ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
+ (CPU_16BIT_STACK_SIZE - 1);
+
+ cpu_gdt_descr->size = GDT_SIZE - 1;
+ cpu_gdt_descr->address = (unsigned long)gdt;
+#else
+ if (cpu == 0 && cpu_gdt_descr->address == 0) {
+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
+ /* alloc_bootmem_pages panics on failure, so no check */
+ memset(gdt, 0, PAGE_SIZE);
+
+ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+
+ cpu_gdt_descr->size = GDT_SIZE;
+ cpu_gdt_descr->address = (unsigned long)gdt;
+ }
+#endif
+
+ cpu_gdt_init(cpu_gdt_descr);
/*
* Set up and load the per-CPU TSS and LDT
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S
--- a/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S Wed Mar 1 19:47:25 2006
@@ -87,19 +87,9 @@
*/
.data
- ALIGN
- .word 0 # 32 bit align gdt_desc.address
- .globl cpu_gdt_descr
-cpu_gdt_descr:
- .word GDT_SIZE
- .long cpu_gdt_table
-
- .fill NR_CPUS-1,8,0 # space for the other GDT descriptors
-
/*
* The Global Descriptor Table contains 28 quadwords, per-CPU.
*/
- .align PAGE_SIZE_asm
ENTRY(cpu_gdt_table)
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x0000000000000000 /* 0x0b reserved */
@@ -148,10 +138,6 @@
.quad 0x0000000000000000 /* 0xf0 - unused */
.quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault
TSS */
- /* Be sure this is zeroed to avoid false validations in Xen */
- .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
-
-
/*
* __xen_guest information
*/
@@ -176,6 +162,7 @@
.ascii ",FEATURES=writable_page_tables"
.ascii "|writable_descriptor_tables"
.ascii "|auto_translated_physmap"
+ .ascii "|pae_pgdir_above_4gb"
.ascii "|supervisor_mode_kernel"
#ifdef CONFIG_X86_PAE
.ascii ",PAE=yes"
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c Wed Mar 1
19:47:25 2006
@@ -2634,8 +2634,10 @@
spin_unlock_irqrestore(&ioapic_lock, flags);
/* Sanity check */
- if (reg_00.bits.ID != apic_id)
- panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+ if (reg_00.bits.ID != apic_id) {
+ printk("IOAPIC[%d]: Unable to change apic_id!\n",
ioapic);
+ return -1;
+ }
}
apic_printk(APIC_VERBOSE, KERN_INFO
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c Wed Mar 1
19:47:25 2006
@@ -935,6 +935,7 @@
u32 gsi_base)
{
int idx = 0;
+ int tmpid;
if (nr_ioapics >= MAX_IO_APICS) {
printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -957,9 +958,14 @@
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
#endif
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
(boot_cpu_data.x86 < 15))
- mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+ tmpid = io_apic_get_unique_id(idx, id);
else
- mp_ioapics[idx].mpc_apicid = id;
+ tmpid = id;
+ if (tmpid == -1) {
+ nr_ioapics--;
+ return;
+ }
+ mp_ioapics[idx].mpc_apicid = tmpid;
mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
/*
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c Wed Mar 1 19:47:25 2006
@@ -898,12 +898,6 @@
unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0;
- if (!cpu_gdt_descr[cpu].address &&
- !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
- printk("Failed to allocate GDT for CPU %d\n", cpu);
- return 1;
- }
-
++cpucount;
/*
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c Wed Mar 1 19:47:25 2006
@@ -48,6 +48,8 @@
#include <linux/mca.h>
#include <linux/sysctl.h>
#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <linux/posix-timers.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -70,6 +72,7 @@
#include <asm/arch_hooks.h>
#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
#if defined (__i386__)
#include <asm/i8259.h>
@@ -122,6 +125,13 @@
/* Keep track of last time we did processing/updating of jiffies and xtime. */
static u64 processed_system_time; /* System time (ns) at last processing. */
static DEFINE_PER_CPU(u64, processed_system_time);
+
+/* How much CPU time was spent blocked and how much was 'stolen'? */
+static DEFINE_PER_CPU(u64, processed_stolen_time);
+static DEFINE_PER_CPU(u64, processed_blocked_time);
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
/* Must be signed, as it's compared with s64 quantities which can be -ve. */
#define NS_PER_TICK (1000000000LL/HZ)
@@ -477,14 +487,45 @@
EXPORT_SYMBOL(do_settimeofday);
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static void sync_xen_wallclock(unsigned long dummy);
+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
+static void sync_xen_wallclock(unsigned long dummy)
+{
+ time_t sec;
+ s64 nsec;
+ dom0_op_t op;
+
+ if (!ntp_synced() || independent_wallclock ||
+ !(xen_start_info->flags & SIF_INITDOMAIN))
+ return;
+
+ write_seqlock_irq(&xtime_lock);
+
+ sec = xtime.tv_sec;
+ nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
+ __normalize_time(&sec, &nsec);
+
+ op.cmd = DOM0_SETTIME;
+ op.u.settime.secs = sec;
+ op.u.settime.nsecs = nsec;
+ op.u.settime.system_time = processed_system_time;
+ HYPERVISOR_dom0_op(&op);
+
+ update_wallclock();
+
+ write_sequnlock_irq(&xtime_lock);
+
+ /* Once per minute. */
+ mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
+}
+
static int set_rtc_mmss(unsigned long nowtime)
{
int retval;
WARN_ON(irqs_disabled());
- if (!(xen_start_info->flags & SIF_INITDOMAIN))
+ if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
return 0;
/* gets recalled with irq locally disabled */
@@ -497,12 +538,6 @@
return retval;
}
-#else
-static int set_rtc_mmss(unsigned long nowtime)
-{
- return 0;
-}
-#endif
/* monotonic_clock(): returns # of nanoseconds passed since time_init()
* Note: This function is required to return accurate
@@ -567,19 +602,37 @@
irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
- s64 delta, delta_cpu;
+ s64 delta, delta_cpu, stolen, blocked;
+ u64 sched_time;
int i, cpu = smp_processor_id();
struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+ struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
write_seqlock(&xtime_lock);
do {
get_time_values_from_xen();
+ /* Obtain a consistent snapshot of elapsed wallclock cycles. */
delta = delta_cpu =
shadow->system_timestamp + get_nsec_offset(shadow);
delta -= processed_system_time;
delta_cpu -= per_cpu(processed_system_time, cpu);
+
+ /*
+ * Obtain a consistent snapshot of stolen/blocked cycles. We
+ * can use state_entry_time to detect if we get preempted here.
+ */
+ do {
+ sched_time = runstate->state_entry_time;
+ barrier();
+ stolen = runstate->time[RUNSTATE_runnable] +
+ runstate->time[RUNSTATE_offline] -
+ per_cpu(processed_stolen_time, cpu);
+ blocked = runstate->time[RUNSTATE_blocked] -
+ per_cpu(processed_blocked_time, cpu);
+ barrier();
+ } while (sched_time != runstate->state_entry_time);
}
while (!time_values_up_to_date(cpu));
@@ -612,18 +665,67 @@
write_sequnlock(&xtime_lock);
/*
- * Local CPU jiffy work. No need to hold xtime_lock, and I'm not sure
- * if there is risk of deadlock if we do (since update_process_times
- * may do scheduler rebalancing work and thus acquire runqueue locks).
- */
- while (delta_cpu >= NS_PER_TICK) {
- delta_cpu -= NS_PER_TICK;
- per_cpu(processed_system_time, cpu) += NS_PER_TICK;
- update_process_times(user_mode(regs));
- profile_tick(CPU_PROFILING, regs);
- }
+ * Account stolen ticks.
+ * HACK: Passing NULL to account_steal_time()
+ * ensures that the ticks are accounted as stolen.
+ */
+ if (stolen > 0) {
+ delta_cpu -= stolen;
+ do_div(stolen, NS_PER_TICK);
+ per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
+ per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
+ account_steal_time(NULL, (cputime_t)stolen);
+ }
+
+ /*
+ * Account blocked ticks.
+ * HACK: Passing idle_task to account_steal_time()
+ * ensures that the ticks are accounted as idle/wait.
+ */
+ if (blocked > 0) {
+ delta_cpu -= blocked;
+ do_div(blocked, NS_PER_TICK);
+ per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
+ per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
+ account_steal_time(idle_task(cpu), (cputime_t)blocked);
+ }
+
+ /* Account user/system ticks. */
+ if (delta_cpu > 0) {
+ do_div(delta_cpu, NS_PER_TICK);
+ per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+ if (user_mode(regs))
+ account_user_time(current, (cputime_t)delta_cpu);
+ else
+ account_system_time(current, HARDIRQ_OFFSET,
+ (cputime_t)delta_cpu);
+ }
+
+ /* Local timer processing (see update_process_times()). */
+ run_local_timers();
+ if (rcu_pending(cpu))
+ rcu_check_callbacks(cpu, user_mode(regs));
+ scheduler_tick();
+ run_posix_cpu_timers(current);
return IRQ_HANDLED;
+}
+
+static void init_missing_ticks_accounting(int cpu)
+{
+ struct vcpu_register_runstate_memory_area area;
+ struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+ memset(runstate, 0, sizeof(*runstate));
+
+ area.addr.v = runstate;
+ HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+
+ per_cpu(processed_blocked_time, cpu) =
+ runstate->time[RUNSTATE_blocked];
+ per_cpu(processed_stolen_time, cpu) =
+ runstate->time[RUNSTATE_runnable] +
+ runstate->time[RUNSTATE_offline];
}
/* not static: needed by APM */
@@ -691,6 +793,7 @@
void notify_arch_cmos_timer(void)
{
mod_timer(&sync_cmos_timer, jiffies + 1);
+ mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
}
static long clock_cmos_diff, sleep_start;
@@ -814,6 +917,7 @@
processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
per_cpu(processed_system_time, 0) = processed_system_time;
+ init_missing_ticks_accounting(0);
update_wallclock();
@@ -891,6 +995,7 @@
processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
per_cpu(processed_system_time, 0) = processed_system_time;
+ init_missing_ticks_accounting(0);
update_wallclock();
}
@@ -909,6 +1014,7 @@
/* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
per_cpu(processed_system_time, cpu) =
per_cpu(shadow_time, 0).system_timestamp;
+ init_missing_ticks_accounting(cpu);
} while (read_seqretry(&xtime_lock, seq));
sprintf(timer_name[cpu], "timer%d", cpu);
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile
--- a/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile Wed Mar 1 19:47:25 2006
@@ -2,6 +2,4 @@
# Makefile for the linux kernel.
#
-obj-y := setup.o topology.o
-
-topology-y := ../mach-default/topology.o
+obj-y := setup.o
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/i386/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Wed Mar 1 19:47:25 2006
@@ -454,6 +454,7 @@
static int disable_nx __initdata = 0;
u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL(__supported_pte_mask);
/*
* noexec = on|off
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/Kconfig
--- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig Wed Mar 1 19:47:25 2006
@@ -381,21 +381,6 @@
as it is off-chip. You can find the HPET spec at
<http://www.intel.com/hardwaredesign/hpetspec.htm>.
-config X86_PM_TIMER
- bool "PM timer" if EMBEDDED
- depends on ACPI && !X86_64_XEN
- default y
- help
- Support the ACPI PM timer for time keeping. This is slow,
- but is useful on some chipsets without HPET on systems with more
- than one CPU. On a single processor or single socket multi core
- system it is normally not required.
- When the PM timer is active 64bit vsyscalls are disabled
- and should not be enabled (/proc/sys/kernel/vsyscall64 should
- not be changed).
- The kernel selects the PM timer only as a last resort, so it is
- useful to enable just in case.
-
config HPET_EMULATE_RTC
bool "Provide RTC interrupt"
depends on HPET_TIMER && RTC=y
@@ -640,6 +625,7 @@
config KPROBES
bool "Kprobes (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && MODULES
help
Kprobes allows you to trap at almost any kernel address and
execute a callback function. register_kprobe() establishes
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile Wed Mar 1 19:47:25 2006
@@ -45,7 +45,7 @@
bootflag-y += ../../i386/kernel/bootflag.o
cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
-topology-y += ../../i386/mach-default/topology.o
+topology-y += ../../i386/kernel/topology.o
microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o
intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
quirks-y += ../../i386/kernel/quirks.o
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c Wed Mar 1
19:47:25 2006
@@ -114,6 +114,8 @@
irq_exit();
}
+int __initdata unsync_tsc_on_multicluster;
+
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
*/
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S Wed Mar 1
19:47:25 2006
@@ -51,6 +51,7 @@
#include <asm/page.h>
#include <asm/errno.h>
#include <xen/interface/arch-x86_64.h>
+#include <xen/interface/features.h>
#include "irq_vectors.h"
@@ -146,16 +147,19 @@
*/
.macro HYPERVISOR_IRET flag
testb $3,1*8(%rsp)
- jnz 1f
+ jnz 2f
testl $NMI_MASK,2*8(%rsp)
+ jnz 2f
+
+ testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
jnz 1f
/* Direct iret to kernel space. Correct CS and SS. */
orb $3,1*8(%rsp)
orb $3,4*8(%rsp)
- iretq
-
-1: /* Slow iret via hypervisor. */
+1: iretq
+
+2: /* Slow iret via hypervisor. */
andl $~NMI_MASK, 16(%rsp)
pushq $\flag
jmp hypercall_page + (__HYPERVISOR_iret * 32)
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Wed Mar 1
19:47:25 2006
@@ -51,6 +51,8 @@
int disable_timer_pin_1 __initdata;
#ifndef CONFIG_XEN
+int timer_over_8254 __initdata = 1;
+
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
#endif
@@ -300,6 +302,22 @@
__setup("noapic", disable_ioapic_setup);
__setup("apic", enable_ioapic_setup);
+
+#ifndef CONFIG_XEN
+static int __init setup_disable_8254_timer(char *s)
+{
+ timer_over_8254 = -1;
+ return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+ timer_over_8254 = 2;
+ return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+#endif /* !CONFIG_XEN */
#include <asm/pci-direct.h>
#include <linux/pci_ids.h>
@@ -360,27 +378,20 @@
/* RED-PEN skip them on mptables too? */
return;
case PCI_VENDOR_ID_ATI:
+
+ /* This should be actually default, but
+ for 2.6.16 let's do it for ATI only where
+ it's really needed. */
#ifndef CONFIG_XEN
- if (apic_runs_main_timer != 0)
- break;
-#ifdef CONFIG_ACPI
- /* Don't do this for laptops right
- right now because their timer
- doesn't necessarily tick in C2/3 */
- if (acpi_fadt.revision >= 3 &&
- (acpi_fadt.plvl2_lat + acpi_fadt.plvl3_lat) < 1100) {
- printk(KERN_INFO
-"ATI board detected, but seems to be a laptop. Timer might be shakey,
sorry\n");
- break;
- }
-#endif
+ if (timer_over_8254 == 1) {
+ timer_over_8254 = 0;
printk(KERN_INFO
- "ATI board detected. Using APIC/PM timer.\n");
- apic_runs_main_timer = 1;
- nohpet = 1;
+ "ATI board detected. Disabling timer routing over 8254.\n");
+ }
#endif
return;
}
+
/* No multi-function device? */
type = read_pci_config_byte(num,slot,func,
@@ -1848,6 +1859,8 @@
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for modern platforms only.
*/
static inline void check_timer(void)
{
@@ -1870,7 +1883,8 @@
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- enable_8259A_irq(0);
+ if (timer_over_8254 > 0)
+ enable_8259A_irq(0);
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1925,7 +1939,7 @@
}
printk(" failed.\n");
- if (nmi_watchdog) {
+ if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC -
disabling NMI Watchdog!\n");
nmi_watchdog = 0;
}
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c Wed Mar 1
19:47:25 2006
@@ -462,6 +462,12 @@
else if(!memcmp(from, "elfcorehdr=", 11))
elfcorehdr_addr = memparse(from+11, &from);
#endif
+
+#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
+ else if (!memcmp(from, "additional_cpus=", 16))
+ setup_additional_cpus(from+16);
+#endif
+
next_char:
c = *(from++);
if (!c)
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/acpi/Kconfig
--- a/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar 1 19:47:25 2006
@@ -247,7 +247,7 @@
Enter the full path name to the file wich includes the AmlCode
declaration.
config ACPI_BLACKLIST_YEAR
- int "Disable ACPI for systems before Jan 1st this year" if X86
+ int "Disable ACPI for systems before Jan 1st this year" if X86_32
default 0
help
enter a 4-digit year, eg. 2001 to disable ACPI by default
@@ -285,9 +285,9 @@
dump your ACPI DSDT table using /proc/acpi/dsdt.
config X86_PM_TIMER
- bool "Power Management Timer Support"
- depends on X86
- depends on !X86_64
+ bool "Power Management Timer Support" if EMBEDDED
+ depends on X86
+ depends on !XEN
default y
help
The Power Management Timer is available on all ACPI-capable,
@@ -298,9 +298,8 @@
voltage scaling, unlike the commonly used Time Stamp Counter
(TSC) timing source.
- So, if you see messages like 'Losing too many ticks!' in the
- kernel logs, and/or you are using this on a notebook which
- does not yet have an HPET, you should say "Y" here.
+ You should nearly always say Y here because many modern
+ systems require this timer.
config ACPI_CONTAINER
tristate "ACPI0004,PNP0A05 and PNP0A06 Container Driver (EXPERIMENTAL)"
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/video/Kconfig
--- a/linux-2.6-xen-sparse/drivers/video/Kconfig Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/video/Kconfig Wed Mar 1 19:47:25 2006
@@ -520,7 +520,7 @@
config FB_GBE_MEM
int "Video memory size in MB"
depends on FB_GBE
- default 8
+ default 4
help
This is the amount of memory reserved for the framebuffer,
which can be any value between 1MB and 8MB.
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/Kconfig
--- a/linux-2.6-xen-sparse/drivers/xen/Kconfig Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/Kconfig Wed Mar 1 19:47:25 2006
@@ -68,7 +68,7 @@
default n
config XEN_BLKDEV_BACKEND
- bool "Block-device backend driver"
+ tristate "Block-device backend driver"
default y
help
The block-device backend driver allows the kernel to export its
@@ -76,7 +76,7 @@
interface.
config XEN_BLKDEV_TAP_BE
- bool "Block Tap support for backend driver (DANGEROUS)"
+ tristate "Block Tap support for backend driver (DANGEROUS)"
depends on XEN_BLKDEV_BACKEND
default n
help
@@ -89,7 +89,7 @@
modified to use grant tables.
config XEN_NETDEV_BACKEND
- bool "Network-device backend driver"
+ tristate "Network-device backend driver"
default y
help
The network-device backend driver allows the kernel to export its
@@ -109,8 +109,16 @@
are unsure; or if you experience network hangs when this option is
enabled; then you must say N here.
+config XEN_NETDEV_LOOPBACK
+ tristate "Network-device loopback driver"
+ depends on XEN_NETDEV_BACKEND
+ default y
+ help
+ A two-interface loopback device to emulate a local netfront-netback
+ connection.
+
config XEN_TPMDEV_BACKEND
- bool "TPM-device backend driver"
+ tristate "TPM-device backend driver"
default n
help
The TPM-device backend driver
@@ -145,7 +153,7 @@
(domain 0), then you almost certainly want to say Y here.
config XEN_BLKDEV_TAP
- bool "Block device tap driver"
+ tristate "Block device tap driver"
default n
help
This driver allows a VM to interact on block device channels
@@ -154,7 +162,7 @@
space. Odds are that you want to say N here.
config XEN_TPMDEV_FRONTEND
- bool "TPM-device frontend driver"
+ tristate "TPM-device frontend driver"
default n
select TCG_TPM
select TCG_XEN
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/blkback/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar 1 19:47:25 2006
@@ -1,2 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
-obj-y := blkback.o xenbus.o interface.o vbd.o
+blkbk-y := blkback.o xenbus.o interface.o vbd.o
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Wed Mar 1
19:47:25 2006
@@ -29,14 +29,10 @@
* 64 should be enough to keep us competitive with Linux.
*/
static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
static int mmap_pages;
-
-static int __init set_blkif_reqs(char *str)
-{
- get_option(&str, &blkif_reqs);
- return 1;
-}
-__setup("blkif_reqs=", set_blkif_reqs);
/* Run-time switchable: /sys/module/blkback/parameters/ */
static unsigned int log_stats = 0;
@@ -574,10 +570,20 @@
list_add_tail(&pending_reqs[i].free_list, &pending_free);
blkif_xenbus_init();
+ __unsafe(THIS_MODULE);
return 0;
}
-__initcall(blkif_init);
+module_init(blkif_init);
+
+static void blkif_exit(void)
+{
+ BUG();
+}
+
+module_exit(blkif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
/*
* Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/core/skbuff.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c Wed Mar 1 19:47:25 2006
@@ -16,6 +16,7 @@
/* Referenced in netback.c. */
/*static*/ kmem_cache_t *skbuff_cachep;
+EXPORT_SYMBOL(skbuff_cachep);
#define MAX_SKBUFF_ORDER 4
static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/core/smpboot.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c Wed Mar 1 19:47:25 2006
@@ -150,6 +150,11 @@
{
vcpu_guest_context_t ctxt;
struct task_struct *idle = idle_task(vcpu);
+#ifdef __x86_64__
+ struct desc_ptr *gdt_descr = &cpu_gdt_descr[vcpu];
+#else
+ struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, vcpu);
+#endif
if (vcpu == 0)
return;
@@ -171,8 +176,8 @@
ctxt.ldt_ents = 0;
- ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[vcpu].address);
- ctxt.gdt_ents = cpu_gdt_descr[vcpu].size / 8;
+ ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
+ ctxt.gdt_ents = gdt_descr->size / 8;
#ifdef __i386__
ctxt.user_regs.cs = __KERNEL_CS;
@@ -210,6 +215,11 @@
{
int cpu;
struct task_struct *idle;
+#ifdef __x86_64__
+ struct desc_ptr *gdt_descr;
+#else
+ struct Xgt_desc_struct *gdt_descr;
+#endif
cpu_data[0] = boot_cpu_data;
@@ -225,6 +235,22 @@
for_each_cpu_mask (cpu, cpu_possible_map) {
if (cpu == 0)
continue;
+
+#ifdef __x86_64__
+ gdt_descr = &cpu_gdt_descr[cpu];
+#else
+ gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+#endif
+ gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+ if (unlikely(!gdt_descr->address)) {
+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+ continue;
+ }
+ gdt_descr->size = GDT_SIZE;
+ memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
+ make_page_readonly(
+ (void *)gdt_descr->address,
+ XENFEAT_writable_descriptor_tables);
cpu_data[cpu] = boot_cpu_data;
cpu_2_logical_apicid[cpu] = cpu;
@@ -241,17 +267,6 @@
#endif
irq_ctx_init(cpu);
-
- cpu_gdt_descr[cpu].address =
- __get_free_page(GFP_KERNEL|__GFP_ZERO);
- BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE);
- cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
- memcpy((void *)cpu_gdt_descr[cpu].address,
- (void *)cpu_gdt_descr[0].address,
- cpu_gdt_descr[0].size);
- make_page_readonly(
- (void *)cpu_gdt_descr[cpu].address,
- XENFEAT_writable_descriptor_tables);
#ifdef CONFIG_HOTPLUG_CPU
if (xen_start_info->flags & SIF_INITDOMAIN)
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/net_driver_util.c
--- a/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c Wed Mar 1
19:47:25 2006
@@ -30,6 +30,7 @@
#include <linux/if_ether.h>
#include <linux/err.h>
+#include <linux/module.h>
#include <xen/net_driver_util.h>
@@ -54,7 +55,7 @@
kfree(macstr);
return 0;
}
-
+EXPORT_SYMBOL(xen_net_read_mac);
/*
* Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/netback/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar 1 19:47:25 2006
@@ -1,2 +1,5 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
+obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
-obj-y := netback.o xenbus.o interface.o loopback.o
+netbk-y := netback.o xenbus.o interface.o
+netloop-y := loopback.o
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/netback/loopback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c Wed Mar 1
19:47:25 2006
@@ -178,6 +178,23 @@
return err;
}
+static void __init clean_loopback(int i)
+{
+ struct net_device *dev1, *dev2;
+ char dev_name[IFNAMSIZ];
+
+ sprintf(dev_name, "vif0.%d", i);
+ dev1 = dev_get_by_name(dev_name);
+ sprintf(dev_name, "veth%d", i);
+ dev2 = dev_get_by_name(dev_name);
+ if (dev1 && dev2) {
+ unregister_netdev(dev2);
+ unregister_netdev(dev1);
+ free_netdev(dev2);
+ free_netdev(dev1);
+ }
+}
+
static int __init loopback_init(void)
{
int i, err = 0;
@@ -190,6 +207,18 @@
}
module_init(loopback_init);
+
+static void __exit loopback_exit(void)
+{
+ int i;
+
+ for (i = nloopbacks; i-- > 0; )
+ clean_loopback(i);
+}
+
+module_exit(loopback_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
/*
* Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/netback/netback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Wed Mar 1
19:47:25 2006
@@ -505,14 +505,12 @@
/* Still too big to send right now? Set a callback. */
if (txreq.size > netif->remaining_credit) {
netif->remaining_credit = 0;
- netif->credit_timeout.expires =
- next_credit;
netif->credit_timeout.data =
(unsigned long)netif;
netif->credit_timeout.function =
tx_credit_callback;
- add_timer_on(&netif->credit_timeout,
- smp_processor_id());
+ __mod_timer(&netif->credit_timeout,
+ next_credit);
break;
}
}
@@ -811,6 +809,8 @@
&netif_be_dbg);
#endif
+ __unsafe(THIS_MODULE);
+
return 0;
}
@@ -821,6 +821,8 @@
module_init(netback_init);
module_exit(netback_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
/*
* Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Wed Mar 1
19:47:25 2006
@@ -114,6 +114,7 @@
/* Receive-ring batched refills. */
#define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
#define RX_MAX_TARGET NET_RX_RING_SIZE
int rx_min_target, rx_max_target, rx_target;
struct sk_buff_head rx_batch;
@@ -1102,8 +1103,8 @@
spin_lock_init(&np->rx_lock);
skb_queue_head_init(&np->rx_batch);
- np->rx_target = RX_MIN_TARGET;
- np->rx_min_target = RX_MIN_TARGET;
+ np->rx_target = RX_DFL_MIN_TARGET;
+ np->rx_min_target = RX_DFL_MIN_TARGET;
np->rx_max_target = RX_MAX_TARGET;
init_timer(&np->rx_refill_timer);
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/tpmback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar 1 19:47:25 2006
@@ -54,9 +54,11 @@
void tpmif_disconnect_complete(tpmif_t * tpmif);
tpmif_t *tpmif_find(domid_t domid, long int instance);
void tpmif_interface_init(void);
+void tpmif_interface_exit(void);
void tpmif_schedule_work(tpmif_t * tpmif);
void tpmif_deschedule_work(tpmif_t * tpmif);
void tpmif_xenbus_init(void);
+void tpmif_xenbus_exit(void);
int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
int tpmif_vtpm_open(tpmif_t *tpmif, domid_t domain, u32 instance);
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c Wed Mar 1
19:47:25 2006
@@ -186,6 +186,12 @@
0, 0, NULL, NULL);
}
+void __init
+tpmif_interface_exit(void)
+{
+ kmem_cache_destroy(tpmif_cachep);
+}
+
/*
* Local variables:
* c-file-style: "linux"
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c Wed Mar 1
19:47:25 2006
@@ -1092,7 +1092,20 @@
return 0;
}
-__initcall(tpmback_init);
+module_init(tpmback_init);
+
+static void __exit
+tpmback_exit(void)
+{
+
+ tpmif_xenbus_exit();
+ tpmif_interface_exit();
+ misc_deregister(&ibmvtpms_miscdevice);
+}
+
+module_exit(tpmback_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
/*
* Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar 1 19:47:25 2006
@@ -317,6 +317,11 @@
xenbus_register_backend(&tpmback);
}
+void tpmif_xenbus_exit(void)
+{
+ xenbus_unregister_driver(&tpmback);
+}
+
/*
* Local variables:
* c-file-style: "linux"
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c Wed Mar 1
19:47:25 2006
@@ -480,6 +480,11 @@
xenbus_register_frontend(&tpmfront);
}
+static void __exit exit_tpm_xenbus(void)
+{
+ xenbus_unregister_driver(&tpmfront);
+}
+
static int
tpm_allocate_buffers(struct tpm_private *tp)
@@ -700,7 +705,18 @@
return 0;
}
-__initcall(tpmif_init);
+module_init(tpmif_init);
+
+static void __exit
+tpmif_exit(void)
+{
+ exit_tpm_xenbus();
+ gnttab_free_grant_references(gref_head);
+}
+
+module_exit(tpmif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
/*
* Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h
--- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar 1
17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar 1
19:47:25 2006
@@ -23,11 +23,13 @@
unsigned short pad;
} __attribute__ ((packed));
-extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
+extern struct Xgt_desc_struct idt_descr;
+DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
+
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
- return ((struct desc_struct *)cpu_gdt_descr[cpu].address);
+ return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
}
#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h Wed Mar
1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h Wed Mar
1 19:47:25 2006
@@ -18,8 +18,6 @@
#define pcibios_assign_all_busses() 0
#endif
#define pcibios_scan_all_fns(a, b) 0
-
-extern int no_iommu, force_iommu;
extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_IO 0x1000
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h Wed Mar
1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h Wed Mar
1 19:47:25 2006
@@ -169,7 +169,7 @@
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1))
-#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
#define FIRST_USER_ADDRESS 0
#ifndef __ASSEMBLY__
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/linux/mm.h
--- a/linux-2.6-xen-sparse/include/linux/mm.h Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/linux/mm.h Wed Mar 1 19:47:25 2006
@@ -1064,7 +1064,11 @@
void drop_pagecache(void);
void drop_slab(void);
+#ifndef CONFIG_MMU
+#define randomize_va_space 0
+#else
extern int randomize_va_space;
+#endif
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/mm/page_alloc.c
--- a/linux-2.6-xen-sparse/mm/page_alloc.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/mm/page_alloc.c Wed Mar 1 19:47:25 2006
@@ -1017,7 +1017,7 @@
if (page)
goto got_pg;
- out_of_memory(gfp_mask, order);
+ out_of_memory(zonelist, gfp_mask, order);
goto restart;
}
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/net/core/skbuff.c
--- a/linux-2.6-xen-sparse/net/core/skbuff.c Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/net/core/skbuff.c Wed Mar 1 19:47:25 2006
@@ -434,6 +434,9 @@
C(pkt_type);
C(ip_summed);
C(priority);
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ C(ipvs_property);
+#endif
C(protocol);
n->destructor = NULL;
#ifdef CONFIG_NETFILTER
@@ -441,13 +444,6 @@
C(nfct);
nf_conntrack_get(skb->nfct);
C(nfctinfo);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- C(nfct_reasm);
- nf_conntrack_get_reasm(skb->nfct_reasm);
-#endif
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
- C(ipvs_property);
-#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
C(nfct_reasm);
nf_conntrack_get_reasm(skb->nfct_reasm);
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/Makefile
--- a/tools/examples/Makefile Wed Mar 1 17:01:54 2006
+++ b/tools/examples/Makefile Wed Mar 1 19:47:25 2006
@@ -26,10 +26,11 @@
XEN_SCRIPTS += network-nat vif-nat
XEN_SCRIPTS += block
XEN_SCRIPTS += block-enbd block-nbd
-XEN_SCRIPTS += vtpm
-XEN_SCRIPT_DATA = xen-script-common.sh
+XEN_SCRIPTS += vtpm vtpm-delete
+XEN_SCRIPTS += xen-hotplug-cleanup
+XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
-XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh
+XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
XEN_HOTPLUG_DIR = /etc/hotplug
XEN_HOTPLUG_SCRIPTS = xen-backend.agent
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vif-common.sh
--- a/tools/examples/vif-common.sh Wed Mar 1 17:01:54 2006
+++ b/tools/examples/vif-common.sh Wed Mar 1 19:47:25 2006
@@ -125,7 +125,7 @@
#
function ip_of()
{
- ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed 's,/.*,,' | head
-1
+ ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed -n '1 s,/.*,,p'
}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm
--- a/tools/examples/vtpm Wed Mar 1 17:01:54 2006
+++ b/tools/examples/vtpm Wed Mar 1 19:47:25 2006
@@ -1,7 +1,7 @@
#!/bin/sh
dir=$(dirname "$0")
-. "$dir/vtpm-common.sh"
+. "$dir/vtpm-hotplug-common.sh"
vtpm_fatal_error=0
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-common.sh
--- a/tools/examples/vtpm-common.sh Wed Mar 1 17:01:54 2006
+++ b/tools/examples/vtpm-common.sh Wed Mar 1 19:47:25 2006
@@ -17,21 +17,8 @@
#
dir=$(dirname "$0")
-. "$dir/xen-hotplug-common.sh"
-
-findCommand "$@"
-if [ "$command" != "online" ] &&
- [ "$command" != "offline" ] &&
- [ "$command" != "add" ] &&
- [ "$command" != "remove" ]
-then
- log err "Invalid command: $command"
- exit 1
-fi
-
-
-XENBUS_PATH="${XENBUS_PATH:?}"
-
+. "$dir/logging.sh"
+. "$dir/locking.sh"
VTPMDB="/etc/xen/vtpm.db"
@@ -58,7 +45,11 @@
function vtpm_resume() {
true
}
+ function vtpm_delete() {
+ true
+ }
fi
+
#Find the instance number for the vtpm given the name of the domain
# Parameters
@@ -66,7 +57,7 @@
# Return value
# Returns '0' if instance number could not be found, otherwise
# it returns the instance number in the variable 'instance'
-function find_instance () {
+function vtpmdb_find_instance () {
local vmname=$1
local ret=0
instance=`cat $VTPMDB | \
@@ -80,18 +71,17 @@
} \
}'`
if [ "$instance" != "" ]; then
- ret=1
- fi
- return $ret
+ ret=$instance
+ fi
+ echo "$ret"
}
# Check whether a particular instance number is still available
-# returns '1' if it is available
-function is_free_instancenum () {
+# returns "0" if it is not available, "1" otherwise.
+function vtpmdb_is_free_instancenum () {
local instance=$1
local avail=1
-
#Allowed instance number range: 1-255
if [ $instance -eq 0 -o $instance -gt 255 ]; then
avail=0
@@ -110,13 +100,13 @@
fi
done
fi
- return $avail
+ echo "$avail"
}
# Get an available instance number given the database
# Returns an unused instance number
-function get_free_instancenum () {
+function vtpmdb_get_free_instancenum () {
local ctr
local instances
local don
@@ -145,12 +135,12 @@
fi
let ctr=ctr+1
done
- let instance=$ctr
+ echo "$ctr"
}
# Add a domain name and instance number to the DB file
-function add_instance () {
+function vtpmdb_add_instance () {
local vmname=$1
local inst=$2
@@ -159,8 +149,8 @@
echo "#1st column: domain name" >> $VTPMDB
echo "#2nd column: TPM instance number" >> $VTPMDB
fi
- validate_entry $vmname $inst
- if [ $? -eq 0 ]; then
+ res=$(vtpmdb_validate_entry $vmname $inst)
+ if [ $res -eq 0 ]; then
echo "$vmname $inst" >> $VTPMDB
fi
}
@@ -168,11 +158,10 @@
#Validate whether an entry is the same as passed to this
#function
-function validate_entry () {
+function vtpmdb_validate_entry () {
local rc=0
local vmname=$1
local inst=$2
- local res
res=`cat $VTPMDB | \
gawk -vvmname=$vmname \
@@ -197,13 +186,15 @@
elif [ "$res" == "2" ]; then
let rc=2
fi
- return $rc
+ echo "$rc"
}
#Remove an entry from the vTPM database given its domain name
-function remove_entry () {
+#and instance number
+function vtpmdb_remove_entry () {
local vmname=$1
+ local instance=$2
local VTPMDB_TMP="$VTPMDB".tmp
`cat $VTPMDB | \
gawk -vvmname=$vmname \
@@ -214,6 +205,7 @@
'} > $VTPMDB_TMP`
if [ -e $VTPMDB_TMP ]; then
mv -f $VTPMDB_TMP $VTPMDB
+ vtpm_delete $instance
else
log err "Error creating temporary file '$VTPMDB_TMP'."
fi
@@ -222,7 +214,7 @@
# Find the reason for the creation of this device:
# Set global REASON variable to 'resume' or 'create'
-function get_create_reason () {
+function vtpm_get_create_reason () {
local resume=$(xenstore-read $XENBUS_PATH/resume)
if [ "$resume" == "True" ]; then
REASON="resume"
@@ -230,6 +222,7 @@
REASON="create"
fi
}
+
#Create a vTPM instance
# If no entry in the TPM database is found, the instance is
@@ -237,26 +230,23 @@
function vtpm_create_instance () {
local domname=$(xenstore_read "$XENBUS_PATH"/domain)
local res
- set +e
- get_create_reason
+ local instance
+ vtpm_get_create_reason
claim_lock vtpmdb
-
- find_instance $domname
- res=$?
- if [ $res -eq 0 ]; then
+ instance=$(vtpmdb_find_instance $domname)
+ if [ "$instance" == "0" ]; then
#Try to give the preferred instance to the domain
instance=$(xenstore_read "$XENBUS_PATH"/pref_instance)
if [ "$instance" != "" ]; then
- is_free_instancenum $instance
- res=$?
+ res=$(vtpmdb_is_free_instancenum $instance)
if [ $res -eq 0 ]; then
- get_free_instancenum
+ instance=$(vtpmdb_get_free_instancenum)
fi
else
- get_free_instancenum
+ instance=$(vtpmdb_get_free_instancenum)
fi
- add_instance $domname $instance
+ vtpmdb_add_instance $domname $instance
if [ "$REASON" == "create" ]; then
vtpm_create $instance
elif [ "$REASON" == "resume" ]; then
@@ -279,25 +269,40 @@
true
fi
xenstore_write $XENBUS_PATH/instance $instance
- set -e
-}
-
-
-#Remove an instance
+}
+
+
+#Remove an instance when a VM is terminating or suspending.
+#Since it is assumed that the VM will appear again, the
+#entry is kept in the VTPMDB file.
function vtpm_remove_instance () {
local domname=$(xenstore_read "$XENBUS_PATH"/domain)
- set +e
- find_instance $domname
- res=$?
- if [ $res -eq 0 ]; then
- #Something is really wrong with the DB
- log err "vTPM DB file $VTPMDB has no entry for '$domname'"
- else
+
+ claim_lock vtpmdb
+
+ instance=$(vtpmdb_find_instance $domname)
+
+ if [ "$instance" != "0" ]; then
if [ "$REASON" == "suspend" ]; then
vtpm_suspend $instance
fi
fi
- set -e
-}
-
-
+
+ release_lock vtpmdb
+}
+
+
+#Remove an entry in the VTPMDB file given the domain's name
+#1st parameter: The name of the domain
+function vtpm_delete_instance () {
+ local rc
+
+ claim_lock vtpmdb
+
+ instance=$(vtpmdb_find_instance $1)
+ if [ "$instance" != "0" ]; then
+ vtpmdb_remove_entry $1 $instance
+ fi
+
+ release_lock vtpmdb
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.agent
--- a/tools/examples/xen-backend.agent Wed Mar 1 17:01:54 2006
+++ b/tools/examples/xen-backend.agent Wed Mar 1 19:47:25 2006
@@ -18,12 +18,7 @@
add)
;;
remove)
- # remove device frontend store entries
- xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
-
- # remove device backend store entries
- xenstore-rm -t "$XENBUS_PATH" || true
- xenstore-rm -t "error/$XENBUS_PATH" || true
+ /etc/xen/scripts/xen-hotplug-cleanup
;;
online)
;;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.rules
--- a/tools/examples/xen-backend.rules Wed Mar 1 17:01:54 2006
+++ b/tools/examples/xen-backend.rules Wed Mar 1 19:47:25 2006
@@ -2,6 +2,4 @@
SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm
$env{ACTION}"
SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script}
online"
SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="offline",
RUN+="$env{script} offline"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/bin/bash -c
'/usr/bin/xenstore-rm -t $$(/usr/bin/xenstore-read $env{XENBUS_PATH}/frontend)'"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t
$env{XENBUS_PATH}"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t
error/$env{XENBUS_PATH}"
+SUBSYSTEM=="xen-backend", ACTION=="remove",
RUN+="/etc/xen/scripts/xen-hotplug-cleanup"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-common.sh
--- a/tools/examples/xen-hotplug-common.sh Wed Mar 1 17:01:54 2006
+++ b/tools/examples/xen-hotplug-common.sh Wed Mar 1 19:47:25 2006
@@ -17,19 +17,15 @@
dir=$(dirname "$0")
+. "$dir/logging.sh"
. "$dir/xen-script-common.sh"
+. "$dir/locking.sh"
exec 2>>/var/log/xen-hotplug.log
export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
export LANG="POSIX"
unset $(set | grep ^LC_ | cut -d= -f1)
-
-log() {
- local level="$1"
- shift
- logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
-}
fatal() {
xenstore_write "$XENBUS_PATH"/hotplug-status error
@@ -93,87 +89,4 @@
}
-#
-# Serialisation
-#
-
-LOCK_SLEEPTIME=1
-LOCK_SPINNING_RETRIES=5
-LOCK_RETRIES=10
-LOCK_BASEDIR=/var/run/xen-hotplug
-
-
-claim_lock()
-{
- local lockdir="$LOCK_BASEDIR/$1"
- mkdir -p "$LOCK_BASEDIR"
- _claim_lock "$lockdir"
-}
-
-
-release_lock()
-{
- _release_lock "$LOCK_BASEDIR/$1"
-}
-
-
-_claim_lock()
-{
- local lockdir="$1"
- local owner=$(_lock_owner "$lockdir")
- local retries=0
-
- while [ $retries -lt $LOCK_RETRIES ]
- do
- mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
- _update_lock_info "$lockdir" && return
-
- local new_owner=$(_lock_owner "$lockdir")
- if [ "$new_owner" != "$owner" ]
- then
- owner="$new_owner"
- retries=0
- fi
-
- if [ $retries -gt $LOCK_SPINNING_RETRIES ]
- then
- sleep $LOCK_SLEEPTIME
- else
- sleep 0
- fi
- retries=$(($retries + 1))
- done
- _steal_lock "$lockdir"
-}
-
-
-_release_lock()
-{
- trap sigerr ERR
- rm -rf "$1" 2>/dev/null || true
-}
-
-
-_steal_lock()
-{
- local lockdir="$1"
- local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
- log err "Forced to steal lock on $lockdir from $owner!"
- _release_lock "$lockdir"
- _claim_lock "$lockdir"
-}
-
-
-_lock_owner()
-{
- cat "$1/owner" 2>/dev/null || echo "unknown"
-}
-
-
-_update_lock_info()
-{
- echo "$$: $0" >"$1/owner"
-}
-
-
log debug "$@" "XENBUS_PATH=$XENBUS_PATH"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/hvmloader/Makefile
--- a/tools/firmware/hvmloader/Makefile Wed Mar 1 17:01:54 2006
+++ b/tools/firmware/hvmloader/Makefile Wed Mar 1 19:47:25 2006
@@ -19,7 +19,7 @@
#
XEN_ROOT = ../../..
-include $(XEN_ROOT)/tools/Rules.mk
+include $(XEN_ROOT)/Config.mk
# The HVM loader is started in 32-bit mode at the address below:
LOADADDR = 0x100000
@@ -29,9 +29,13 @@
OBJECTS = hvmloader.o acpi_madt.o
-CC = gcc
+# Disable PIE/SSP if GCC supports them. They can break us.
+CFLAGS += $(call test-gcc-flag,$(CC),-nopie)
+CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector)
+CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
+
OBJCOPY = objcopy
-CFLAGS = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
+CFLAGS += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
CFLAGS += -m32 -march=i686
LDFLAGS = -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,$(LOADADDR)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vgabios/Makefile
--- a/tools/firmware/vgabios/Makefile Wed Mar 1 17:01:54 2006
+++ b/tools/firmware/vgabios/Makefile Wed Mar 1 19:47:25 2006
@@ -1,6 +1,4 @@
CC = gcc
-CFLAGS = -g -O2 -Wall -Wstrict-prototypes
-LDFLAGS =
GCC = gcc
BCC = bcc
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vmxassist/Makefile
--- a/tools/firmware/vmxassist/Makefile Wed Mar 1 17:01:54 2006
+++ b/tools/firmware/vmxassist/Makefile Wed Mar 1 19:47:25 2006
@@ -19,7 +19,7 @@
#
XEN_ROOT = ../../..
-include $(XEN_ROOT)/tools/Rules.mk
+include $(XEN_ROOT)/Config.mk
# The emulator code lives in ROM space
TEXTADDR=0x000D0000
@@ -27,11 +27,14 @@
DEFINES=-DDEBUG -DTEXTADDR=$(TEXTADDR)
XENINC=-I$(XEN_ROOT)/tools/libxc
-LD = ld
-CC = gcc
+# Disable PIE/SSP if GCC supports them. They can break us.
+CFLAGS += $(call test-gcc-flag,$(CC),-nopie)
+CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector)
+CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
+
CPP = cpp -P
OBJCOPY = objcopy -p -O binary -R .note -R .comment -R .bss -S --gap-fill=0
-CFLAGS = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
+CFLAGS += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
CFLAGS += -m32 -march=i686
LDFLAGS = -m elf_i386
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/Makefile
--- a/tools/ioemu/Makefile Wed Mar 1 17:01:54 2006
+++ b/tools/ioemu/Makefile Wed Mar 1 19:47:25 2006
@@ -1,6 +1,9 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
-include config-host.mak
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing
+CFLAGS+=-Wall -O2 -g -fno-strict-aliasing
ifdef CONFIG_DARWIN
CFLAGS+= -mdynamic-no-pic
endif
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/ide.c
--- a/tools/ioemu/hw/ide.c Wed Mar 1 17:01:54 2006
+++ b/tools/ioemu/hw/ide.c Wed Mar 1 19:47:25 2006
@@ -669,9 +669,6 @@
}
if (s->io_buffer_index >= s->io_buffer_size && s->nsector == 0) {
s->status = READY_STAT | SEEK_STAT;
- s->bmdma->status &= ~BM_STATUS_DMAING;
- s->bmdma->status |= BM_STATUS_INT;
- ide_set_irq(s);
#ifdef DEBUG_IDE_ATAPI
printf("dma status=0x%x\n", s->status);
#endif
@@ -738,9 +735,6 @@
if (n == 0) {
/* end of transfer */
s->status = READY_STAT | SEEK_STAT;
- s->bmdma->status &= ~BM_STATUS_DMAING;
- s->bmdma->status |= BM_STATUS_INT;
- ide_set_irq(s);
return 0;
}
if (n > MAX_MULT_SECTORS)
@@ -987,9 +981,6 @@
if (s->packet_transfer_size <= 0) {
s->status = READY_STAT;
s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO |
ATAPI_INT_REASON_CD;
- s->bmdma->status &= ~BM_STATUS_DMAING;
- s->bmdma->status |= BM_STATUS_INT;
- ide_set_irq(s);
#ifdef DEBUG_IDE_ATAPI
printf("dma status=0x%x\n", s->status);
#endif
@@ -2025,6 +2016,17 @@
}
}
+static void ide_dma_finish(BMDMAState *bm)
+{
+ IDEState *s = bm->ide_if;
+
+ bm->status &= ~BM_STATUS_DMAING;
+ bm->status |= BM_STATUS_INT;
+ bm->dma_cb = NULL;
+ bm->ide_if = NULL;
+ ide_set_irq(s);
+}
+
/* XXX: full callback usage to prepare non blocking I/Os support -
error handling */
#ifdef DMA_MULTI_THREAD
@@ -2070,9 +2072,8 @@
cur_addr += 8;
}
/* end of transfer */
- the_end:
- bm->dma_cb = NULL;
- bm->ide_if = NULL;
+the_end:
+ ide_dma_finish(bm);
}
static void ide_dma_start(IDEState *s, IDEDMAFunc *dma_cb)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/pcnet.c
--- a/tools/ioemu/hw/pcnet.c Wed Mar 1 17:01:54 2006
+++ b/tools/ioemu/hw/pcnet.c Wed Mar 1 19:47:25 2006
@@ -376,6 +376,10 @@
if (s->recv_pos > 0)
return 0;
+ pcnet_rdte_poll(s);
+ if (!(CSR_CRST(s) & 0x8000)) {
+ return 0;
+ }
return sizeof(s->buffer)-16;
}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/target-i386-dm/Makefile
--- a/tools/ioemu/target-i386-dm/Makefile Wed Mar 1 17:01:54 2006
+++ b/tools/ioemu/target-i386-dm/Makefile Wed Mar 1 19:47:25 2006
@@ -1,7 +1,8 @@
+include config.mak
+override TARGET_ARCH=i386
+
XEN_ROOT=../../..
include $(XEN_ROOT)/tools/Rules.mk
-include config.mak
-override TARGET_ARCH=i386
INSTALL_DIR := $(DESTDIR)/usr/$(LIBDIR)/xen/bin
TARGET_PATH=$(SRC_PATH)/target-$(TARGET_ARCH)
@@ -12,7 +13,7 @@
VPATH+=:$(SRC_PATH)/linux-user
DEFINES+=-I$(SRC_PATH)/linux-user -I$(SRC_PATH)/linux-user/$(TARGET_ARCH)
endif
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing
+CFLAGS+=-Wall -O2 -g -fno-strict-aliasing
LDFLAGS=-g
LIBS=
HELPER_CFLAGS=$(CFLAGS)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c Wed Mar 1 17:01:54 2006
+++ b/tools/libxc/xc_linux_build.c Wed Mar 1 19:47:25 2006
@@ -45,6 +45,77 @@
#ifdef __ia64__
#define probe_aout9(image,image_size,load_funcs) 1
#endif
+
+static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
+ [XENFEAT_writable_page_tables] = "writable_page_tables",
+ [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
+ [XENFEAT_auto_translated_physmap] = "auto_translated_physmap",
+ [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel",
+ [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb"
+};
+
+static inline void set_feature_bit (int nr, uint32_t *addr)
+{
+ addr[nr>>5] |= (1<<(nr&31));
+}
+
+static inline int test_feature_bit(int nr, uint32_t *addr)
+{
+ return !!(addr[nr>>5] & (1<<(nr&31)));
+}
+
+static int parse_features(
+ const char *feats,
+ uint32_t supported[XENFEAT_NR_SUBMAPS],
+ uint32_t required[XENFEAT_NR_SUBMAPS])
+{
+ const char *end, *p;
+ int i, req;
+
+ if ( (end = strchr(feats, ',')) == NULL )
+ end = feats + strlen(feats);
+
+ while ( feats < end )
+ {
+ p = strchr(feats, '|');
+ if ( (p == NULL) || (p > end) )
+ p = end;
+
+ req = (*feats == '!');
+ if ( req )
+ feats++;
+
+ for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
+ {
+ if ( feature_names[i] == NULL )
+ continue;
+
+ if ( strncmp(feature_names[i], feats, p-feats) == 0 )
+ {
+ set_feature_bit(i, supported);
+ if ( required && req )
+ set_feature_bit(i, required);
+ break;
+ }
+ }
+
+ if ( i == XENFEAT_NR_SUBMAPS*32 )
+ {
+ ERROR("Unknown feature \"%.*s\".\n", (int)(p-feats), feats);
+ if ( req )
+ {
+ ERROR("Kernel requires an unknown hypervisor feature.\n");
+ return -EINVAL;
+ }
+ }
+
+ feats = p;
+ if ( *feats == '|' )
+ feats++;
+ }
+
+ return -EINVAL;
+}
static int probeimageformat(char *image,
unsigned long image_size,
@@ -344,7 +415,8 @@
unsigned long shared_info_frame,
unsigned long flags,
unsigned int store_evtchn, unsigned long *store_mfn,
- unsigned int console_evtchn, unsigned long *console_mfn)
+ unsigned int console_evtchn, unsigned long *console_mfn,
+ uint32_t required_features[XENFEAT_NR_SUBMAPS])
{
unsigned long *page_array = NULL;
struct load_funcs load_funcs;
@@ -483,7 +555,8 @@
unsigned long shared_info_frame,
unsigned long flags,
unsigned int store_evtchn, unsigned long *store_mfn,
- unsigned int console_evtchn, unsigned long *console_mfn)
+ unsigned int console_evtchn, unsigned long *console_mfn,
+ uint32_t required_features[XENFEAT_NR_SUBMAPS])
{
unsigned long *page_array = NULL;
unsigned long count, i, hypercall_pfn;
@@ -515,8 +588,9 @@
unsigned long vpt_start;
unsigned long vpt_end;
unsigned long v_end;
- unsigned shadow_mode_enabled;
unsigned long guest_store_mfn, guest_console_mfn, guest_shared_info_mfn;
+ unsigned long shadow_mode_enabled;
+ uint32_t supported_features[XENFEAT_NR_SUBMAPS] = { 0, };
rc = probeimageformat(image, image_size, &load_funcs);
if ( rc != 0 )
@@ -534,8 +608,6 @@
goto error_out;
}
- shadow_mode_enabled = !!strstr(dsi.xen_guest_string,
- "SHADOW=translate");
/*
* Why do we need this? The number of page-table frames depends on the
* size of the bootstrap address space. But the size of the address space
@@ -637,6 +709,35 @@
(load_funcs.loadimage)(image, image_size, xc_handle, dom, page_array,
&dsi);
+ /* Parse and validate kernel features. */
+ p = strstr(dsi.xen_guest_string, "FEATURES=");
+ if ( p != NULL )
+ {
+ if ( !parse_features(p + strlen("FEATURES="),
+ supported_features,
+ required_features) )
+ {
+ ERROR("Failed to parse guest kernel features.\n");
+ goto error_out;
+ }
+
+ fprintf(stderr, "Supported features = { %08x }.\n",
+ supported_features[0]);
+ fprintf(stderr, "Required features = { %08x }.\n",
+ required_features[0]);
+ }
+
+ for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
+ {
+ if ( (supported_features[i]&required_features[i]) !=
required_features[i] )
+ {
+ ERROR("Guest kernel does not support a required feature.\n");
+ goto error_out;
+ }
+ }
+
+ shadow_mode_enabled = test_feature_bit(XENFEAT_auto_translated_physmap,
required_features);
+
/* Load the initial ramdisk image. */
if ( initrd_len != 0 )
{
@@ -870,6 +971,7 @@
const char *image_name,
const char *ramdisk_name,
const char *cmdline,
+ const char *features,
unsigned long flags,
unsigned int store_evtchn,
unsigned long *store_mfn,
@@ -886,6 +988,16 @@
char *image = NULL;
unsigned long image_size, initrd_size=0;
unsigned long vstartinfo_start, vkern_entry, vstack_start;
+ uint32_t features_bitmap[XENFEAT_NR_SUBMAPS] = { 0, };
+
+ if ( features != NULL )
+ {
+ if ( !parse_features(features, features_bitmap, NULL) )
+ {
+ PERROR("Failed to parse configured features\n");
+ goto error_out;
+ }
+ }
if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 )
{
@@ -940,7 +1052,8 @@
&vstack_start, ctxt, cmdline,
op.u.getdomaininfo.shared_info_frame,
flags, store_evtchn, store_mfn,
- console_evtchn, console_mfn) < 0 )
+ console_evtchn, console_mfn,
+ features_bitmap) < 0 )
{
ERROR("Error constructing guest OS");
goto error_out;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h Wed Mar 1 17:01:54 2006
+++ b/tools/libxc/xenguest.h Wed Mar 1 19:47:25 2006
@@ -47,6 +47,7 @@
const char *image_name,
const char *ramdisk_name,
const char *cmdline,
+ const char *features,
unsigned long flags,
unsigned int store_evtchn,
unsigned long *store_mfn,
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/pygrub/src/pygrub
--- a/tools/pygrub/src/pygrub Wed Mar 1 17:01:54 2006
+++ b/tools/pygrub/src/pygrub Wed Mar 1 19:47:25 2006
@@ -94,11 +94,17 @@
return struct.unpack("<L", buf[poff+8:poff+12])[0] * SECTOR_SIZE
return -1
-def get_config(fn):
+def get_config(fn, isconfig = False):
if not os.access(fn, os.R_OK):
raise RuntimeError, "Unable to access %s" %(fn,)
cf = grub.GrubConf.GrubConfigFile()
+
+ if isconfig:
+ # set the config file and parse it
+ cf.filename = fn
+ cf.parse()
+ return cf
offset = 0
if is_disk_image(fn):
@@ -130,9 +136,7 @@
# then parse the grub config
cf.parse(buf)
else:
- # set the config file and parse it
- cf.filename = fn
- cf.parse()
+ raise RuntimeError, "Unable to read filesystem"
return cf
@@ -214,7 +218,8 @@
try:
opts, args = getopt.gnu_getopt(sys.argv[1:], 'qh::',
- ["quiet", "help", "output=", "entry="])
+ ["quiet", "help", "output=", "entry=",
+ "isconfig"])
except getopt.GetoptError:
usage()
sys.exit(1)
@@ -227,6 +232,7 @@
output = None
entry = None
interactive = True
+ isconfig = False
for o, a in opts:
if o in ("-q", "--quiet"):
interactive = False
@@ -239,13 +245,15 @@
entry = a
# specifying the entry to boot implies non-interactive
interactive = False
+ elif o in ("--isconfig",):
+ isconfig = True
if output is None or output == "-":
fd = sys.stdout.fileno()
else:
fd = os.open(output, os.O_WRONLY)
- cf = get_config(file)
+ cf = get_config(file, isconfig)
if interactive:
curses.wrapper(run_main)
else:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Mar 1 17:01:54 2006
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Mar 1 19:47:25 2006
@@ -326,27 +326,29 @@
PyObject *kwds)
{
uint32_t dom;
- char *image, *ramdisk = NULL, *cmdline = "";
+ char *image, *ramdisk = NULL, *cmdline = "", *features = NULL;
int flags = 0;
int store_evtchn, console_evtchn;
unsigned long store_mfn = 0;
unsigned long console_mfn = 0;
- static char *kwd_list[] = { "dom", "store_evtchn",
- "console_evtchn", "image",
+ static char *kwd_list[] = { "dom", "store_evtchn",
+ "console_evtchn", "image",
/* optional */
- "ramdisk", "cmdline", "flags", NULL };
-
- if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssi", kwd_list,
+ "ramdisk", "cmdline", "flags",
+ "features", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssis", kwd_list,
&dom, &store_evtchn,
- &console_evtchn, &image,
+ &console_evtchn, &image,
/* optional */
- &ramdisk, &cmdline, &flags) )
+ &ramdisk, &cmdline, &flags,
+ &features) )
return NULL;
if ( xc_linux_build(self->xc_handle, dom, image,
- ramdisk, cmdline, flags,
- store_evtchn, &store_mfn,
+ ramdisk, cmdline, features, flags,
+ store_evtchn, &store_mfn,
console_evtchn, &console_mfn) != 0 ) {
if (!errno)
errno = EINVAL;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendBootloader.py
--- a/tools/python/xen/xend/XendBootloader.py Wed Mar 1 17:01:54 2006
+++ b/tools/python/xen/xend/XendBootloader.py Wed Mar 1 19:47:25 2006
@@ -1,7 +1,7 @@
#
# XendBootloader.py - Framework to run a boot loader for picking the kernel
#
-# Copyright 2005 Red Hat, Inc.
+# Copyright 2005-2006 Red Hat, Inc.
# Jeremy Katz <katzj@xxxxxxxxxx>
#
# This software may be freely redistributed under the terms of the GNU
@@ -13,12 +13,11 @@
#
import os, select, errno
+import random
import sxp
from XendLogging import log
from XendError import VmError
-
-BL_FIFO = "/var/lib/xen/xenbl"
def bootloader(blexec, disk, quiet = 0, vcpus = None, entry = None):
"""Run the boot loader executable on the given disk and return a
@@ -38,14 +37,18 @@
log.error(msg)
raise VmError(msg)
- os.mkfifo(BL_FIFO, 0600)
+ while True:
+ fifo = "/var/lib/xen/xenbl.%s" %(random.randint(0, 32000),)
+ if not os.path.exists(fifo):
+ break
+ os.mkfifo(fifo, 0600)
child = os.fork()
if (not child):
args = [ blexec ]
if quiet:
args.append("-q")
- args.append("--output=%s" %(BL_FIFO,))
+ args.append("--output=%s" %(fifo,))
if entry is not None:
args.append("--entry=%s" %(entry,))
args.append(disk)
@@ -59,7 +62,7 @@
while 1:
try:
- r = os.open(BL_FIFO, os.O_RDONLY)
+ r = os.open(fifo, os.O_RDONLY)
except OSError, e:
if e.errno == errno.EINTR:
continue
@@ -74,7 +77,7 @@
os.waitpid(child, 0)
os.close(r)
- os.unlink(BL_FIFO)
+ os.unlink(fifo)
if len(ret) == 0:
msg = "Boot loader didn't return any data!"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Wed Mar 1 17:01:54 2006
+++ b/tools/python/xen/xend/XendDomainInfo.py Wed Mar 1 19:47:25 2006
@@ -1502,15 +1502,14 @@
if not self.info['bootloader']:
return
# if we're restarting with a bootloader, we need to run it
- # FIXME: this assumes the disk is the first device and
- # that we're booting from the first disk
blcfg = None
config = self.sxpr()
# FIXME: this assumes that we want to use the first disk
- dev = sxp.child_value(config, "device")
- if dev:
- disk = sxp.child_value(dev, "uname")
- fn = blkdev_uname_to_file(disk)
+ for dev in sxp.children(config, "device"):
+ disk = sxp.child(dev, "vbd")
+ if disk is None:
+ continue
+ fn = blkdev_uname_to_file(sxp.child_value(disk, "uname"))
blcfg = bootloader(self.info['bootloader'], fn, 1,
self.info['vcpus'])
if blcfg is None:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py Wed Mar 1 17:01:54 2006
+++ b/tools/python/xen/xend/image.py Wed Mar 1 19:47:25 2006
@@ -68,6 +68,7 @@
self.kernel = None
self.ramdisk = None
self.cmdline = None
+ self.features = None
self.configure(imageConfig, deviceConfig)
@@ -89,6 +90,7 @@
if args:
self.cmdline += " " + args
self.ramdisk = get_cfg("ramdisk", '')
+ self.features = get_cfg("features", '')
self.vm.storeVm(("image/ostype", self.ostype),
("image/kernel", self.kernel),
@@ -175,13 +177,15 @@
log.debug("cmdline = %s", self.cmdline)
log.debug("ramdisk = %s", self.ramdisk)
log.debug("vcpus = %d", self.vm.getVCpuCount())
+ log.debug("features = %s", self.features)
return xc.linux_build(dom = self.vm.getDomid(),
image = self.kernel,
store_evtchn = store_evtchn,
console_evtchn = console_evtchn,
cmdline = self.cmdline,
- ramdisk = self.ramdisk)
+ ramdisk = self.ramdisk,
+ features = self.features)
class HVMImageHandler(ImageHandler):
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/server/netif.py
--- a/tools/python/xen/xend/server/netif.py Wed Mar 1 17:01:54 2006
+++ b/tools/python/xen/xend/server/netif.py Wed Mar 1 19:47:25 2006
@@ -113,7 +113,8 @@
script.replace(xroot.network_script_dir + os.sep,
"")])
if ip:
- result.append(['ip', ip.split(" ")])
+ for i in ip.split(" "):
+ result.append(['ip', i])
if bridge:
result.append(['bridge', bridge])
if mac:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py Wed Mar 1 17:01:54 2006
+++ b/tools/python/xen/xm/create.py Wed Mar 1 19:47:25 2006
@@ -137,6 +137,10 @@
fn=set_value, default='',
use="Path to ramdisk.")
+gopts.var('features', val='FEATURES',
+ fn=set_value, default='',
+ use="Features to enable in guest kernel")
+
gopts.var('builder', val='FUNCTION',
fn=set_value, default='linux',
use="Function to use to build the domain.")
@@ -445,6 +449,8 @@
config_image.append(['root', cmdline_root])
if vals.extra:
config_image.append(['args', vals.extra])
+ if vals.features:
+ config_image.append(['features', vals.features])
if vals.builder == 'hvm':
configure_hvm(config_image, vals)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/tests/Makefile
--- a/tools/tests/Makefile Wed Mar 1 17:01:54 2006
+++ b/tools/tests/Makefile Wed Mar 1 19:47:25 2006
@@ -4,13 +4,12 @@
TARGET := test_x86_emulator
-CC := gcc
-CFLAGS := -O2 -Wall -Werror -D__TEST_HARNESS__
+HOSTCFLAGS += -D__TEST_HARNESS__
all: $(TARGET)
$(TARGET): x86_emulate.o test_x86_emulator.o
- $(CC) -o $@ $^
+ $(HOSTCC) -o $@ $^
clean:
rm -rf $(TARGET) *.o *~ core
@@ -18,7 +17,7 @@
install:
x86_emulate.o: $(XEN_ROOT)/xen/arch/x86/x86_emulate.c
- $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
+ $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
%.o: %.c
- $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
+ $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xenstore/xs.c
--- a/tools/xenstore/xs.c Wed Mar 1 17:01:54 2006
+++ b/tools/xenstore/xs.c Wed Mar 1 19:47:25 2006
@@ -31,7 +31,6 @@
#include <signal.h>
#include <stdint.h>
#include <errno.h>
-#include <sys/ioctl.h>
#include <pthread.h>
#include "xs.h"
#include "list.h"
@@ -343,7 +342,6 @@
free(ret);
saved_errno = EBADF;
goto close_fd;
-
}
return ret;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/configure.ac
--- a/tools/xm-test/configure.ac Wed Mar 1 17:01:54 2006
+++ b/tools/xm-test/configure.ac Wed Mar 1 19:47:25 2006
@@ -93,6 +93,7 @@
tests/unpause/Makefile
tests/vcpu-pin/Makefile
tests/vcpu-disable/Makefile
+ tests/vtpm/Makefile
tests/enforce_dom0_cpus/Makefile
lib/XmTestReport/xmtest.py
lib/XmTestLib/config.py
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/Network.py
--- a/tools/xm-test/lib/XmTestLib/Network.py Wed Mar 1 17:01:54 2006
+++ b/tools/xm-test/lib/XmTestLib/Network.py Wed Mar 1 19:47:25 2006
@@ -22,6 +22,7 @@
import sys;
import os;
import atexit;
+import random;
from Test import *
from Xm import *
@@ -53,12 +54,22 @@
if rc == 0:
SKIP("Zeroconf address found: " + out)
+ # Randomize one octet of the IP addresses we choose, so that
+ # multiple machines running network tests don't interfere
+ # with each other.
+ self.subnet = random.randint(1,254)
+
def calc_ip_address(self, dom, interface):
# Generate an IP address from the dom# and eth#:
- # 169.254.(eth#+153).(dom#+10)
+ # 169.254.(self.subnet).(eth#)*16 + (dom# + 1)
ethnum = int(interface[len("eth"):])
+ if (ethnum > 15):
+ raise NetworkError("ethnum > 15 : " + interface)
domnum = int(dom[len("dom"):])
- return "169.254."+ str(ethnum+153) + "." + str(domnum+10)
+ if (domnum > 14):
+ raise NetworkError("domnum > 14 : " + dom)
+
+ return "169.254."+ str(self.subnet) + "." + str(ethnum*16+domnum+1)
def ip(self, dom, interface, todomname=None, toeth=None, bridge=None):
newip = self.calc_ip_address(dom, interface)
@@ -96,4 +107,4 @@
return newip
def mask(self, dom, interface):
- return "255.255.255.0"
+ return "255.255.255.240"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/XenDomain.py
--- a/tools/xm-test/lib/XmTestLib/XenDomain.py Wed Mar 1 17:01:54 2006
+++ b/tools/xm-test/lib/XmTestLib/XenDomain.py Wed Mar 1 19:47:25 2006
@@ -99,6 +99,7 @@
# These options need to be lists
self.defaultOpts["disk"] = []
self.defaultOpts["vif"] = []
+ self.defaultOpts["vtpm"] = []
self.opts = self.defaultOpts
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/Makefile.am
--- a/tools/xm-test/tests/Makefile.am Wed Mar 1 17:01:54 2006
+++ b/tools/xm-test/tests/Makefile.am Wed Mar 1 19:47:25 2006
@@ -23,6 +23,7 @@
unpause \
vcpu-disable \
vcpu-pin \
+ vtpm \
enforce_dom0_cpus \
save restore migrate
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/Rules.mk
--- a/xen/Rules.mk Wed Mar 1 17:01:54 2006
+++ b/xen/Rules.mk Wed Mar 1 19:47:25 2006
@@ -45,7 +45,7 @@
include $(BASEDIR)/arch/$(TARGET_ARCH)/Rules.mk
-CFLAGS += -g
+CFLAGS += -g -D__XEN__
ifneq ($(debug),y)
CFLAGS += -DNDEBUG
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/vmx/vmx_hypercall.c
--- a/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar 1 19:47:25 2006
@@ -57,45 +57,7 @@
vcpu_set_gr(vcpu, 8, ret, 0);
vmx_vcpu_increment_iip(vcpu);
}
-/* turn off temporarily, we will merge hypercall parameter convention with
xeno, when
- VTI domain need to call hypercall */
-#if 0
-unsigned long __hypercall_create_continuation(
- unsigned int op, unsigned int nr_args, ...)
-{
- struct mc_state *mcs = &mc_state[smp_processor_id()];
- VCPU *vcpu = current;
- struct cpu_user_regs *regs = vcpu_regs(vcpu);
- unsigned int i;
- va_list args;
-
- va_start(args, nr_args);
- if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
- panic("PREEMPT happen in multicall\n"); // Not support yet
- } else {
- vcpu_set_gr(vcpu, 15, op, 0);
- for ( i = 0; i < nr_args; i++) {
- switch (i) {
- case 0: vcpu_set_gr(vcpu, 16, va_arg(args, unsigned long), 0);
- break;
- case 1: vcpu_set_gr(vcpu, 17, va_arg(args, unsigned long), 0);
- break;
- case 2: vcpu_set_gr(vcpu, 18, va_arg(args, unsigned long), 0);
- break;
- case 3: vcpu_set_gr(vcpu, 19, va_arg(args, unsigned long), 0);
- break;
- case 4: vcpu_set_gr(vcpu, 20, va_arg(args, unsigned long), 0);
- break;
- default: panic("Too many args for hypercall continuation\n");
- break;
- }
- }
- }
- vcpu->arch.hypercall_continuation = 1;
- va_end(args);
- return op;
-}
-#endif
+
void hyper_dom_mem_op(void)
{
VCPU *vcpu=current;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/xen/process.c
--- a/xen/arch/ia64/xen/process.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/ia64/xen/process.c Wed Mar 1 19:47:25 2006
@@ -801,30 +801,48 @@
reflect_interruption(isr,regs,vector);
}
-unsigned long __hypercall_create_continuation(
- unsigned int op, unsigned int nr_args, ...)
+unsigned long hypercall_create_continuation(
+ unsigned int op, const char *format, ...)
{
struct mc_state *mcs = &mc_state[smp_processor_id()];
struct vcpu *v = current;
+ const char *p = format;
+ unsigned long arg;
unsigned int i;
va_list args;
- va_start(args, nr_args);
+ va_start(args, format);
if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
panic("PREEMPT happen in multicall\n"); // Not support yet
} else {
vcpu_set_gr(v, 2, op, 0);
- for ( i = 0; i < nr_args; i++) {
+ for ( i = 0; *p != '\0'; i++) {
+ switch ( *p++ )
+ {
+ case 'i':
+ arg = (unsigned long)va_arg(args, unsigned int);
+ break;
+ case 'l':
+ arg = (unsigned long)va_arg(args, unsigned long);
+ break;
+ case 'p':
+ case 'h':
+ arg = (unsigned long)va_arg(args, void *);
+ break;
+ default:
+ arg = 0;
+ BUG();
+ }
switch (i) {
- case 0: vcpu_set_gr(v, 14, va_arg(args, unsigned long), 0);
+ case 0: vcpu_set_gr(v, 14, arg, 0);
break;
- case 1: vcpu_set_gr(v, 15, va_arg(args, unsigned long), 0);
+ case 1: vcpu_set_gr(v, 15, arg, 0);
break;
- case 2: vcpu_set_gr(v, 16, va_arg(args, unsigned long), 0);
+ case 2: vcpu_set_gr(v, 16, arg, 0);
break;
- case 3: vcpu_set_gr(v, 17, va_arg(args, unsigned long), 0);
+ case 3: vcpu_set_gr(v, 17, arg, 0);
break;
- case 4: vcpu_set_gr(v, 18, va_arg(args, unsigned long), 0);
+ case 4: vcpu_set_gr(v, 18, arg, 0);
break;
default: panic("Too many args for hypercall continuation\n");
break;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/Makefile Wed Mar 1 19:47:25 2006
@@ -33,6 +33,10 @@
endif
endif
+ifneq ($(supervisor_mode_kernel),y)
+OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS))
+endif
+
OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS))
OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS))
@@ -44,7 +48,7 @@
$(TARGET): $(TARGET)-syms boot/mkelf32
./boot/mkelf32 $(TARGET)-syms $(TARGET) 0x100000 \
- `nm $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'`
+ `$(NM) $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^
]*\).*/0x\1/'`
$(CURDIR)/arch.o: $(OBJS)
$(LD) $(LDFLAGS) -r -o $@ $(OBJS)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Rules.mk
--- a/xen/arch/x86/Rules.mk Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/Rules.mk Wed Mar 1 19:47:25 2006
@@ -6,6 +6,7 @@
# 'make clean' before rebuilding.
#
pae ?= n
+supervisor_mode_kernel ?= n
CFLAGS += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing
CFLAGS += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe
@@ -32,6 +33,9 @@
CFLAGS += -DCONFIG_X86_PAE=1
endif
endif
+ifeq ($(supervisor_mode_kernel),y)
+CFLAGS += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
+endif
ifeq ($(TARGET_SUBARCH),x86_64)
CFLAGS += -m64 -mno-red-zone -fpic -fno-reorder-blocks
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/boot/mkelf32.c
--- a/xen/arch/x86/boot/mkelf32.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/boot/mkelf32.c Wed Mar 1 19:47:25 2006
@@ -244,7 +244,7 @@
inimage = argv[1];
outimage = argv[2];
- loadbase = strtoul(argv[3], NULL, 16);
+ loadbase = strtoull(argv[3], NULL, 16);
final_exec_addr = strtoul(argv[4], NULL, 16);
infd = open(inimage, O_RDONLY);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/dom0_ops.c Wed Mar 1 19:47:25 2006
@@ -181,10 +181,13 @@
{
dom0_physinfo_t *pi = &op->u.physinfo;
- pi->threads_per_core = smp_num_siblings;
- pi->cores_per_socket = boot_cpu_data.x86_max_cores;
+ pi->threads_per_core =
+ cpus_weight(cpu_sibling_map[0]);
+ pi->cores_per_socket =
+ cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
pi->sockets_per_node =
- num_online_cpus() / (pi->threads_per_core * pi->cores_per_socket);
+ num_online_cpus() / cpus_weight(cpu_core_map[0]);
+
pi->nr_nodes = 1;
pi->total_pages = total_pages;
pi->free_pages = avail_domheap_pages();
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/domain.c Wed Mar 1 19:47:25 2006
@@ -351,17 +351,17 @@
if ( !(c->flags & VGCF_HVM_GUEST) )
{
- fixup_guest_selector(c->user_regs.ss);
- fixup_guest_selector(c->kernel_ss);
- fixup_guest_selector(c->user_regs.cs);
+ fixup_guest_stack_selector(c->user_regs.ss);
+ fixup_guest_stack_selector(c->kernel_ss);
+ fixup_guest_code_selector(c->user_regs.cs);
#ifdef __i386__
- fixup_guest_selector(c->event_callback_cs);
- fixup_guest_selector(c->failsafe_callback_cs);
+ fixup_guest_code_selector(c->event_callback_cs);
+ fixup_guest_code_selector(c->failsafe_callback_cs);
#endif
for ( i = 0; i < 256; i++ )
- fixup_guest_selector(c->trap_ctxt[i].cs);
+ fixup_guest_code_selector(c->trap_ctxt[i].cs);
}
else if ( !hvm_enabled )
return -EINVAL;
@@ -784,6 +784,11 @@
context_saved(prev);
+ /* Update per-VCPU guest runstate shared memory area (if registered). */
+ if ( next->runstate_guest != NULL )
+ __copy_to_user(next->runstate_guest, &next->runstate,
+ sizeof(next->runstate));
+
schedule_tail(next);
BUG();
}
@@ -820,56 +825,77 @@
flush_tlb_mask(v->vcpu_dirty_cpumask);
}
-unsigned long __hypercall_create_continuation(
- unsigned int op, unsigned int nr_args, ...)
+#define next_arg(fmt, args) ({ \
+ unsigned long __arg; \
+ switch ( *(fmt)++ ) \
+ { \
+ case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
+ case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
+ case 'p': __arg = (unsigned long)va_arg(args, void *); break; \
+ case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
+ default: __arg = 0; BUG(); \
+ } \
+ __arg; \
+})
+
+unsigned long hypercall_create_continuation(
+ unsigned int op, const char *format, ...)
{
struct mc_state *mcs = &mc_state[smp_processor_id()];
struct cpu_user_regs *regs;
+ const char *p = format;
+ unsigned long arg;
unsigned int i;
va_list args;
- va_start(args, nr_args);
+ va_start(args, format);
if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
{
__set_bit(_MCSF_call_preempted, &mcs->flags);
- for ( i = 0; i < nr_args; i++ )
- mcs->call.args[i] = va_arg(args, unsigned long);
+ for ( i = 0; *p != '\0'; i++ )
+ mcs->call.args[i] = next_arg(p, args);
}
else
{
regs = guest_cpu_user_regs();
#if defined(__i386__)
regs->eax = op;
- regs->eip -= 2; /* re-execute 'int 0x82' */
-
- for ( i = 0; i < nr_args; i++ )
- {
+
+ if ( supervisor_mode_kernel )
+ regs->eip &= ~31; /* re-execute entire hypercall entry stub */
+ else
+ regs->eip -= 2; /* re-execute 'int 0x82' */
+
+ for ( i = 0; *p != '\0'; i++ )
+ {
+ arg = next_arg(p, args);
switch ( i )
{
- case 0: regs->ebx = va_arg(args, unsigned long); break;
- case 1: regs->ecx = va_arg(args, unsigned long); break;
- case 2: regs->edx = va_arg(args, unsigned long); break;
- case 3: regs->esi = va_arg(args, unsigned long); break;
- case 4: regs->edi = va_arg(args, unsigned long); break;
- case 5: regs->ebp = va_arg(args, unsigned long); break;
+ case 0: regs->ebx = arg; break;
+ case 1: regs->ecx = arg; break;
+ case 2: regs->edx = arg; break;
+ case 3: regs->esi = arg; break;
+ case 4: regs->edi = arg; break;
+ case 5: regs->ebp = arg; break;
}
}
#elif defined(__x86_64__)
regs->rax = op;
regs->rip -= 2; /* re-execute 'syscall' */
- for ( i = 0; i < nr_args; i++ )
- {
+ for ( i = 0; *p != '\0'; i++ )
+ {
+ arg = next_arg(p, args);
switch ( i )
{
- case 0: regs->rdi = va_arg(args, unsigned long); break;
- case 1: regs->rsi = va_arg(args, unsigned long); break;
- case 2: regs->rdx = va_arg(args, unsigned long); break;
- case 3: regs->r10 = va_arg(args, unsigned long); break;
- case 4: regs->r8 = va_arg(args, unsigned long); break;
- case 5: regs->r9 = va_arg(args, unsigned long); break;
+ case 0: regs->rdi = arg; break;
+ case 1: regs->rsi = arg; break;
+ case 2: regs->rdx = arg; break;
+ case 3: regs->r10 = arg; break;
+ case 4: regs->r8 = arg; break;
+ case 5: regs->r9 = arg; break;
}
}
#endif
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/domain_build.c Wed Mar 1 19:47:25 2006
@@ -27,6 +27,9 @@
#include <asm/shadow.h>
#include <public/version.h>
+
+extern unsigned long initial_images_nrpages(void);
+extern void discard_initial_images(void);
static long dom0_nrpages;
@@ -181,7 +184,8 @@
{
printk("Unknown kernel feature \"%.*s\".\n",
(int)(p-feats), feats);
- panic("Domain 0 requires an unknown hypervisor feature.\n");
+ if ( req )
+ panic("Domain 0 requires an unknown hypervisor feature.\n");
}
feats = p;
@@ -248,9 +252,6 @@
uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
- extern void translate_l2pgtable(
- struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
-
/* Sanity! */
BUG_ON(d->domain_id != 0);
BUG_ON(d->vcpu[0] == NULL);
@@ -271,18 +272,14 @@
*/
if ( dom0_nrpages == 0 )
{
- dom0_nrpages = avail_domheap_pages() +
- ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
- ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
dom0_nrpages = -dom0_nrpages;
}
/* Negative memory specification means "all memory - specified amount". */
if ( dom0_nrpages < 0 )
- nr_pages = avail_domheap_pages() +
- ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
- ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
+ nr_pages = avail_domheap_pages() + initial_images_nrpages() +
dom0_nrpages;
else
nr_pages = dom0_nrpages;
@@ -704,16 +701,12 @@
hypercall_page_initialise((void *)hypercall_page);
}
- init_domheap_pages(
- _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
-
- /* Copy the initial ramdisk and free temporary buffer. */
+ /* Copy the initial ramdisk. */
if ( initrd_len != 0 )
- {
memcpy((void *)vinitrd_start, initrd_start, initrd_len);
- init_domheap_pages(
- _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
- }
+
+ /* Free temporary buffers. */
+ discard_initial_images();
/* Set up start info area. */
si = (start_info_t *)vstartinfo_start;
@@ -790,6 +783,25 @@
{
shadow_mode_enable(d, SHM_enable);
update_pagetables(v);
+ }
+
+ if ( supervisor_mode_kernel )
+ {
+ v->arch.guest_context.kernel_ss &= ~3;
+ v->arch.guest_context.user_regs.ss &= ~3;
+ v->arch.guest_context.user_regs.es &= ~3;
+ v->arch.guest_context.user_regs.ds &= ~3;
+ v->arch.guest_context.user_regs.fs &= ~3;
+ v->arch.guest_context.user_regs.gs &= ~3;
+ printk("Dom0 runs in ring 0 (supervisor mode)\n");
+ if ( !test_bit(XENFEAT_supervisor_mode_kernel,
+ dom0_features_supported) )
+ panic("Dom0 does not support supervisor-mode execution\n");
+ }
+ else
+ {
+ if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
+ panic("Dom0 requires supervisor-mode execution\n");
}
rc = 0;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/hvm.c Wed Mar 1 19:47:25 2006
@@ -25,6 +25,7 @@
#include <xen/sched.h>
#include <xen/irq.h>
#include <xen/softirq.h>
+#include <xen/domain.h>
#include <xen/domain_page.h>
#include <asm/current.h>
#include <asm/io.h>
@@ -59,9 +60,9 @@
for ( i = 0; i < nr_pfn; i++ )
{
- if ( pfn + i >= 0xfffff )
+ if ( pfn + i >= 0xfffff )
break;
-
+
__copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
}
}
@@ -217,7 +218,7 @@
global_iodata_t *spg;
u16 *virq_line, irqs;
struct hvm_virpic *pic = &v->domain->arch.hvm_domain.vpic;
-
+
spg = &get_sp(v->domain)->sp_global;
virq_line = &spg->pic_clear_irr;
if ( *virq_line ) {
@@ -312,6 +313,52 @@
}
/*
+ * only called in HVM domain BSP context
+ * when booting, vcpuid is always equal to apic_id
+ */
+int hvm_bringup_ap(int vcpuid, int trampoline_vector)
+{
+ struct vcpu *bsp = current, *v;
+ struct domain *d = bsp->domain;
+ struct vcpu_guest_context *ctxt;
+ int rc = 0;
+
+ /* current must be HVM domain BSP */
+ if ( !(HVM_DOMAIN(bsp) && bsp->vcpu_id == 0) ) {
+ printk("Not calling hvm_bringup_ap from BSP context.\n");
+ domain_crash_synchronous();
+ }
+
+ if ( (v = d->vcpu[vcpuid]) == NULL )
+ return -ENOENT;
+
+ if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL ) {
+ printk("Failed to allocate memory in hvm_bringup_ap.\n");
+ return -ENOMEM;
+ }
+
+ hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
+
+ LOCK_BIGLOCK(d);
+ rc = -EEXIST;
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ rc = boot_vcpu(d, vcpuid, ctxt);
+ UNLOCK_BIGLOCK(d);
+
+ if ( rc != 0 )
+ printk("AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
+ else {
+ if ( test_and_clear_bit(_VCPUF_down, &d->vcpu[vcpuid]->vcpu_flags) )
+ vcpu_wake(d->vcpu[vcpuid]);
+ printk("AP %d bringup suceeded.\n", vcpuid);
+ }
+
+ xfree(ctxt);
+
+ return rc;
+}
+
+/*
* Local variables:
* mode: C
* c-set-style: "BSD"
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/emulate.c
--- a/xen/arch/x86/hvm/svm/emulate.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/emulate.c Wed Mar 1 19:47:25 2006
@@ -86,7 +86,7 @@
case 0x7:
value = regs->edi;
break;
-#if X86_64
+#if __x86_64__
case 0x8:
value = regs->r8;
break;
@@ -318,20 +318,14 @@
/* Get the register/mode number of src register in ModRM register. */
-unsigned int decode_dest_reg(u8 m)
-{
-#if __x86_64__
- ASSERT(0); /* Need to adjust for REX prefix if applicable */
-#endif
- return (m >> 3) & 7;
-}
-
-unsigned int decode_src_reg(u8 m)
-{
-#if __x86_64__
- ASSERT(0); /* Need to adjust for REX prefix if applicable */
-#endif
- return m & 7;
+unsigned int decode_dest_reg(u8 prefix, u8 m)
+{
+ return DECODE_MODRM_REG(prefix, m);
+}
+
+unsigned int decode_src_reg(u8 prefix, u8 m)
+{
+ return DECODE_MODRM_RM(prefix, m);
}
@@ -431,7 +425,7 @@
* The caller can either pass a NULL pointer to the guest_eip_buf, or a pointer
* to enough bytes to satisfy the instruction including prefix bytes.
*/
-unsigned int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
+int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
enum instruction_index *list, unsigned int list_count,
u8 *guest_eip_buf, enum instruction_index *match)
{
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/intr.c
--- a/xen/arch/x86/hvm/svm/intr.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/intr.c Wed Mar 1 19:47:25 2006
@@ -80,12 +80,7 @@
{
struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
- switch(type)
- {
- case VLAPIC_DELIV_MODE_EXT:
- case VLAPIC_DELIV_MODE_FIXED:
- case VLAPIC_DELIV_MODE_LPRI:
- if ( is_pit_irq(v, vector, type) ) {
+ if ( is_pit_irq(v, vector, type) ) {
if ( !vpit->first_injected ) {
vpit->first_injected = 1;
vpit->pending_intr_nr = 0;
@@ -95,12 +90,15 @@
}
vpit->inject_point = NOW();
svm_set_tsc_shift (v, vpit);
- }
+ }
+
+ switch(type)
+ {
+ case VLAPIC_DELIV_MODE_EXT:
break;
default:
- printk("Not support interrupt type: %d\n", type);
- break;
+ vlapic_post_injection(v, vector, type);
}
}
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/svm.c Wed Mar 1 19:47:25 2006
@@ -164,7 +164,7 @@
}
static inline void svm_inject_exception(struct vmcb_struct *vmcb,
- int trap, int error_code)
+ int trap, int ev, int error_code)
{
eventinj_t event;
@@ -172,7 +172,7 @@
event.fields.v = 1;
event.fields.type = EVENTTYPE_EXCEPTION;
event.fields.vector = trap;
- event.fields.ev = 1;
+ event.fields.ev = ev;
event.fields.errorcode = error_code;
ASSERT(vmcb->eventinj.fields.v == 0);
@@ -237,61 +237,16 @@
}
#ifdef __x86_64__
-static struct svm_msr_state percpu_msr[NR_CPUS];
-
-static u32 msr_data_index[VMX_MSR_COUNT] =
-{
- MSR_LSTAR, MSR_STAR, MSR_CSTAR,
- MSR_SYSCALL_MASK, MSR_EFER,
-};
void svm_save_segments(struct vcpu *v)
{
- rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_svm.msr_content.shadow_gs);
-}
-
-/*
- * To avoid MSR save/restore at every VM exit/entry time, we restore
- * the x86_64 specific MSRs at domain switch time. Since those MSRs are
- * are not modified once set for generic domains, we don't save them,
- * but simply reset them to the values set at percpu_traps_init().
- */
+}
void svm_load_msrs(void)
{
- struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
- int i;
-
- while ( host_state->flags )
- {
- i = find_first_set_bit(host_state->flags);
- wrmsrl(msr_data_index[i], host_state->msr_items[i]);
- clear_bit(i, &host_state->flags);
- }
-}
-
-static void svm_save_init_msrs(void)
-{
- struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
- int i;
-
- for ( i = 0; i < SVM_MSR_COUNT; i++ )
- rdmsrl(msr_data_index[i], host_state->msr_items[i]);
-}
-
-#define CASE_READ_MSR(address) \
- case MSR_ ## address: \
- msr_content = msr->msr_items[SVM_INDEX_MSR_ ## address]; \
- break
-
-#define CASE_WRITE_MSR(address) \
- case MSR_ ## address: \
- msr->msr_items[SVM_INDEX_MSR_ ## address] = msr_content; \
- if (!test_bit(SVM_INDEX_MSR_ ## address, &msr->flags)) \
- { \
- set_bit(SVM_INDEX_MSR_ ## address, &msr->flags); \
- } \
- break
-
+}
+void svm_restore_msrs(struct vcpu *v)
+{
+}
#define IS_CANO_ADDRESS(add) 1
@@ -299,47 +254,45 @@
{
u64 msr_content = 0;
struct vcpu *vc = current;
- struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
+ // struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
switch (regs->ecx)
{
case MSR_EFER:
- msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
- HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n",
- (unsigned long long)msr_content);
-
- if (test_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state))
- msr_content |= 1 << _EFER_LME;
-
- if (SVM_LONG_GUEST(vc))
- msr_content |= 1 << _EFER_LMA;
-
+ // msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
+ msr_content = vmcb->efer;
+ msr_content &= ~EFER_SVME;
break;
case MSR_FS_BASE:
- if (!(SVM_LONG_GUEST(vc)))
- /* XXX should it be GP fault */
- domain_crash_synchronous();
-
msr_content = vmcb->fs.base;
break;
case MSR_GS_BASE:
- if (!(SVM_LONG_GUEST(vc)))
- domain_crash_synchronous();
-
msr_content = vmcb->gs.base;
break;
case MSR_SHADOW_GS_BASE:
- msr_content = msr->shadow_gs;
- break;
-
- CASE_READ_MSR(STAR);
- CASE_READ_MSR(LSTAR);
- CASE_READ_MSR(CSTAR);
- CASE_READ_MSR(SYSCALL_MASK);
+ msr_content = vmcb->kerngsbase;
+ break;
+
+ case MSR_STAR:
+ msr_content = vmcb->star;
+ break;
+
+ case MSR_LSTAR:
+ msr_content = vmcb->lstar;
+ break;
+
+ case MSR_CSTAR:
+ msr_content = vmcb->cstar;
+ break;
+
+ case MSR_SYSCALL_MASK:
+ msr_content = vmcb->sfmask;
+ break;
+
default:
return 0;
}
@@ -356,8 +309,6 @@
{
u64 msr_content = regs->eax | ((u64)regs->edx << 32);
struct vcpu *vc = current;
- struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
- struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
HVM_DBG_LOG(DBG_LEVEL_1, "mode_do_msr_write msr %lx msr_content %lx\n",
@@ -373,26 +324,20 @@
|| !test_bit(SVM_CPU_STATE_PAE_ENABLED,
&vc->arch.hvm_svm.cpu_state))
{
- svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+ svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
}
}
if (msr_content & EFER_LME)
set_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state);
+ /* We have already recorded that we want LME, so it will be set
+ * next time CR0 gets updated. So we clear that bit and continue.
+ */
+ if ((msr_content ^ vmcb->efer) & EFER_LME)
+ msr_content &= ~EFER_LME;
/* No update for LME/LMA since it have no effect */
- msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
- if (msr_content & ~(EFER_LME | EFER_LMA))
- {
- msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
- if (!test_bit(SVM_INDEX_MSR_EFER, &msr->flags))
- {
- rdmsrl(MSR_EFER, host_state->msr_items[SVM_INDEX_MSR_EFER]);
- set_bit(SVM_INDEX_MSR_EFER, &host_state->flags);
- set_bit(SVM_INDEX_MSR_EFER, &msr->flags);
- wrmsrl(MSR_EFER, msr_content);
- }
- }
+ vmcb->efer = msr_content | EFER_SVME;
break;
case MSR_FS_BASE:
@@ -403,63 +348,42 @@
if (!IS_CANO_ADDRESS(msr_content))
{
HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
- svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+ svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
}
if (regs->ecx == MSR_FS_BASE)
- vmcb->fs.base = msr_content;
+ vmcb->fs.base = msr_content;
else
- vmcb->gs.base = msr_content;
+ vmcb->gs.base = msr_content;
break;
case MSR_SHADOW_GS_BASE:
- if (!(SVM_LONG_GUEST(vc)))
- domain_crash_synchronous();
-
- vc->arch.hvm_svm.msr_content.shadow_gs = msr_content;
- wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
- break;
-
- CASE_WRITE_MSR(STAR);
- CASE_WRITE_MSR(LSTAR);
- CASE_WRITE_MSR(CSTAR);
- CASE_WRITE_MSR(SYSCALL_MASK);
+ vmcb->kerngsbase = msr_content;
+ break;
+
+ case MSR_STAR:
+ vmcb->star = msr_content;
+ break;
+
+ case MSR_LSTAR:
+ vmcb->lstar = msr_content;
+ break;
+
+ case MSR_CSTAR:
+ vmcb->cstar = msr_content;
+ break;
+
+ case MSR_SYSCALL_MASK:
+ vmcb->sfmask = msr_content;
+ break;
+
default:
return 0;
}
return 1;
}
-void
-svm_restore_msrs(struct vcpu *v)
-{
- int i = 0;
- struct svm_msr_state *guest_state;
- struct svm_msr_state *host_state;
- unsigned long guest_flags;
-
- guest_state = &v->arch.hvm_svm.msr_content;;
- host_state = &percpu_msr[smp_processor_id()];
-
- wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
- guest_flags = guest_state->flags;
- if (!guest_flags)
- return;
-
- while (guest_flags){
- i = find_first_set_bit(guest_flags);
-
- HVM_DBG_LOG(DBG_LEVEL_2,
- "restore guest's index %d msr %lx with %lx\n",
- i, (unsigned long) msr_data_index[i], (unsigned long)
guest_state->msr_items[i]);
- set_bit(i, &host_state->flags);
- wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
- clear_bit(i, &guest_flags);
- }
-}
#else
-#define svm_save_init_msrs() ((void)0)
-
static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
{
return 0;
@@ -497,9 +421,28 @@
{
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
-
- mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
+ /* check which operating mode the guest is running */
+ if( vmcb->efer & EFER_LMA )
+ mode = vmcb->cs.attributes.fields.l ? 8 : 4;
+ else
+ mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
return svm_instrlen(guest_cpu_user_regs(), mode);
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+{
+ switch ( num )
+ {
+ case 0:
+ return v->arch.hvm_svm.cpu_shadow_cr0;
+ case 2:
+ return v->arch.hvm_svm.cpu_cr2;
+ case 3:
+ return v->arch.hvm_svm.cpu_cr3;
+ default:
+ BUG();
+ }
+ return 0; /* dummy */
}
int start_svm(void)
@@ -519,8 +462,6 @@
asidpool_init(smp_processor_id());
printk("AMD SVM Extension is enabled for cpu %d.\n", smp_processor_id());
- svm_save_init_msrs();
-
/* Setup HVM interfaces */
hvm_funcs.disable = stop_svm;
@@ -542,6 +483,7 @@
hvm_funcs.realmode = svm_realmode;
hvm_funcs.paging_enabled = svm_paging_enabled;
hvm_funcs.instruction_length = svm_instruction_length;
+ hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
hvm_enabled = 1;
@@ -631,8 +573,17 @@
}
#if defined (__x86_64__)
-void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *c )
-{
+void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v )
+{
+ struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+ regs->rip = vmcb->rip;
+ regs->rsp = vmcb->rsp;
+ regs->rflags = vmcb->rflags;
+ regs->cs = vmcb->cs.sel;
+ regs->ds = vmcb->ds.sel;
+ regs->es = vmcb->es.sel;
+ regs->ss = vmcb->ss.sel;
}
#elif defined (__i386__)
void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v)
@@ -810,7 +761,8 @@
vpit = &v->domain->arch.hvm_domain.vpit;
kill_timer(&vpit->pit_timer);
kill_timer(&v->arch.hvm_svm.hlt_timer);
- if ( hvm_apic_support(v->domain) ) {
+ if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
+ {
kill_timer( &(VLAPIC(v)->vlapic_timer) );
xfree( VLAPIC(v) );
}
@@ -819,8 +771,29 @@
void arch_svm_do_resume(struct vcpu *v)
{
- svm_do_resume(v);
- reset_stack_and_jump(svm_asm_do_resume);
+ /* pinning VCPU to a different core? */
+ if ( v->arch.hvm_svm.launch_core == smp_processor_id()) {
+ svm_do_resume( v );
+ reset_stack_and_jump( svm_asm_do_resume );
+ }
+ else {
+ printk("VCPU core pinned: %d to %d\n", v->arch.hvm_svm.launch_core,
smp_processor_id() );
+ v->arch.hvm_svm.launch_core = smp_processor_id();
+ svm_migrate_timers( v );
+ svm_do_resume( v );
+ reset_stack_and_jump( svm_asm_do_resume );
+ }
+}
+
+
+void svm_migrate_timers(struct vcpu *v)
+{
+ struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
+
+ migrate_timer( &vpit->pit_timer, v->processor );
+ migrate_timer( &v->arch.hvm_svm.hlt_timer, v->processor );
+ if ( hvm_apic_support(v->domain) && VLAPIC( v ))
+ migrate_timer( &(VLAPIC(v)->vlapic_timer ), v->processor );
}
@@ -860,9 +833,9 @@
/* No support for APIC */
if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
{
- unsigned long inst_len;
- inst_len = svm_instruction_length(v);
- if (inst_len == (unsigned long)-1)
+ int inst_len;
+ inst_len = svm_instruction_length(v);
+ if (inst_len == -1)
{
printf("%s: INST_LEN - Unable to decode properly.\n",
__func__);
domain_crash_synchronous();
@@ -914,6 +887,14 @@
eip = vmcb->rip;
error_code = vmcb->exitinfo1;
+
+ if (vmcb->idtr.limit == 0) {
+ printf("Huh? We got a GP Fault with an invalid IDTR!\n");
+ svm_dump_vmcb(__func__, vmcb);
+ svm_dump_regs(__func__, regs);
+ svm_dump_inst(vmcb->rip);
+ __hvm_bug(regs);
+ }
HVM_DBG_LOG(DBG_LEVEL_1,
"svm_general_protection_fault: eip = %lx, erro_code = %lx",
@@ -927,7 +908,7 @@
/* Reflect it back into the guest */
- svm_inject_exception(vmcb, TRAP_gp_fault, error_code);
+ svm_inject_exception(vmcb, TRAP_gp_fault, 1, error_code);
}
/* Reserved bits: [31:14], [12:1] */
@@ -939,7 +920,7 @@
unsigned int eax, ebx, ecx, edx;
unsigned long eip;
struct vcpu *v = current;
- unsigned int inst_len;
+ int inst_len;
ASSERT(vmcb);
@@ -956,21 +937,29 @@
if (input == 1)
{
+#ifndef __x86_64__
if ( hvm_apic_support(v->domain) &&
!vlapic_global_enabled((VLAPIC(v))) )
+#endif
clear_bit(X86_FEATURE_APIC, &edx);
-#ifdef __x86_64__
+#if CONFIG_PAGING_LEVELS < 3
+ clear_bit(X86_FEATURE_PAE, &edx);
+ clear_bit(X86_FEATURE_PSE, &edx);
+ clear_bit(X86_FEATURE_PSE36, &edx);
+#else
if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-#endif
{
+ if ( !v->domain->arch.hvm_domain.pae_enabled )
+ clear_bit(X86_FEATURE_PAE, &edx);
clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PAE, &edx);
clear_bit(X86_FEATURE_PSE36, &edx);
}
+#endif
/* Clear out reserved bits. */
ecx &= ~SVM_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */
+ clear_bit(X86_FEATURE_MWAIT & 31, &ecx);
}
#ifdef __i386__
else if ( input == 0x80000001 )
@@ -991,6 +980,7 @@
eip, input, eax, ebx, ecx, edx);
inst_len = __get_instruction_length(vmcb, INSTR_CPUID, NULL);
+ ASSERT(inst_len > 0);
__update_guest_eip(vmcb, inst_len);
}
@@ -1083,9 +1073,11 @@
unsigned long *reg_p = 0;
unsigned int gpreg = 0;
unsigned long eip;
- unsigned int inst_len;
+ int inst_len;
+ int index;
struct vmcb_struct *vmcb;
u8 buffer[MAX_INST_LEN];
+ u8 prefix = 0;
vmcb = v->arch.hvm_svm.vmcb;
@@ -1093,13 +1085,15 @@
eip = vmcb->rip;
inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
-
- ASSERT(buffer[0] == 0x0f && (buffer[1] & 0xFD) == 0x21);
-
- gpreg = decode_src_reg(buffer[2]);
-#if DEBUG
- ASSERT(reg == decode_dest_reg(buffer[2]));
-#endif
+ index = skip_prefix_bytes(buffer, sizeof(buffer));
+
+ ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21);
+
+ if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
+ prefix = buffer[index-1];
+
+ gpreg = decode_src_reg(prefix, buffer[index + 2]);
+ ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2]));
HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x",
eip, reg, gpreg);
@@ -1120,6 +1114,7 @@
__hvm_bug(regs);
break;
}
+ ASSERT(inst_len > 0);
__update_guest_eip(vmcb, inst_len);
}
@@ -1335,13 +1330,13 @@
}
}
-
static int svm_set_cr0(unsigned long value)
{
struct vcpu *v = current;
unsigned long mfn;
int paging_enabled;
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+ unsigned long crn;
ASSERT(vmcb);
@@ -1377,7 +1372,7 @@
&v->arch.hvm_svm.cpu_state))
{
HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
- svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+ svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
}
if (test_bit(SVM_CPU_STATE_LME_ENABLED, &v->arch.hvm_svm.cpu_state))
@@ -1386,14 +1381,7 @@
HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
set_bit(SVM_CPU_STATE_LMA_ENABLED,
&v->arch.hvm_svm.cpu_state);
-#if 0
- __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
- vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
- __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-#else
- printk("Cannot yet set SVM_CPU_STATE_LMA_ENABLED\n");
- domain_crash_synchronous();
-#endif
+ vmcb->efer |= (EFER_LMA | EFER_LME);
#if CONFIG_PAGING_LEVELS >= 4
if (!shadow_set_guest_paging_levels(v->domain, 4))
@@ -1404,8 +1392,9 @@
#endif
}
else
+#endif /* __x86_64__ */
{
-#if CONFIG_PAGING_LEVELS >= 4
+#if CONFIG_PAGING_LEVELS >= 3
if (!shadow_set_guest_paging_levels(v->domain, 2))
{
printk("Unsupported guest paging levels\n");
@@ -1414,33 +1403,18 @@
#endif
}
-#if 0
- unsigned long crn;
-
/* update CR4's PAE if needed */
- __vmread(GUEST_CR4, &crn);
+ crn = vmcb->cr4;
if ((!(crn & X86_CR4_PAE))
&& test_bit(SVM_CPU_STATE_PAE_ENABLED,
&v->arch.hvm_svm.cpu_state))
{
HVM_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n");
- __vmwrite(GUEST_CR4, crn | X86_CR4_PAE);
- }
-#else
- printk("Cannot yet set SVM_CPU_STATE_PAE_ENABLED\n");
- domain_crash_synchronous();
-#endif
-#elif defined(__i386__)
- {
- unsigned long old_base_mfn;
- old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
- if (old_base_mfn)
- put_page(mfn_to_page(old_base_mfn));
- }
-#endif
+ vmcb->cr4 |= X86_CR4_PAE;
+ }
/* Now arch.guest_table points to machine physical. */
- v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+ v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
update_pagetables(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
@@ -1461,7 +1435,7 @@
*/
if ((value & X86_CR0_PE) == 0) {
if (value & X86_CR0_PG) {
- svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+ svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
return 0;
}
@@ -1471,7 +1445,6 @@
return 1;
}
-
/*
* Read from control registers. CR0 and CR4 are read from the shadow.
@@ -1497,7 +1470,7 @@
value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
break;
case 4:
- value = vmcb->cr4;
+ value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
break;
case 8:
#if 0
@@ -1579,7 +1552,7 @@
}
old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
- v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+ v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
if (old_base_mfn)
put_page(mfn_to_page(old_base_mfn));
@@ -1596,12 +1569,19 @@
case 4:
/* CR4 */
- if (value & X86_CR4_PAE)
- __hvm_bug(regs); /* not implemented */
-
- old_cr = vmcb->cr4;
-
- vmcb->cr4 = value;
+ if (value & X86_CR4_PAE) {
+ set_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
+ } else {
+ if (test_bit(SVM_CPU_STATE_LMA_ENABLED,
+ &v->arch.hvm_svm.cpu_state)) {
+ svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
+ }
+ clear_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
+ }
+
+ old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
+ v->arch.hvm_svm.cpu_shadow_cr4 = value;
+ vmcb->cr4 = value | SVM_CR4_HOST_MASK;
/*
* Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
@@ -1630,10 +1610,12 @@
struct cpu_user_regs *regs)
{
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
- unsigned int inst_len = 0;
+ int inst_len = 0;
+ int index;
unsigned int gpreg;
unsigned long value;
- u8 buffer[6];
+ u8 buffer[MAX_INST_LEN];
+ u8 prefix = 0;
int result = 1;
enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
@@ -1642,29 +1624,41 @@
ASSERT(vmcb);
inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
+ /* get index to first actual instruction byte - as we will need to know
where the
+ * prefix lives later on
+ */
+ index = skip_prefix_bytes(buffer, sizeof(buffer));
if (type == TYPE_MOV_TO_CR)
{
inst_len = __get_instruction_length_from_list(vmcb, list_a,
- ARR_SIZE(list_a), buffer, &match);
+ ARR_SIZE(list_a), &buffer[index], &match);
}
else
{
inst_len = __get_instruction_length_from_list(vmcb, list_b,
- ARR_SIZE(list_b), buffer, &match);
- }
+ ARR_SIZE(list_b), &buffer[index], &match);
+ }
+
+ ASSERT(inst_len > 0);
+
+ inst_len += index;
+
+ /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
+ if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
+ prefix = buffer[index-1];
HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
switch (match)
{
case INSTR_MOV2CR:
- gpreg = decode_src_reg(buffer[2]);
+ gpreg = decode_src_reg(prefix, buffer[index+2]);
result = mov_to_cr(gpreg, cr, regs);
break;
case INSTR_MOVCR2:
- gpreg = decode_src_reg(buffer[2]);
+ gpreg = decode_src_reg(prefix, buffer[index+2]);
mov_from_cr(cr, gpreg, regs);
break;
@@ -1680,7 +1674,7 @@
if (svm_dbg_on)
svm_dump_inst(svm_rip2pointer(vmcb));
- gpreg = decode_src_reg(buffer[2]);
+ gpreg = decode_src_reg(prefix, buffer[index+2]);
value = get_reg(gpreg, regs, vmcb) & 0xF;
if (svm_dbg_on)
@@ -1698,7 +1692,7 @@
case INSTR_SMSW:
svm_dump_inst(svm_rip2pointer(vmcb));
value = v->arch.hvm_svm.cpu_shadow_cr0;
- gpreg = decode_src_reg(buffer[2]);
+ gpreg = decode_src_reg(prefix, buffer[index+2]);
set_reg(gpreg, value, regs, vmcb);
if (svm_dbg_on)
@@ -1721,7 +1715,7 @@
static inline void svm_do_msr_access(struct vcpu *v, struct cpu_user_regs
*regs)
{
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
- unsigned int inst_len;
+ int inst_len;
int64_t tsc_sum;
ASSERT(vmcb);
@@ -1813,7 +1807,9 @@
next_wakeup = next_pit;
if ( next_wakeup != - 1 )
set_timer(¤t->arch.hvm_svm.hlt_timer, next_wakeup);
+/* temporary workaround for 8828/8822 evtchn patches causing SVM failure.
hvm_safe_block();
+*/
}
@@ -1860,7 +1856,7 @@
struct vcpu *v = current;
u8 opcode[MAX_INST_SIZE], prefix, length = MAX_INST_SIZE;
unsigned long g_vaddr;
- unsigned int inst_len;
+ int inst_len;
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
ASSERT(vmcb);
@@ -1877,6 +1873,7 @@
if (invlpga)
{
inst_len = __get_instruction_length(vmcb, INSTR_INVLPGA, opcode);
+ ASSERT(inst_len > 0);
__update_guest_eip(vmcb, inst_len);
/*
@@ -1890,6 +1887,7 @@
/* What about multiple prefix codes? */
prefix = (is_prefix(opcode[0])?opcode[0]:0);
inst_len = __get_instruction_length(vmcb, INSTR_INVLPG, opcode);
+ ASSERT(inst_len > 0);
inst_len--;
length -= inst_len;
@@ -1941,7 +1939,10 @@
v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
vmcb->cr2 = 0;
- vmcb->cr4 = 0;
+ vmcb->efer = EFER_SVME;
+
+ vmcb->cr4 = SVM_CR4_HOST_MASK;
+ v->arch.hvm_svm.cpu_shadow_cr4 = 0;
/* This will jump to ROMBIOS */
vmcb->rip = 0xFFF0;
@@ -2011,12 +2012,13 @@
static int svm_do_vmmcall(struct vcpu *v, struct cpu_user_regs *regs)
{
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
- unsigned int inst_len;
+ int inst_len;
ASSERT(vmcb);
ASSERT(regs);
inst_len = __get_instruction_length(vmcb, INSTR_VMCALL, NULL);
+ ASSERT(inst_len > 0);
/* VMMCALL sanity check */
if (vmcb->cpl > get_vmmcall_cpl(regs->edi))
@@ -2470,7 +2472,7 @@
{
v->arch.hvm_svm.injecting_event = 1;
/* Inject #PG using Interruption-Information Fields */
- svm_inject_exception(vmcb, TRAP_page_fault, regs.error_code);
+ svm_inject_exception(vmcb, TRAP_page_fault, 1, regs.error_code);
v->arch.hvm_svm.cpu_cr2 = va;
vmcb->cr2 = va;
@@ -2665,26 +2667,23 @@
{
struct vcpu *v = current;
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
- int core = smp_processor_id();
- int oldcore = v->arch.hvm_svm.core;
- /*
- * if need to assign new asid or if switching cores,
- * then retire asid for old core, and assign new for new core.
- */
- if( v->arch.hvm_svm.core != core ) {
- if (svm_dbg_on)
- printk("old core %d new core
%d\n",(int)v->arch.hvm_svm.core,(int)core);
- v->arch.hvm_svm.core = core;
- }
- if( test_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags) ||
- (oldcore != core)) {
- if(!asidpool_assign_next(vmcb, 1,
- oldcore, core)) {
+
+ /*
+ * if need to assign new asid, or if switching cores,
+ * retire asid for the old core, and assign a new asid to the current core.
+ */
+ if ( test_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags ) ||
+ ( v->arch.hvm_svm.asid_core != v->arch.hvm_svm.launch_core )) {
+ /* recycle asid */
+ if ( !asidpool_assign_next( vmcb, 1,
+ v->arch.hvm_svm.asid_core, v->arch.hvm_svm.launch_core )) {
/* If we get here, we have a major problem */
domain_crash_synchronous();
}
- }
- clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
+
+ v->arch.hvm_svm.asid_core = v->arch.hvm_svm.launch_core;
+ clear_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags );
+ }
}
/*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/vmcb.c Wed Mar 1 19:47:25 2006
@@ -190,7 +190,6 @@
unsigned long eflags;
unsigned long shadow_cr;
struct vmcb_struct *vmcb = arch_svm->vmcb;
- struct Xgt_desc_struct desc;
/* Allows IRQs to be shares */
vmcb->vintr.fields.intr_masking = 1;
@@ -224,9 +223,9 @@
vmcb->fs.base = 0;
vmcb->gs.base = 0;
- __asm__ __volatile__ ("sidt (%0) \n" :: "a"(&desc) : "memory");
- vmcb->idtr.base = desc.address;
- vmcb->idtr.limit = desc.size;
+ /* Guest Interrupt descriptor table */
+ vmcb->idtr.base = 0;
+ vmcb->idtr.limit = 0;
/* Set up segment attributes */
attrib.bytes = 0;
@@ -248,15 +247,11 @@
attrib.fields.type = 0xb; /* type=0xb -> executable/readable, accessed */
vmcb->cs.attributes = attrib;
- /* Global descriptor table */
- //NMERGE7500 - can probably remove access to gdtr
- vmcb->gdtr.base = regs->edx;
- regs->edx = 0;
- ASSERT(regs->eax <= 0xFFFF); /* Make sure we're in the limit */
- vmcb->gdtr.limit = regs->eax;
- regs->eax = 0;
-
- /* Local Descriptor Table */
+ /* Guest Global descriptor table */
+ vmcb->gdtr.base = 0;
+ vmcb->gdtr.limit = 0;
+
+ /* Guest Local Descriptor Table */
attrib.fields.s = 0; /* not code or data segement */
attrib.fields.type = 0x2; /* LDT */
attrib.fields.db = 0; /* 16-bit */
@@ -279,11 +274,10 @@
/* CR3 is set in svm_final_setup_guest */
__asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) :);
- shadow_cr = crn;
- vmcb->cr4 = shadow_cr;
-
-//MERGE7500 - should write a 0 instead to rsp?
- vmcb->rsp = regs->esp;
+ arch_svm->cpu_shadow_cr4 = crn & ~(X86_CR4_PGE | X86_CR4_PSE);
+ vmcb->cr4 = crn | SVM_CR4_HOST_MASK;
+
+ vmcb->rsp = 0;
vmcb->rip = regs->eip;
eflags = regs->eflags & ~HVM_EFLAGS_RESERVED_0; /* clear 0s */
@@ -306,7 +300,7 @@
{
if(arch_svm->vmcb != NULL)
{
- asidpool_retire(arch_svm->vmcb, arch_svm->core);
+ asidpool_retire(arch_svm->vmcb, arch_svm->asid_core);
free_vmcb(arch_svm->vmcb);
}
if(arch_svm->iopm != NULL) {
@@ -404,18 +398,17 @@
void svm_do_launch(struct vcpu *v)
{
+ struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+ int core = smp_processor_id();
+ ASSERT(vmcb);
+
/* Update CR3, GDT, LDT, TR */
- struct vmcb_struct *vmcb;
- int core = smp_processor_id();
- vmcb = v->arch.hvm_svm.vmcb;
- ASSERT(vmcb);
-
svm_stts(v);
- /* current core is the one we will perform the vmrun on */
- v->arch.hvm_svm.core = core;
+ /* current core is the one we intend to perform the VMRUN on */
+ v->arch.hvm_svm.launch_core = v->arch.hvm_svm.asid_core = core;
clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- if ( !asidpool_assign_next(vmcb, 0, core, core) )
+ if ( !asidpool_assign_next( vmcb, 0, core, core ))
BUG();
if (v->vcpu_id == 0)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/x86_64/exits.S
--- a/xen/arch/x86/hvm/svm/x86_64/exits.S Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/x86_64/exits.S Wed Mar 1 19:47:25 2006
@@ -107,8 +107,6 @@
movq %rax, VMCB_rax(%rcx)
movq VCPU_svm_hsa_pa(%rbx), %rax
VMSAVE
- /* XXX FPU SAVE */
- /* XXX DO TSC OFFSET */
movq VCPU_svm_vmcb_pa(%rbx), %rax
popq %r15
@@ -137,9 +135,7 @@
VMSAVE
/* rax is the only register we're allowed to touch here... */
- /* XXX FPU SAVE */
GET_CURRENT(%rax)
- /* XXX DO TSC OFFSET */
movq VCPU_svm_hsa_pa(%rax), %rax
VMLOAD
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vlapic.c Wed Mar 1 19:47:25 2006
@@ -225,27 +225,35 @@
break;
case VLAPIC_DELIV_MODE_INIT:
- if (!level && trig_mode == 1) { //Deassert
+ if ( !level && trig_mode == 1 ) { //Deassert
printk("This hvm_vlapic is for P4, no work for De-assert init\n");
} else {
/* FIXME How to check the situation after vcpu reset? */
- vlapic->init_sipi_sipi_state =
VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
- if (vlapic->vcpu) {
- vcpu_pause(vlapic->vcpu);
+ if ( test_and_clear_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
+ printk("Reset hvm vcpu not supported yet\n");
+ domain_crash_synchronous();
}
+ v->arch.hvm_vcpu.init_sipi_sipi_state =
+ HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
+ result = 1;
}
break;
case VLAPIC_DELIV_MODE_STARTUP:
- if (vlapic->init_sipi_sipi_state !=
VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI)
+ if ( v->arch.hvm_vcpu.init_sipi_sipi_state ==
+ HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM )
break;
- vlapic->init_sipi_sipi_state = VLAPIC_INIT_SIPI_SIPI_STATE_NORM;
- if (!vlapic->vcpu) {
- /* XXX Call hvm_bringup_ap here */
- result = 0;
- }else{
- //hvm_vcpu_reset(vlapic->vcpu);
- }
+
+ v->arch.hvm_vcpu.init_sipi_sipi_state =
+ HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM;
+
+ if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
+ printk("SIPI for initialized vcpu vcpuid %x\n", v->vcpu_id);
+ domain_crash_synchronous();
+ }
+
+ if ( hvm_bringup_ap(v->vcpu_id, vector) != 0 )
+ result = 0;
break;
default:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/io.c
--- a/xen/arch/x86/hvm/vmx/io.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vmx/io.c Wed Mar 1 19:47:25 2006
@@ -113,13 +113,15 @@
struct hvm_virpit *vpit = &plat->vpit;
struct hvm_virpic *pic= &plat->vpic;
- hvm_pic_assist(v);
- __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
- if ( vpit->pending_intr_nr ) {
+ if ( v->vcpu_id == 0 )
+ hvm_pic_assist(v);
+
+ if ( (v->vcpu_id == 0) && vpit->pending_intr_nr ) {
pic_set_irq(pic, 0, 0);
pic_set_irq(pic, 0, 1);
}
+ __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
__vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields);
if (intr_fields & INTR_INFO_VALID_MASK) {
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Mar 1 19:47:25 2006
@@ -448,6 +448,37 @@
return 0; /* dummy */
}
+/* SMP VMX guest support */
+void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
+ int vcpuid, int trampoline_vector)
+{
+ int i;
+
+ memset(ctxt, 0, sizeof(*ctxt));
+
+ /*
+ * Initial register values:
+ */
+ ctxt->user_regs.eip = VMXASSIST_BASE;
+ ctxt->user_regs.edx = vcpuid;
+ ctxt->user_regs.ebx = trampoline_vector;
+
+ ctxt->flags = VGCF_HVM_GUEST;
+
+ /* Virtual IDT is empty at start-of-day. */
+ for ( i = 0; i < 256; i++ )
+ {
+ ctxt->trap_ctxt[i].vector = i;
+ ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
+ }
+
+ /* No callback handlers. */
+#if defined(__i386__)
+ ctxt->event_callback_cs = FLAT_KERNEL_CS;
+ ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
+#endif
+}
+
void do_nmi(struct cpu_user_regs *);
static int check_vmx_controls(ctrls, msr)
@@ -544,6 +575,8 @@
hvm_funcs.paging_enabled = vmx_paging_enabled;
hvm_funcs.instruction_length = vmx_instruction_length;
hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
+
+ hvm_funcs.init_ap_context = vmx_init_ap_context;
hvm_enabled = 1;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/mm.c Wed Mar 1 19:47:25 2006
@@ -97,11 +97,11 @@
#include <xen/domain_page.h>
#include <xen/event.h>
#include <xen/iocap.h>
+#include <xen/guest_access.h>
#include <asm/shadow.h>
#include <asm/page.h>
#include <asm/flushtlb.h>
#include <asm/io.h>
-#include <asm/uaccess.h>
#include <asm/ldt.h>
#include <asm/x86_emulate.h>
#include <public/memory.h>
@@ -475,7 +475,8 @@
{
MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
" for dom%d",
- mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e),
d->domain_id);
+ mfn, get_gpfn_from_mfn(mfn),
+ l1e_get_intpte(l1e), d->domain_id);
}
return okay;
@@ -515,7 +516,6 @@
#if CONFIG_PAGING_LEVELS >= 3
-
static int
get_page_from_l3e(
l3_pgentry_t l3e, unsigned long pfn,
@@ -545,11 +545,9 @@
#endif
return rc;
}
-
#endif /* 3 level */
#if CONFIG_PAGING_LEVELS >= 4
-
static int
get_page_from_l4e(
l4_pgentry_t l4e, unsigned long pfn,
@@ -579,7 +577,6 @@
return rc;
}
-
#endif /* 4 level */
@@ -649,27 +646,22 @@
#if CONFIG_PAGING_LEVELS >= 3
-
static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
{
if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
(l3e_get_pfn(l3e) != pfn) )
put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
}
-
#endif
#if CONFIG_PAGING_LEVELS >= 4
-
static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
{
if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
(l4e_get_pfn(l4e) != pfn) )
put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
}
-
#endif
-
static int alloc_l1_table(struct page_info *page)
{
@@ -1569,43 +1561,71 @@
int okay;
unsigned long old_base_mfn;
+ ASSERT(writable_pagetable_in_sync(d));
+
if ( shadow_mode_refcounts(d) )
+ {
okay = get_page_from_pagenr(mfn, d);
+ if ( unlikely(!okay) )
+ {
+ MEM_LOG("Error while installing new baseptr %lx", mfn);
+ return 0;
+ }
+ }
else
+ {
okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
-
- if ( likely(okay) )
- {
- invalidate_shadow_ldt(v);
-
- old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
- v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
- update_pagetables(v); /* update shadow_table and monitor_table */
-
- write_ptbase(v);
-
+ if ( unlikely(!okay) )
+ {
+ /* Switch to idle pagetable: this VCPU has no active p.t. now. */
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+ v->arch.guest_table = mk_pagetable(0);
+ update_pagetables(v);
+ write_cr3(__pa(idle_pg_table));
+ if ( old_base_mfn != 0 )
+ put_page_and_type(mfn_to_page(old_base_mfn));
+
+ /* Retry the validation with no active p.t. for this VCPU. */
+ okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+ if ( !okay )
+ {
+ /* Failure here is unrecoverable: the VCPU has no pagetable! */
+ MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
+ domain_crash(d);
+ percpu_info[v->processor].deferred_ops = 0;
+ return 0;
+ }
+ }
+ }
+
+ invalidate_shadow_ldt(v);
+
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+ v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+ update_pagetables(v); /* update shadow_table and monitor_table */
+
+ write_ptbase(v);
+
+ if ( likely(old_base_mfn != 0) )
+ {
if ( shadow_mode_refcounts(d) )
put_page(mfn_to_page(old_base_mfn));
else
put_page_and_type(mfn_to_page(old_base_mfn));
-
- /* CR3 also holds a ref to its shadow... */
- if ( shadow_mode_enabled(d) )
- {
- if ( v->arch.monitor_shadow_ref )
- put_shadow_ref(v->arch.monitor_shadow_ref);
- v->arch.monitor_shadow_ref =
- pagetable_get_pfn(v->arch.monitor_table);
- ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
- get_shadow_ref(v->arch.monitor_shadow_ref);
- }
- }
- else
- {
- MEM_LOG("Error while installing new baseptr %lx", mfn);
- }
-
- return okay;
+ }
+
+ /* CR3 also holds a ref to its shadow... */
+ if ( shadow_mode_enabled(d) )
+ {
+ if ( v->arch.monitor_shadow_ref )
+ put_shadow_ref(v->arch.monitor_shadow_ref);
+ v->arch.monitor_shadow_ref =
+ pagetable_get_pfn(v->arch.monitor_table);
+ ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
+ get_shadow_ref(v->arch.monitor_shadow_ref);
+ }
+
+ return 1;
}
static void process_deferred_ops(unsigned int cpu)
@@ -1625,7 +1645,7 @@
else
local_flush_tlb();
}
-
+
if ( deferred_ops & DOP_RELOAD_LDT )
(void)map_ldt_shadow_page(0);
@@ -1752,9 +1772,9 @@
{
if ( hypercall_preempt_check() )
{
- rc = hypercall4_create_continuation(
- __HYPERVISOR_mmuext_op, uops,
- (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "pipi",
+ uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
break;
}
@@ -2018,9 +2038,9 @@
{
if ( hypercall_preempt_check() )
{
- rc = hypercall4_create_continuation(
- __HYPERVISOR_mmu_update, ureqs,
- (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "pipi",
+ ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
break;
}
@@ -2769,7 +2789,7 @@
}
-long arch_memory_op(int op, void *arg)
+long arch_memory_op(int op, GUEST_HANDLE(void) arg)
{
struct xen_reserved_phys_area xrpa;
unsigned long pfn;
@@ -2779,7 +2799,7 @@
switch ( op )
{
case XENMEM_reserved_phys_area:
- if ( copy_from_user(&xrpa, arg, sizeof(xrpa)) )
+ if ( copy_from_guest(&xrpa, arg, 1) )
return -EFAULT;
/* No guest has more than one reserved area. */
@@ -2813,7 +2833,7 @@
put_domain(d);
- if ( copy_to_user(arg, &xrpa, sizeof(xrpa)) )
+ if ( copy_to_guest(arg, &xrpa, 1) )
return -EFAULT;
break;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/setup.c Wed Mar 1 19:47:25 2006
@@ -144,6 +144,20 @@
static struct e820entry e820_raw[E820MAX];
+static unsigned long initial_images_start, initial_images_end;
+
+unsigned long initial_images_nrpages(void)
+{
+ unsigned long s = initial_images_start + PAGE_SIZE - 1;
+ unsigned long e = initial_images_end;
+ return ((e >> PAGE_SHIFT) - (s >> PAGE_SHIFT));
+}
+
+void discard_initial_images(void)
+{
+ init_domheap_pages(initial_images_start, initial_images_end);
+}
+
void __init __start_xen(multiboot_info_t *mbi)
{
char *cmdline;
@@ -152,7 +166,6 @@
unsigned int initrdidx = 1;
module_t *mod = (module_t *)__va(mbi->mods_addr);
unsigned long nr_pages, modules_length;
- unsigned long initial_images_start, initial_images_end;
paddr_t s, e;
int i, e820_warn = 0, e820_raw_nr = 0, bytes = 0;
struct ns16550_defaults ns16550 = {
@@ -437,11 +450,7 @@
set_in_cr4(X86_CR4_OSXMMEXCPT);
if ( opt_nosmp )
- {
max_cpus = 0;
- smp_num_siblings = 1;
- boot_cpu_data.x86_max_cores = 1;
- }
smp_prepare_cpus(max_cpus);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow32.c
--- a/xen/arch/x86/shadow32.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/shadow32.c Wed Mar 1 19:47:25 2006
@@ -43,7 +43,8 @@
static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned
long gpfn);
#endif
-static void free_p2m_table(struct vcpu *v);
+static int alloc_p2m_table(struct domain *d);
+static void free_p2m_table(struct domain *d);
/********
@@ -739,7 +740,7 @@
mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
memset(mpl2e, 0, PAGE_SIZE);
- memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
&idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
@@ -760,6 +761,23 @@
if ( v->vcpu_id == 0 )
alloc_p2m_table(d);
+ else
+ {
+ unsigned long mfn;
+
+ mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+ if ( mfn )
+ {
+ l2_pgentry_t *l2tab;
+
+ l2tab = map_domain_page(mfn);
+
+ mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+ l2tab[l2_table_offset(RO_MPT_VIRT_START)];
+
+ unmap_domain_page(l2tab);
+ }
+ }
}
/*
@@ -771,7 +789,7 @@
unsigned long mfn;
ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
-
+
mpl2e = v->arch.monitor_vtable;
/*
@@ -794,7 +812,7 @@
}
if ( v->vcpu_id == 0 )
- free_p2m_table(v);
+ free_p2m_table(v->domain);
/*
* Then free monitor_table.
@@ -808,8 +826,8 @@
}
static int
-map_p2m_entry(
- l1_pgentry_t *l1tab, unsigned long va, unsigned long gpa, unsigned long
mfn)
+map_p2m_entry(l1_pgentry_t *l1tab, unsigned long va,
+ unsigned long gpa, unsigned long mfn)
{
unsigned long *l0tab = NULL;
l1_pgentry_t l1e = { 0 };
@@ -820,27 +838,22 @@
{
page = alloc_domheap_page(NULL);
if ( !page )
- goto fail;
-
- if ( l0tab )
- unmap_domain_page(l0tab);
+ return 0;
+
l0tab = map_domain_page(page_to_mfn(page));
- memset(l0tab, 0, PAGE_SIZE );
+ memset(l0tab, 0, PAGE_SIZE);
+
l1e = l1tab[l1_table_offset(va)] =
l1e_from_page(page, __PAGE_HYPERVISOR);
}
- else if ( l0tab == NULL)
+ else
l0tab = map_domain_page(l1e_get_pfn(l1e));
- l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
-
- if ( l0tab )
- unmap_domain_page(l0tab);
+ l0tab[gpa & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
+
+ unmap_domain_page(l0tab);
return 1;
-
-fail:
- return 0;
}
int
@@ -853,7 +866,6 @@
l1_pgentry_t *l1;
struct page_info *l1page;
unsigned long va = pfn << PAGE_SHIFT;
- int error;
if ( shadow_mode_external(d) )
{
@@ -877,6 +889,7 @@
if ( shadow_mode_external(d) )
{
+ int error;
l1_pgentry_t *l1tab = NULL;
l2_pgentry_t l2e;
@@ -885,14 +898,13 @@
ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT );
l1tab = map_domain_page(l2e_get_pfn(l2e));
- error = map_p2m_entry(l1tab, va, pfn, mfn);
- if ( !error )
- domain_crash_synchronous();
+ if ( !(error = map_p2m_entry(l1tab, va, pfn, mfn)) )
+ domain_crash(d);
unmap_domain_page(l1tab);
unmap_domain_page_with_cache(l2, l2cache);
- return 1;
+ return error;
}
/*
@@ -926,7 +938,7 @@
return 1;
}
-int
+static int
alloc_p2m_table(struct domain *d)
{
struct list_head *list_ent;
@@ -937,7 +949,7 @@
l2_pgentry_t l2e = { 0 };
struct page_info *page;
unsigned long gpfn, mfn;
- int error;
+ int error = 0;
if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) )
{
@@ -955,6 +967,9 @@
}
else
l1tab = map_domain_page(l2e_get_pfn(l2e));
+
+ if ( l2tab )
+ unmap_domain_page(l2tab);
}
else
{
@@ -972,23 +987,23 @@
page = list_entry(list_ent, struct page_info, list);
mfn = page_to_mfn(page);
- error = map_p2m_entry(l1tab, va, gpfn, mfn);
- if ( !error )
- domain_crash_synchronous();
+ if ( !(error = map_p2m_entry(l1tab, va, gpfn, mfn)) )
+ {
+ domain_crash(d);
+ break;
+ }
list_ent = frame_table[mfn].list.next;
va += sizeof(mfn);
}
- if (l2tab)
- unmap_domain_page(l2tab);
unmap_domain_page(l1tab);
- return 1;
-}
-
-static void
-free_p2m_table(struct vcpu *v)
+ return error;
+}
+
+static void
+free_p2m_table(struct domain *d)
{
unsigned long va;
l2_pgentry_t *l2tab;
@@ -996,10 +1011,10 @@
l2_pgentry_t l2e;
l1_pgentry_t l1e;
- ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
+ ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
l2tab = map_domain_page(
- pagetable_get_pfn(v->arch.monitor_table));
+ pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
{
@@ -1015,11 +1030,13 @@
if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
- va += PAGE_SIZE;
+ va += PAGE_SIZE;
}
unmap_domain_page(l1tab);
free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
}
+ else
+ va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
}
unmap_domain_page(l2tab);
}
@@ -1246,7 +1263,7 @@
if ( shadow_mode_refcounts(d) )
{
- struct list_head *list_ent;
+ struct list_head *list_ent;
struct page_info *page;
/*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow_public.c
--- a/xen/arch/x86/shadow_public.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/shadow_public.c Wed Mar 1 19:47:25 2006
@@ -31,7 +31,8 @@
#include <xen/trace.h>
#include <asm/shadow_64.h>
-static void free_p2m_table(struct vcpu *v);
+static int alloc_p2m_table(struct domain *d);
+static void free_p2m_table(struct domain *d);
#define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) -
((_encoded) >> 16))
@@ -328,6 +329,23 @@
if ( v->vcpu_id == 0 )
alloc_p2m_table(d);
+ else
+ {
+ unsigned long mfn;
+
+ mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+ if ( mfn )
+ {
+ l4_pgentry_t *l4tab;
+
+ l4tab = map_domain_page(mfn);
+
+ mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
+ l4tab[l4_table_offset(RO_MPT_VIRT_START)];
+
+ unmap_domain_page(l4tab);
+ }
+ }
}
void free_monitor_pagetable(struct vcpu *v)
@@ -338,7 +356,7 @@
* free monitor_table.
*/
if ( v->vcpu_id == 0 )
- free_p2m_table(v);
+ free_p2m_table(v->domain);
/*
* Then free monitor_table.
@@ -397,13 +415,49 @@
l2e_empty();
mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty();
- unmap_domain_page(mpl2e);
-
v->arch.monitor_table = mk_pagetable(m3mfn << PAGE_SHIFT); /* < 4GB */
v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
if ( v->vcpu_id == 0 )
alloc_p2m_table(d);
+ else
+ {
+ unsigned long mfn;
+
+ mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+ if ( mfn )
+ {
+ l3_pgentry_t *l3tab, l3e;
+ l2_pgentry_t *l2tab;
+
+ l3tab = map_domain_page(mfn);
+ l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
+
+ /*
+ * NB: when CONFIG_PAGING_LEVELS == 3,
+ * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+ * alloc_monitor_pagetable should guarantee this.
+ */
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+ BUG();
+
+ l2tab = map_domain_page(l3e_get_pfn(l3e));
+
+ /*
+ * Just one l2 slot is used here, so at most 2M for p2m table:
+ * ((4K * 512)/sizeof(unsigned long)) * 4K = 2G
+ * should be OK on PAE xen, since Qemu DM can only map 1.5G VMX
+ * guest memory.
+ */
+ mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+ l2tab[l2_table_offset(RO_MPT_VIRT_START)];
+
+ unmap_domain_page(l2tab);
+ unmap_domain_page(l3tab);
+ }
+ }
+
+ unmap_domain_page(mpl2e);
}
void free_monitor_pagetable(struct vcpu *v)
@@ -413,7 +467,7 @@
* free monitor_table.
*/
if ( v->vcpu_id == 0 )
- free_p2m_table(v);
+ free_p2m_table(v->domain);
m3mfn = pagetable_get_pfn(v->arch.monitor_table);
m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
@@ -1348,14 +1402,14 @@
}
static int
-map_p2m_entry(
- pgentry_64_t *top_tab, unsigned long va, unsigned long gpa, unsigned long
mfn)
+map_p2m_entry(pgentry_64_t *top_tab, unsigned long va,
+ unsigned long gpfn, unsigned long mfn)
{
#if CONFIG_PAGING_LEVELS >= 4
pgentry_64_t l4e = { 0 };
+ pgentry_64_t *l3tab = NULL;
#endif
#if CONFIG_PAGING_LEVELS >= 3
- pgentry_64_t *l3tab = NULL;
pgentry_64_t l3e = { 0 };
#endif
l2_pgentry_t *l2tab = NULL;
@@ -1367,7 +1421,7 @@
#if CONFIG_PAGING_LEVELS >= 4
l4e = top_tab[l4_table_offset(va)];
- if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
+ if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
{
page = alloc_domheap_page(NULL);
if ( !page )
@@ -1375,17 +1429,14 @@
l3tab = map_domain_page(page_to_mfn(page));
memset(l3tab, 0, PAGE_SIZE);
- l4e = top_tab[l4_table_offset(va)] =
+ l4e = top_tab[l4_table_offset(va)] =
entry_from_page(page, __PAGE_HYPERVISOR);
- }
- else if ( l3tab == NULL)
+ }
+ else
l3tab = map_domain_page(entry_get_pfn(l4e));
l3e = l3tab[l3_table_offset(va)];
-#else
- l3e = top_tab[l3_table_offset(va)];
-#endif
- if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
+ if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
{
page = alloc_domheap_page(NULL);
if ( !page )
@@ -1393,14 +1444,29 @@
l2tab = map_domain_page(page_to_mfn(page));
memset(l2tab, 0, PAGE_SIZE);
- l3e = l3tab[l3_table_offset(va)] =
+ l3e = l3tab[l3_table_offset(va)] =
entry_from_page(page, __PAGE_HYPERVISOR);
- }
- else if ( l2tab == NULL)
+ }
+ else
l2tab = map_domain_page(entry_get_pfn(l3e));
+ unmap_domain_page(l3tab);
+#else
+ l3e = top_tab[l3_table_offset(va)];
+
+ /*
+ * NB: when CONFIG_PAGING_LEVELS == 3,
+ * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+ * alloc_monitor_pagetable should guarantee this.
+ */
+ if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
+ BUG();
+
+ l2tab = map_domain_page(entry_get_pfn(l3e));
+#endif
+
l2e = l2tab[l2_table_offset(va)];
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
{
page = alloc_domheap_page(NULL);
if ( !page )
@@ -1408,14 +1474,16 @@
l1tab = map_domain_page(page_to_mfn(page));
memset(l1tab, 0, PAGE_SIZE);
- l2e = l2tab[l2_table_offset(va)] =
+ l2e = l2tab[l2_table_offset(va)] =
l2e_from_page(page, __PAGE_HYPERVISOR);
- }
- else if ( l1tab == NULL)
+ }
+ else
l1tab = map_domain_page(l2e_get_pfn(l2e));
+ unmap_domain_page(l2tab);
+
l1e = l1tab[l1_table_offset(va)];
- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
+ if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
{
page = alloc_domheap_page(NULL);
if ( !page )
@@ -1423,96 +1491,88 @@
l0tab = map_domain_page(page_to_mfn(page));
memset(l0tab, 0, PAGE_SIZE);
- l1e = l1tab[l1_table_offset(va)] =
+ l1e = l1tab[l1_table_offset(va)] =
l1e_from_page(page, __PAGE_HYPERVISOR);
}
- else if ( l0tab == NULL)
+ else
l0tab = map_domain_page(l1e_get_pfn(l1e));
- l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
-
- if ( l2tab )
- {
- unmap_domain_page(l2tab);
- l2tab = NULL;
- }
- if ( l1tab )
- {
- unmap_domain_page(l1tab);
- l1tab = NULL;
- }
- if ( l0tab )
- {
- unmap_domain_page(l0tab);
- l0tab = NULL;
- }
+ unmap_domain_page(l1tab);
+
+ l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
+
+ unmap_domain_page(l0tab);
return 1;
nomem:
-
return 0;
}
int
-set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
+set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
struct domain_mmap_cache *l2cache,
struct domain_mmap_cache *l1cache)
{
- unsigned long tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- pgentry_64_t *top;
- unsigned long va = RO_MPT_VIRT_START + (pfn * sizeof (unsigned long));
+ unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+ unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(unsigned long));
+ pgentry_64_t *top_tab;
int error;
- ASSERT(tabpfn != 0);
+ ASSERT(tabmfn != 0);
ASSERT(shadow_lock_is_acquired(d));
- top = map_domain_page_with_cache(tabpfn, l2cache);
- error = map_p2m_entry(top, va, pfn, mfn);
- unmap_domain_page_with_cache(top, l2cache);
-
- if ( !error )
- domain_crash_synchronous();
-
- return 1;
-}
-
-int
+ top_tab = map_domain_page_with_cache(tabmfn, l2cache);
+
+ if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
+ domain_crash(d);
+
+ unmap_domain_page_with_cache(top_tab, l2cache);
+
+ return error;
+}
+
+static int
alloc_p2m_table(struct domain *d)
{
struct list_head *list_ent;
unsigned long va = RO_MPT_VIRT_START; /* phys_to_machine_mapping */
pgentry_64_t *top_tab = NULL;
unsigned long mfn;
- int gpa;
-
- ASSERT ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
+ int gpfn, error = 0;
+
+ ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
top_tab = map_domain_page(
pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-
list_ent = d->page_list.next;
- for ( gpa = 0; list_ent != &d->page_list; gpa++ )
+ for ( gpfn = 0; list_ent != &d->page_list; gpfn++ )
{
struct page_info *page;
+
page = list_entry(list_ent, struct page_info, list);
mfn = page_to_mfn(page);
- map_p2m_entry(top_tab, va, gpa, mfn);
+ if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
+ {
+ domain_crash(d);
+ break;
+ }
+
list_ent = frame_table[mfn].list.next;
va += sizeof(mfn);
}
unmap_domain_page(top_tab);
- return 1;
+ return error;
}
#if CONFIG_PAGING_LEVELS >= 3
static void
-free_p2m_table(struct vcpu *v)
+free_p2m_table(struct domain *d)
{
unsigned long va;
l1_pgentry_t *l1tab;
@@ -1520,27 +1580,35 @@
l2_pgentry_t *l2tab;
l2_pgentry_t l2e;
#if CONFIG_PAGING_LEVELS >= 3
- l3_pgentry_t *l3tab;
+ l3_pgentry_t *l3tab;
l3_pgentry_t l3e;
#endif
#if CONFIG_PAGING_LEVELS == 4
int i3;
- l4_pgentry_t *l4tab;
+ l4_pgentry_t *l4tab;
l4_pgentry_t l4e;
#endif
- ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
+ ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
#if CONFIG_PAGING_LEVELS == 4
l4tab = map_domain_page(
- pagetable_get_pfn(v->arch.monitor_table));
+ pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
#endif
#if CONFIG_PAGING_LEVELS == 3
l3tab = map_domain_page(
- pagetable_get_pfn(v->arch.monitor_table));
-
- va = RO_MPT_VIRT_START;
- l3e = l3tab[l3_table_offset(va)];
+ pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
+
+ l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
+
+ /*
+ * NB: when CONFIG_PAGING_LEVELS == 3,
+ * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+ * alloc_monitor_pagetable should guarantee this.
+ */
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+ BUG();
+
l2tab = map_domain_page(l3e_get_pfn(l3e));
#endif
@@ -1555,8 +1623,8 @@
for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
{
-
l3e = l3tab[l3_table_offset(va)];
+
if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
{
int i2;
@@ -1567,12 +1635,13 @@
{
#endif
l2e = l2tab[l2_table_offset(va)];
+
if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
{
int i1;
l1tab = map_domain_page(l2e_get_pfn(l2e));
-
+
/*
* unsigned long phys_to_machine_mapping[]
*/
@@ -1591,7 +1660,7 @@
else
va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
-#if CONFIG_PAGING_LEVELS == 4
+#if CONFIG_PAGING_LEVELS == 4
}
unmap_domain_page(l2tab);
free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
@@ -1603,7 +1672,7 @@
free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
}
else
- va += PAGE_SIZE *
+ va += PAGE_SIZE *
L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES *
L3_PAGETABLE_ENTRIES;
#endif
}
@@ -1622,7 +1691,7 @@
paddr_t pa, l1_pgentry_t gpte,
struct domain_mmap_cache *cache)
{
- unsigned long sl1mfn;
+ unsigned long sl1mfn;
l1_pgentry_t *spl1e, spte;
shadow_lock(d);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/traps.c Wed Mar 1 19:47:25 2006
@@ -951,6 +951,7 @@
case 3: /* Write CR3 */
LOCK_BIGLOCK(v->domain);
+ cleanup_writable_pagetable(v->domain);
(void)new_guest_cr3(gmfn_to_mfn(v->domain, paddr_to_pfn(*reg)));
UNLOCK_BIGLOCK(v->domain);
break;
@@ -1002,7 +1003,6 @@
#endif
default:
if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
- (regs->ecx != MSR_EFER) ||
(regs->eax != l) || (regs->edx != h) )
DPRINTK("Domain attempted WRMSR %p from "
"%08x:%08x to %08lx:%08lx.\n",
@@ -1033,8 +1033,8 @@
goto fail;
break;
default:
- DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));
/* Everyone can read the MSR space. */
+ /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
goto fail;
break;
@@ -1416,8 +1416,8 @@
{
if ( hypercall_preempt_check() )
{
- rc = hypercall1_create_continuation(
- __HYPERVISOR_set_trap_table, traps);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_set_trap_table, "p", traps);
break;
}
@@ -1430,7 +1430,7 @@
if ( cur.address == 0 )
break;
- fixup_guest_selector(cur.cs);
+ fixup_guest_code_selector(cur.cs);
memcpy(&dst[cur.vector], &cur, sizeof(cur));
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/asm-offsets.c Wed Mar 1 19:47:25 2006
@@ -72,6 +72,13 @@
DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
BLANK();
+ OFFSET(TSS_ss0, struct tss_struct, ss0);
+ OFFSET(TSS_esp0, struct tss_struct, esp0);
+ OFFSET(TSS_ss1, struct tss_struct, ss1);
+ OFFSET(TSS_esp1, struct tss_struct, esp1);
+ DEFINE(TSS_sizeof, sizeof(struct tss_struct));
+ BLANK();
+
OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
OFFSET(VCPU_svm_hsa_pa, struct vcpu, arch.hvm_svm.host_save_pa);
OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/entry.S Wed Mar 1 19:47:25 2006
@@ -77,6 +77,13 @@
restore_all_guest:
testl $X86_EFLAGS_VM,UREGS_eflags(%esp)
jnz restore_all_vm86
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+ testl $2,UREGS_cs(%esp)
+ jnz 1f
+ call restore_ring0_guest
+ jmp restore_all_vm86
+1:
+#endif
FLT1: mov UREGS_ds(%esp),%ds
FLT2: mov UREGS_es(%esp),%es
FLT3: mov UREGS_fs(%esp),%fs
@@ -157,6 +164,7 @@
ALIGN
ENTRY(hypercall)
subl $4,%esp
+ FIXUP_RING0_GUEST_STACK
SAVE_ALL(b)
sti
GET_CURRENT(%ebx)
@@ -294,6 +302,11 @@
popl %eax
shll $16,%eax # Bits 16-23: saved_upcall_mask
movw UREGS_cs+4(%esp),%ax # Bits 0-15: CS
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+ testw $2,%ax
+ jnz FLT15
+ and $~3,%ax # RPL 1 -> RPL 0
+#endif
FLT15: movl %eax,%gs:4(%esi)
test $0x00FF0000,%eax # Bits 16-23: saved_upcall_mask
setz %ch # %ch == !saved_upcall_mask
@@ -388,6 +401,7 @@
pushl $TRAP_divide_error<<16
ALIGN
error_code:
+ FIXUP_RING0_GUEST_STACK
SAVE_ALL_NOSEGREGS(a)
SET_XEN_SEGMENTS(a)
testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp)
@@ -505,6 +519,10 @@
jmp error_code
ENTRY(nmi)
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+ # NMI entry protocol is incompatible with guest kernel in ring 0.
+ iret
+#else
# Save state but do not trash the segment registers!
# We may otherwise be unable to reload them or copy them to ring 1.
pushl %eax
@@ -546,6 +564,7 @@
movl $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \
TRAP_deferred_nmi),%ss:APIC_ICR(%eax)
jmp restore_all_xen
+#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */
ENTRY(setup_vm86_frame)
# Copies the entire stack frame forwards by 16 bytes.
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/mm.c
--- a/xen/arch/x86/x86_32/mm.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/mm.c Wed Mar 1 19:47:25 2006
@@ -23,6 +23,7 @@
#include <xen/init.h>
#include <xen/mm.h>
#include <xen/sched.h>
+#include <xen/guest_access.h>
#include <asm/current.h>
#include <asm/page.h>
#include <asm/flushtlb.h>
@@ -180,9 +181,18 @@
page_set_owner(page, dom_xen);
}
}
-}
-
-long subarch_memory_op(int op, void *arg)
+
+ if ( supervisor_mode_kernel )
+ {
+ /* Guest kernel runs in ring 0, not ring 1. */
+ struct desc_struct *d;
+ d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+ d[0].b &= ~_SEGMENT_DPL;
+ d[1].b &= ~_SEGMENT_DPL;
+ }
+}
+
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
{
struct xen_machphys_mfn_list xmml;
unsigned long mfn;
@@ -192,7 +202,7 @@
switch ( op )
{
case XENMEM_machphys_mfn_list:
- if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
+ if ( copy_from_guest(&xmml, arg, 1) )
return -EFAULT;
max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21);
@@ -201,11 +211,12 @@
{
mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset(
RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21);
- if ( put_user(mfn, &xmml.extent_start[i]) )
+ if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
return -EFAULT;
}
- if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
+ xmml.nr_extents = i;
+ if ( copy_to_guest(arg, &xmml, 1) )
return -EFAULT;
break;
@@ -223,7 +234,7 @@
int nr = smp_processor_id();
struct tss_struct *t = &init_tss[nr];
- fixup_guest_selector(ss);
+ fixup_guest_stack_selector(ss);
current->arch.guest_context.kernel_ss = ss;
current->arch.guest_context.kernel_sp = esp;
@@ -239,6 +250,10 @@
unsigned long base, limit;
u32 a = d->a, b = d->b;
u16 cs;
+
+ /* Let a ring0 guest kernel set any descriptor it wants to. */
+ if ( supervisor_mode_kernel )
+ return 1;
/* A not-present descriptor will always fault, so is safe. */
if ( !(b & _SEGMENT_P) )
@@ -273,7 +288,7 @@
/* Validate and fix up the target code selector. */
cs = a >> 16;
- fixup_guest_selector(cs);
+ fixup_guest_code_selector(cs);
if ( !guest_gate_selector_okay(cs) )
goto bad;
a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/traps.c Wed Mar 1 19:47:25 2006
@@ -256,8 +256,14 @@
* We can't virtualise interrupt gates, as there's no way to get
* the CPU to automatically clear the events_mask variable. Also we
* must ensure that the CS is safe to poke into an interrupt gate.
- */
- if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) )
+ *
+ * When running with supervisor_mode_kernel enabled a direct trap
+ * to the guest OS cannot be used because the INT instruction will
+ * switch to the Xen stack and we need to swap back to the guest
+ * kernel stack before passing control to the system call entry point.
+ */
+ if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ||
+ supervisor_mode_kernel )
{
v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
return;
@@ -278,8 +284,8 @@
{
struct vcpu *d = current;
- fixup_guest_selector(event_selector);
- fixup_guest_selector(failsafe_selector);
+ fixup_guest_code_selector(event_selector);
+ fixup_guest_code_selector(failsafe_selector);
d->arch.guest_context.event_callback_cs = event_selector;
d->arch.guest_context.event_callback_eip = event_address;
@@ -289,12 +295,51 @@
return 0;
}
-void hypercall_page_initialise(void *hypercall_page)
-{
+static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
+{
+ extern asmlinkage int hypercall(void);
char *p;
int i;
/* Fill in all the transfer points with template machine code. */
+
+ for ( i = 0; i < NR_hypercalls; i++ )
+ {
+ p = (char *)(hypercall_page + (i * 32));
+
+ *(u8 *)(p+ 0) = 0x9c; /* pushf */
+ *(u8 *)(p+ 1) = 0xfa; /* cli */
+ *(u8 *)(p+ 2) = 0xb8; /* mov $<i>,%eax */
+ *(u32 *)(p+ 3) = i;
+ *(u8 *)(p+ 7) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
+ *(u32 *)(p+ 8) = (u32)&hypercall;
+ *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
+ *(u8 *)(p+14) = 0xc3; /* ret */
+ }
+
+ /*
+ * HYPERVISOR_iret is special because it doesn't return and expects a
+ * special stack frame. Guests jump at this transfer point instead of
+ * calling it.
+ */
+ p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
+ *(u8 *)(p+ 0) = 0x50; /* push %eax */
+ *(u8 *)(p+ 1) = 0x9c; /* pushf */
+ *(u8 *)(p+ 2) = 0xfa; /* cli */
+ *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */
+ *(u32 *)(p+ 4) = __HYPERVISOR_iret;
+ *(u8 *)(p+ 8) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
+ *(u32 *)(p+ 9) = (u32)&hypercall;
+ *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
+}
+
+static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
+{
+ char *p;
+ int i;
+
+ /* Fill in all the transfer points with template machine code. */
+
for ( i = 0; i < (PAGE_SIZE / 32); i++ )
{
p = (char *)(hypercall_page + (i * 32));
@@ -314,6 +359,14 @@
*(u8 *)(p+ 1) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */
*(u32 *)(p+ 2) = __HYPERVISOR_iret;
*(u16 *)(p+ 6) = 0x82cd; /* int $0x82 */
+}
+
+void hypercall_page_initialise(void *hypercall_page)
+{
+ if ( supervisor_mode_kernel )
+ hypercall_page_initialise_ring0_kernel(hypercall_page);
+ else
+ hypercall_page_initialise_ring1_kernel(hypercall_page);
}
/*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_64/mm.c
--- a/xen/arch/x86/x86_64/mm.c Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/x86_64/mm.c Wed Mar 1 19:47:25 2006
@@ -22,6 +22,7 @@
#include <xen/init.h>
#include <xen/mm.h>
#include <xen/sched.h>
+#include <xen/guest_access.h>
#include <asm/current.h>
#include <asm/asm_defns.h>
#include <asm/page.h>
@@ -182,7 +183,7 @@
}
}
-long subarch_memory_op(int op, void *arg)
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
{
struct xen_machphys_mfn_list xmml;
l3_pgentry_t l3e;
@@ -194,7 +195,7 @@
switch ( op )
{
case XENMEM_machphys_mfn_list:
- if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
+ if ( copy_from_guest(&xmml, arg, 1) )
return -EFAULT;
for ( i = 0, v = RDWR_MPT_VIRT_START;
@@ -209,11 +210,12 @@
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
break;
mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
- if ( put_user(mfn, &xmml.extent_start[i]) )
+ if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
return -EFAULT;
}
- if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
+ xmml.nr_extents = i;
+ if ( copy_to_guest(arg, &xmml, 1) )
return -EFAULT;
break;
@@ -228,7 +230,7 @@
long do_stack_switch(unsigned long ss, unsigned long esp)
{
- fixup_guest_selector(ss);
+ fixup_guest_stack_selector(ss);
current->arch.guest_context.kernel_ss = ss;
current->arch.guest_context.kernel_sp = esp;
return 0;
@@ -315,7 +317,7 @@
/* Validate and fix up the target code selector. */
cs = a >> 16;
- fixup_guest_selector(cs);
+ fixup_guest_code_selector(cs);
if ( !guest_gate_selector_okay(cs) )
goto bad;
a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/dom0_ops.c Wed Mar 1 19:47:25 2006
@@ -46,6 +46,7 @@
struct vcpu *v;
u64 cpu_time = 0;
int flags = DOMFLAGS_BLOCKED;
+ struct vcpu_runstate_info runstate;
info->domain = d->domain_id;
info->nr_online_vcpus = 0;
@@ -55,7 +56,8 @@
* - domain is marked as running if any of its vcpus is running
*/
for_each_vcpu ( d, v ) {
- cpu_time += v->cpu_time;
+ vcpu_runstate_get(v, &runstate);
+ cpu_time += runstate.time[RUNSTATE_running];
info->max_vcpu_id = v->vcpu_id;
if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
{
@@ -165,7 +167,15 @@
domid_t dom;
struct vcpu *v;
unsigned int i, cnt[NR_CPUS] = { 0 };
+ cpumask_t cpu_exclude_map;
static domid_t rover = 0;
+
+ /*
+ * Running the domain 0 kernel in ring 0 is not compatible
+ * with multiple guests.
+ */
+ if ( supervisor_mode_kernel )
+ return -EINVAL;
dom = op->u.createdomain.domain;
if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
@@ -195,18 +205,29 @@
read_lock(&domlist_lock);
for_each_domain ( d )
for_each_vcpu ( d, v )
- cnt[v->processor]++;
+ if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
+ cnt[v->processor]++;
read_unlock(&domlist_lock);
/*
- * If we're on a HT system, we only use the first HT for dom0, other
- * domains will all share the second HT of each CPU. Since dom0 is on
- * CPU 0, we favour high numbered CPUs in the event of a tie.
+ * If we're on a HT system, we only auto-allocate to a non-primary HT.
+ * We favour high numbered CPUs in the event of a tie.
*/
- pro = smp_num_siblings - 1;
- for ( i = pro; i < num_online_cpus(); i += smp_num_siblings )
+ pro = first_cpu(cpu_sibling_map[0]);
+ if ( cpus_weight(cpu_sibling_map[0]) > 1 )
+ pro = next_cpu(pro, cpu_sibling_map[0]);
+ cpu_exclude_map = cpu_sibling_map[0];
+ for_each_online_cpu ( i )
+ {
+ if ( cpu_isset(i, cpu_exclude_map) )
+ continue;
+ if ( (i == first_cpu(cpu_sibling_map[i])) &&
+ (cpus_weight(cpu_sibling_map[i]) > 1) )
+ continue;
+ cpus_or(cpu_exclude_map, cpu_exclude_map, cpu_sibling_map[i]);
if ( cnt[i] <= cnt[pro] )
pro = i;
+ }
ret = -ENOMEM;
if ( (d = domain_create(dom, pro)) == NULL )
@@ -485,6 +506,7 @@
{
struct domain *d;
struct vcpu *v;
+ struct vcpu_runstate_info runstate;
ret = -ESRCH;
if ( (d = find_domain_by_id(op->u.getvcpuinfo.domain)) == NULL )
@@ -498,10 +520,12 @@
if ( (v = d->vcpu[op->u.getvcpuinfo.vcpu]) == NULL )
goto getvcpuinfo_out;
+ vcpu_runstate_get(v, &runstate);
+
op->u.getvcpuinfo.online = !test_bit(_VCPUF_down, &v->vcpu_flags);
op->u.getvcpuinfo.blocked = test_bit(_VCPUF_blocked, &v->vcpu_flags);
op->u.getvcpuinfo.running = test_bit(_VCPUF_running, &v->vcpu_flags);
- op->u.getvcpuinfo.cpu_time = v->cpu_time;
+ op->u.getvcpuinfo.cpu_time = runstate.time[RUNSTATE_running];
op->u.getvcpuinfo.cpu = v->processor;
op->u.getvcpuinfo.cpumap = 0;
memcpy(&op->u.getvcpuinfo.cpumap,
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/domain.c
--- a/xen/common/domain.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/domain.c Wed Mar 1 19:47:25 2006
@@ -451,6 +451,41 @@
case VCPUOP_is_up:
rc = !test_bit(_VCPUF_down, &v->vcpu_flags);
break;
+
+ case VCPUOP_get_runstate_info:
+ {
+ struct vcpu_runstate_info runstate;
+ vcpu_runstate_get(v, &runstate);
+ if ( copy_to_user(arg, &runstate, sizeof(runstate)) )
+ rc = -EFAULT;
+ break;
+ }
+
+ case VCPUOP_register_runstate_memory_area:
+ {
+ struct vcpu_register_runstate_memory_area area;
+
+ rc = -EINVAL;
+ if ( v != current )
+ break;
+
+ rc = -EFAULT;
+ if ( copy_from_user(&area, arg, sizeof(area)) )
+ break;
+
+ if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
+ break;
+
+ rc = 0;
+ v->runstate_guest = area.addr.v;
+ __copy_to_user(v->runstate_guest, &v->runstate, sizeof(v->runstate));
+
+ break;
+ }
+
+ default:
+ rc = -ENOSYS;
+ break;
}
return rc;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/kernel.c
--- a/xen/common/kernel.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/kernel.c Wed Mar 1 19:47:25 2006
@@ -195,6 +195,8 @@
(1U << XENFEAT_writable_page_tables) |
(1U << XENFEAT_auto_translated_physmap) |
(1U << XENFEAT_pae_pgdir_above_4gb);
+ if ( supervisor_mode_kernel )
+ fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
break;
default:
return -EINVAL;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/keyhandler.c
--- a/xen/common/keyhandler.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/keyhandler.c Wed Mar 1 19:47:25 2006
@@ -169,8 +169,6 @@
}
extern void dump_runq(unsigned char key);
-extern void print_sched_histo(unsigned char key);
-extern void reset_sched_histo(unsigned char key);
#ifndef NDEBUG
extern void audit_domains_key(unsigned char key);
#endif
@@ -206,10 +204,6 @@
'd', dump_registers, "dump registers");
register_keyhandler(
'h', show_handlers, "show this message");
- register_keyhandler(
- 'l', print_sched_histo, "print sched latency histogram");
- register_keyhandler(
- 'L', reset_sched_histo, "reset sched latency histogram");
register_keyhandler(
'q', dump_domains, "dump domain (and guest debug) info");
register_keyhandler(
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/memory.c
--- a/xen/common/memory.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/memory.c Wed Mar 1 19:47:25 2006
@@ -16,6 +16,7 @@
#include <xen/event.h>
#include <xen/shadow.h>
#include <xen/iocap.h>
+#include <xen/guest_access.h>
#include <asm/current.h>
#include <asm/hardirq.h>
#include <public/memory.h>
@@ -30,7 +31,7 @@
static long
increase_reservation(
struct domain *d,
- unsigned long *extent_list,
+ GUEST_HANDLE(xen_ulong) extent_list,
unsigned int nr_extents,
unsigned int extent_order,
unsigned int flags,
@@ -39,8 +40,8 @@
struct page_info *page;
unsigned long i, mfn;
- if ( (extent_list != NULL) &&
- !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+ if ( !guest_handle_is_null(extent_list) &&
+ !guest_handle_okay(extent_list, nr_extents) )
return 0;
if ( (extent_order != 0) &&
@@ -65,10 +66,10 @@
}
/* Inform the domain of the new page's machine address. */
- if ( extent_list != NULL )
+ if ( !guest_handle_is_null(extent_list) )
{
mfn = page_to_mfn(page);
- if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
+ if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
return i;
}
}
@@ -79,16 +80,16 @@
static long
populate_physmap(
struct domain *d,
- unsigned long *extent_list,
- unsigned int nr_extents,
- unsigned int extent_order,
- unsigned int flags,
- int *preempted)
+ GUEST_HANDLE(xen_ulong) extent_list,
+ unsigned int nr_extents,
+ unsigned int extent_order,
+ unsigned int flags,
+ int *preempted)
{
struct page_info *page;
unsigned long i, j, gpfn, mfn;
- if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+ if ( !guest_handle_okay(extent_list, nr_extents) )
return 0;
if ( (extent_order != 0) &&
@@ -103,7 +104,7 @@
goto out;
}
- if ( unlikely(__copy_from_user(&gpfn, &extent_list[i], sizeof(gpfn))) )
+ if ( unlikely(__copy_from_guest_offset(&gpfn, extent_list, i, 1)) )
goto out;
if ( unlikely((page = alloc_domheap_pages(
@@ -128,7 +129,7 @@
set_gpfn_from_mfn(mfn + j, gpfn + j);
/* Inform the domain of the new page's machine address. */
- if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
+ if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
goto out;
}
}
@@ -139,8 +140,8 @@
static long
decrease_reservation(
- struct domain *d,
- unsigned long *extent_list,
+ struct domain *d,
+ GUEST_HANDLE(xen_ulong) extent_list,
unsigned int nr_extents,
unsigned int extent_order,
unsigned int flags,
@@ -149,7 +150,7 @@
struct page_info *page;
unsigned long i, j, gmfn, mfn;
- if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+ if ( !guest_handle_okay(extent_list, nr_extents) )
return 0;
for ( i = 0; i < nr_extents; i++ )
@@ -160,7 +161,7 @@
return i;
}
- if ( unlikely(__copy_from_user(&gmfn, &extent_list[i], sizeof(gmfn))) )
+ if ( unlikely(__copy_from_guest_offset(&gmfn, extent_list, i, 1)) )
return i;
for ( j = 0; j < (1 << extent_order); j++ )
@@ -197,21 +198,21 @@
static long
translate_gpfn_list(
- struct xen_translate_gpfn_list *uop, unsigned long *progress)
+ GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress)
{
struct xen_translate_gpfn_list op;
unsigned long i, gpfn, mfn;
struct domain *d;
- if ( copy_from_user(&op, uop, sizeof(op)) )
+ if ( copy_from_guest(&op, uop, 1) )
return -EFAULT;
/* Is size too large for us to encode a continuation? */
if ( op.nr_gpfns > (ULONG_MAX >> START_EXTENT_SHIFT) )
return -EINVAL;
- if ( !array_access_ok(op.gpfn_list, op.nr_gpfns, sizeof(*op.gpfn_list)) ||
- !array_access_ok(op.mfn_list, op.nr_gpfns, sizeof(*op.mfn_list)) )
+ if ( !guest_handle_okay(op.gpfn_list, op.nr_gpfns) ||
+ !guest_handle_okay(op.mfn_list, op.nr_gpfns) )
return -EFAULT;
if ( op.domid == DOMID_SELF )
@@ -237,8 +238,7 @@
return -EAGAIN;
}
- if ( unlikely(__copy_from_user(&gpfn, &op.gpfn_list[i],
- sizeof(gpfn))) )
+ if ( unlikely(__copy_from_guest_offset(&gpfn, op.gpfn_list, i, 1)) )
{
put_domain(d);
return -EFAULT;
@@ -246,8 +246,7 @@
mfn = gmfn_to_mfn(d, gpfn);
- if ( unlikely(__copy_to_user(&op.mfn_list[i], &mfn,
- sizeof(mfn))) )
+ if ( unlikely(__copy_to_guest_offset(op.mfn_list, i, &mfn, 1)) )
{
put_domain(d);
return -EFAULT;
@@ -258,7 +257,7 @@
return 0;
}
-long do_memory_op(unsigned long cmd, void *arg)
+long do_memory_op(unsigned long cmd, GUEST_HANDLE(void) arg)
{
struct domain *d;
int rc, op, flags = 0, preempted = 0;
@@ -273,7 +272,7 @@
case XENMEM_increase_reservation:
case XENMEM_decrease_reservation:
case XENMEM_populate_physmap:
- if ( copy_from_user(&reservation, arg, sizeof(reservation)) )
+ if ( copy_from_guest(&reservation, arg, 1) )
return -EFAULT;
/* Is size too large for us to encode a continuation? */
@@ -283,9 +282,9 @@
start_extent = cmd >> START_EXTENT_SHIFT;
if ( unlikely(start_extent > reservation.nr_extents) )
return -EINVAL;
-
- if ( reservation.extent_start != NULL )
- reservation.extent_start += start_extent;
+
+ if ( !guest_handle_is_null(reservation.extent_start) )
+ guest_handle_add_offset(reservation.extent_start, start_extent);
reservation.nr_extents -= start_extent;
if ( (reservation.address_bits != 0) &&
@@ -342,8 +341,9 @@
rc += start_extent;
if ( preempted )
- return hypercall2_create_continuation(
- __HYPERVISOR_memory_op, op | (rc << START_EXTENT_SHIFT), arg);
+ return hypercall_create_continuation(
+ __HYPERVISOR_memory_op, "lh",
+ op | (rc << START_EXTENT_SHIFT), arg);
break;
@@ -353,10 +353,10 @@
case XENMEM_current_reservation:
case XENMEM_maximum_reservation:
- if ( copy_from_user(&domid, (domid_t *)arg, sizeof(domid)) )
+ if ( copy_from_guest(&domid, arg, 1) )
return -EFAULT;
- if ( likely((domid = (unsigned long)arg) == DOMID_SELF) )
+ if ( likely(domid == DOMID_SELF) )
d = current->domain;
else if ( !IS_PRIV(current->domain) )
return -EPERM;
@@ -372,12 +372,13 @@
case XENMEM_translate_gpfn_list:
progress = cmd >> START_EXTENT_SHIFT;
- rc = translate_gpfn_list(arg, &progress);
+ rc = translate_gpfn_list(
+ guest_handle_cast(arg, xen_translate_gpfn_list_t),
+ &progress);
if ( rc == -EAGAIN )
- return hypercall2_create_continuation(
- __HYPERVISOR_memory_op,
- op | (progress << START_EXTENT_SHIFT),
- arg);
+ return hypercall_create_continuation(
+ __HYPERVISOR_memory_op, "lh",
+ op | (progress << START_EXTENT_SHIFT), arg);
break;
default:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/multicall.c
--- a/xen/common/multicall.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/multicall.c Wed Mar 1 19:47:25 2006
@@ -81,8 +81,8 @@
if ( i < nr_calls )
{
mcs->flags = 0;
- return hypercall2_create_continuation(
- __HYPERVISOR_multicall, &call_list[i], nr_calls-i);
+ return hypercall_create_continuation(
+ __HYPERVISOR_multicall, "pi", &call_list[i], nr_calls-i);
}
}
}
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/page_alloc.c
--- a/xen/common/page_alloc.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/page_alloc.c Wed Mar 1 19:47:25 2006
@@ -32,6 +32,7 @@
#include <xen/softirq.h>
#include <xen/shadow.h>
#include <xen/domain_page.h>
+#include <xen/keyhandler.h>
#include <asm/page.h>
/*
@@ -662,6 +663,26 @@
}
+static void pagealloc_keyhandler(unsigned char key)
+{
+ printk("Physical memory information:\n");
+ printk(" Xen heap: %lukB free\n"
+ " DMA heap: %lukB free\n"
+ " Dom heap: %lukB free\n",
+ avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
+ avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
+ avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+}
+
+
+static __init int pagealloc_keyhandler_init(void)
+{
+ register_keyhandler('m', pagealloc_keyhandler, "memory info");
+ return 0;
+}
+__initcall(pagealloc_keyhandler_init);
+
+
/*************************
* PAGE SCRUBBING
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_bvt.c
--- a/xen/common/sched_bvt.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/sched_bvt.c Wed Mar 1 19:47:25 2006
@@ -132,13 +132,13 @@
vcpu_schedule_unlock_irq(v);
}
-static inline u32 calc_avt(struct vcpu *d, s_time_t now)
+static inline u32 calc_avt(struct vcpu *v, s_time_t now)
{
u32 ranfor, mcus;
- struct bvt_dom_info *inf = BVT_INFO(d->domain);
- struct bvt_vcpu_info *einf = EBVT_INFO(d);
-
- ranfor = (u32)(now - d->lastschd);
+ struct bvt_dom_info *inf = BVT_INFO(v->domain);
+ struct bvt_vcpu_info *einf = EBVT_INFO(v);
+
+ ranfor = (u32)(now - v->runstate.state_entry_time);
mcus = (ranfor + MCU - 1)/MCU;
return einf->avt + mcus * inf->mcu_advance;
@@ -262,7 +262,7 @@
curr_evt = calc_evt(curr, calc_avt(curr, now));
/* Calculate the time the current domain would run assuming
the second smallest evt is of the newly woken domain */
- r_time = curr->lastschd +
+ r_time = curr->runstate.state_entry_time +
((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) +
ctx_allow;
@@ -558,7 +558,6 @@
printk("%3d: %u has=%c ", loop++, v->domain->domain_id,
test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F');
bvt_dump_runq_el(v);
- printk("c=0x%X%08X\n", (u32)(v->cpu_time>>32), (u32)v->cpu_time);
printk(" l: %p n: %p p: %p\n",
&vcpu_inf->run_list, vcpu_inf->run_list.next,
vcpu_inf->run_list.prev);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/sched_sedf.c Wed Mar 1 19:47:25 2006
@@ -1408,18 +1408,14 @@
{
printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id,
test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F');
- printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64
+ printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu"
" sc=%i xtr(%s)=%"PRIu64" ew=%hu",
EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs,
- EDOM_INFO(d)->weight, d->cpu_time,
+ EDOM_INFO(d)->weight,
EDOM_INFO(d)->score[EXTRA_UTIL_Q],
(EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no",
EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight);
- if ( d->cpu_time != 0 )
- printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100)
- / d->cpu_time);
-
#ifdef SEDF_STATS
if ( EDOM_INFO(d)->block_time_tot != 0 )
printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) /
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/schedule.c
--- a/xen/common/schedule.c Wed Mar 1 17:01:54 2006
+++ b/xen/common/schedule.c Wed Mar 1 19:47:25 2006
@@ -36,14 +36,6 @@
static char opt_sched[10] = "sedf";
string_param("sched", opt_sched);
-/*#define WAKE_HISTO*/
-/*#define BLOCKTIME_HISTO*/
-#if defined(WAKE_HISTO)
-#define BUCKETS 31
-#elif defined(BLOCKTIME_HISTO)
-#define BUCKETS 200
-#endif
-
#define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
/* Various timer handlers. */
@@ -73,6 +65,36 @@
/* Per-CPU periodic timer sends an event to the currently-executing domain. */
static struct timer t_timer[NR_CPUS];
+static inline void vcpu_runstate_change(
+ struct vcpu *v, int new_state, s_time_t new_entry_time)
+{
+ ASSERT(v->runstate.state != new_state);
+ ASSERT(spin_is_locked(&schedule_data[v->processor].schedule_lock));
+
+ v->runstate.time[v->runstate.state] +=
+ new_entry_time - v->runstate.state_entry_time;
+ v->runstate.state_entry_time = new_entry_time;
+ v->runstate.state = new_state;
+}
+
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
+{
+ if ( likely(v == current) )
+ {
+ /* Fast lock-free path. */
+ memcpy(runstate, &v->runstate, sizeof(*runstate));
+ ASSERT(runstate->state == RUNSTATE_running);
+ runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
+ }
+ else
+ {
+ vcpu_schedule_lock_irq(v);
+ memcpy(runstate, &v->runstate, sizeof(*runstate));
+ runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
+ vcpu_schedule_unlock_irq(v);
+ }
+}
+
struct domain *alloc_domain(void)
{
struct domain *d;
@@ -119,6 +141,9 @@
v->cpu_affinity = is_idle_domain(d) ?
cpumask_of_cpu(cpu_id) : CPU_MASK_ALL;
+ v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
+ v->runstate.state_entry_time = NOW();
+
if ( (vcpu_id != 0) && !is_idle_domain(d) )
set_bit(_VCPUF_down, &v->vcpu_flags);
@@ -165,8 +190,15 @@
unsigned long flags;
vcpu_schedule_lock_irqsave(v, flags);
+
if ( likely(!vcpu_runnable(v)) )
+ {
+ if ( v->runstate.state == RUNSTATE_runnable )
+ vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
SCHED_OP(sleep, v);
+ }
+
vcpu_schedule_unlock_irqrestore(v, flags);
TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
@@ -187,11 +219,19 @@
unsigned long flags;
vcpu_schedule_lock_irqsave(v, flags);
+
if ( likely(vcpu_runnable(v)) )
{
+ if ( v->runstate.state >= RUNSTATE_blocked )
+ vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
SCHED_OP(wake, v);
- v->wokenup = NOW();
- }
+ }
+ else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
+ {
+ if ( v->runstate.state == RUNSTATE_blocked )
+ vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+ }
+
vcpu_schedule_unlock_irqrestore(v, flags);
TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
@@ -376,8 +416,6 @@
stop_timer(&schedule_data[cpu].s_timer);
- prev->cpu_time += now - prev->lastschd;
-
/* get policy-specific decision on scheduling... */
next_slice = ops.do_schedule(now);
@@ -386,8 +424,6 @@
schedule_data[cpu].curr = next;
- next->lastschd = now;
-
set_timer(&schedule_data[cpu].s_timer, now + r_time);
if ( unlikely(prev == next) )
@@ -397,38 +433,23 @@
}
TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
- prev->domain->domain_id, now - prev->lastschd);
+ prev->domain->domain_id,
+ now - prev->runstate.state_entry_time);
TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
- next->domain->domain_id, now - next->wokenup, r_time);
-
- /*
- * Logic of wokenup field in domain struct:
- * Used to calculate "waiting time", which is the time that a domain
- * spends being "runnable", but not actually running. wokenup is set
- * set whenever a domain wakes from sleeping. However, if wokenup is not
- * also set here then a preempted runnable domain will get a screwed up
- * "waiting time" value next time it is scheduled.
- */
- prev->wokenup = now;
-
-#if defined(WAKE_HISTO)
- if ( !is_idle_vcpu(next) && next->wokenup )
- {
- ulong diff = (ulong)(now - next->wokenup);
- diff /= (ulong)MILLISECS(1);
- if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
- else schedule_data[cpu].hist[BUCKETS-1]++;
- }
- next->wokenup = (s_time_t)0;
-#elif defined(BLOCKTIME_HISTO)
- prev->lastdeschd = now;
- if ( !is_idle_vcpu(next) )
- {
- ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
- if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
- else schedule_data[cpu].hist[BUCKETS-1]++;
- }
-#endif
+ next->domain->domain_id,
+ (next->runstate.state == RUNSTATE_runnable) ?
+ (now - next->runstate.state_entry_time) : 0,
+ r_time);
+
+ ASSERT(prev->runstate.state == RUNSTATE_running);
+ vcpu_runstate_change(
+ prev,
+ (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
+ (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
+ now);
+
+ ASSERT(next->runstate.state != RUNSTATE_running);
+ vcpu_runstate_change(next, RUNSTATE_running, now);
ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
set_bit(_VCPUF_running, &next->vcpu_flags);
@@ -567,47 +588,6 @@
local_irq_restore(flags);
}
-
-#if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
-
-void print_sched_histo(unsigned char key)
-{
- int i, j, k;
- for_each_online_cpu ( k )
- {
- j = 0;
- printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
- for ( i = 0; i < BUCKETS; i++ )
- {
- if ( schedule_data[k].hist[i] != 0 )
- {
- if ( i < BUCKETS-1 )
- printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
- else
- printk(" >:[%7u] ", schedule_data[k].hist[i]);
- if ( !(++j % 5) )
- printk("\n");
- }
- }
- printk("\n");
- }
-
-}
-
-void reset_sched_histo(unsigned char key)
-{
- int i, j;
- for ( j = 0; j < NR_CPUS; j++ )
- for ( i=0; i < BUCKETS; i++ )
- schedule_data[j].hist[i] = 0;
-}
-
-#else
-
-void print_sched_histo(unsigned char key) { }
-void reset_sched_histo(unsigned char key) { }
-
-#endif
/*
* Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/drivers/char/console.c
--- a/xen/drivers/char/console.c Wed Mar 1 17:01:54 2006
+++ b/xen/drivers/char/console.c Wed Mar 1 19:47:25 2006
@@ -335,8 +335,9 @@
}
if ( hypercall_preempt_check() )
- return hypercall3_create_continuation(
- __HYPERVISOR_console_io, CONSOLEIO_write, count, buffer);
+ return hypercall_create_continuation(
+ __HYPERVISOR_console_io, "iip",
+ CONSOLEIO_write, count, buffer);
kcount = min_t(int, count, sizeof(kbuf)-1);
if ( copy_from_user(kbuf, buffer, kcount) )
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/config.h
--- a/xen/include/asm-ia64/config.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-ia64/config.h Wed Mar 1 19:47:25 2006
@@ -36,6 +36,8 @@
//#define CONFIG_NR_CPUS 16
//leave SMP for a later time
//#undef CONFIG_SMP
+
+#define supervisor_mode_kernel (0)
#define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */
@@ -190,11 +192,6 @@
#define find_first_set_bit(x) (ffs(x)-1) // FIXME: Is this right???
-// from include/asm-x86/*/uaccess.h
-#define array_access_ok(addr,count,size) \
- (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ && \
- access_ok(type,addr,count*size))
-
// see drivers/char/console.c
#ifndef VALIDATE_VT
#define OPT_CONSOLE_STR "com1"
@@ -299,7 +296,6 @@
//#define raw_smp_processor_id() 0
//#endif
-
#ifndef __ASSEMBLY__
#include <linux/linkage.h>
#define FORCE_CRASH() asm("break.m 0;;");
diff -r 88f97bb8f3ae -r 673f62edbfbe
xen/include/asm-ia64/linux-xen/asm/README.origin
--- a/xen/include/asm-ia64/linux-xen/asm/README.origin Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-ia64/linux-xen/asm/README.origin Wed Mar 1 19:47:25 2006
@@ -22,4 +22,3 @@
system.h -> linux/include/asm-ia64/system.h
tlbflush.h -> linux/include/asm-ia64/tlbflush.h
types.h -> linux/include/asm-ia64/types.h
-uaccess.h -> linux/include/asm-ia64/uaccess.h
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/config.h Wed Mar 1 19:47:25 2006
@@ -36,6 +36,12 @@
#define OPT_CONSOLE_STR "com1,vga"
#define NR_CPUS 32
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+# define supervisor_mode_kernel (1)
+#else
+# define supervisor_mode_kernel (0)
+#endif
/* Linkage for x86 */
#define __ALIGN .align 16,0x90
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/desc.h
--- a/xen/include/asm-x86/desc.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/desc.h Wed Mar 1 19:47:25 2006
@@ -27,9 +27,22 @@
#endif
/* Fix up the RPL of a guest segment selector. */
-#define fixup_guest_selector(sel) \
+#define __fixup_guest_selector(sel) \
((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) : \
(((sel) & ~3) | GUEST_KERNEL_RPL))
+
+/* Stack selectors don't need fixing up if the kernel runs in ring 0. */
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define fixup_guest_stack_selector(ss) ((void)0)
+#else
+#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss)
+#endif
+
+/*
+ * Code selectors are always fixed up. It allows the Xen exit stub to detect
+ * return to guest context, even when the guest kernel runs in ring 0.
+ */
+#define fixup_guest_code_selector(cs) __fixup_guest_selector(cs)
/*
* We need this function because enforcing the correct guest kernel RPL is
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/hvm.h Wed Mar 1 19:47:25 2006
@@ -67,6 +67,9 @@
int (*paging_enabled)(struct vcpu *v);
int (*instruction_length)(struct vcpu *v);
unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
+
+ void (*init_ap_context)(struct vcpu_guest_context *ctxt,
+ int vcpuid, int trampoline_vector);
};
extern struct hvm_function_table hvm_funcs;
@@ -173,4 +176,14 @@
return hvm_funcs.get_guest_ctrl_reg(v, num);
return 0; /* force to fail */
}
+
+static inline void
+hvm_init_ap_context(struct vcpu_guest_context *ctxt,
+ int vcpuid, int trampoline_vector)
+{
+ return hvm_funcs.init_ap_context(ctxt, vcpuid, trampoline_vector);
+}
+
+extern int hvm_bringup_ap(int vcpuid, int trampoline_vector);
+
#endif /* __ASM_X86_HVM_HVM_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/emulate.h
--- a/xen/include/asm-x86/hvm/svm/emulate.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/emulate.h Wed Mar 1 19:47:25 2006
@@ -83,15 +83,15 @@
struct cpu_user_regs *regs, const u8 prefix, const u8 *operand,
u8 *size);
extern OPERATING_MODE get_operating_mode (struct vmcb_struct *vmcb);
-extern unsigned int decode_dest_reg(u8 modrm);
-extern unsigned int decode_src_reg(u8 modrm);
+extern unsigned int decode_dest_reg(u8 prefix, u8 modrm);
+extern unsigned int decode_src_reg(u8 prefix, u8 modrm);
extern unsigned long svm_rip2pointer(struct vmcb_struct *vmcb);
-extern unsigned int __get_instruction_length_from_list(struct vmcb_struct
*vmcb,
+extern int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
enum instruction_index *list, unsigned int list_count,
u8 *guest_eip_buf, enum instruction_index *match);
-static inline unsigned int __get_instruction_length(struct vmcb_struct *vmcb,
+static inline int __get_instruction_length(struct vmcb_struct *vmcb,
enum instruction_index instr, u8 *guest_eip_buf)
{
return __get_instruction_length_from_list(vmcb, &instr, 1, guest_eip_buf,
@@ -138,9 +138,20 @@
}
+static inline int skip_prefix_bytes(u8 *buf, size_t size)
+{
+ int index;
+ for (index = 0; index < size && is_prefix(buf[index]); index ++)
+ /* do nothing */ ;
+ return index;
+}
+
+
+
static void inline __update_guest_eip(struct vmcb_struct *vmcb,
- unsigned long inst_len)
+ int inst_len)
{
+ ASSERT(inst_len > 0);
vmcb->rip += inst_len;
}
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/svm.h
--- a/xen/include/asm-x86/hvm/svm/svm.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/svm.h Wed Mar 1 19:47:25 2006
@@ -54,6 +54,8 @@
/* For debugging. Remove when no longer needed. */
extern void svm_dump_host_regs(const char *from);
+extern void svm_migrate_timers(struct vcpu *v);
+
/* ASID API */
enum {
ASID_AVAILABLE = 0,
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h Wed Mar 1 19:47:25 2006
@@ -269,21 +269,6 @@
#define SVM_LONG_GUEST(ed) \
(test_bit(SVM_CPU_STATE_LMA_ENABLED, &ed->arch.hvm_svm.cpu_state))
-enum {
- SVM_INDEX_MSR_LSTAR = 0,
- SVM_INDEX_MSR_STAR,
- SVM_INDEX_MSR_CSTAR,
- SVM_INDEX_MSR_SYSCALL_MASK,
- SVM_INDEX_MSR_EFER,
-
- SVM_MSR_COUNT,
-};
-
-struct svm_msr_state {
- unsigned long flags;
- unsigned long msr_items[SVM_MSR_COUNT];
- unsigned long shadow_gs;
-};
/*
* Attribute for segment selector. This is a copy of bit 40:47 & 52:55 of the
@@ -449,7 +434,7 @@
struct arch_svm_struct {
struct vmcb_struct *vmcb;
- void *host_save_area;
+ void *host_save_area;
u64 host_save_pa;
u64 vmcb_pa;
u32 *iopm;
@@ -457,14 +442,15 @@
u64 vmexit_tsc; /* tsc read at #VMEXIT. for TSC_OFFSET */
int injecting_event;
int saved_irq_vector;
- u32 core; /* cpu of last vmexit */
+ u32 launch_core;
+ u32 asid_core;
unsigned long flags; /* VMCB flags */
- unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+ unsigned long cpu_shadow_cr0; /* Guest value for CR0 */
+ unsigned long cpu_shadow_cr4; /* Guest value for CR4 */
unsigned long cpu_cr2;
unsigned long cpu_cr3;
unsigned long cpu_state;
- struct svm_msr_state msr_content;
struct timer hlt_timer; /* hlt ins emulation wakeup timer */
};
@@ -485,6 +471,14 @@
#define VMCB_EFLAGS_RESERVED_0 0xffc08028 /* bitmap for 0 */
#define VMCB_EFLAGS_RESERVED_1 0x00000002 /* bitmap for 1 */
+
+/* These bits in the CR4 are owned by the host */
+#ifdef __i386__
+#define SVM_CR4_HOST_MASK (0)
+#else
+#define SVM_CR4_HOST_MASK (X86_CR4_PAE)
+#endif
+
#endif /* ASM_X86_HVM_SVM_VMCS_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/vcpu.h Wed Mar 1 19:47:25 2006
@@ -25,10 +25,15 @@
#include <asm/hvm/vmx/vmcs.h>
#include <asm/hvm/svm/vmcb.h>
+#define HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM 0
+#define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1
+
struct hvm_vcpu {
- unsigned long ioflags;
- struct mmio_op mmio_op;
- struct vlapic *vlapic;
+ unsigned long ioflags;
+ struct mmio_op mmio_op;
+ struct vlapic *vlapic;
+ /* For AP startup */
+ unsigned long init_sipi_sipi_state;
union {
struct arch_vmx_struct vmx;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vlapic.h
--- a/xen/include/asm-x86/hvm/vlapic.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/vlapic.h Wed Mar 1 19:47:25 2006
@@ -158,9 +158,6 @@
int deliver_mode;
int source[6];
} direct_intr_info_t;
-
-#define VLAPIC_INIT_SIPI_SIPI_STATE_NORM 0
-#define VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1
struct vlapic
{
@@ -197,7 +194,6 @@
unsigned long init_ticks;
uint32_t err_write_count;
uint64_t apic_base_msr;
- uint32_t init_sipi_sipi_state;
struct vcpu *vcpu;
struct domain *domain;
};
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/mm.h Wed Mar 1 19:47:25 2006
@@ -337,6 +337,10 @@
UNLOCK_BIGLOCK(d); \
} while ( 0 )
+#define writable_pagetable_in_sync(d) \
+ (!((d)->arch.ptwr[PTWR_PT_ACTIVE].l1va | \
+ (d)->arch.ptwr[PTWR_PT_INACTIVE].l1va))
+
int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
#ifndef NDEBUG
@@ -376,7 +380,7 @@
int __sync_lazy_execstate(void);
/* Arch-specific portion of memory_op hypercall. */
-long arch_memory_op(int op, void *arg);
-long subarch_memory_op(int op, void *arg);
+long arch_memory_op(int op, GUEST_HANDLE(void) arg);
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg);
#endif /* __ASM_X86_MM_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_64.h
--- a/xen/include/asm-x86/shadow_64.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/shadow_64.h Wed Mar 1 19:47:25 2006
@@ -223,6 +223,7 @@
int i;
pgentry_64_t *le_e;
pgentry_64_t *le_p = NULL;
+ pgentry_64_t *phys_vtable = NULL;
unsigned long mfn;
int index;
u32 level = flag & L_MASK;
@@ -251,25 +252,35 @@
{
root_level = PAE_PAGING_LEVELS;
index = table_offset_64(va, root_level);
- le_e = (pgentry_64_t *)map_domain_page(
+ phys_vtable = (pgentry_64_t *)map_domain_page(
pagetable_get_pfn(v->domain->arch.phys_table));
+ le_e = &phys_vtable[index];
}
/*
* If it's not external mode, then mfn should be machine physical.
*/
- for (i = root_level - level; i > 0; i--) {
- if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) ) {
+ for ( i = root_level - level; i > 0; i-- )
+ {
+ if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) )
+ {
if ( le_p )
unmap_domain_page(le_p);
+
+ if ( phys_vtable )
+ unmap_domain_page(phys_vtable);
+
return 0;
}
+
mfn = entry_get_pfn(*le_e);
if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) )
mfn = get_mfn_from_gpfn(mfn);
+
if ( le_p )
unmap_domain_page(le_p);
le_p = (pgentry_64_t *)map_domain_page(mfn);
+
if ( flag & SHADOW_ENTRY )
index = table_offset_64(va, (level + i - 1));
else
@@ -285,8 +296,10 @@
if ( le_p )
unmap_domain_page(le_p);
+ if ( phys_vtable )
+ unmap_domain_page(phys_vtable);
+
return 1;
-
}
static inline int __rw_entry(
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_public.h
--- a/xen/include/asm-x86/shadow_public.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/shadow_public.h Wed Mar 1 19:47:25 2006
@@ -21,8 +21,6 @@
#ifndef _XEN_SHADOW_PUBLIC_H
#define _XEN_SHADOW_PUBLIC_H
-
-extern int alloc_p2m_table(struct domain *d);
#if CONFIG_PAGING_LEVELS >= 3
#define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/x86_32/asm_defns.h
--- a/xen/include/asm-x86/x86_32/asm_defns.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-x86/x86_32/asm_defns.h Wed Mar 1 19:47:25 2006
@@ -48,9 +48,24 @@
#ifdef PERF_COUNTERS
#define PERFC_INCR(_name,_idx) \
- lock incl perfcounters+_name(,_idx,4)
+ lock incl perfcounters+_name(,_idx,4)
#else
#define PERFC_INCR(_name,_idx)
+#endif
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define FIXUP_RING0_GUEST_STACK \
+ testl $2,8(%esp); \
+ jnz 1f; /* rings 2 & 3 permitted */ \
+ testl $1,8(%esp); \
+ jz 2f; \
+ ud2; /* ring 1 should not be used */ \
+ 2:cmpl $(__HYPERVISOR_VIRT_START),%esp; \
+ jge 1f; \
+ call fixup_ring0_guest_stack; \
+ 1:
+#else
+#define FIXUP_RING0_GUEST_STACK
#endif
#define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v)
@@ -61,6 +76,7 @@
".globl " STR(x) "\n\t" \
STR(x) ":\n\t" \
"pushl $"#v"<<16\n\t" \
+ STR(FIXUP_RING0_GUEST_STACK) \
STR(SAVE_ALL(a)) \
"movl %esp,%eax\n\t" \
"pushl %eax\n\t" \
@@ -72,6 +88,7 @@
__asm__( \
"\n" __ALIGN_STR"\n" \
"common_interrupt:\n\t" \
+ STR(FIXUP_RING0_GUEST_STACK) \
STR(SAVE_ALL(a)) \
"movl %esp,%eax\n\t" \
"pushl %eax\n\t" \
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/memory.h
--- a/xen/include/public/memory.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/public/memory.h Wed Mar 1 19:47:25 2006
@@ -29,7 +29,7 @@
* OUT: GMFN bases of extents that were allocated
* (NB. This command also updates the mach_to_phys translation table)
*/
- unsigned long *extent_start;
+ GUEST_HANDLE(xen_ulong) extent_start;
/* Number of extents, and size/alignment of each (2^extent_order pages). */
unsigned long nr_extents;
@@ -50,6 +50,7 @@
domid_t domid;
} xen_memory_reservation_t;
+DEFINE_GUEST_HANDLE(xen_memory_reservation_t);
/*
* Returns the maximum machine frame number of mapped RAM in this system.
@@ -85,7 +86,7 @@
* any large discontiguities in the machine address space, 2MB gaps in
* the machphys table will be represented by an MFN base of zero.
*/
- unsigned long *extent_start;
+ GUEST_HANDLE(xen_ulong) extent_start;
/*
* Number of extents written to the above array. This will be smaller
@@ -93,6 +94,7 @@
*/
unsigned int nr_extents;
} xen_machphys_mfn_list_t;
+DEFINE_GUEST_HANDLE(xen_machphys_mfn_list_t);
/*
* Returns the base and size of the specified reserved 'RAM hole' in the
@@ -113,6 +115,7 @@
/* Base and size of the specified reserved area. */
unsigned long first_gpfn, nr_gpfns;
} xen_reserved_phys_area_t;
+DEFINE_GUEST_HANDLE(xen_reserved_phys_area_t);
/*
* Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
@@ -127,14 +130,15 @@
unsigned long nr_gpfns;
/* List of GPFNs to translate. */
- unsigned long *gpfn_list;
+ GUEST_HANDLE(xen_ulong) gpfn_list;
/*
* Output list to contain MFN translations. May be the same as the input
* list (in which case each input GPFN is overwritten with the output MFN).
*/
- unsigned long *mfn_list;
+ GUEST_HANDLE(xen_ulong) mfn_list;
} xen_translate_gpfn_list_t;
+DEFINE_GUEST_HANDLE(xen_translate_gpfn_list_t);
#endif /* __XEN_PUBLIC_MEMORY_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/vcpu.h
--- a/xen/include/public/vcpu.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/public/vcpu.h Wed Mar 1 19:47:25 2006
@@ -51,6 +51,61 @@
/* Returns 1 if the given VCPU is up. */
#define VCPUOP_is_up 3
+/*
+ * Return information about the state and running time of a VCPU.
+ * @extra_arg == pointer to vcpu_runstate_info structure.
+ */
+#define VCPUOP_get_runstate_info 4
+typedef struct vcpu_runstate_info {
+ /* VCPU's current state (RUNSTATE_*). */
+ int state;
+ /* When was current state entered (system time, ns)? */
+ uint64_t state_entry_time;
+ /*
+ * Time spent in each RUNSTATE_* (ns). The sum of these times is
+ * guaranteed not to drift from system time.
+ */
+ uint64_t time[4];
+} vcpu_runstate_info_t;
+
+/* VCPU is currently running on a physical CPU. */
+#define RUNSTATE_running 0
+
+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
+#define RUNSTATE_runnable 1
+
+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
+#define RUNSTATE_blocked 2
+
+/*
+ * VCPU is not runnable, but it is not blocked.
+ * This is a 'catch all' state for things like hotplug and pauses by the
+ * system administrator (or for critical sections in the hypervisor).
+ * RUNSTATE_blocked dominates this state (it is the preferred state).
+ */
+#define RUNSTATE_offline 3
+
+/*
+ * Register a shared memory area from which the guest may obtain its own
+ * runstate information without needing to execute a hypercall.
+ * Notes:
+ * 1. The registered address may be virtual or physical, depending on the
+ * platform. The virtual address should be registered on x86 systems.
+ * 2. Only one shared area may be registered per VCPU. The shared area is
+ * updated by the hypervisor each time the VCPU is scheduled. Thus
+ * runstate.state will always be RUNSTATE_running and
+ * runstate.state_entry_time will indicate the system time at which the
+ * VCPU was last scheduled to run.
+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
+ */
+#define VCPUOP_register_runstate_memory_area 5
+typedef struct vcpu_register_runstate_memory_area {
+ union {
+ struct vcpu_runstate_info *v;
+ uint64_t p;
+ } addr;
+} vcpu_register_runstate_memory_area_t;
+
#endif /* __XEN_PUBLIC_VCPU_H__ */
/*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/version.h
--- a/xen/include/public/version.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/public/version.h Wed Mar 1 19:47:25 2006
@@ -48,36 +48,8 @@
uint32_t submap; /* OUT: 32-bit submap */
} xen_feature_info_t;
-/*
- * If set, the guest does not need to write-protect its pagetables, and can
- * update them via direct writes.
- */
-#define XENFEAT_writable_page_tables 0
-
-/*
- * If set, the guest does not need to write-protect its segment descriptor
- * tables, and can update them via direct writes.
- */
-#define XENFEAT_writable_descriptor_tables 1
-
-/*
- * If set, translation between the guest's 'pseudo-physical' address space
- * and the host's machine address space are handled by the hypervisor. In this
- * mode the guest does not need to perform phys-to/from-machine translations
- * when performing page table operations.
- */
-#define XENFEAT_auto_translated_physmap 2
-
-/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
-#define XENFEAT_supervisor_mode_kernel 3
-
-/*
- * If set, the guest does not need to allocate x86 PAE page directories
- * below 4GB. This flag is usually implied by auto_translated_physmap.
- */
-#define XENFEAT_pae_pgdir_above_4gb 4
-
-#define XENFEAT_NR_SUBMAPS 1
+/* Declares the features reported by XENVER_get_features. */
+#include "features.h"
#endif /* __XEN_PUBLIC_VERSION_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/xen.h
--- a/xen/include/public/xen.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/public/xen.h Wed Mar 1 19:47:25 2006
@@ -8,6 +8,22 @@
#ifndef __XEN_PUBLIC_XEN_H__
#define __XEN_PUBLIC_XEN_H__
+
+#ifdef __XEN__
+#define DEFINE_GUEST_HANDLE(type) struct __guest_handle_ ## type { type *p; }
+#define GUEST_HANDLE(type) struct __guest_handle_ ## type
+#else
+#define DEFINE_GUEST_HANDLE(type)
+#define GUEST_HANDLE(type) type *
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handle for unsigned long pointer. Define a name with no whitespace. */
+typedef unsigned long xen_ulong;
+DEFINE_GUEST_HANDLE(xen_ulong);
+/* Guest handle for arbitrary-type pointer (void *). */
+DEFINE_GUEST_HANDLE(void);
+#endif
#if defined(__i386__)
#include "arch-x86_32.h"
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/xen/sched-if.h Wed Mar 1 19:47:25 2006
@@ -8,9 +8,6 @@
#ifndef __XEN_SCHED_IF_H__
#define __XEN_SCHED_IF_H__
-#define BUCKETS 10
-/*300*/
-
struct schedule_data {
spinlock_t schedule_lock; /* spinlock protecting curr */
struct vcpu *curr; /* current task */
@@ -18,9 +15,6 @@
void *sched_priv;
struct timer s_timer; /* scheduling timer */
unsigned long tick; /* current periodic 'tick' */
-#ifdef BUCKETS
- u32 hist[BUCKETS]; /* for scheduler latency histogram */
-#endif
} __cacheline_aligned;
extern struct schedule_data schedule_data[];
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/xen/sched.h Wed Mar 1 19:47:25 2006
@@ -8,6 +8,7 @@
#include <xen/smp.h>
#include <public/xen.h>
#include <public/dom0_ops.h>
+#include <public/vcpu.h>
#include <xen/time.h>
#include <xen/timer.h>
#include <xen/grant_table.h>
@@ -63,14 +64,13 @@
struct vcpu *next_in_list;
- struct timer timer; /* one-shot timer for timeout values */
+ struct timer timer; /* one-shot timer for timeout values */
unsigned long sleep_tick; /* tick at which this vcpu started sleep */
- s_time_t lastschd; /* time this domain was last scheduled */
- s_time_t lastdeschd; /* time this domain was last descheduled */
- s_time_t cpu_time; /* total CPU time received till now */
- s_time_t wokenup; /* time domain got woken up */
void *sched_priv; /* scheduler-specific data */
+
+ struct vcpu_runstate_info runstate;
+ struct vcpu_runstate_info *runstate_guest; /* guest address */
unsigned long vcpu_flags;
@@ -303,31 +303,18 @@
void startup_cpu_idle_loop(void);
-unsigned long __hypercall_create_continuation(
- unsigned int op, unsigned int nr_args, ...);
-#define hypercall0_create_continuation(_op) \
- __hypercall_create_continuation((_op), 0)
-#define hypercall1_create_continuation(_op, _a1) \
- __hypercall_create_continuation((_op), 1, \
- (unsigned long)(_a1))
-#define hypercall2_create_continuation(_op, _a1, _a2) \
- __hypercall_create_continuation((_op), 2, \
- (unsigned long)(_a1), (unsigned long)(_a2))
-#define hypercall3_create_continuation(_op, _a1, _a2, _a3) \
- __hypercall_create_continuation((_op), 3, \
- (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3))
-#define hypercall4_create_continuation(_op, _a1, _a2, _a3, _a4) \
- __hypercall_create_continuation((_op), 4, \
- (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
- (unsigned long)(_a4))
-#define hypercall5_create_continuation(_op, _a1, _a2, _a3, _a4, _a5) \
- __hypercall_create_continuation((_op), 5, \
- (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
- (unsigned long)(_a4), (unsigned long)(_a5))
-#define hypercall6_create_continuation(_op, _a1, _a2, _a3, _a4, _a5, _a6) \
- __hypercall_create_continuation((_op), 6, \
- (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
- (unsigned long)(_a4), (unsigned long)(_a5), (unsigned long)(_a6))
+/*
+ * Creates a continuation to resume the current hypercall. The caller should
+ * return immediately, propagating the value returned from this invocation.
+ * The format string specifies the types and number of hypercall arguments.
+ * It contains one character per argument as follows:
+ * 'i' [unsigned] {char, int}
+ * 'l' [unsigned] long
+ * 'p' pointer (foo *)
+ * 'h' guest handle (GUEST_HANDLE(foo))
+ */
+unsigned long hypercall_create_continuation(
+ unsigned int op, const char *format, ...);
#define hypercall_preempt_check() (unlikely( \
softirq_pending(smp_processor_id()) | \
@@ -397,7 +384,6 @@
#define _DOMF_debugging 4
#define DOMF_debugging (1UL<<_DOMF_debugging)
-
static inline int vcpu_runnable(struct vcpu *v)
{
return ( (atomic_read(&v->pausecnt) == 0) &&
@@ -415,6 +401,8 @@
int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
+
static inline void vcpu_unblock(struct vcpu *v)
{
if ( test_and_clear_bit(_VCPUF_blocked, &v->vcpu_flags) )
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/string.h
--- a/xen/include/xen/string.h Wed Mar 1 17:01:54 2006
+++ b/xen/include/xen/string.h Wed Mar 1 19:47:25 2006
@@ -24,6 +24,9 @@
#endif
#ifndef __HAVE_ARCH_STRNCPY
extern char * strncpy(char *,const char *, __kernel_size_t);
+#endif
+#ifndef __HAVE_ARCH_STRLCPY
+extern size_t strlcpy(char *,const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRCAT
extern char * strcat(char *, const char *);
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/mm/pgtable.c
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable.c Wed Mar 1 19:47:25 2006
@@ -0,0 +1,283 @@
+/*
+ * linux/arch/i386/mm/pgtable.c
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+void show_mem(void)
+{
+ int total = 0, reserved = 0;
+ int shared = 0, cached = 0;
+ int highmem = 0;
+ struct page *page;
+ pg_data_t *pgdat;
+ unsigned long i;
+ struct page_state ps;
+ unsigned long flags;
+
+ printk(KERN_INFO "Mem-info:\n");
+ show_free_areas();
+ printk(KERN_INFO "Free swap: %6ldkB\n",
nr_swap_pages<<(PAGE_SHIFT-10));
+ for_each_pgdat(pgdat) {
+ pgdat_resize_lock(pgdat, &flags);
+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+ page = pgdat_page_nr(pgdat, i);
+ total++;
+ if (PageHighMem(page))
+ highmem++;
+ if (PageReserved(page))
+ reserved++;
+ else if (PageSwapCache(page))
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+ }
+ pgdat_resize_unlock(pgdat, &flags);
+ }
+ printk(KERN_INFO "%d pages of RAM\n", total);
+ printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
+ printk(KERN_INFO "%d reserved pages\n", reserved);
+ printk(KERN_INFO "%d pages shared\n", shared);
+ printk(KERN_INFO "%d pages swap cached\n", cached);
+
+ get_page_state(&ps);
+ printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
+ printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
+ printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
+ printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
+ printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ BUG();
+ return;
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ BUG();
+ return;
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ /* <pfn,flags> stored as-is, to permit clearing entries */
+ set_pte(pte, pfn_pte(pfn, flags));
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned.
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
+ printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+ return; /* BUG(); */
+ }
+ if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
+ printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+ return; /* BUG(); */
+ }
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+ return; /* BUG(); */
+ }
+ pud = pud_offset(pgd, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ set_pmd(pmd, pfn_pmd(pfn, flags));
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+static int nr_fixmaps = 0;
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t
flags)
+{
+ unsigned long address = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+ nr_fixmaps++;
+}
+
+void set_fixaddr_top(unsigned long top)
+{
+ BUG_ON(nr_fixmaps > 0);
+ __FIXADDR_TOP = top - PAGE_SIZE;
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+ return pte;
+}
+
+void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
+{
+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * The locking scheme was chosen on the basis of manfred's
+ * recommendations and having no core impact whatsoever.
+ * -- wli
+ */
+DEFINE_SPINLOCK(pgd_lock);
+struct page *pgd_list;
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+ page->index = (unsigned long)pgd_list;
+ if (pgd_list)
+ set_page_private(pgd_list, (unsigned long)&page->index);
+ pgd_list = page;
+ set_page_private(page, (unsigned long)&pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *next, **pprev, *page = virt_to_page(pgd);
+ next = (struct page *)page->index;
+ pprev = (struct page **)page_private(page);
+ *pprev = next;
+ if (next)
+ set_page_private(next, (unsigned long)pprev);
+}
+
+void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+ unsigned long flags;
+
+ if (PTRS_PER_PMD == 1) {
+ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+ spin_lock_irqsave(&pgd_lock, flags);
+ }
+
+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+ swapper_pg_dir + USER_PTRS_PER_PGD,
+ KERNEL_PGD_PTRS);
+ if (PTRS_PER_PMD > 1)
+ return;
+
+ pgd_list_add(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+/* never called when PTRS_PER_PMD > 1 */
+void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+ unsigned long flags; /* can be called from interrupt context */
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ pgd_list_del(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ int i;
+ pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+
+ if (PTRS_PER_PMD == 1 || !pgd)
+ return pgd;
+
+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+ if (!pmd)
+ goto out_oom;
+ set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+ }
+ return pgd;
+
+out_oom:
+ for (i--; i >= 0; i--)
+ kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+ kmem_cache_free(pgd_cache, pgd);
+ return NULL;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+ int i;
+
+ /* in the PAE case user pgd entries are overwritten before usage */
+ if (PTRS_PER_PMD > 1)
+ for (i = 0; i < USER_PTRS_PER_PGD; ++i)
+ kmem_cache_free(pmd_cache, (void
*)__va(pgd_val(pgd[i])-1));
+ /* in the non-PAE case, free_pgtables() clears user pgd entries */
+ kmem_cache_free(pgd_cache, pgd);
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/include/asm-i386/fixmap.h
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/fixmap.h Wed Mar 1 19:47:25 2006
@@ -0,0 +1,151 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#ifndef _ASM_FIXMAP_H
+#define _ASM_FIXMAP_H
+
+#include <linux/config.h>
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#endif
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process. We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * these 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages. (or larger if used with an increment
+ * highger than 1) use fixmap_set(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+ FIX_HOLE,
+#ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+ FIX_F00F_IDT, /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_HIGHMEM
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#endif
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+#endif
+ __end_of_permanent_fixed_addresses,
+ /* temporary boot-time mappings, used before ioremap() is functional */
+#define NR_FIX_BTMAPS 16
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
+ FIX_WP_TEST,
+ __end_of_fixed_addresses
+};
+
+extern void __set_fixmap (enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags);
+
+extern void set_fixaddr_top(unsigned long top);
+
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL)
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx) \
+ __set_fixmap(idx, 0, __pgprot(0))
+
+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+
+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
+#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
+
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without tranlation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ /*
+ * this branch gets completely eliminated after inlining,
+ * except when someone tries to use fixaddr indices in an
+ * illegal way. (such as mixing up address types or using
+ * out-of-range indices).
+ *
+ * If it doesn't get removed, the linker will complain
+ * loudly with a reasonably clear error message..
+ */
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
+
+ return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif
diff -r 88f97bb8f3ae -r 673f62edbfbe
linux-2.6-xen-sparse/include/asm-i386/page.h
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/page.h Wed Mar 1 19:47:25 2006
@@ -0,0 +1,148 @@
+#ifndef _I386_PAGE_H
+#define _I386_PAGE_H
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (1UL << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
+#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#include <linux/config.h>
+
+#ifdef CONFIG_X86_USE_3DNOW
+
+#include <asm/mmx.h>
+
+#define clear_page(page) mmx_clear_page((void *)(page))
+#define copy_page(to,from) mmx_copy_page(to,from)
+
+#else
+
+/*
+ * On older X86 processors it's not a win to use MMX here it seems.
+ * Maybe the K6-III ?
+ */
+
+#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
+
+#endif
+
+#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
+
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER |
__GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+/*
+ * These are used to make use of C type-checking..
+ */
+extern int nx_enabled;
+#ifdef CONFIG_X86_PAE
+extern unsigned long long __supported_pte_mask;
+typedef struct { unsigned long pte_low, pte_high; } pte_t;
+typedef struct { unsigned long long pmd; } pmd_t;
+typedef struct { unsigned long long pgd; } pgd_t;
+typedef struct { unsigned long long pgprot; } pgprot_t;
+#define pmd_val(x) ((x).pmd)
+#define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32))
+#define __pmd(x) ((pmd_t) { (x) } )
+#define HPAGE_SHIFT 21
+#else
+typedef struct { unsigned long pte_low; } pte_t;
+typedef struct { unsigned long pgd; } pgd_t;
+typedef struct { unsigned long pgprot; } pgprot_t;
+#define boot_pte_t pte_t /* or would you rather have a typedef */
+#define pte_val(x) ((x).pte_low)
+#define HPAGE_SHIFT 22
+#endif
+#define PTE_MASK PAGE_MASK
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#endif
+
+#define pgd_val(x) ((x).pgd)
+#define pgprot_val(x) ((x).pgprot)
+
+#define __pte(x) ((pte_t) { (x) } )
+#define __pgd(x) ((pgd_t) { (x) } )
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+#endif /* !__ASSEMBLY__ */
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+
+/*
+ * This handles the memory map.. We could make this a config
+ * option, but too many people screw it up, and too few need
+ * it.
+ *
+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
+ * a virtual address space of one gigabyte, which limits the
+ * amount of physical memory you can use to about 950MB.
+ *
+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ */
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This much address space is reserved for vmalloc() and iomap()
+ * as well as fixmap mappings.
+ */
+extern unsigned int __VMALLOC_RESERVE;
+
+extern int sysctl_legacy_va_layout;
+
+extern int page_is_ram(unsigned long pagenr);
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __ASSEMBLY__
+#define __PAGE_OFFSET CONFIG_PAGE_OFFSET
+#define __PHYSICAL_START CONFIG_PHYSICAL_START
+#else
+#define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET)
+#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
+#endif
+#define __KERNEL_START (__PAGE_OFFSET + __PHYSICAL_START)
+
+
+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
+#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE)
+#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
+#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
+#define __va(x) ((void *)((unsigned
long)(x)+PAGE_OFFSET))
+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
+#ifdef CONFIG_FLATMEM
+#define pfn_to_page(pfn) (mem_map + (pfn))
+#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif /* CONFIG_FLATMEM */
+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+
+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#define VM_DATA_DEFAULT_FLAGS \
+ (VM_READ | VM_WRITE | \
+ ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#define __HAVE_ARCH_GATE_AREA 1
+
+#endif /* __KERNEL__ */
+
+#include <asm-generic/page.h>
+
+#endif /* _I386_PAGE_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe
patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch Wed Mar 1
19:47:25 2006
@@ -0,0 +1,45 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c
./arch/i386/kernel/traps.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c 2006-02-27
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/traps.c 2006-02-27 15:55:23.000000000 +0000
+@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
+
+ static void io_check_error(unsigned char reason, struct pt_regs * regs)
+ {
+- unsigned long i;
+-
+ printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+- reason = (reason & 0xf) | 8;
+- outb(reason, 0x61);
+- i = 2000;
+- while (--i) udelay(1000);
+- reason &= ~8;
+- outb(reason, 0x61);
++ clear_io_check_error(reason);
+ }
+
+ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+diff -pruN
../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h
./include/asm-i386/mach-default/mach_traps.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/mach-default/mach_traps.h 2006-02-27
15:55:23.000000000 +0000
+@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
+ outb(reason, 0x61);
+ }
+
++static inline void clear_io_check_error(unsigned char reason)
++{
++ unsigned long i;
++
++ reason = (reason & 0xf) | 8;
++ outb(reason, 0x61);
++ i = 2000;
++ while (--i) udelay(1000);
++ reason &= ~8;
++ outb(reason, 0x61);
++}
++
+ static inline unsigned char get_nmi_reason(void)
+ {
+ return inb(0x61);
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/net-csum.patch
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/net-csum.patch Wed Mar 1 19:47:25 2006
@@ -0,0 +1,41 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c
./net/ipv4/netfilter/ip_nat_proto_tcp.c
+--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c
2006-02-27 15:47:38.000000000 +0000
++++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-02-27 15:55:25.000000000
+0000
+@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
+ if (hdrsize < sizeof(*hdr))
+ return 1;
+
+- hdr->check = ip_nat_cheat_check(~oldip, newip,
++ if ((*pskb)->proto_csum_blank) {
++ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
++ } else {
++ hdr->check = ip_nat_cheat_check(~oldip, newip,
+ ip_nat_cheat_check(oldport ^ 0xFFFF,
+ newport,
+ hdr->check));
++ }
+ return 1;
+ }
+
+diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c
./net/ipv4/netfilter/ip_nat_proto_udp.c
+--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c
2006-02-27 15:47:38.000000000 +0000
++++ ./net/ipv4/netfilter/ip_nat_proto_udp.c 2006-02-27 15:55:25.000000000
+0000
+@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+- if (hdr->check) /* 0 is a special case meaning no checksum */
+- hdr->check = ip_nat_cheat_check(~oldip, newip,
++ if (hdr->check) { /* 0 is a special case meaning no checksum */
++ if ((*pskb)->proto_csum_blank) {
++ hdr->check = ip_nat_cheat_check(oldip, ~newip,
hdr->check);
++ } else {
++ hdr->check = ip_nat_cheat_check(~oldip, newip,
+ ip_nat_cheat_check(*portptr ^ 0xFFFF,
+ newport,
+ hdr->check));
++ }
++ }
+ *portptr = newport;
+ return 1;
+ }
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/pmd-shared.patch
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/pmd-shared.patch Wed Mar 1 19:47:25 2006
@@ -0,0 +1,111 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c
./arch/i386/mm/pageattr.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c 2006-02-27
15:46:58.000000000 +0000
++++ ./arch/i386/mm/pageattr.c 2006-02-27 15:55:31.000000000 +0000
+@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
+ unsigned long flags;
+
+ set_pte_atomic(kpte, pte); /* change init_mm */
+- if (PTRS_PER_PMD > 1)
++ if (HAVE_SHARED_KERNEL_PMD)
+ return;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c
./arch/i386/mm/pgtable.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c 2006-01-03
03:21:10.000000000 +0000
++++ ./arch/i386/mm/pgtable.c 2006-02-27 15:55:31.000000000 +0000
+@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
+ spin_lock_irqsave(&pgd_lock, flags);
+ }
+
+- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+- swapper_pg_dir + USER_PTRS_PER_PGD,
+- KERNEL_PGD_PTRS);
++ if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
++ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
++ swapper_pg_dir + USER_PTRS_PER_PGD,
++ KERNEL_PGD_PTRS);
+ if (PTRS_PER_PMD > 1)
+ return;
+
+@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+ goto out_oom;
+ set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+ }
++
++ if (!HAVE_SHARED_KERNEL_PMD) {
++ unsigned long flags;
++
++ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
++ if (!pmd)
++ goto out_oom;
++ set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
++ }
++
++ spin_lock_irqsave(&pgd_lock, flags);
++ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++ unsigned long v = (unsigned long)i << PGDIR_SHIFT;
++ pgd_t *kpgd = pgd_offset_k(v);
++ pud_t *kpud = pud_offset(kpgd, v);
++ pmd_t *kpmd = pmd_offset(kpud, v);
++ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++ memcpy(pmd, kpmd, PAGE_SIZE);
++ }
++ pgd_list_add(pgd);
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ }
++
+ return pgd;
+
+ out_oom:
+@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
+ int i;
+
+ /* in the PAE case user pgd entries are overwritten before usage */
+- if (PTRS_PER_PMD > 1)
+- for (i = 0; i < USER_PTRS_PER_PGD; ++i)
+- kmem_cache_free(pmd_cache, (void
*)__va(pgd_val(pgd[i])-1));
++ if (PTRS_PER_PMD > 1) {
++ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
++ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++ kmem_cache_free(pmd_cache, pmd);
++ }
++ if (!HAVE_SHARED_KERNEL_PMD) {
++ unsigned long flags;
++ spin_lock_irqsave(&pgd_lock, flags);
++ pgd_list_del(pgd);
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
++ kmem_cache_free(pmd_cache, pmd);
++ }
++ }
++ }
+ /* in the non-PAE case, free_pgtables() clears user pgd entries */
+ kmem_cache_free(pgd_cache, pgd);
+ }
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h
./include/asm-i386/pgtable-2level-defs.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/pgtable-2level-defs.h 2006-02-27 15:55:31.000000000
+0000
+@@ -1,6 +1,8 @@
+ #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
+ #define _I386_PGTABLE_2LEVEL_DEFS_H
+
++#define HAVE_SHARED_KERNEL_PMD 0
++
+ /*
+ * traditional i386 two-level paging structure:
+ */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h
./include/asm-i386/pgtable-3level-defs.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/pgtable-3level-defs.h 2006-02-27 15:55:31.000000000
+0000
+@@ -1,6 +1,8 @@
+ #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
+ #define _I386_PGTABLE_3LEVEL_DEFS_H
+
++#define HAVE_SHARED_KERNEL_PMD 1
++
+ /*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/smp-alts.patch
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/smp-alts.patch Wed Mar 1 19:47:25 2006
@@ -0,0 +1,591 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig ./arch/i386/Kconfig
+--- ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig 2006-02-27
15:46:58.000000000 +0000
++++ ./arch/i386/Kconfig 2006-02-27 15:55:34.000000000 +0000
+@@ -202,6 +202,19 @@ config SMP
+
+ If you don't know what to do here, say N.
+
++config SMP_ALTERNATIVES
++ bool "SMP alternatives support (EXPERIMENTAL)"
++ depends on SMP && EXPERIMENTAL
++ help
++ Try to reduce the overhead of running an SMP kernel on a uniprocessor
++ host slightly by replacing certain key instruction sequences
++ according to whether we currently have more than one CPU available.
++ This should provide a noticeable boost to performance when
++ running SMP kernels on UP machines, and have negligible impact
++ when running on an true SMP host.
++
++ If unsure, say N.
++
+ config NR_CPUS
+ int "Maximum number of CPUs (2-255)"
+ range 2 255
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile
./arch/i386/kernel/Makefile
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile 2006-02-27
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/Makefile 2006-02-27 15:55:34.000000000 +0000
+@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o
+ obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
+ obj-$(CONFIG_VM86) += vm86.o
+ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
++obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o
+
+ EXTRA_AFLAGS := -traditional
+
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c
./arch/i386/kernel/smpalts.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c 1970-01-01
01:00:00.000000000 +0100
++++ ./arch/i386/kernel/smpalts.c 2006-02-27 15:55:34.000000000 +0000
+@@ -0,0 +1,85 @@
++#include <linux/kernel.h>
++#include <asm/system.h>
++#include <asm/smp_alt.h>
++#include <asm/processor.h>
++#include <asm/string.h>
++
++struct smp_replacement_record {
++ unsigned char targ_size;
++ unsigned char smp1_size;
++ unsigned char smp2_size;
++ unsigned char up_size;
++ unsigned char feature;
++ unsigned char data[0];
++};
++
++struct smp_alternative_record {
++ void *targ_start;
++ struct smp_replacement_record *repl;
++};
++
++extern struct smp_alternative_record __start_smp_alternatives_table,
++ __stop_smp_alternatives_table;
++extern unsigned long __init_begin, __init_end;
++
++void prepare_for_smp(void)
++{
++ struct smp_alternative_record *r;
++ printk(KERN_INFO "Enabling SMP...\n");
++ for (r = &__start_smp_alternatives_table;
++ r != &__stop_smp_alternatives_table;
++ r++) {
++ BUG_ON(r->repl->targ_size < r->repl->smp1_size);
++ BUG_ON(r->repl->targ_size < r->repl->smp2_size);
++ BUG_ON(r->repl->targ_size < r->repl->up_size);
++ if (system_state == SYSTEM_RUNNING &&
++ r->targ_start >= (void *)&__init_begin &&
++ r->targ_start < (void *)&__init_end)
++ continue;
++ if (r->repl->feature != (unsigned char)-1 &&
++ boot_cpu_has(r->repl->feature)) {
++ memcpy(r->targ_start,
++ r->repl->data + r->repl->smp1_size,
++ r->repl->smp2_size);
++ memset(r->targ_start + r->repl->smp2_size,
++ 0x90,
++ r->repl->targ_size - r->repl->smp2_size);
++ } else {
++ memcpy(r->targ_start,
++ r->repl->data,
++ r->repl->smp1_size);
++ memset(r->targ_start + r->repl->smp1_size,
++ 0x90,
++ r->repl->targ_size - r->repl->smp1_size);
++ }
++ }
++ /* Paranoia */
++ asm volatile ("jmp 1f\n1:");
++ mb();
++}
++
++void unprepare_for_smp(void)
++{
++ struct smp_alternative_record *r;
++ printk(KERN_INFO "Disabling SMP...\n");
++ for (r = &__start_smp_alternatives_table;
++ r != &__stop_smp_alternatives_table;
++ r++) {
++ BUG_ON(r->repl->targ_size < r->repl->smp1_size);
++ BUG_ON(r->repl->targ_size < r->repl->smp2_size);
++ BUG_ON(r->repl->targ_size < r->repl->up_size);
++ if (system_state == SYSTEM_RUNNING &&
++ r->targ_start >= (void *)&__init_begin &&
++ r->targ_start < (void *)&__init_end)
++ continue;
++ memcpy(r->targ_start,
++ r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
++ r->repl->up_size);
++ memset(r->targ_start + r->repl->up_size,
++ 0x90,
++ r->repl->targ_size - r->repl->up_size);
++ }
++ /* Paranoia */
++ asm volatile ("jmp 1f\n1:");
++ mb();
++}
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c
./arch/i386/kernel/smpboot.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c 2006-02-27
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/smpboot.c 2006-02-27 15:55:34.000000000 +0000
+@@ -1208,6 +1208,11 @@ static void __init smp_boot_cpus(unsigne
+ if (max_cpus <= cpucount+1)
+ continue;
+
++#ifdef CONFIG_SMP_ALTERNATIVES
++ if (kicked == 1)
++ prepare_for_smp();
++#endif
++
+ if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
+ printk("CPU #%d not responding - cannot use it.\n",
+ apicid);
+@@ -1386,6 +1391,11 @@ int __devinit __cpu_up(unsigned int cpu)
+ return -EIO;
+ }
+
++#ifdef CONFIG_SMP_ALTERNATIVES
++ if (num_online_cpus() == 1)
++ prepare_for_smp();
++#endif
++
+ local_irq_enable();
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /* Unleash the CPU! */
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S
./arch/i386/kernel/vmlinux.lds.S
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S
2006-01-03 03:21:10.000000000 +0000
++++ ./arch/i386/kernel/vmlinux.lds.S 2006-02-27 15:55:34.000000000 +0000
+@@ -34,6 +34,13 @@ SECTIONS
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
+ __stop___ex_table = .;
+
++ . = ALIGN(16);
++ __start_smp_alternatives_table = .;
++ __smp_alternatives : { *(__smp_alternatives) }
++ __stop_smp_alternatives_table = .;
++
++ __smp_replacements : { *(__smp_replacements) }
++
+ RODATA
+
+ /* writeable */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h
./include/asm-i386/atomic.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h 2006-02-27
15:47:25.000000000 +0000
++++ ./include/asm-i386/atomic.h 2006-02-27 15:55:34.000000000 +0000
+@@ -4,18 +4,13 @@
+ #include <linux/config.h>
+ #include <linux/compiler.h>
+ #include <asm/processor.h>
++#include <asm/smp_alt.h>
+
+ /*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+-#ifdef CONFIG_SMP
+-#define LOCK "lock ; "
+-#else
+-#define LOCK ""
+-#endif
+-
+ /*
+ * Make sure gcc doesn't try to be clever and move things around
+ * on us. We need to use _exactly_ the address the user gave us,
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h
./include/asm-i386/bitops.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h 2006-02-27
15:47:25.000000000 +0000
++++ ./include/asm-i386/bitops.h 2006-02-27 15:55:34.000000000 +0000
+@@ -7,6 +7,7 @@
+
+ #include <linux/config.h>
+ #include <linux/compiler.h>
++#include <asm/smp_alt.h>
+
+ /*
+ * These have to be done with inline assembly: that way the bit-setting
+@@ -16,12 +17,6 @@
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+-#ifdef CONFIG_SMP
+-#define LOCK_PREFIX "lock ; "
+-#else
+-#define LOCK_PREFIX ""
+-#endif
+-
+ #define ADDR (*(volatile long *) addr)
+
+ /**
+@@ -41,7 +36,7 @@
+ */
+ static inline void set_bit(int nr, volatile unsigned long * addr)
+ {
+- __asm__ __volatile__( LOCK_PREFIX
++ __asm__ __volatile__( LOCK
+ "btsl %1,%0"
+ :"+m" (ADDR)
+ :"Ir" (nr));
+@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
+ */
+ static inline void clear_bit(int nr, volatile unsigned long * addr)
+ {
+- __asm__ __volatile__( LOCK_PREFIX
++ __asm__ __volatile__( LOCK
+ "btrl %1,%0"
+ :"+m" (ADDR)
+ :"Ir" (nr));
+@@ -121,7 +116,7 @@ static inline void __change_bit(int nr,
+ */
+ static inline void change_bit(int nr, volatile unsigned long * addr)
+ {
+- __asm__ __volatile__( LOCK_PREFIX
++ __asm__ __volatile__( LOCK
+ "btcl %1,%0"
+ :"+m" (ADDR)
+ :"Ir" (nr));
+@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
+ {
+ int oldbit;
+
+- __asm__ __volatile__( LOCK_PREFIX
++ __asm__ __volatile__( LOCK
+ "btsl %2,%1\n\tsbbl %0,%0"
+ :"=r" (oldbit),"+m" (ADDR)
+ :"Ir" (nr) : "memory");
+@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
+ {
+ int oldbit;
+
+- __asm__ __volatile__( LOCK_PREFIX
++ __asm__ __volatile__( LOCK
+ "btrl %2,%1\n\tsbbl %0,%0"
+ :"=r" (oldbit),"+m" (ADDR)
+ :"Ir" (nr) : "memory");
+@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
+ {
+ int oldbit;
+
+- __asm__ __volatile__( LOCK_PREFIX
++ __asm__ __volatile__( LOCK
+ "btcl %2,%1\n\tsbbl %0,%0"
+ :"=r" (oldbit),"+m" (ADDR)
+ :"Ir" (nr) : "memory");
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h
./include/asm-i386/futex.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h 2006-02-27
15:47:25.000000000 +0000
++++ ./include/asm-i386/futex.h 2006-02-27 15:55:34.000000000 +0000
+@@ -28,7 +28,7 @@
+ "1: movl %2, %0\n\
+ movl %0, %3\n" \
+ insn "\n" \
+-"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\
++"2: " LOCK "cmpxchgl %3, %2\n\
+ jnz 1b\n\
+ 3: .section .fixup,\"ax\"\n\
+ 4: mov %5, %1\n\
+@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op,
+ #endif
+ switch (op) {
+ case FUTEX_OP_ADD:
+- __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
++ __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
+ oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_OR:
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h
./include/asm-i386/rwsem.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h 2006-01-03
03:21:10.000000000 +0000
++++ ./include/asm-i386/rwsem.h 2006-02-27 15:55:34.000000000 +0000
+@@ -40,6 +40,7 @@
+
+ #include <linux/list.h>
+ #include <linux/spinlock.h>
++#include <asm/smp_alt.h>
+
+ struct rwsem_waiter;
+
+@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
+ {
+ __asm__ __volatile__(
+ "# beginning down_read\n\t"
+-LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old
value */
++LOCK " incl (%%eax)\n\t" /* adds 0x00000001, returns the old
value */
+ " js 2f\n\t" /* jump if we weren't granted the lock */
+ "1:\n\t"
+ LOCK_SECTION_START("")
+@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
+ " movl %1,%2\n\t"
+ " addl %3,%2\n\t"
+ " jle 2f\n\t"
+-LOCK_PREFIX " cmpxchgl %2,%0\n\t"
++LOCK " cmpxchgl %2,%0\n\t"
+ " jnz 1b\n\t"
+ "2:\n\t"
+ "# ending __down_read_trylock\n\t"
+@@ -150,7 +151,7 @@ static inline void __down_write(struct r
+ tmp = RWSEM_ACTIVE_WRITE_BIAS;
+ __asm__ __volatile__(
+ "# beginning down_write\n\t"
+-LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns
the old value */
++LOCK " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns
the old value */
+ " testl %%edx,%%edx\n\t" /* was the count 0 before? */
+ " jnz 2f\n\t" /* jump if we weren't granted the lock */
+ "1:\n\t"
+@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
+ __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
+ __asm__ __volatile__(
+ "# beginning __up_read\n\t"
+-LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old
value */
++LOCK " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old
value */
+ " js 2f\n\t" /* jump if the lock is being waited upon */
+ "1:\n\t"
+ LOCK_SECTION_START("")
+@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
+ __asm__ __volatile__(
+ "# beginning __up_write\n\t"
+ " movl %2,%%edx\n\t"
+-LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" /* tries to transition
0xffff0001 -> 0x00000000 */
++LOCK " xaddl %%edx,(%%eax)\n\t" /* tries to transition
0xffff0001 -> 0x00000000 */
+ " jnz 2f\n\t" /* jump if the lock is being waited upon */
+ "1:\n\t"
+ LOCK_SECTION_START("")
+@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
+ {
+ __asm__ __volatile__(
+ "# beginning __downgrade_write\n\t"
+-LOCK_PREFIX " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 ->
0xYYYY0001 */
++LOCK " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 ->
0xYYYY0001 */
+ " js 2f\n\t" /* jump if the lock is being waited upon */
+ "1:\n\t"
+ LOCK_SECTION_START("")
+@@ -263,7 +264,7 @@ LOCK_PREFIX " addl %2,(%%eax)\n\t"
+ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+ {
+ __asm__ __volatile__(
+-LOCK_PREFIX "addl %1,%0"
++LOCK "addl %1,%0"
+ : "=m"(sem->count)
+ : "ir"(delta), "m"(sem->count));
+ }
+@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
+ int tmp = delta;
+
+ __asm__ __volatile__(
+-LOCK_PREFIX "xadd %0,(%2)"
++LOCK "xadd %0,(%2)"
+ : "+r"(tmp), "=m"(sem->count)
+ : "r"(sem), "m"(sem->count)
+ : "memory");
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h
./include/asm-i386/smp_alt.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h 1970-01-01
01:00:00.000000000 +0100
++++ ./include/asm-i386/smp_alt.h 2006-02-27 15:55:34.000000000 +0000
+@@ -0,0 +1,32 @@
++#ifndef __ASM_SMP_ALT_H__
++#define __ASM_SMP_ALT_H__
++
++#include <linux/config.h>
++
++#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
++#define LOCK \
++ "6677: nop\n" \
++ ".section __smp_alternatives,\"a\"\n" \
++ ".long 6677b\n" \
++ ".long 6678f\n" \
++ ".previous\n" \
++ ".section __smp_replacements,\"a\"\n" \
++ "6678: .byte 1\n" \
++ ".byte 1\n" \
++ ".byte 0\n" \
++ ".byte 1\n" \
++ ".byte -1\n" \
++ "lock\n" \
++ "nop\n" \
++ ".previous\n"
++void prepare_for_smp(void);
++void unprepare_for_smp(void);
++#else
++#define LOCK "lock ; "
++#endif
++#else
++#define LOCK ""
++#endif
++
++#endif /* __ASM_SMP_ALT_H__ */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h
./include/asm-i386/spinlock.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h 2006-01-03
03:21:10.000000000 +0000
++++ ./include/asm-i386/spinlock.h 2006-02-27 15:55:34.000000000 +0000
+@@ -6,6 +6,7 @@
+ #include <asm/page.h>
+ #include <linux/config.h>
+ #include <linux/compiler.h>
++#include <asm/smp_alt.h>
+
+ /*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+@@ -23,7 +24,8 @@
+
+ #define __raw_spin_lock_string \
+ "\n1:\t" \
+- "lock ; decb %0\n\t" \
++ LOCK \
++ "decb %0\n\t" \
+ "jns 3f\n" \
+ "2:\t" \
+ "rep;nop\n\t" \
+@@ -34,7 +36,8 @@
+
+ #define __raw_spin_lock_string_flags \
+ "\n1:\t" \
+- "lock ; decb %0\n\t" \
++ LOCK \
++ "decb %0\n\t" \
+ "jns 4f\n\t" \
+ "2:\t" \
+ "testl $0x200, %1\n\t" \
+@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
+ static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+ {
+ char oldval;
++#ifdef CONFIG_SMP_ALTERNATIVES
+ __asm__ __volatile__(
+- "xchgb %b0,%1"
++ "1:movb %1,%b0\n"
++ "movb $0,%1\n"
++ "2:"
++ ".section __smp_alternatives,\"a\"\n"
++ ".long 1b\n"
++ ".long 3f\n"
++ ".previous\n"
++ ".section __smp_replacements,\"a\"\n"
++ "3: .byte 2b - 1b\n"
++ ".byte 5f-4f\n"
++ ".byte 0\n"
++ ".byte 6f-5f\n"
++ ".byte -1\n"
++ "4: xchgb %b0,%1\n"
++ "5: movb %1,%b0\n"
++ "movb $0,%1\n"
++ "6:\n"
++ ".previous\n"
+ :"=q" (oldval), "=m" (lock->slock)
+ :"0" (0) : "memory");
++#else
++ __asm__ __volatile__(
++ "xchgb %b0,%1\n"
++ :"=q" (oldval), "=m" (lock->slock)
++ :"0" (0) : "memory");
++#endif
+ return oldval > 0;
+ }
+
+@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
+
+ static inline void __raw_read_unlock(raw_rwlock_t *rw)
+ {
+- asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
++ asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
+ }
+
+ static inline void __raw_write_unlock(raw_rwlock_t *rw)
+ {
+- asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
++ asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
+ : "=m" (rw->lock) : : "memory");
+ }
+
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h
./include/asm-i386/system.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h 2006-02-27
15:47:25.000000000 +0000
++++ ./include/asm-i386/system.h 2006-02-27 15:55:34.000000000 +0000
+@@ -5,7 +5,7 @@
+ #include <linux/kernel.h>
+ #include <asm/segment.h>
+ #include <asm/cpufeature.h>
+-#include <linux/bitops.h> /* for LOCK_PREFIX */
++#include <asm/smp_alt.h>
+
+ #ifdef __KERNEL__
+
+@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
+ unsigned long prev;
+ switch (size) {
+ case 1:
+- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
++ __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
++ __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
++ __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
+ unsigned long long new)
+ {
+ unsigned long long prev;
+- __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
++ __asm__ __volatile__(LOCK "cmpxchg8b %3"
+ : "=A"(prev)
+ : "b"((unsigned long)new),
+ "c"((unsigned long)(new >> 32)),
+@@ -503,11 +503,55 @@ struct alt_instr {
+ #endif
+
+ #ifdef CONFIG_SMP
++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
++#define smp_alt_mb(instr) \
++__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
++ ".section __smp_alternatives,\"a\"\n" \
++ ".long 6667b\n" \
++ ".long 6673f\n" \
++ ".previous\n" \
++ ".section __smp_replacements,\"a\"\n" \
++ "6673:.byte 6668b-6667b\n" \
++ ".byte 6670f-6669f\n" \
++ ".byte 6671f-6670f\n" \
++ ".byte 0\n" \
++ ".byte %c0\n" \
++ "6669:lock;addl $0,0(%%esp)\n" \
++ "6670:" instr "\n" \
++ "6671:\n" \
++ ".previous\n" \
++ : \
++ : "i" (X86_FEATURE_XMM2) \
++ : "memory")
++#define smp_rmb() smp_alt_mb("lfence")
++#define smp_mb() smp_alt_mb("mfence")
++#define set_mb(var, value) do { \
++unsigned long __set_mb_temp; \
++__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \
++ ".section __smp_alternatives,\"a\"\n" \
++ ".long 6667b\n" \
++ ".long 6673f\n" \
++ ".previous\n" \
++ ".section __smp_replacements,\"a\"\n" \
++ "6673: .byte 6668b-6667b\n" \
++ ".byte 6670f-6669f\n" \
++ ".byte 0\n" \
++ ".byte 6671f-6670f\n" \
++ ".byte -1\n" \
++ "6669: xchg %1, %0\n" \
++ "6670:movl %1, %0\n" \
++ "6671:\n" \
++ ".previous\n" \
++ : "=m" (var), "=r" (__set_mb_temp) \
++ : "1" (value) \
++ : "memory"); } while (0)
++#else
+ #define smp_mb() mb()
+ #define smp_rmb() rmb()
++#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
++#endif
+ #define smp_wmb() wmb()
+ #define smp_read_barrier_depends() read_barrier_depends()
+-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+ #else
+ #define smp_mb() barrier()
+ #define smp_rmb() barrier()
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/locking.sh
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/examples/locking.sh Wed Mar 1 19:47:25 2006
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+
+#
+# Serialisation
+#
+
+LOCK_SLEEPTIME=1
+LOCK_SPINNING_RETRIES=5
+LOCK_RETRIES=10
+LOCK_BASEDIR=/var/run/xen-hotplug
+
+
+claim_lock()
+{
+ local lockdir="$LOCK_BASEDIR/$1"
+ mkdir -p "$LOCK_BASEDIR"
+ _claim_lock "$lockdir"
+}
+
+
+release_lock()
+{
+ _release_lock "$LOCK_BASEDIR/$1"
+}
+
+
+_claim_lock()
+{
+ local lockdir="$1"
+ local owner=$(_lock_owner "$lockdir")
+ local retries=0
+
+ while [ $retries -lt $LOCK_RETRIES ]
+ do
+ mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
+ _update_lock_info "$lockdir" && return
+
+ local new_owner=$(_lock_owner "$lockdir")
+ if [ "$new_owner" != "$owner" ]
+ then
+ owner="$new_owner"
+ retries=0
+ fi
+
+ if [ $retries -gt $LOCK_SPINNING_RETRIES ]
+ then
+ sleep $LOCK_SLEEPTIME
+ else
+ sleep 0
+ fi
+ retries=$(($retries + 1))
+ done
+ _steal_lock "$lockdir"
+}
+
+
+_release_lock()
+{
+ trap sigerr ERR
+ rm -rf "$1" 2>/dev/null || true
+}
+
+
+_steal_lock()
+{
+ local lockdir="$1"
+ local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
+ log err "Forced to steal lock on $lockdir from $owner!"
+ _release_lock "$lockdir"
+ _claim_lock "$lockdir"
+}
+
+
+_lock_owner()
+{
+ cat "$1/owner" 2>/dev/null || echo "unknown"
+}
+
+
+_update_lock_info()
+{
+ echo "$$: $0" >"$1/owner"
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/logging.sh
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/examples/logging.sh Wed Mar 1 19:47:25 2006
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+
+log() {
+ local level="$1"
+ shift
+ logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-delete
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/examples/vtpm-delete Wed Mar 1 19:47:25 2006
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+# This scripts must be called the following way:
+# vtpm-delete <domain name>
+
+dir=$(dirname "$0")
+. "$dir/vtpm-common.sh"
+
+vtpm_delete_instance $1
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-hotplug-common.sh
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/examples/vtpm-hotplug-common.sh Wed Mar 1 19:47:25 2006
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2005 IBM Corporation
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+findCommand "$@"
+if [ "$command" != "online" ] &&
+ [ "$command" != "offline" ] &&
+ [ "$command" != "add" ] &&
+ [ "$command" != "remove" ]
+then
+ log err "Invalid command: $command"
+ exit 1
+fi
+
+
+XENBUS_PATH="${XENBUS_PATH:?}"
+
+. "$dir/vtpm-common.sh"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-cleanup
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/examples/xen-hotplug-cleanup Wed Mar 1 19:47:25 2006
@@ -0,0 +1,21 @@
+#! /bin/sh
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+# Claim the lock protecting /etc/xen/scripts/block. This stops a race whereby
+# paths in the store would disappear underneath that script as it attempted to
+# read from the store checking for device sharing.
+# Any other scripts that do similar things will have to have their lock
+# claimed too.
+# This is pretty horrible, but there's not really a nicer way of solving this.
+claim_lock "block"
+
+# remove device frontend store entries
+xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
+
+# remove device backend store entries
+xenstore-rm -t "$XENBUS_PATH" || true
+xenstore-rm -t "error/$XENBUS_PATH" || true
+
+release_lock "block"
diff -r 88f97bb8f3ae -r 673f62edbfbe
tools/xm-test/tests/vtpm/01_vtpm-list_pos.py
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/01_vtpm-list_pos.py Wed Mar 1 19:47:25 2006
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+# Copyright (C) International Business Machines Corp., 2006
+# Author: Stefan Berger <stefanb@xxxxxxxxxx)
+
+# Positive Test: create domain with virtual TPM attached at build time,
+# verify list
+
+
+from XmTestLib import *
+
+def vtpm_cleanup(domName):
+ # Since this is only a temporary domain I clean up the domain from the
+ # virtual TPM directory
+ traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
+
+if ENABLE_HVM_SUPPORT:
+ SKIP("vtpm-list not supported for HVM domains")
+
+config = {"vtpm":"instance=1,backend=0"}
+domain = XmTestDomain(extraConfig=config)
+
+try:
+ domain.start()
+except DomainError, e:
+ if verbose:
+ print e.extra
+ vtpm_cleanup(domain.getName())
+ FAIL("Unable to create domain")
+
+domName = domain.getName()
+
+status, output = traceCommand("xm vtpm-list %s" % domain.getId())
+eyecatcher = "/local/domain/0/backend/vtpm"
+where = output.find(eyecatcher)
+if status != 0:
+ vtpm_cleanup(domName)
+ FAIL("xm vtpm-list returned bad status, expected 0, status is %i" % status)
+elif where < 0:
+ vtpm_cleanup(domName)
+ FAIL("Fail to list virtual TPM device")
+
+domain.stop()
+
+vtpm_cleanup(domName)
diff -r 88f97bb8f3ae -r 673f62edbfbe
tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py Wed Mar 1 19:47:25 2006
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+
+# Copyright (C) International Business Machines Corp., 2006
+# Author: Stefan Berger <stefanb@xxxxxxxxxx)
+
+# Positive Test: create domain with virtual TPM attached at build time,
+# check list of pcrs
+
+from XmTestLib import *
+
+def vtpm_cleanup(domName):
+ # Since this is only a temporary domain I clean up the domain from the
+ # virtual TPM directory
+ traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
+
+if ENABLE_HVM_SUPPORT:
+ SKIP("vtpm-list not supported for HVM domains")
+
+status, output = traceCommand("ls /dev/tpm0")
+if re.search("No such file or directory",output):
+ SKIP("This machine has no hardware TPM; cannot run this test")
+
+status, output = traceCommand("ps aux | grep vtpm_manager | grep -v grep")
+if output == "":
+ FAIL("virtual TPM manager must be started to run this test")
+
+# vtpm manager has been detected
+config = {"vtpm":"instance=1,backend=0"}
+domain = XmTestDomain(extraConfig=config)
+
+try:
+ domain.start()
+except DomainError, e:
+ if verbose:
+ print e.extra
+ vtpm_cleanup(domain.getName())
+ FAIL("Unable to create domain")
+
+domName = domain.getName()
+
+try:
+ console = XmConsole(domain.getName())
+except ConsoleError, e:
+ vtpm_cleanup(domName)
+ FAIL(str(e))
+
+try:
+ console.sendInput("input")
+ run = console.runCmd("ls /sys")
+except ConsoleError, e:
+ saveLog(console.getHistory())
+ vtpm_cleanup(domName)
+ FAIL(str(e))
+
+if re.search("No such file",run["output"]):
+ try:
+ run = console.runCmd("mkdir /sys")
+ run = console.runCmd("mount -t sysfs /sys /sys")
+ except ConsoleError, e:
+ saveLog(console.getHistory())
+ vtpm_cleanup(domName)
+ FAIL(str(e))
+
+try:
+ run = console.runCmd("cat /sys/devices/platform/tpm_vtpm/pcrs")
+except ConsoleError, e:
+ saveLog(console.getHistory())
+ vtpm_cleanup(domName)
+ FAIL(str(e))
+
+if re.search("No such file",run["output"]):
+ FAIL("TPM frontend support not compiled into (domU?) kernel")
+
+console.closeConsole()
+
+domain.stop()
+
+vtpm_cleanup(domName)
+
+if not re.search("PCR-00:",run["output"]):
+ FAIL("Virtual TPM is not working correctly on /dev/vtpm on backend
side")
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/vtpm/Makefile.am
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/Makefile.am Wed Mar 1 19:47:25 2006
@@ -0,0 +1,22 @@
+
+SUBDIRS =
+
+TESTS = 01_vtpm-list_pos.test \
+ 02_vtpm-cat_pcrs.test
+
+XFAIL_TESTS =
+
+EXTRA_DIST = $(TESTS) $(XFAIL_TESTS)
+
+TESTS_ENVIRONMENT=@TENV@
+
+%.test: %.py
+ cp $< $@
+ chmod +x $@
+
+clean-local: am_config_clean-local
+
+am_config_clean-local:
+ rm -f *test
+ rm -f *log
+ rm -f *~
diff -r 88f97bb8f3ae -r 673f62edbfbe
xen/arch/x86/x86_32/supervisor_mode_kernel.S
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S Wed Mar 1 19:47:25 2006
@@ -0,0 +1,145 @@
+/*
+ * Handle stack fixup for guest running in RING 0.
+ *
+ * Copyright (c) 2006 Ian Campbell
+ *
+ * When a guest kernel is allowed to run in RING 0 a hypercall,
+ * interrupt or exception interrupting the guest kernel will not cause
+ * a privilege level change and therefore the stack will not be swapped
+ * to the Xen stack.
+ *
+ * To fix this we look for RING 0 activation frames with a stack
+ * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel
+ * frame) and fix this up by locating the Xen stack via the TSS
+ * and moving the activation frame to the Xen stack. In the process we
+ * convert the frame into an inter-privilege frame returning to RING 1
+ * so that we can catch and reverse the process on exit.
+ */
+
+#include <xen/config.h>
+#include <asm/asm_defns.h>
+#include <public/xen.h>
+
+ # Upon entry the stack should be the Xen stack and contain:
+ # %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN
+ # On exit the stack should be %ss:%esp (i.e. the guest stack)
+ # and contain:
+ # EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN
+ ALIGN
+ENTRY(restore_ring0_guest)
+ # Point %gs:%esi to guest stack.
+RRG0: movw UREGS_ss+4(%esp),%gs
+ movl UREGS_esp+4(%esp),%esi
+
+ # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack.
+ movl $(UREGS_kernel_sizeof>>2)+1,%ecx
+
+1: subl $4,%esi
+ movl -4(%esp,%ecx,4),%eax
+RRG1: movl %eax,%gs:(%esi)
+ loop 1b
+
+RRG2: andl $~3,%gs:UREGS_cs+4(%esi)
+
+ movl %gs,%eax
+
+ # We need to do this because these registers are not present
+ # on the guest stack so they cannot be restored by the code in
+ # restore_all_guest.
+RRG3: mov UREGS_ds+4(%esp),%ds
+RRG4: mov UREGS_es+4(%esp),%es
+RRG5: mov UREGS_fs+4(%esp),%fs
+RRG6: mov UREGS_gs+4(%esp),%gs
+
+RRG7: movl %eax,%ss
+ movl %esi,%esp
+
+ ret
+.section __ex_table,"a"
+ .long RRG0,domain_crash_synchronous
+ .long RRG1,domain_crash_synchronous
+ .long RRG2,domain_crash_synchronous
+ .long RRG3,domain_crash_synchronous
+ .long RRG4,domain_crash_synchronous
+ .long RRG5,domain_crash_synchronous
+ .long RRG6,domain_crash_synchronous
+ .long RRG7,domain_crash_synchronous
+.previous
+
+ # Upon entry the stack should be a guest stack and contain:
+ # EFLAGS, %cs, %eip, ERROR, RETURN
+ # On exit the stack should be the Xen stack and contain:
+ # %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN
+ ALIGN
+ENTRY(fixup_ring0_guest_stack)
+ pushl %eax
+ pushl %ecx
+ pushl %ds
+ pushl %gs
+ pushl %esi
+
+ movw $__HYPERVISOR_DS,%ax
+ movw %ax,%ds
+
+ # Point %gs:%esi to guest stack frame.
+ movw %ss,%ax
+ movw %ax,%gs
+ movl %esp,%esi
+ # Account for entries on the guest stack:
+ # * Pushed by normal exception/interrupt/hypercall mechanisms
+ # * EFLAGS, %cs, %eip, ERROR == 4 words.
+ # * Pushed by the fixup routine
+ # * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words.
+ addl $((6+4)*4),%esi
+
+ # %gs:%esi now points to the guest stack before the
+ # interrupt/exception occured.
+
+ /*
+ * Reverse the __TSS macro, giving us the CPU number.
+ * The TSS for this cpu is at init_tss + ( cpu * 128 ).
+ */
+ str %ecx
+ shrl $3,%ecx # Calculate GDT index
for TSS.
+ subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu.
+ shll $6,%ecx # Each TSS entry is
0x80 bytes
+ addl $init_tss,%ecx # but we have 2*cpu
from above.
+
+ # Load Xen stack from TSS.
+ movw TSS_ss0(%ecx),%ax
+TRP1: movw %ax,%ss
+ movl TSS_esp0(%ecx),%esp
+
+ pushl %gs
+ pushl %esi
+
+ # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi
+ # from guest stack to Xen stack.
+ movl $10,%ecx
+1: subl $4,%esp
+ subl $4,%esi
+TRP2: movl %gs:(%esi),%eax
+ movl %eax,(%esp)
+ loop 1b
+
+ # CS = CS|1 to simulate RING1 stack frame.
+ orl $1,32(%esp)
+
+ popl %esi
+ popl %gs
+ popl %ds
+ popl %ecx
+ popl %eax
+ ret
+.section __ex_table,"a"
+ .long TRP1,domain_crash_synchronous
+ .long TRP2,domain_crash_synchronous
+.previous
+
+domain_crash_synchronous_string:
+ .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n"
+
+domain_crash_synchronous:
+ pushl $domain_crash_synchronous_string
+ call printf
+ jmp __domain_crash_synchronous
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/uaccess.h
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/xen/include/asm-ia64/uaccess.h Wed Mar 1 19:47:25 2006
@@ -0,0 +1,285 @@
+#ifndef _ASM_IA64_UACCESS_H
+#define _ASM_IA64_UACCESS_H
+
+/*
+ * This file defines various macros to transfer memory areas across
+ * the user/kernel boundary. This needs to be done carefully because
+ * this code is executed in kernel mode and uses user-specified
+ * addresses. Thus, we need to be careful not to let the user to
+ * trick us into accessing kernel memory that would normally be
+ * inaccessible. This code is also fairly performance sensitive,
+ * so we want to spend as little time doing safety checks as
+ * possible.
+ *
+ * To make matters a bit more interesting, these macros sometimes also
+ * called from within the kernel itself, in which case the address
+ * validity check must be skipped. The get_fs() macro tells us what
+ * to do: if get_fs()==USER_DS, checking is performed, if
+ * get_fs()==KERNEL_DS, checking is bypassed.
+ *
+ * Note that even if the memory area specified by the user is in a
+ * valid address range, it is still possible that we'll get a page
+ * fault while accessing it. This is handled by filling out an
+ * exception handler fixup entry for each instruction that has the
+ * potential to fault. When such a fault occurs, the page fault
+ * handler checks to see whether the faulting instruction has a fixup
+ * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
+ * then resumes execution at the continuation point.
+ *
+ * Based on <asm-alpha/uaccess.h>.
+ *
+ * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@xxxxxxxxxx>
+ */
+
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h>
+
+#include <asm/intrinsics.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+
+#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
+#define __access_ok(addr) (!IS_VMM_ADDRESS((unsigned long)(addr)))
+#define access_ok(addr, size) (__access_ok(addr))
+#define array_access_ok(addr,count,size)( __access_ok(addr))
+
+/*
+ * These are the main single-value transfer routines. They automatically
+ * use the right size if we just have the right pointer type.
+ *
+ * Careful to not
+ * (a) re-use the arguments for side effects (sizeof/typeof is ok)
+ * (b) require any knowledge of processes at this stage
+ */
+#define put_user(x, ptr) __put_user_check((__typeof__(*(ptr))) (x),
(ptr), sizeof(*(ptr)), get_fs())
+#define get_user(x, ptr) __get_user_check((x), (ptr), sizeof(*(ptr)),
get_fs())
+
+/*
+ * The "__xxx" versions do not do address space checking, useful when
+ * doing multiple accesses to the same area (the programmer has to do the
+ * checks by hand with "access_ok()")
+ */
+#define __put_user(x, ptr) __put_user_nocheck((__typeof__(*(ptr))) (x),
(ptr), sizeof(*(ptr)))
+#define __get_user(x, ptr) __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+
+extern long __put_user_unaligned_unknown (void);
+
+#define __put_user_unaligned(x, ptr)
\
+({
\
+ long __ret;
\
+ switch (sizeof(*(ptr))) {
\
+ case 1: __ret = __put_user((x), (ptr)); break;
\
+ case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))
\
+ | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1)));
break; \
+ case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))
\
+ | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1)));
break; \
+ case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))
\
+ | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1)));
break; \
+ default: __ret = __put_user_unaligned_unknown();
\
+ }
\
+ __ret;
\
+})
+
+extern long __get_user_unaligned_unknown (void);
+
+#define __get_user_unaligned(x, ptr)
\
+({
\
+ long __ret;
\
+ switch (sizeof(*(ptr))) {
\
+ case 1: __ret = __get_user((x), (ptr)); break;
\
+ case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))
\
+ | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1)));
break; \
+ case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))
\
+ | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1)));
break; \
+ case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))
\
+ | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1)));
break; \
+ default: __ret = __get_user_unaligned_unknown();
\
+ }
\
+ __ret;
\
+})
+
+#ifdef ASM_SUPPORTED
+ struct __large_struct { unsigned long buf[100]; };
+# define __m(x) (*(struct __large_struct __user *)(x))
+
+/* We need to declare the __ex_table section before we can use it in .xdata.
*/
+asm (".section \"__ex_table\", \"a\"\n\t.previous");
+
+# define __get_user_size(val, addr, n, err)
\
+do {
\
+ register long __gu_r8 asm ("r8") = 0;
\
+ register long __gu_r9 asm ("r9");
\
+ asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by
exception handler\n" \
+ "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"
\
+ "[1:]"
\
+ : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));
\
+ (err) = __gu_r8;
\
+ (val) = __gu_r9;
\
+} while (0)
+
+/*
+ * The "__put_user_size()" macro tells gcc it reads from memory instead of
writing it. This
+ * is because they do not write to any memory gcc knows about, so there are no
aliasing
+ * issues.
+ */
+# define __put_user_size(val, addr, n, err)
\
+do {
\
+ register long __pu_r8 asm ("r8") = 0;
\
+ asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by
exception handler\n" \
+ "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"
\
+ "[1:]"
\
+ : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val),
"0"(__pu_r8)); \
+ (err) = __pu_r8;
\
+} while (0)
+
+#else /* !ASM_SUPPORTED */
+# define RELOC_TYPE 2 /* ip-rel */
+# define __get_user_size(val, addr, n, err) \
+do { \
+ __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE); \
+ (err) = ia64_getreg(_IA64_REG_R8); \
+ (val) = ia64_getreg(_IA64_REG_R9); \
+} while (0)
+# define __put_user_size(val, addr, n, err)
\
+do {
\
+ __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned
long) (val)); \
+ (err) = ia64_getreg(_IA64_REG_R8);
\
+} while (0)
+#endif /* !ASM_SUPPORTED */
+
+extern void __get_user_unknown (void);
+
+/*
+ * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve
subroutine-calls, which
+ * could clobber r8 and r9 (among others). Thus, be careful not to evaluate
it while
+ * using r8/r9.
+ */
+#define __do_get_user(check, x, ptr, size, segment)
\
+({
\
+ const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);
\
+ __typeof__ (size) __gu_size = (size);
\
+ long __gu_err = -EFAULT, __gu_val = 0;
\
+
\
+ if (!check || __access_ok(__gu_ptr))
\
+ switch (__gu_size) {
\
+ case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err);
break; \
+ case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err);
break; \
+ case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err);
break; \
+ case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err);
break; \
+ default: __get_user_unknown(); break;
\
+ }
\
+ (x) = (__typeof__(*(__gu_ptr))) __gu_val;
\
+ __gu_err;
\
+})
+
+#define __get_user_nocheck(x, ptr, size) __do_get_user(0, x, ptr, size,
KERNEL_DS)
+#define __get_user_check(x, ptr, size, segment) __do_get_user(1, x,
ptr, size, segment)
+
+extern void __put_user_unknown (void);
+
+/*
+ * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve
subroutine-calls, which
+ * could clobber r8 (among others). Thus, be careful not to evaluate them
while using r8.
+ */
+#define __do_put_user(check, x, ptr, size, segment)
\
+({
\
+ __typeof__ (x) __pu_x = (x);
\
+ __typeof__ (*(ptr)) __user *__pu_ptr = (ptr);
\
+ __typeof__ (size) __pu_size = (size);
\
+ long __pu_err = -EFAULT;
\
+
\
+ if (!check || __access_ok(__pu_ptr))
\
+ switch (__pu_size) {
\
+ case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err);
break; \
+ case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err);
break; \
+ case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err);
break; \
+ case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err);
break; \
+ default: __put_user_unknown(); break;
\
+ }
\
+ __pu_err;
\
+})
+
+#define __put_user_nocheck(x, ptr, size) __do_put_user(0, x, ptr, size,
KERNEL_DS)
+#define __put_user_check(x, ptr, size, segment) __do_put_user(1, x,
ptr, size, segment)
+
+/*
+ * Complex access routines
+ */
+extern unsigned long __must_check __copy_user (void __user *to, const void
__user *from,
+ unsigned long count);
+
+static inline unsigned long
+__copy_to_user (void __user *to, const void *from, unsigned long count)
+{
+ return __copy_user(to, (void __user *) from, count);
+}
+
+static inline unsigned long
+__copy_from_user (void *to, const void __user *from, unsigned long count)
+{
+ return __copy_user((void __user *) to, from, count);
+}
+
+#define __copy_to_user_inatomic __copy_to_user
+#define __copy_from_user_inatomic __copy_from_user
+#define copy_to_user(to, from, n)
\
+({
\
+ void __user *__cu_to = (to);
\
+ const void *__cu_from = (from);
\
+ long __cu_len = (n);
\
+
\
+ if (__access_ok(__cu_to))
\
+ __cu_len = __copy_user(__cu_to, (void __user *) __cu_from,
__cu_len); \
+ __cu_len;
\
+})
+
+#define copy_from_user(to, from, n)
\
+({
\
+ void *__cu_to = (to);
\
+ const void __user *__cu_from = (from);
\
+ long __cu_len = (n);
\
+
\
+ __chk_user_ptr(__cu_from);
\
+ if (__access_ok(__cu_from))
\
+ __cu_len = __copy_user((void __user *) __cu_to, __cu_from,
__cu_len); \
+ __cu_len;
\
+})
+
+#define __copy_in_user(to, from, size) __copy_user((to), (from), (size))
+
+static inline unsigned long
+copy_in_user (void __user *to, const void __user *from, unsigned long n)
+{
+ if (likely(access_ok(from, n) && access_ok(to, n)))
+ n = __copy_user(to, from, n);
+ return n;
+}
+
+#define ARCH_HAS_SORT_EXTABLE
+#define ARCH_HAS_SEARCH_EXTABLE
+
+struct exception_table_entry {
+ int addr; /* location-relative address of insn this fixup is for
*/
+ int cont; /* location-relative continuation addr.; if bit 2 is
set, r9 is set to 0 */
+};
+
+extern void ia64_handle_exception (struct pt_regs *regs, const struct
exception_table_entry *e);
+extern const struct exception_table_entry *search_exception_tables (unsigned
long addr);
+
+static inline int
+ia64_done_with_exception (struct pt_regs *regs)
+{
+ const struct exception_table_entry *e;
+ e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
+ if (e) {
+ ia64_handle_exception(regs, e);
+ return 1;
+ }
+ return 0;
+}
+
+#endif /* _ASM_IA64_UACCESS_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/features.h
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/xen/include/public/features.h Wed Mar 1 19:47:25 2006
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * features.h
+ *
+ * Feature flags, reported by XENVER_get_features.
+ *
+ * Copyright (c) 2006, Keir Fraser <keir@xxxxxxxxxxxxx>
+ */
+
+#ifndef __XEN_PUBLIC_FEATURES_H__
+#define __XEN_PUBLIC_FEATURES_H__
+
+/*
+ * If set, the guest does not need to write-protect its pagetables, and can
+ * update them via direct writes.
+ */
+#define XENFEAT_writable_page_tables 0
+
+/*
+ * If set, the guest does not need to write-protect its segment descriptor
+ * tables, and can update them via direct writes.
+ */
+#define XENFEAT_writable_descriptor_tables 1
+
+/*
+ * If set, translation between the guest's 'pseudo-physical' address space
+ * and the host's machine address space are handled by the hypervisor. In this
+ * mode the guest does not need to perform phys-to/from-machine translations
+ * when performing page table operations.
+ */
+#define XENFEAT_auto_translated_physmap 2
+
+/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
+#define XENFEAT_supervisor_mode_kernel 3
+
+/*
+ * If set, the guest does not need to allocate x86 PAE page directories
+ * below 4GB. This flag is usually implied by auto_translated_physmap.
+ */
+#define XENFEAT_pae_pgdir_above_4gb 4
+
+#define XENFEAT_NR_SUBMAPS 1
+
+#endif /* __XEN_PUBLIC_FEATURES_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/guest_access.h
--- /dev/null Wed Mar 1 17:01:54 2006
+++ b/xen/include/xen/guest_access.h Wed Mar 1 19:47:25 2006
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * guest_access.h
+ *
+ * Copyright (x) 2006, K A Fraser
+ */
+
+#ifndef __XEN_GUEST_ACCESS_H__
+#define __XEN_GUEST_ACCESS_H__
+
+#include <asm/uaccess.h>
+
+/* Is the guest handle a NULL reference? */
+#define guest_handle_is_null(hnd) ((hnd).p == NULL)
+
+/* Offset the given guest handle into the array it refers to. */
+#define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr))
+
+/* Cast a guest handle to the specified type of handle. */
+#define guest_handle_cast(hnd, type) ({ \
+ type *_x = (hnd).p; \
+ (GUEST_HANDLE(type)) { _x }; \
+})
+
+/*
+ * Copy an array of objects to guest context via a guest handle.
+ * Optionally specify an offset into the guest array.
+ */
+#define copy_to_guest_offset(hnd, off, ptr, nr) ({ \
+ const typeof(ptr) _x = (hnd).p; \
+ const typeof(ptr) _y = (ptr); \
+ copy_to_user(_x+(off), _y, sizeof(*_x)*(nr)); \
+})
+#define copy_to_guest(hnd, ptr, nr) \
+ copy_to_guest_offset(hnd, 0, ptr, nr)
+
+/*
+ * Copy an array of objects from guest context via a guest handle.
+ * Optionally specify an offset into the guest array.
+ */
+#define copy_from_guest_offset(ptr, hnd, off, nr) ({ \
+ const typeof(ptr) _x = (hnd).p; \
+ const typeof(ptr) _y = (ptr); \
+ copy_from_user(_y, _x+(off), sizeof(*_x)*(nr)); \
+})
+#define copy_from_guest(ptr, hnd, nr) \
+ copy_from_guest_offset(ptr, hnd, 0, nr)
+
+/*
+ * Pre-validate a guest handle.
+ * Allows use of faster __copy_* functions.
+ */
+#define guest_handle_okay(hnd, nr) \
+ array_access_ok((hnd).p, (nr), sizeof(*(hnd).p))
+
+#define __copy_to_guest_offset(hnd, off, ptr, nr) ({ \
+ const typeof(ptr) _x = (hnd).p; \
+ const typeof(ptr) _y = (ptr); \
+ __copy_to_user(_x+(off), _y, sizeof(*_x)*(nr)); \
+})
+#define __copy_to_guest(hnd, ptr, nr) \
+ __copy_to_guest_offset(hnd, 0, ptr, nr)
+
+#define __copy_from_guest_offset(ptr, hnd, off, nr) ({ \
+ const typeof(ptr) _x = (hnd).p; \
+ const typeof(ptr) _y = (ptr); \
+ __copy_from_user(_y, _x+(off), sizeof(*_x)*(nr)); \
+})
+#define __copy_from_guest(ptr, hnd, nr) \
+ __copy_from_guest_offset(ptr, hnd, 0, nr)
+
+#endif /* __XEN_GUEST_ACCESS_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe
patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch
--- a/patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch Wed Mar 1
17:01:54 2006
+++ /dev/null Wed Mar 1 19:47:25 2006
@@ -1,45 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c
./arch/i386/kernel/traps.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c 2006-02-15
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/traps.c 2006-02-15 20:40:43.000000000 +0000
-@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
-
- static void io_check_error(unsigned char reason, struct pt_regs * regs)
- {
-- unsigned long i;
--
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-
- /* Re-enable the IOCK line, wait for a few seconds */
-- reason = (reason & 0xf) | 8;
-- outb(reason, 0x61);
-- i = 2000;
-- while (--i) udelay(1000);
-- reason &= ~8;
-- outb(reason, 0x61);
-+ clear_io_check_error(reason);
- }
-
- static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-diff -pruN
../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h
./include/asm-i386/mach-default/mach_traps.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/mach-default/mach_traps.h 2006-02-15
20:40:43.000000000 +0000
-@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
- outb(reason, 0x61);
- }
-
-+static inline void clear_io_check_error(unsigned char reason)
-+{
-+ unsigned long i;
-+
-+ reason = (reason & 0xf) | 8;
-+ outb(reason, 0x61);
-+ i = 2000;
-+ while (--i) udelay(1000);
-+ reason &= ~8;
-+ outb(reason, 0x61);
-+}
-+
- static inline unsigned char get_nmi_reason(void)
- {
- return inb(0x61);
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/net-csum.patch
--- a/patches/linux-2.6.16-rc4/net-csum.patch Wed Mar 1 17:01:54 2006
+++ /dev/null Wed Mar 1 19:47:25 2006
@@ -1,41 +0,0 @@
-diff -pruN
../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c
./net/ipv4/netfilter/ip_nat_proto_tcp.c
---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c
2006-02-02 17:39:51.000000000 +0000
-+++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-02-02 17:44:18.000000000
+0000
-@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
- if (hdrsize < sizeof(*hdr))
- return 1;
-
-- hdr->check = ip_nat_cheat_check(~oldip, newip,
-+ if ((*pskb)->proto_csum_blank) {
-+ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
-+ } else {
-+ hdr->check = ip_nat_cheat_check(~oldip, newip,
- ip_nat_cheat_check(oldport ^ 0xFFFF,
- newport,
- hdr->check));
-+ }
- return 1;
- }
-
-diff -pruN
../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c
./net/ipv4/netfilter/ip_nat_proto_udp.c
---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c
2006-02-02 17:39:51.000000000 +0000
-+++ ./net/ipv4/netfilter/ip_nat_proto_udp.c 2006-02-02 17:44:18.000000000
+0000
-@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
- newport = tuple->dst.u.udp.port;
- portptr = &hdr->dest;
- }
-- if (hdr->check) /* 0 is a special case meaning no checksum */
-- hdr->check = ip_nat_cheat_check(~oldip, newip,
-+ if (hdr->check) { /* 0 is a special case meaning no checksum */
-+ if ((*pskb)->proto_csum_blank) {
-+ hdr->check = ip_nat_cheat_check(oldip, ~newip,
hdr->check);
-+ } else {
-+ hdr->check = ip_nat_cheat_check(~oldip, newip,
- ip_nat_cheat_check(*portptr ^ 0xFFFF,
- newport,
- hdr->check));
-+ }
-+ }
- *portptr = newport;
- return 1;
- }
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/pmd-shared.patch
--- a/patches/linux-2.6.16-rc4/pmd-shared.patch Wed Mar 1 17:01:54 2006
+++ /dev/null Wed Mar 1 19:47:25 2006
@@ -1,111 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c
./arch/i386/mm/pageattr.c
---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c 2006-02-02
17:39:29.000000000 +0000
-+++ ./arch/i386/mm/pageattr.c 2006-02-02 17:45:14.000000000 +0000
-@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
- unsigned long flags;
-
- set_pte_atomic(kpte, pte); /* change init_mm */
-- if (PTRS_PER_PMD > 1)
-+ if (HAVE_SHARED_KERNEL_PMD)
- return;
-
- spin_lock_irqsave(&pgd_lock, flags);
-diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c
./arch/i386/mm/pgtable.c
---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c 2006-01-03
03:21:10.000000000 +0000
-+++ ./arch/i386/mm/pgtable.c 2006-02-02 17:45:14.000000000 +0000
-@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
- spin_lock_irqsave(&pgd_lock, flags);
- }
-
-- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-- swapper_pg_dir + USER_PTRS_PER_PGD,
-- KERNEL_PGD_PTRS);
-+ if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
-+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-+ swapper_pg_dir + USER_PTRS_PER_PGD,
-+ KERNEL_PGD_PTRS);
- if (PTRS_PER_PMD > 1)
- return;
-
-@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
- goto out_oom;
- set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
- }
-+
-+ if (!HAVE_SHARED_KERNEL_PMD) {
-+ unsigned long flags;
-+
-+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-+ if (!pmd)
-+ goto out_oom;
-+ set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
-+ }
-+
-+ spin_lock_irqsave(&pgd_lock, flags);
-+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+ unsigned long v = (unsigned long)i << PGDIR_SHIFT;
-+ pgd_t *kpgd = pgd_offset_k(v);
-+ pud_t *kpud = pud_offset(kpgd, v);
-+ pmd_t *kpmd = pmd_offset(kpud, v);
-+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+ memcpy(pmd, kpmd, PAGE_SIZE);
-+ }
-+ pgd_list_add(pgd);
-+ spin_unlock_irqrestore(&pgd_lock, flags);
-+ }
-+
- return pgd;
-
- out_oom:
-@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
- int i;
-
- /* in the PAE case user pgd entries are overwritten before usage */
-- if (PTRS_PER_PMD > 1)
-- for (i = 0; i < USER_PTRS_PER_PGD; ++i)
-- kmem_cache_free(pmd_cache, (void
*)__va(pgd_val(pgd[i])-1));
-+ if (PTRS_PER_PMD > 1) {
-+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+ kmem_cache_free(pmd_cache, pmd);
-+ }
-+ if (!HAVE_SHARED_KERNEL_PMD) {
-+ unsigned long flags;
-+ spin_lock_irqsave(&pgd_lock, flags);
-+ pgd_list_del(pgd);
-+ spin_unlock_irqrestore(&pgd_lock, flags);
-+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-+ kmem_cache_free(pmd_cache, pmd);
-+ }
-+ }
-+ }
- /* in the non-PAE case, free_pgtables() clears user pgd entries */
- kmem_cache_free(pgd_cache, pgd);
- }
-diff -pruN
../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h
./include/asm-i386/pgtable-2level-defs.h
---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/pgtable-2level-defs.h 2006-02-02 17:45:14.000000000
+0000
-@@ -1,6 +1,8 @@
- #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
- #define _I386_PGTABLE_2LEVEL_DEFS_H
-
-+#define HAVE_SHARED_KERNEL_PMD 0
-+
- /*
- * traditional i386 two-level paging structure:
- */
-diff -pruN
../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h
./include/asm-i386/pgtable-3level-defs.h
---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/pgtable-3level-defs.h 2006-02-02 17:45:14.000000000
+0000
-@@ -1,6 +1,8 @@
- #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
- #define _I386_PGTABLE_3LEVEL_DEFS_H
-
-+#define HAVE_SHARED_KERNEL_PMD 1
-+
- /*
- * PGDIR_SHIFT determines what a top-level page table entry can map
- */
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/smp-alts.patch
--- a/patches/linux-2.6.16-rc4/smp-alts.patch Wed Mar 1 17:01:54 2006
+++ /dev/null Wed Mar 1 19:47:25 2006
@@ -1,591 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig ./arch/i386/Kconfig
---- ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig 2006-02-15
20:38:51.000000000 +0000
-+++ ./arch/i386/Kconfig 2006-02-15 20:45:57.000000000 +0000
-@@ -202,6 +202,19 @@ config SMP
-
- If you don't know what to do here, say N.
-
-+config SMP_ALTERNATIVES
-+ bool "SMP alternatives support (EXPERIMENTAL)"
-+ depends on SMP && EXPERIMENTAL
-+ help
-+ Try to reduce the overhead of running an SMP kernel on a uniprocessor
-+ host slightly by replacing certain key instruction sequences
-+ according to whether we currently have more than one CPU available.
-+ This should provide a noticeable boost to performance when
-+ running SMP kernels on UP machines, and have negligible impact
-+ when running on an true SMP host.
-+
-+ If unsure, say N.
-+
- config NR_CPUS
- int "Maximum number of CPUs (2-255)"
- range 2 255
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile
./arch/i386/kernel/Makefile
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile 2006-02-15
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/Makefile 2006-02-15 20:45:57.000000000 +0000
-@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o
- obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
- obj-$(CONFIG_VM86) += vm86.o
- obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-+obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o
-
- EXTRA_AFLAGS := -traditional
-
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c
./arch/i386/kernel/smpalts.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c 1970-01-01
01:00:00.000000000 +0100
-+++ ./arch/i386/kernel/smpalts.c 2006-02-15 20:45:57.000000000 +0000
-@@ -0,0 +1,85 @@
-+#include <linux/kernel.h>
-+#include <asm/system.h>
-+#include <asm/smp_alt.h>
-+#include <asm/processor.h>
-+#include <asm/string.h>
-+
-+struct smp_replacement_record {
-+ unsigned char targ_size;
-+ unsigned char smp1_size;
-+ unsigned char smp2_size;
-+ unsigned char up_size;
-+ unsigned char feature;
-+ unsigned char data[0];
-+};
-+
-+struct smp_alternative_record {
-+ void *targ_start;
-+ struct smp_replacement_record *repl;
-+};
-+
-+extern struct smp_alternative_record __start_smp_alternatives_table,
-+ __stop_smp_alternatives_table;
-+extern unsigned long __init_begin, __init_end;
-+
-+void prepare_for_smp(void)
-+{
-+ struct smp_alternative_record *r;
-+ printk(KERN_INFO "Enabling SMP...\n");
-+ for (r = &__start_smp_alternatives_table;
-+ r != &__stop_smp_alternatives_table;
-+ r++) {
-+ BUG_ON(r->repl->targ_size < r->repl->smp1_size);
-+ BUG_ON(r->repl->targ_size < r->repl->smp2_size);
-+ BUG_ON(r->repl->targ_size < r->repl->up_size);
-+ if (system_state == SYSTEM_RUNNING &&
-+ r->targ_start >= (void *)&__init_begin &&
-+ r->targ_start < (void *)&__init_end)
-+ continue;
-+ if (r->repl->feature != (unsigned char)-1 &&
-+ boot_cpu_has(r->repl->feature)) {
-+ memcpy(r->targ_start,
-+ r->repl->data + r->repl->smp1_size,
-+ r->repl->smp2_size);
-+ memset(r->targ_start + r->repl->smp2_size,
-+ 0x90,
-+ r->repl->targ_size - r->repl->smp2_size);
-+ } else {
-+ memcpy(r->targ_start,
-+ r->repl->data,
-+ r->repl->smp1_size);
-+ memset(r->targ_start + r->repl->smp1_size,
-+ 0x90,
-+ r->repl->targ_size - r->repl->smp1_size);
-+ }
-+ }
-+ /* Paranoia */
-+ asm volatile ("jmp 1f\n1:");
-+ mb();
-+}
-+
-+void unprepare_for_smp(void)
-+{
-+ struct smp_alternative_record *r;
-+ printk(KERN_INFO "Disabling SMP...\n");
-+ for (r = &__start_smp_alternatives_table;
-+ r != &__stop_smp_alternatives_table;
-+ r++) {
-+ BUG_ON(r->repl->targ_size < r->repl->smp1_size);
-+ BUG_ON(r->repl->targ_size < r->repl->smp2_size);
-+ BUG_ON(r->repl->targ_size < r->repl->up_size);
-+ if (system_state == SYSTEM_RUNNING &&
-+ r->targ_start >= (void *)&__init_begin &&
-+ r->targ_start < (void *)&__init_end)
-+ continue;
-+ memcpy(r->targ_start,
-+ r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
-+ r->repl->up_size);
-+ memset(r->targ_start + r->repl->up_size,
-+ 0x90,
-+ r->repl->targ_size - r->repl->up_size);
-+ }
-+ /* Paranoia */
-+ asm volatile ("jmp 1f\n1:");
-+ mb();
-+}
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c
./arch/i386/kernel/smpboot.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c 2006-02-15
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/smpboot.c 2006-02-15 20:45:57.000000000 +0000
-@@ -1214,6 +1214,11 @@ static void __init smp_boot_cpus(unsigne
- if (max_cpus <= cpucount+1)
- continue;
-
-+#ifdef CONFIG_SMP_ALTERNATIVES
-+ if (kicked == 1)
-+ prepare_for_smp();
-+#endif
-+
- if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
- printk("CPU #%d not responding - cannot use it.\n",
- apicid);
-@@ -1392,6 +1397,11 @@ int __devinit __cpu_up(unsigned int cpu)
- return -EIO;
- }
-
-+#ifdef CONFIG_SMP_ALTERNATIVES
-+ if (num_online_cpus() == 1)
-+ prepare_for_smp();
-+#endif
-+
- local_irq_enable();
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- /* Unleash the CPU! */
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S
./arch/i386/kernel/vmlinux.lds.S
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S
2006-01-03 03:21:10.000000000 +0000
-+++ ./arch/i386/kernel/vmlinux.lds.S 2006-02-15 20:45:57.000000000 +0000
-@@ -34,6 +34,13 @@ SECTIONS
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
- __stop___ex_table = .;
-
-+ . = ALIGN(16);
-+ __start_smp_alternatives_table = .;
-+ __smp_alternatives : { *(__smp_alternatives) }
-+ __stop_smp_alternatives_table = .;
-+
-+ __smp_replacements : { *(__smp_replacements) }
-+
- RODATA
-
- /* writeable */
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h
./include/asm-i386/atomic.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h 2006-02-15
20:38:57.000000000 +0000
-+++ ./include/asm-i386/atomic.h 2006-02-15 20:45:57.000000000 +0000
-@@ -4,18 +4,13 @@
- #include <linux/config.h>
- #include <linux/compiler.h>
- #include <asm/processor.h>
-+#include <asm/smp_alt.h>
-
- /*
- * Atomic operations that C can't guarantee us. Useful for
- * resource counting etc..
- */
-
--#ifdef CONFIG_SMP
--#define LOCK "lock ; "
--#else
--#define LOCK ""
--#endif
--
- /*
- * Make sure gcc doesn't try to be clever and move things around
- * on us. We need to use _exactly_ the address the user gave us,
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h
./include/asm-i386/bitops.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h 2006-02-15
20:38:57.000000000 +0000
-+++ ./include/asm-i386/bitops.h 2006-02-15 20:45:57.000000000 +0000
-@@ -7,6 +7,7 @@
-
- #include <linux/config.h>
- #include <linux/compiler.h>
-+#include <asm/smp_alt.h>
-
- /*
- * These have to be done with inline assembly: that way the bit-setting
-@@ -16,12 +17,6 @@
- * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
- */
-
--#ifdef CONFIG_SMP
--#define LOCK_PREFIX "lock ; "
--#else
--#define LOCK_PREFIX ""
--#endif
--
- #define ADDR (*(volatile long *) addr)
-
- /**
-@@ -41,7 +36,7 @@
- */
- static inline void set_bit(int nr, volatile unsigned long * addr)
- {
-- __asm__ __volatile__( LOCK_PREFIX
-+ __asm__ __volatile__( LOCK
- "btsl %1,%0"
- :"+m" (ADDR)
- :"Ir" (nr));
-@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
- */
- static inline void clear_bit(int nr, volatile unsigned long * addr)
- {
-- __asm__ __volatile__( LOCK_PREFIX
-+ __asm__ __volatile__( LOCK
- "btrl %1,%0"
- :"+m" (ADDR)
- :"Ir" (nr));
-@@ -121,7 +116,7 @@ static inline void __change_bit(int nr,
- */
- static inline void change_bit(int nr, volatile unsigned long * addr)
- {
-- __asm__ __volatile__( LOCK_PREFIX
-+ __asm__ __volatile__( LOCK
- "btcl %1,%0"
- :"+m" (ADDR)
- :"Ir" (nr));
-@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
- {
- int oldbit;
-
-- __asm__ __volatile__( LOCK_PREFIX
-+ __asm__ __volatile__( LOCK
- "btsl %2,%1\n\tsbbl %0,%0"
- :"=r" (oldbit),"+m" (ADDR)
- :"Ir" (nr) : "memory");
-@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
- {
- int oldbit;
-
-- __asm__ __volatile__( LOCK_PREFIX
-+ __asm__ __volatile__( LOCK
- "btrl %2,%1\n\tsbbl %0,%0"
- :"=r" (oldbit),"+m" (ADDR)
- :"Ir" (nr) : "memory");
-@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
- {
- int oldbit;
-
-- __asm__ __volatile__( LOCK_PREFIX
-+ __asm__ __volatile__( LOCK
- "btcl %2,%1\n\tsbbl %0,%0"
- :"=r" (oldbit),"+m" (ADDR)
- :"Ir" (nr) : "memory");
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h
./include/asm-i386/futex.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h 2006-02-15
20:38:57.000000000 +0000
-+++ ./include/asm-i386/futex.h 2006-02-15 20:45:57.000000000 +0000
-@@ -28,7 +28,7 @@
- "1: movl %2, %0\n\
- movl %0, %3\n" \
- insn "\n" \
--"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\
-+"2: " LOCK "cmpxchgl %3, %2\n\
- jnz 1b\n\
- 3: .section .fixup,\"ax\"\n\
- 4: mov %5, %1\n\
-@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op,
- #endif
- switch (op) {
- case FUTEX_OP_ADD:
-- __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
-+ __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
- oldval, uaddr, oparg);
- break;
- case FUTEX_OP_OR:
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h
./include/asm-i386/rwsem.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h 2006-01-03
03:21:10.000000000 +0000
-+++ ./include/asm-i386/rwsem.h 2006-02-15 20:45:57.000000000 +0000
-@@ -40,6 +40,7 @@
-
- #include <linux/list.h>
- #include <linux/spinlock.h>
-+#include <asm/smp_alt.h>
-
- struct rwsem_waiter;
-
-@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
- {
- __asm__ __volatile__(
- "# beginning down_read\n\t"
--LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old
value */
-+LOCK " incl (%%eax)\n\t" /* adds 0x00000001, returns the old
value */
- " js 2f\n\t" /* jump if we weren't granted the lock */
- "1:\n\t"
- LOCK_SECTION_START("")
-@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
- " movl %1,%2\n\t"
- " addl %3,%2\n\t"
- " jle 2f\n\t"
--LOCK_PREFIX " cmpxchgl %2,%0\n\t"
-+LOCK " cmpxchgl %2,%0\n\t"
- " jnz 1b\n\t"
- "2:\n\t"
- "# ending __down_read_trylock\n\t"
-@@ -150,7 +151,7 @@ static inline void __down_write(struct r
- tmp = RWSEM_ACTIVE_WRITE_BIAS;
- __asm__ __volatile__(
- "# beginning down_write\n\t"
--LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns
the old value */
-+LOCK " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns
the old value */
- " testl %%edx,%%edx\n\t" /* was the count 0 before? */
- " jnz 2f\n\t" /* jump if we weren't granted the lock */
- "1:\n\t"
-@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
- __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
- __asm__ __volatile__(
- "# beginning __up_read\n\t"
--LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old
value */
-+LOCK " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old
value */
- " js 2f\n\t" /* jump if the lock is being waited upon */
- "1:\n\t"
- LOCK_SECTION_START("")
-@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
- __asm__ __volatile__(
- "# beginning __up_write\n\t"
- " movl %2,%%edx\n\t"
--LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" /* tries to transition
0xffff0001 -> 0x00000000 */
-+LOCK " xaddl %%edx,(%%eax)\n\t" /* tries to transition
0xffff0001 -> 0x00000000 */
- " jnz 2f\n\t" /* jump if the lock is being waited upon */
- "1:\n\t"
- LOCK_SECTION_START("")
-@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
- {
- __asm__ __volatile__(
- "# beginning __downgrade_write\n\t"
--LOCK_PREFIX " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 ->
0xYYYY0001 */
-+LOCK " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 ->
0xYYYY0001 */
- " js 2f\n\t" /* jump if the lock is being waited upon */
- "1:\n\t"
- LOCK_SECTION_START("")
-@@ -263,7 +264,7 @@ LOCK_PREFIX " addl %2,(%%eax)\n\t"
- static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
- {
- __asm__ __volatile__(
--LOCK_PREFIX "addl %1,%0"
-+LOCK "addl %1,%0"
- : "=m"(sem->count)
- : "ir"(delta), "m"(sem->count));
- }
-@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
- int tmp = delta;
-
- __asm__ __volatile__(
--LOCK_PREFIX "xadd %0,(%2)"
-+LOCK "xadd %0,(%2)"
- : "+r"(tmp), "=m"(sem->count)
- : "r"(sem), "m"(sem->count)
- : "memory");
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h
./include/asm-i386/smp_alt.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h 1970-01-01
01:00:00.000000000 +0100
-+++ ./include/asm-i386/smp_alt.h 2006-02-15 20:45:57.000000000 +0000
-@@ -0,0 +1,32 @@
-+#ifndef __ASM_SMP_ALT_H__
-+#define __ASM_SMP_ALT_H__
-+
-+#include <linux/config.h>
-+
-+#ifdef CONFIG_SMP
-+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
-+#define LOCK \
-+ "6677: nop\n" \
-+ ".section __smp_alternatives,\"a\"\n" \
-+ ".long 6677b\n" \
-+ ".long 6678f\n" \
-+ ".previous\n" \
-+ ".section __smp_replacements,\"a\"\n" \
-+ "6678: .byte 1\n" \
-+ ".byte 1\n" \
-+ ".byte 0\n" \
-+ ".byte 1\n" \
-+ ".byte -1\n" \
-+ "lock\n" \
-+ "nop\n" \
-+ ".previous\n"
-+void prepare_for_smp(void);
-+void unprepare_for_smp(void);
-+#else
-+#define LOCK "lock ; "
-+#endif
-+#else
-+#define LOCK ""
-+#endif
-+
-+#endif /* __ASM_SMP_ALT_H__ */
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h
./include/asm-i386/spinlock.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h 2006-01-03
03:21:10.000000000 +0000
-+++ ./include/asm-i386/spinlock.h 2006-02-15 20:45:57.000000000 +0000
-@@ -6,6 +6,7 @@
- #include <asm/page.h>
- #include <linux/config.h>
- #include <linux/compiler.h>
-+#include <asm/smp_alt.h>
-
- /*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
-@@ -23,7 +24,8 @@
-
- #define __raw_spin_lock_string \
- "\n1:\t" \
-- "lock ; decb %0\n\t" \
-+ LOCK \
-+ "decb %0\n\t" \
- "jns 3f\n" \
- "2:\t" \
- "rep;nop\n\t" \
-@@ -34,7 +36,8 @@
-
- #define __raw_spin_lock_string_flags \
- "\n1:\t" \
-- "lock ; decb %0\n\t" \
-+ LOCK \
-+ "decb %0\n\t" \
- "jns 4f\n\t" \
- "2:\t" \
- "testl $0x200, %1\n\t" \
-@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
- static inline int __raw_spin_trylock(raw_spinlock_t *lock)
- {
- char oldval;
-+#ifdef CONFIG_SMP_ALTERNATIVES
- __asm__ __volatile__(
-- "xchgb %b0,%1"
-+ "1:movb %1,%b0\n"
-+ "movb $0,%1\n"
-+ "2:"
-+ ".section __smp_alternatives,\"a\"\n"
-+ ".long 1b\n"
-+ ".long 3f\n"
-+ ".previous\n"
-+ ".section __smp_replacements,\"a\"\n"
-+ "3: .byte 2b - 1b\n"
-+ ".byte 5f-4f\n"
-+ ".byte 0\n"
-+ ".byte 6f-5f\n"
-+ ".byte -1\n"
-+ "4: xchgb %b0,%1\n"
-+ "5: movb %1,%b0\n"
-+ "movb $0,%1\n"
-+ "6:\n"
-+ ".previous\n"
- :"=q" (oldval), "=m" (lock->slock)
- :"0" (0) : "memory");
-+#else
-+ __asm__ __volatile__(
-+ "xchgb %b0,%1\n"
-+ :"=q" (oldval), "=m" (lock->slock)
-+ :"0" (0) : "memory");
-+#endif
- return oldval > 0;
- }
-
-@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
-
- static inline void __raw_read_unlock(raw_rwlock_t *rw)
- {
-- asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
-+ asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
- }
-
- static inline void __raw_write_unlock(raw_rwlock_t *rw)
- {
-- asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
-+ asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
- : "=m" (rw->lock) : : "memory");
- }
-
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h
./include/asm-i386/system.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h 2006-02-15
20:38:57.000000000 +0000
-+++ ./include/asm-i386/system.h 2006-02-15 20:45:57.000000000 +0000
-@@ -5,7 +5,7 @@
- #include <linux/kernel.h>
- #include <asm/segment.h>
- #include <asm/cpufeature.h>
--#include <linux/bitops.h> /* for LOCK_PREFIX */
-+#include <asm/smp_alt.h>
-
- #ifdef __KERNEL__
-
-@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
- unsigned long prev;
- switch (size) {
- case 1:
-- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
-+ __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
-- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
-+ __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
-- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
-+ __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
-@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
- unsigned long long new)
- {
- unsigned long long prev;
-- __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
-+ __asm__ __volatile__(LOCK "cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
-@@ -503,11 +503,55 @@ struct alt_instr {
- #endif
-
- #ifdef CONFIG_SMP
-+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
-+#define smp_alt_mb(instr) \
-+__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
-+ ".section __smp_alternatives,\"a\"\n" \
-+ ".long 6667b\n" \
-+ ".long 6673f\n" \
-+ ".previous\n" \
-+ ".section __smp_replacements,\"a\"\n" \
-+ "6673:.byte 6668b-6667b\n" \
-+ ".byte 6670f-6669f\n" \
-+ ".byte 6671f-6670f\n" \
-+ ".byte 0\n" \
-+ ".byte %c0\n" \
-+ "6669:lock;addl $0,0(%%esp)\n" \
-+ "6670:" instr "\n" \
-+ "6671:\n" \
-+ ".previous\n" \
-+ : \
-+ : "i" (X86_FEATURE_XMM2) \
-+ : "memory")
-+#define smp_rmb() smp_alt_mb("lfence")
-+#define smp_mb() smp_alt_mb("mfence")
-+#define set_mb(var, value) do { \
-+unsigned long __set_mb_temp; \
-+__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \
-+ ".section __smp_alternatives,\"a\"\n" \
-+ ".long 6667b\n" \
-+ ".long 6673f\n" \
-+ ".previous\n" \
-+ ".section __smp_replacements,\"a\"\n" \
-+ "6673: .byte 6668b-6667b\n" \
-+ ".byte 6670f-6669f\n" \
-+ ".byte 0\n" \
-+ ".byte 6671f-6670f\n" \
-+ ".byte -1\n" \
-+ "6669: xchg %1, %0\n" \
-+ "6670:movl %1, %0\n" \
-+ "6671:\n" \
-+ ".previous\n" \
-+ : "=m" (var), "=r" (__set_mb_temp) \
-+ : "1" (value) \
-+ : "memory"); } while (0)
-+#else
- #define smp_mb() mb()
- #define smp_rmb() rmb()
-+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
-+#endif
- #define smp_wmb() wmb()
- #define smp_read_barrier_depends() read_barrier_depends()
--#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
- #else
- #define smp_mb() barrier()
- #define smp_rmb() barrier()
diff -r 88f97bb8f3ae -r 673f62edbfbe
xen/include/asm-ia64/linux-xen/asm/uaccess.h
--- a/xen/include/asm-ia64/linux-xen/asm/uaccess.h Wed Mar 1 17:01:54 2006
+++ /dev/null Wed Mar 1 19:47:25 2006
@@ -1,415 +0,0 @@
-#ifndef _ASM_IA64_UACCESS_H
-#define _ASM_IA64_UACCESS_H
-
-/*
- * This file defines various macros to transfer memory areas across
- * the user/kernel boundary. This needs to be done carefully because
- * this code is executed in kernel mode and uses user-specified
- * addresses. Thus, we need to be careful not to let the user to
- * trick us into accessing kernel memory that would normally be
- * inaccessible. This code is also fairly performance sensitive,
- * so we want to spend as little time doing safety checks as
- * possible.
- *
- * To make matters a bit more interesting, these macros sometimes also
- * called from within the kernel itself, in which case the address
- * validity check must be skipped. The get_fs() macro tells us what
- * to do: if get_fs()==USER_DS, checking is performed, if
- * get_fs()==KERNEL_DS, checking is bypassed.
- *
- * Note that even if the memory area specified by the user is in a
- * valid address range, it is still possible that we'll get a page
- * fault while accessing it. This is handled by filling out an
- * exception handler fixup entry for each instruction that has the
- * potential to fault. When such a fault occurs, the page fault
- * handler checks to see whether the faulting instruction has a fixup
- * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
- * then resumes execution at the continuation point.
- *
- * Based on <asm-alpha/uaccess.h>.
- *
- * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
- * David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/page-flags.h>
-#include <linux/mm.h>
-
-#include <asm/intrinsics.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-
-/*
- * For historical reasons, the following macros are grossly misnamed:
- */
-#define KERNEL_DS ((mm_segment_t) { ~0UL }) /* cf.
access_ok() */
-#define USER_DS ((mm_segment_t) { TASK_SIZE-1 }) /* cf.
access_ok() */
-
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
-#define get_ds() (KERNEL_DS)
-#define get_fs() (current_thread_info()->addr_limit)
-#define set_fs(x) (current_thread_info()->addr_limit = (x))
-
-#define segment_eq(a, b) ((a).seg == (b).seg)
-
-/*
- * When accessing user memory, we need to make sure the entire area really is
in
- * user-level space. In order to do this efficiently, we make sure that the
page at
- * address TASK_SIZE is never valid. We also need to make sure that the
address doesn't
- * point inside the virtually mapped linear page table.
- */
-#ifdef XEN
-#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
-#define __access_ok(addr, size, segment) (!IS_VMM_ADDRESS((unsigned
long)(addr)))
-#else
-#define __access_ok(addr, size, segment)
\
-({
\
- __chk_user_ptr(addr);
\
- (likely((unsigned long) (addr) <= (segment).seg)
\
- && ((segment).seg == KERNEL_DS.seg
\
- || likely(REGION_OFFSET((unsigned long) (addr)) <
RGN_MAP_LIMIT))); \
-})
-#endif
-#define access_ok(type, addr, size) __access_ok((addr), (size), get_fs())
-
-/* this function will go away soon - use access_ok() instead */
-static inline int __deprecated
-verify_area (int type, const void __user *addr, unsigned long size)
-{
- return access_ok(type, addr, size) ? 0 : -EFAULT;
-}
-
-/*
- * These are the main single-value transfer routines. They automatically
- * use the right size if we just have the right pointer type.
- *
- * Careful to not
- * (a) re-use the arguments for side effects (sizeof/typeof is ok)
- * (b) require any knowledge of processes at this stage
- */
-#define put_user(x, ptr) __put_user_check((__typeof__(*(ptr))) (x),
(ptr), sizeof(*(ptr)), get_fs())
-#define get_user(x, ptr) __get_user_check((x), (ptr), sizeof(*(ptr)),
get_fs())
-
-/*
- * The "__xxx" versions do not do address space checking, useful when
- * doing multiple accesses to the same area (the programmer has to do the
- * checks by hand with "access_ok()")
- */
-#define __put_user(x, ptr) __put_user_nocheck((__typeof__(*(ptr))) (x),
(ptr), sizeof(*(ptr)))
-#define __get_user(x, ptr) __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-extern long __put_user_unaligned_unknown (void);
-
-#define __put_user_unaligned(x, ptr)
\
-({
\
- long __ret;
\
- switch (sizeof(*(ptr))) {
\
- case 1: __ret = __put_user((x), (ptr)); break;
\
- case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))
\
- | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1)));
break; \
- case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))
\
- | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1)));
break; \
- case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))
\
- | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1)));
break; \
- default: __ret = __put_user_unaligned_unknown();
\
- }
\
- __ret;
\
-})
-
-extern long __get_user_unaligned_unknown (void);
-
-#define __get_user_unaligned(x, ptr)
\
-({
\
- long __ret;
\
- switch (sizeof(*(ptr))) {
\
- case 1: __ret = __get_user((x), (ptr)); break;
\
- case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))
\
- | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1)));
break; \
- case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))
\
- | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1)));
break; \
- case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))
\
- | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1)));
break; \
- default: __ret = __get_user_unaligned_unknown();
\
- }
\
- __ret;
\
-})
-
-#ifdef ASM_SUPPORTED
- struct __large_struct { unsigned long buf[100]; };
-# define __m(x) (*(struct __large_struct __user *)(x))
-
-/* We need to declare the __ex_table section before we can use it in .xdata.
*/
-asm (".section \"__ex_table\", \"a\"\n\t.previous");
-
-# define __get_user_size(val, addr, n, err)
\
-do {
\
- register long __gu_r8 asm ("r8") = 0;
\
- register long __gu_r9 asm ("r9");
\
- asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by
exception handler\n" \
- "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"
\
- "[1:]"
\
- : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));
\
- (err) = __gu_r8;
\
- (val) = __gu_r9;
\
-} while (0)
-
-/*
- * The "__put_user_size()" macro tells gcc it reads from memory instead of
writing it. This
- * is because they do not write to any memory gcc knows about, so there are no
aliasing
- * issues.
- */
-# define __put_user_size(val, addr, n, err)
\
-do {
\
- register long __pu_r8 asm ("r8") = 0;
\
- asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by
exception handler\n" \
- "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"
\
- "[1:]"
\
- : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val),
"0"(__pu_r8)); \
- (err) = __pu_r8;
\
-} while (0)
-
-#else /* !ASM_SUPPORTED */
-# define RELOC_TYPE 2 /* ip-rel */
-# define __get_user_size(val, addr, n, err) \
-do { \
- __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE); \
- (err) = ia64_getreg(_IA64_REG_R8); \
- (val) = ia64_getreg(_IA64_REG_R9); \
-} while (0)
-# define __put_user_size(val, addr, n, err)
\
-do {
\
- __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned
long) (val)); \
- (err) = ia64_getreg(_IA64_REG_R8);
\
-} while (0)
-#endif /* !ASM_SUPPORTED */
-
-extern void __get_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve
subroutine-calls, which
- * could clobber r8 and r9 (among others). Thus, be careful not to evaluate
it while
- * using r8/r9.
- */
-#define __do_get_user(check, x, ptr, size, segment)
\
-({
\
- const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);
\
- __typeof__ (size) __gu_size = (size);
\
- long __gu_err = -EFAULT, __gu_val = 0;
\
-
\
- if (!check || __access_ok(__gu_ptr, size, segment))
\
- switch (__gu_size) {
\
- case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err);
break; \
- case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err);
break; \
- case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err);
break; \
- case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err);
break; \
- default: __get_user_unknown(); break;
\
- }
\
- (x) = (__typeof__(*(__gu_ptr))) __gu_val;
\
- __gu_err;
\
-})
-
-#define __get_user_nocheck(x, ptr, size) __do_get_user(0, x, ptr, size,
KERNEL_DS)
-#define __get_user_check(x, ptr, size, segment) __do_get_user(1, x,
ptr, size, segment)
-
-extern void __put_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve
subroutine-calls, which
- * could clobber r8 (among others). Thus, be careful not to evaluate them
while using r8.
- */
-#define __do_put_user(check, x, ptr, size, segment)
\
-({
\
- __typeof__ (x) __pu_x = (x);
\
- __typeof__ (*(ptr)) __user *__pu_ptr = (ptr);
\
- __typeof__ (size) __pu_size = (size);
\
- long __pu_err = -EFAULT;
\
-
\
- if (!check || __access_ok(__pu_ptr, __pu_size, segment))
\
- switch (__pu_size) {
\
- case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err);
break; \
- case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err);
break; \
- case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err);
break; \
- case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err);
break; \
- default: __put_user_unknown(); break;
\
- }
\
- __pu_err;
\
-})
-
-#define __put_user_nocheck(x, ptr, size) __do_put_user(0, x, ptr, size,
KERNEL_DS)
-#define __put_user_check(x, ptr, size, segment) __do_put_user(1, x,
ptr, size, segment)
-
-/*
- * Complex access routines
- */
-extern unsigned long __must_check __copy_user (void __user *to, const void
__user *from,
- unsigned long count);
-
-static inline unsigned long
-__copy_to_user (void __user *to, const void *from, unsigned long count)
-{
- return __copy_user(to, (void __user *) from, count);
-}
-
-static inline unsigned long
-__copy_from_user (void *to, const void __user *from, unsigned long count)
-{
- return __copy_user((void __user *) to, from, count);
-}
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-#define copy_to_user(to, from, n)
\
-({
\
- void __user *__cu_to = (to);
\
- const void *__cu_from = (from);
\
- long __cu_len = (n);
\
-
\
- if (__access_ok(__cu_to, __cu_len, get_fs()))
\
- __cu_len = __copy_user(__cu_to, (void __user *) __cu_from,
__cu_len); \
- __cu_len;
\
-})
-
-#define copy_from_user(to, from, n)
\
-({
\
- void *__cu_to = (to);
\
- const void __user *__cu_from = (from);
\
- long __cu_len = (n);
\
-
\
- __chk_user_ptr(__cu_from);
\
- if (__access_ok(__cu_from, __cu_len, get_fs()))
\
- __cu_len = __copy_user((void __user *) __cu_to, __cu_from,
__cu_len); \
- __cu_len;
\
-})
-
-#define __copy_in_user(to, from, size) __copy_user((to), (from), (size))
-
-static inline unsigned long
-copy_in_user (void __user *to, const void __user *from, unsigned long n)
-{
- if (likely(access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE,
to, n)))
- n = __copy_user(to, from, n);
- return n;
-}
-
-extern unsigned long __do_clear_user (void __user *, unsigned long);
-
-#define __clear_user(to, n) __do_clear_user(to, n)
-
-#define clear_user(to, n) \
-({ \
- unsigned long __cu_len = (n); \
- if (__access_ok(to, __cu_len, get_fs())) \
- __cu_len = __do_clear_user(to, __cu_len); \
- __cu_len; \
-})
-
-
-/*
- * Returns: -EFAULT if exception before terminator, N if the entire buffer
filled, else
- * strlen.
- */
-extern long __must_check __strncpy_from_user (char *to, const char __user
*from, long to_len);
-
-#define strncpy_from_user(to, from, n) \
-({ \
- const char __user * __sfu_from = (from); \
- long __sfu_ret = -EFAULT; \
- if (__access_ok(__sfu_from, 0, get_fs())) \
- __sfu_ret = __strncpy_from_user((to), __sfu_from, (n)); \
- __sfu_ret; \
-})
-
-/* Returns: 0 if bad, string length+1 (memory size) of string if ok */
-extern unsigned long __strlen_user (const char __user *);
-
-#define strlen_user(str) \
-({ \
- const char __user *__su_str = (str); \
- unsigned long __su_ret = 0; \
- if (__access_ok(__su_str, 0, get_fs())) \
- __su_ret = __strlen_user(__su_str); \
- __su_ret; \
-})
-
-/*
- * Returns: 0 if exception before NUL or reaching the supplied limit
- * (N), a value greater than N if the limit would be exceeded, else
- * strlen.
- */
-extern unsigned long __strnlen_user (const char __user *, long);
-
-#define strnlen_user(str, len) \
-({ \
- const char __user *__su_str = (str); \
- unsigned long __su_ret = 0; \
- if (__access_ok(__su_str, 0, get_fs())) \
- __su_ret = __strnlen_user(__su_str, len); \
- __su_ret; \
-})
-
-/* Generic code can't deal with the location-relative format that we use for
compactness. */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
-
-struct exception_table_entry {
- int addr; /* location-relative address of insn this fixup is for
*/
- int cont; /* location-relative continuation addr.; if bit 2 is
set, r9 is set to 0 */
-};
-
-extern void ia64_handle_exception (struct pt_regs *regs, const struct
exception_table_entry *e);
-extern const struct exception_table_entry *search_exception_tables (unsigned
long addr);
-
-static inline int
-ia64_done_with_exception (struct pt_regs *regs)
-{
- const struct exception_table_entry *e;
- e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
- if (e) {
- ia64_handle_exception(regs, e);
- return 1;
- }
- return 0;
-}
-
-#ifndef XEN
-#define ARCH_HAS_TRANSLATE_MEM_PTR 1
-static __inline__ char *
-xlate_dev_mem_ptr (unsigned long p)
-{
- struct page *page;
- char * ptr;
-
- page = mfn_to_page(p >> PAGE_SHIFT);
- if (PageUncached(page))
- ptr = (char *)p + __IA64_UNCACHED_OFFSET;
- else
- ptr = __va(p);
-
- return ptr;
-}
-
-/*
- * Convert a virtual cached kernel memory pointer to an uncached pointer
- */
-static __inline__ char *
-xlate_dev_kmem_ptr (char * p)
-{
- struct page *page;
- char * ptr;
-
- page = virt_to_page((unsigned long)p >> PAGE_SHIFT);
- if (PageUncached(page))
- ptr = (char *)__pa(p) + __IA64_UNCACHED_OFFSET;
- else
- ptr = p;
-
- return ptr;
-}
-#endif
-
-#endif /* _ASM_IA64_UACCESS_H */
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|