# HG changeset patch
# User djm@xxxxxxxxxxxxxxx
# Node ID b7276814008c9c924fceecf6fd9f67ccddaadcb2
# Parent 44316ce8327754a7a70c80ffff551e7c4619e066
Begin updating to 2.6.13 base
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/Makefile
--- a/xen/arch/ia64/Makefile Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/Makefile Wed Aug 31 20:32:27 2005
@@ -1,18 +1,21 @@
include $(BASEDIR)/Rules.mk
-VPATH = linux linux-xen
+VPATH = linux linux-xen linux/lib
+#VPATH = linux-xen linux/lib
# libs-y += arch/ia64/lib/lib.a
OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \
- xenmisc.o pdb-stub.o acpi.o hypercall.o \
+ xenmisc.o acpi.o hypercall.o \
machvec.o dom0_ops.o domain.o hpsimserial.o pcdp.o \
idle0_task.o pal.o hpsim.o efi.o efi_stub.o ivt.o mm_contig.o \
xenmem.o sal.o cmdline.o mm_init.o tlb.o smpboot.o \
- extable.o linuxextable.o xenirq.o xentime.o \
+ extable.o linuxextable.o sort.o xenirq.o xentime.o \
regionreg.o entry.o unaligned.o privop.o vcpu.o \
irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \
grant_table.o sn_console.o
+
+#OBJS += idiv64.o idiv32.o \
# TMP holder to contain *.0 moved out of CONFIG_VTI
OBJS += vmx_init.o
@@ -22,6 +25,13 @@
vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \
vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o
endif
+
+# files from xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
+OBJS += bitop.o clear_page.o flush.o copy_page_mck.o
\
+ memset.o strlen.o memcpy_mck.o \
+ __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
+ __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
+
# perfmon.o
# unwind.o needed for kernel unwinding (rare)
@@ -30,8 +40,8 @@
# remove following line if not privifying in memory
# OBJS += privify.o
-default: $(OBJS) head.o ia64lib.o xen.lds.s
- $(LD) -r -o arch.o $(OBJS) ia64lib.o
+default: $(OBJS) head.o xen.lds.s
+ $(LD) -r -o arch.o $(OBJS)
$(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \
-Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms
$(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET)
@@ -79,12 +89,29 @@
$(CC) -E $(CPPFLAGS) -P -DXEN -D__ASSEMBLY__ \
-o xen.lds.s xen.lds.S
-ia64lib.o:
- $(MAKE) -C linux/lib && cp linux/lib/ia64lib.o .
+# variants of divide/modulo
+# see files in xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
+__divdi3.o: idiv64.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
+__udivdi3.o: idiv64.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
+__moddi3.o: idiv64.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
+__umoddi3.o: idiv64.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
+__divsi3.o: idiv32.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
+__udivsi3.o: idiv32.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
+__modsi3.o: idiv32.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
+__umodsi3.o: idiv32.S
+ $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
+
clean:
rm -f *.o *~ core xen.lds.s
$(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s
rm -f asm-xsi-offsets.s $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h
- rm -f lib/*.o
+ rm -f linux/lib/*.o
.PHONY: default clean
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/setup.c
--- a/xen/arch/ia64/linux-xen/setup.c Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/setup.c Wed Aug 31 20:32:27 2005
@@ -4,10 +4,15 @@
* Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
* David Mosberger-Tang <davidm@xxxxxxxxxx>
* Stephane Eranian <eranian@xxxxxxxxxx>
- * Copyright (C) 2000, Rohit Seth <rohit.seth@xxxxxxxxx>
+ * Copyright (C) 2000, 2004 Intel Corp
+ * Rohit Seth <rohit.seth@xxxxxxxxx>
+ * Suresh Siddha <suresh.b.siddha@xxxxxxxxx>
+ * Gordon Jin <gordon.jin@xxxxxxxxx>
* Copyright (C) 1999 VA Linux Systems
* Copyright (C) 1999 Walt Drummond <drummond@xxxxxxxxxxx>
*
+ * 12/26/04 S.Siddha, G.Jin, R.Seth
+ * Add multi-threading and multi-core detection
* 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
* 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
* 03/31/00 R.Seth cpu_initialized and current->processor fixes
@@ -15,6 +20,7 @@
* 02/01/00 R.Seth fixed get_cpuinfo for SMP
* 01/07/99 S.Eranian added the support for command line argument
* 06/24/99 W.Drummond added boot_cpu_data.
+ * 05/28/05 Z. Menyhart Dynamic stride size for "flush_icache_range()"
*/
#include <linux/config.h>
#include <linux/module.h>
@@ -35,6 +41,10 @@
#include <linux/serial_core.h>
#include <linux/efi.h>
#include <linux/initrd.h>
+#ifndef XEN
+#include <linux/platform.h>
+#include <linux/pm.h>
+#endif
#include <asm/ia32.h>
#include <asm/machvec.h>
@@ -51,8 +61,10 @@
#include <asm/smp.h>
#include <asm/system.h>
#include <asm/unistd.h>
+#ifdef XEN
#include <asm/vmx.h>
#include <asm/io.h>
+#endif
#if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
# error "struct cpuinfo_ia64 too big!"
@@ -64,12 +76,16 @@
#endif
DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
+#ifdef XEN
DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr);
+#endif
DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
unsigned long ia64_cycles_per_usec;
struct ia64_boot_param *ia64_boot_param;
struct screen_info screen_info;
+unsigned long vga_console_iobase;
+unsigned long vga_console_membase;
unsigned long ia64_max_cacheline_size;
unsigned long ia64_iobase; /* virtual address for I/O accesses */
@@ -78,7 +94,12 @@
EXPORT_SYMBOL(io_space);
unsigned int num_io_spaces;
-unsigned char aux_device_present = 0xaa; /* XXX remove this when legacy
I/O is gone */
+/*
+ * "flush_icache_range()" needs to know what processor dependent stride size
to use
+ * when it makes i-cache(s) coherent with d-caches.
+ */
+#define I_CACHE_STRIDE_SHIFT 5 /* Safest way to go: 32 bytes
by 32 bytes */
+unsigned long ia64_i_cache_stride_shift = ~0;
/*
* The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) -
1). This
@@ -287,23 +308,25 @@
static inline int __init
early_console_setup (char *cmdline)
{
+ int earlycons = 0;
+
#ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
{
extern int sn_serial_console_early_setup(void);
if (!sn_serial_console_early_setup())
- return 0;
+ earlycons++;
}
#endif
#ifdef CONFIG_EFI_PCDP
if (!efi_setup_pcdp_console(cmdline))
- return 0;
+ earlycons++;
#endif
#ifdef CONFIG_SERIAL_8250_CONSOLE
if (!early_serial_console_init(cmdline))
- return 0;
-#endif
-
- return -1;
+ earlycons++;
+#endif
+
+ return (earlycons) ? 0 : -1;
}
static inline void
@@ -315,7 +338,34 @@
#endif
}
-void __init
+#ifdef CONFIG_SMP
+static void
+check_for_logical_procs (void)
+{
+ pal_logical_to_physical_t info;
+ s64 status;
+
+ status = ia64_pal_logical_to_phys(0, &info);
+ if (status == -1) {
+ printk(KERN_INFO "No logical to physical processor mapping "
+ "available\n");
+ return;
+ }
+ if (status) {
+ printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n",
+ status);
+ return;
+ }
+ /*
+ * Total number of siblings that BSP has. Though not all of them
+ * may have booted successfully. The correct number of siblings
+ * booted is in info.overview_num_log.
+ */
+ smp_num_siblings = info.overview_tpc;
+ smp_num_cpucores = info.overview_cpp;
+}
+#endif
+
#ifdef XEN
early_setup_arch (char **cmdline_p)
#else
@@ -398,6 +448,19 @@
#ifdef CONFIG_SMP
cpu_physical_id(0) = hard_smp_processor_id();
+
+ cpu_set(0, cpu_sibling_map[0]);
+ cpu_set(0, cpu_core_map[0]);
+
+ check_for_logical_procs();
+ if (smp_num_cpucores > 1)
+ printk(KERN_INFO
+ "cpu package is Multi-Core capable: number of
cores=%d\n",
+ smp_num_cpucores);
+ if (smp_num_siblings > 1)
+ printk(KERN_INFO
+ "cpu package is Multi-Threading capable: number of
siblings=%d\n",
+ smp_num_siblings);
#endif
#ifdef XEN
@@ -505,12 +568,23 @@
"cpu regs : %u\n"
"cpu MHz : %lu.%06lu\n"
"itc MHz : %lu.%06lu\n"
- "BogoMIPS : %lu.%02lu\n\n",
+ "BogoMIPS : %lu.%02lu\n",
cpunum, c->vendor, family, c->model, c->revision, c->archrev,
features, c->ppn, c->number,
c->proc_freq / 1000000, c->proc_freq % 1000000,
c->itc_freq / 1000000, c->itc_freq % 1000000,
lpj*HZ/500000, (lpj*HZ/5000) % 100);
+#ifdef CONFIG_SMP
+ seq_printf(m, "siblings : %u\n", c->num_log);
+ if (c->threads_per_core > 1 || c->cores_per_socket > 1)
+ seq_printf(m,
+ "physical id: %u\n"
+ "core id : %u\n"
+ "thread id : %u\n",
+ c->socket_id, c->core_id, c->thread_id);
+#endif
+ seq_printf(m,"\n");
+
return 0;
}
@@ -581,6 +655,14 @@
memcpy(c->vendor, cpuid.field.vendor, 16);
#ifdef CONFIG_SMP
c->cpu = smp_processor_id();
+
+ /* below default values will be overwritten by identify_siblings()
+ * for Multi-Threading/Multi-Core capable cpu's
+ */
+ c->threads_per_core = c->cores_per_socket = c->num_log = 1;
+ c->socket_id = -1;
+
+ identify_siblings(c);
#endif
c->ppn = cpuid.field.ppn;
c->number = cpuid.field.number;
@@ -611,6 +693,12 @@
/* start_kernel() requires this... */
}
+/*
+ * Calculate the max. cache line size.
+ *
+ * In addition, the minimum of the i-cache stride sizes is calculated for
+ * "flush_icache_range()".
+ */
static void
get_max_cacheline_size (void)
{
@@ -624,6 +712,8 @@
printk(KERN_ERR "%s: ia64_pal_cache_summary() failed
(status=%ld)\n",
__FUNCTION__, status);
max = SMP_CACHE_BYTES;
+ /* Safest setup for "flush_icache_range()" */
+ ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
goto out;
}
@@ -632,14 +722,31 @@
&cci);
if (status != 0) {
printk(KERN_ERR
- "%s: ia64_pal_cache_config_info(l=%lu) failed
(status=%ld)\n",
+ "%s: ia64_pal_cache_config_info(l=%lu, 2) failed
(status=%ld)\n",
__FUNCTION__, l, status);
max = SMP_CACHE_BYTES;
+ /* The safest setup for "flush_icache_range()" */
+ cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+ cci.pcci_unified = 1;
}
line_size = 1 << cci.pcci_line_size;
if (line_size > max)
max = line_size;
- }
+ if (!cci.pcci_unified) {
+ status = ia64_pal_cache_config_info(l,
+ /* cache_type
(instruction)= */ 1,
+ &cci);
+ if (status != 0) {
+ printk(KERN_ERR
+ "%s: ia64_pal_cache_config_info(l=%lu, 1)
failed (status=%ld)\n",
+ __FUNCTION__, l, status);
+ /* The safest setup for "flush_icache_range()"
*/
+ cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+ }
+ }
+ if (cci.pcci_stride < ia64_i_cache_stride_shift)
+ ia64_i_cache_stride_shift = cci.pcci_stride;
+ }
out:
if (max > ia64_max_cacheline_size)
ia64_max_cacheline_size = max;
@@ -700,7 +807,17 @@
ia64_set_kr(IA64_KR_FPU_OWNER, 0);
/*
- * Initialize default control register to defer all speculative faults.
The
+ * Initialize the page-table base register to a global
+ * directory with all zeroes. This ensure that we can handle
+ * TLB-misses to user address-space even before we created the
+ * first user address-space. This may happen, e.g., due to
+ * aggressive use of lfetch.fault.
+ */
+ ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
+
+ /*
+ * Initialize default control register to defer speculative faults
except
+ * for those arising from TLB misses, which are not deferred. The
* kernel MUST NOT depend on a particular setting of these bits (in
other words,
* the kernel must have recovery code for all speculative accesses).
Turn on
* dcr.lc as per recommendation by the architecture team. Most IA-32
apps
@@ -762,6 +879,9 @@
/* size of physical stacked register partition plus 8 bytes: */
__get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
platform_cpu_init();
+#ifndef XEN
+ pm_idle = default_idle;
+#endif
}
void
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/extable.c
--- a/xen/arch/ia64/linux/extable.c Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/extable.c Wed Aug 31 20:32:27 2005
@@ -6,29 +6,29 @@
*/
#include <linux/config.h>
+#include <linux/sort.h>
#include <asm/uaccess.h>
#include <asm/module.h>
-static inline int
-compare_entries (struct exception_table_entry *l, struct exception_table_entry
*r)
+static int cmp_ex(const void *a, const void *b)
{
+ const struct exception_table_entry *l = a, *r = b;
u64 lip = (u64) &l->addr + l->addr;
u64 rip = (u64) &r->addr + r->addr;
+ /* avoid overflow */
+ if (lip > rip)
+ return 1;
if (lip < rip)
return -1;
- if (lip == rip)
- return 0;
- else
- return 1;
+ return 0;
}
-static inline void
-swap_entries (struct exception_table_entry *l, struct exception_table_entry *r)
+static void swap_ex(void *a, void *b, int size)
{
+ struct exception_table_entry *l = a, *r = b, tmp;
u64 delta = (u64) r - (u64) l;
- struct exception_table_entry tmp;
tmp = *l;
l->addr = r->addr + delta;
@@ -38,23 +38,20 @@
}
/*
- * Sort the exception table. It's usually already sorted, but there may be
unordered
- * entries due to multiple text sections (such as the .init text section).
Note that the
- * exception-table-entries contain location-relative addresses, which requires
a bit of
- * care during sorting to avoid overflows in the offset members (e.g., it
would not be
- * safe to make a temporary copy of an exception-table entry on the stack,
because the
- * stack may be more than 2GB away from the exception-table).
+ * Sort the exception table. It's usually already sorted, but there
+ * may be unordered entries due to multiple text sections (such as the
+ * .init text section). Note that the exception-table-entries contain
+ * location-relative addresses, which requires a bit of care during
+ * sorting to avoid overflows in the offset members (e.g., it would
+ * not be safe to make a temporary copy of an exception-table entry on
+ * the stack, because the stack may be more than 2GB away from the
+ * exception-table).
*/
-void
-sort_extable (struct exception_table_entry *start, struct
exception_table_entry *finish)
+void sort_extable (struct exception_table_entry *start,
+ struct exception_table_entry *finish)
{
- struct exception_table_entry *p, *q;
-
- /* insertion sort */
- for (p = start + 1; p < finish; ++p)
- /* start .. p-1 is sorted; push p down to it's proper place */
- for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0;
--q)
- swap_entries(&q[0], &q[-1]);
+ sort(start, finish - start, sizeof(struct exception_table_entry),
+ cmp_ex, swap_ex);
}
const struct exception_table_entry *
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/ia64_ksyms.c
--- a/xen/arch/ia64/linux/ia64_ksyms.c Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/ia64_ksyms.c Wed Aug 31 20:32:27 2005
@@ -57,9 +57,6 @@
EXPORT_SYMBOL(__strlen_user);
EXPORT_SYMBOL(__strncpy_from_user);
EXPORT_SYMBOL(__strnlen_user);
-
-#include <asm/unistd.h>
-EXPORT_SYMBOL(__ia64_syscall);
/* from arch/ia64/lib */
extern void __divsi3(void);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/flush.S
--- a/xen/arch/ia64/linux/lib/flush.S Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/flush.S Wed Aug 31 20:32:27 2005
@@ -1,39 +1,61 @@
/*
* Cache flushing routines.
*
- * Copyright (C) 1999-2001 Hewlett-Packard Co
- * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
+ * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@xxxxxxxxxx>
+ *
+ * 05/28/05 Zoltan Menyhart Dynamic stride size
*/
+
#include <asm/asmmacro.h>
-#include <asm/page.h>
+
/*
* flush_icache_range(start,end)
- * Must flush range from start to end-1 but nothing else (need to
+ *
+ * Make i-cache(s) coherent with d-caches.
+ *
+ * Must deal with range from start to end-1 but nothing else (need
to
* be careful not to touch addresses that may be unmapped).
+ *
+ * Note: "in0" and "in1" are preserved for debugging purposes.
*/
GLOBAL_ENTRY(flush_icache_range)
+
.prologue
- alloc r2=ar.pfs,2,0,0,0
- sub r8=in1,in0,1
+ alloc r2=ar.pfs,2,0,0,0
+ movl r3=ia64_i_cache_stride_shift
+ mov r21=1
;;
- shr.u r8=r8,5 // we flush 32 bytes per iteration
- .save ar.lc, r3
- mov r3=ar.lc // save ar.lc
+ ld8 r20=[r3] // r20: stride shift
+ sub r22=in1,r0,1 // last byte address
+ ;;
+ shr.u r23=in0,r20 // start / (stride size)
+ shr.u r22=r22,r20 // (last byte address) / (stride size)
+ shl r21=r21,r20 // r21: stride size of the i-cache(s)
+ ;;
+ sub r8=r22,r23 // number of strides - 1
+ shl r24=r23,r20 // r24: addresses for "fc.i" =
+ // "start" rounded down to stride
boundary
+ .save ar.lc,r3
+ mov r3=ar.lc // save ar.lc
;;
.body
-
- mov ar.lc=r8
+ mov ar.lc=r8
;;
-.Loop: fc in0 // issuable on M0 only
- add in0=32,in0
+ /*
+ * 32 byte aligned loop, even number of (actually 2) bundles
+ */
+.Loop: fc.i r24 // issuable on M0 only
+ add r24=r21,r24 // we flush "stride size" bytes per
iteration
+ nop.i 0
br.cloop.sptk.few .Loop
;;
sync.i
;;
srlz.i
;;
- mov ar.lc=r3 // restore ar.lc
+ mov ar.lc=r3 // restore ar.lc
br.ret.sptk.many rp
END(flush_icache_range)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy_mck.S
--- a/xen/arch/ia64/linux/lib/memcpy_mck.S Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/memcpy_mck.S Wed Aug 31 20:32:27 2005
@@ -75,6 +75,7 @@
mov f6=f0
br.cond.sptk .common_code
;;
+END(memcpy)
GLOBAL_ENTRY(__copy_user)
.prologue
// check dest alignment
@@ -300,7 +301,7 @@
add src_pre_mem=0,src0 // prefetch src pointer
add dst_pre_mem=0,dst0 // prefetch dest pointer
and src0=-8,src0 // 1st src pointer
-(p7) mov ar.lc = r21
+(p7) mov ar.lc = cnt
(p8) mov ar.lc = r0
;;
TEXT_ALIGN(32)
@@ -524,7 +525,6 @@
#undef B
#undef C
#undef D
-END(memcpy)
/*
* Due to lack of local tag support in gcc 2.x assembler, it is not clear which
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memset.S
--- a/xen/arch/ia64/linux/lib/memset.S Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/memset.S Wed Aug 31 20:32:27 2005
@@ -57,10 +57,10 @@
{ .mmi
.prologue
alloc tmp = ar.pfs, 3, 0, 0, 0
- .body
lfetch.nt1 [dest] //
.save ar.lc, save_lc
mov.i save_lc = ar.lc
+ .body
} { .mmi
mov ret0 = dest // return value
cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is
zero
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/pcdp.h
--- a/xen/arch/ia64/linux/pcdp.h Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/pcdp.h Wed Aug 31 20:32:27 2005
@@ -2,7 +2,7 @@
* Definitions for PCDP-defined console devices
*
* v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf
- * v2.0: http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf
+ * v2.0: http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
*
* (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
* Khalid Aziz <khalid.aziz@xxxxxx>
@@ -52,11 +52,36 @@
u32 clock_rate;
u8 pci_prog_intfc;
u8 flags;
-};
+ u16 conout_index;
+ u32 reserved;
+} __attribute__((packed));
+
+#define PCDP_IF_PCI 1
+
+/* pcdp_if_pci.trans */
+#define PCDP_PCI_TRANS_IOPORT 0x02
+#define PCDP_PCI_TRANS_MMIO 0x01
+
+struct pcdp_if_pci {
+ u8 interconnect;
+ u8 reserved;
+ u16 length;
+ u8 segment;
+ u8 bus;
+ u8 dev;
+ u8 fun;
+ u16 dev_id;
+ u16 vendor_id;
+ u32 acpi_interrupt;
+ u64 mmio_tra;
+ u64 ioport_tra;
+ u8 flags;
+ u8 trans;
+} __attribute__((packed));
struct pcdp_vga {
u8 count; /* address space descriptors */
-};
+} __attribute__((packed));
/* pcdp_device.flags */
#define PCDP_PRIMARY_CONSOLE 1
@@ -66,7 +91,9 @@
u8 flags;
u16 length;
u16 efi_index;
-};
+ /* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */
+ /* next data is device specific type (currently only pcdp_vga) */
+} __attribute__((packed));
struct pcdp {
u8 signature[4];
@@ -81,4 +108,4 @@
u32 num_uarts;
struct pcdp_uart uart[0]; /* actual size is num_uarts */
/* remainder of table is pcdp_device structures */
-};
+} __attribute__((packed));
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/minstate.h
--- /dev/null Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/minstate.h Wed Aug 31 20:32:27 2005
@@ -0,0 +1,254 @@
+#include <linux/config.h>
+
+#include <asm/cache.h>
+
+#include "entry.h"
+
+/*
+ * For ivt.s we want to access the stack virtually so we don't have to disable
translation
+ * on interrupts.
+ *
+ * On entry:
+ * r1: pointer to current task (ar.k6)
+ */
+#define MINSTATE_START_SAVE_MIN_VIRT
\
+(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0,
little-endian, loadrs=0 */ \
+ ;;
\
+(pUStk) mov.m r24=ar.rnat;
\
+(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base
of RBS */ \
+(pKStk) mov r1=sp; /* get sp */
\
+ ;;
\
+(pUStk) lfetch.fault.excl.nt1 [r22];
\
+(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base
of memory stack */ \
+(pUStk) mov r23=ar.bspstore; /* save
ar.bspstore */ \
+ ;;
\
+(pUStk) mov ar.bspstore=r22; /* switch to
kernel RBS */ \
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode,
use sp (r12) */ \
+ ;;
\
+(pUStk) mov r18=ar.bsp;
\
+(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian,
loadrs=0 */ \
+
+#define MINSTATE_END_SAVE_MIN_VIRT
\
+ bsw.1; /* switch back to bank 1 (must be last in insn
group) */ \
+ ;;
+
+/*
+ * For mca_asm.S we want to access the stack physically since the state is
saved before we
+ * go virtual and don't want to destroy the iip or ipsr.
+ */
+#define MINSTATE_START_SAVE_MIN_PHYS
\
+(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;
\
+(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;
\
+(pKStk) ld8 r3 = [r3];;
\
+(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;
\
+(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;
\
+(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0,
little-endian, loadrs=0 */ \
+(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of
register backing store */ \
+ ;;
\
+(pUStk) mov r24=ar.rnat;
\
+(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base
of memory stack */ \
+(pUStk) mov r23=ar.bspstore; /* save
ar.bspstore */ \
+(pUStk) dep r22=-1,r22,61,3; /* compute kernel
virtual addr of RBS */ \
+ ;;
\
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp
(r12) */ \
+(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS
*/ \
+ ;;
\
+(pUStk) mov r18=ar.bsp;
\
+(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian,
loadrs=0 */ \
+
+#define MINSTATE_END_SAVE_MIN_PHYS
\
+ dep r12=-1,r12,61,3; /* make sp a kernel virtual address */
\
+ ;;
+
+#ifdef MINSTATE_VIRT
+# define MINSTATE_GET_CURRENT(reg) \
+ movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
+ ld8 reg=[reg]
+# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT
+# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT
+#endif
+
+#ifdef MINSTATE_PHYS
+# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg
+# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS
+# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS
+#endif
+
+/*
+ * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
+ * the minimum state necessary that allows us to turn psr.ic back
+ * on.
+ *
+ * Assumed state upon entry:
+ * psr.ic: off
+ * r31: contains saved predicates (pr)
+ *
+ * Upon exit, the state is as follows:
+ * psr.ic: off
+ * r2 = points to &pt_regs.r16
+ * r8 = contents of ar.ccv
+ * r9 = contents of ar.csd
+ * r10 = contents of ar.ssd
+ * r11 = FPSR_DEFAULT
+ * r12 = kernel sp (kernel virtual address)
+ * r13 = points to current task_struct (kernel virtual address)
+ * p15 = TRUE if psr.i is set in cr.ipsr
+ * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
+ * preserved
+ *
+ * Note that psr.ic is NOT turned on by this macro. This is so that
+ * we can pass interruption state as arguments to a handler.
+ */
+#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)
\
+ MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */
\
+ mov r27=ar.rsc; /* M */
\
+ mov r20=r1; /* A */
\
+ mov r25=ar.unat; /* M */
\
+ mov r29=cr.ipsr; /* M */
\
+ mov r26=ar.pfs; /* I */
\
+ mov r28=cr.iip; /* M */
\
+ mov r21=ar.fpsr; /* M */
\
+ COVER; /* B;; (or nothing) */
\
+ ;;
\
+ adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;
\
+ ;;
\
+ ld1 r17=[r16]; /* load
current->thread.on_ustack flag */ \
+ st1 [r16]=r0; /* clear
current->thread.on_ustack flag */ \
+ adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
\
+ /* switch from user to kernel RBS: */
\
+ ;;
\
+ invala; /* M */
\
+ SAVE_IFS;
\
+ cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode
already? */ \
+ ;;
\
+ MINSTATE_START_SAVE_MIN
\
+ adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line
size */ \
+ adds r16=PT(CR_IPSR),r1;
\
+ ;;
\
+ lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;
\
+ st8 [r16]=r29; /* save cr.ipsr */
\
+ ;;
\
+ lfetch.fault.excl.nt1 [r17];
\
+ tbit.nz p15,p0=r29,IA64_PSR_I_BIT;
\
+ mov r29=b0
\
+ ;;
\
+ adds r16=PT(R8),r1; /* initialize first base pointer */
\
+ adds r17=PT(R9),r1; /* initialize second base pointer */
\
+(pKStk) mov r18=r0; /* make sure r18 isn't NaT */
\
+ ;;
\
+.mem.offset 0,0; st8.spill [r16]=r8,16;
\
+.mem.offset 8,0; st8.spill [r17]=r9,16;
\
+ ;;
\
+.mem.offset 0,0; st8.spill [r16]=r10,24;
\
+.mem.offset 8,0; st8.spill [r17]=r11,24;
\
+ ;;
\
+ st8 [r16]=r28,16; /* save cr.iip */
\
+ st8 [r17]=r30,16; /* save cr.ifs */
\
+(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */
\
+ mov r8=ar.ccv;
\
+ mov r9=ar.csd;
\
+ mov r10=ar.ssd;
\
+ movl r11=FPSR_DEFAULT; /* L-unit */
\
+ ;;
\
+ st8 [r16]=r25,16; /* save ar.unat */
\
+ st8 [r17]=r26,16; /* save ar.pfs */
\
+ shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */
\
+ ;;
\
+ st8 [r16]=r27,16; /* save ar.rsc */
\
+(pUStk) st8 [r17]=r24,16; /* save ar.rnat */
\
+(pKStk) adds r17=16,r17; /* skip over ar_rnat field */
\
+ ;; /* avoid RAW on r16 & r17 */
\
+(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */
\
+ st8 [r17]=r31,16; /* save predicates */
\
+(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */
\
+ ;;
\
+ st8 [r16]=r29,16; /* save b0 */
\
+ st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */
\
+ cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */
\
+ ;;
\
+.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */
\
+.mem.offset 8,0; st8.spill [r17]=r12,16;
\
+ adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes
of scratch) */ \
+ ;;
\
+.mem.offset 0,0; st8.spill [r16]=r13,16;
\
+.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */
\
+ movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;
\
+ ld8 r13=[r13]; /* establish 'current' */
\
+ ;;
\
+.mem.offset 0,0; st8.spill [r16]=r15,16;
\
+.mem.offset 8,0; st8.spill [r17]=r14,16;
\
+ ;;
\
+.mem.offset 0,0; st8.spill [r16]=r2,16;
\
+.mem.offset 8,0; st8.spill [r17]=r3,16;
\
+ adds r2=IA64_PT_REGS_R16_OFFSET,r1;
\
+ ;;
\
+ EXTRA;
\
+ movl r1=__gp; /* establish kernel global pointer */
\
+ ;;
\
+ MINSTATE_END_SAVE_MIN
+
+/*
+ * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
+ *
+ * Assumed state upon entry:
+ * psr.ic: on
+ * r2: points to &pt_regs.r16
+ * r3: points to &pt_regs.r17
+ * r8: contents of ar.ccv
+ * r9: contents of ar.csd
+ * r10: contents of ar.ssd
+ * r11: FPSR_DEFAULT
+ *
+ * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
+ */
+#define SAVE_REST \
+.mem.offset 0,0; st8.spill [r2]=r16,16; \
+.mem.offset 8,0; st8.spill [r3]=r17,16; \
+ ;; \
+.mem.offset 0,0; st8.spill [r2]=r18,16; \
+.mem.offset 8,0; st8.spill [r3]=r19,16; \
+ ;; \
+.mem.offset 0,0; st8.spill [r2]=r20,16; \
+.mem.offset 8,0; st8.spill [r3]=r21,16; \
+ mov r18=b6; \
+ ;; \
+.mem.offset 0,0; st8.spill [r2]=r22,16; \
+.mem.offset 8,0; st8.spill [r3]=r23,16; \
+ mov r19=b7; \
+ ;; \
+.mem.offset 0,0; st8.spill [r2]=r24,16; \
+.mem.offset 8,0; st8.spill [r3]=r25,16; \
+ ;; \
+.mem.offset 0,0; st8.spill [r2]=r26,16; \
+.mem.offset 8,0; st8.spill [r3]=r27,16; \
+ ;; \
+.mem.offset 0,0; st8.spill [r2]=r28,16; \
+.mem.offset 8,0; st8.spill [r3]=r29,16; \
+ ;; \
+.mem.offset 0,0; st8.spill [r2]=r30,16; \
+.mem.offset 8,0; st8.spill [r3]=r31,32; \
+ ;; \
+ mov ar.fpsr=r11; /* M-unit */ \
+ st8 [r2]=r8,8; /* ar.ccv */ \
+ adds r24=PT(B6)-PT(F7),r3; \
+ ;; \
+ stf.spill [r2]=f6,32; \
+ stf.spill [r3]=f7,32; \
+ ;; \
+ stf.spill [r2]=f8,32; \
+ stf.spill [r3]=f9,32; \
+ ;; \
+ stf.spill [r2]=f10; \
+ stf.spill [r3]=f11; \
+ adds r25=PT(B7)-PT(F11),r3; \
+ ;; \
+ st8 [r24]=r18,16; /* b6 */ \
+ st8 [r25]=r19,16; /* b7 */ \
+ ;; \
+ st8 [r24]=r9; /* ar.csd */ \
+ st8 [r25]=r10; /* ar.ssd */ \
+ ;;
+
+#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,)
+#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov
r15=r19)
+#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, )
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/sort.c
--- /dev/null Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/sort.c Wed Aug 31 20:32:27 2005
@@ -0,0 +1,122 @@
+/*
+ * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ *
+ * Jan 23 2005 Matt Mackall <mpm@xxxxxxxxxxx>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#ifdef XEN
+#include <linux/types.h>
+#endif
+
+void u32_swap(void *a, void *b, int size)
+{
+ u32 t = *(u32 *)a;
+ *(u32 *)a = *(u32 *)b;
+ *(u32 *)b = t;
+}
+
+void generic_swap(void *a, void *b, int size)
+{
+ char t;
+
+ do {
+ t = *(char *)a;
+ *(char *)a++ = *(char *)b;
+ *(char *)b++ = t;
+ } while (--size > 0);
+}
+
+/*
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp: pointer to comparison function
+ * @swap: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+void sort(void *base, size_t num, size_t size,
+ int (*cmp)(const void *, const void *),
+ void (*swap)(void *, void *, int size))
+{
+ /* pre-scale counters for performance */
+ int i = (num/2) * size, n = num * size, c, r;
+
+ if (!swap)
+ swap = (size == 4 ? u32_swap : generic_swap);
+
+ /* heapify */
+ for ( ; i >= 0; i -= size) {
+ for (r = i; r * 2 < n; r = c) {
+ c = r * 2;
+ if (c < n - size && cmp(base + c, base + c + size) < 0)
+ c += size;
+ if (cmp(base + r, base + c) >= 0)
+ break;
+ swap(base + r, base + c, size);
+ }
+ }
+
+ /* sort */
+ for (i = n - size; i >= 0; i -= size) {
+ swap(base, base + i, size);
+ for (r = 0; r * 2 < i; r = c) {
+ c = r * 2;
+ if (c < i - size && cmp(base + c, base + c + size) < 0)
+ c += size;
+ if (cmp(base + r, base + c) >= 0)
+ break;
+ swap(base + r, base + c, size);
+ }
+ }
+}
+
+EXPORT_SYMBOL(sort);
+
+#if 0
+/* a simple boot-time regression test */
+
+int cmpint(const void *a, const void *b)
+{
+ return *(int *)a - *(int *)b;
+}
+
+static int sort_test(void)
+{
+ int *a, i, r = 1;
+
+ a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
+ BUG_ON(!a);
+
+ printk("testing sort()\n");
+
+ for (i = 0; i < 1000; i++) {
+ r = (r * 725861) % 6599;
+ a[i] = r;
+ }
+
+ sort(a, 1000, sizeof(int), cmpint, NULL);
+
+ for (i = 0; i < 999; i++)
+ if (a[i] > a[i+1]) {
+ printk("sort() failed!\n");
+ break;
+ }
+
+ kfree(a);
+
+ return 0;
+}
+
+module_init(sort_test);
+#endif
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/README.origin
--- /dev/null Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/README.origin Wed Aug 31 20:32:27 2005
@@ -0,0 +1,20 @@
+Source files in this directory are identical copies of linux-2.6.13 files:
+
+cmdline.c -> linux/lib/cmdline.c
+efi_stub.S -> linux/arch/ia64/efi_stub.S
+extable.c -> linux/arch/ia64/mm/extable.c
+hpsim.S -> linux/arch/ia64/hp/sim/hpsim.S
+ia64_ksyms.c -> linux/arch/ia64/kernel/ia64_ksyms.c
+linuxextable.c -> linux/kernel/extable.c
+machvec.c -> linux/arch/ia64/kernel/machvec.c
+patch.c -> linux/arch/ia64/kernel/patch.c
+pcdp.h -> drivers/firmware/pcdp.h
+lib/bitop.c -> linux/arch/ia64/lib/bitop.c
+lib/clear_page.S -> linux/arch/ia64/lib/clear_page.S
+lib/copy_page_mck.S -> linux/arch/ia64/lib/copy_page_mck.S
+lib/flush.S -> linux/arch/ia64/lib/flush.S
+lib/idiv32.S -> linux/arch/ia64/lib/idiv32.S
+lib/idiv64.S -> linux/arch/ia64/lib/idiv64.S
+lib/memcpy_mck.S -> linux/arch/ia64/lib/memcpy_mck.S
+lib/memset.S -> linux/arch/ia64/lib/memset.S
+lib/strlen.S -> linux/arch/ia64/lib/strlen.S
diff -r 44316ce83277 -r b7276814008c xen/include/asm-ia64/linux/sort.h
--- /dev/null Tue Aug 30 23:51:51 2005
+++ b/xen/include/asm-ia64/linux/sort.h Wed Aug 31 20:32:27 2005
@@ -0,0 +1,10 @@
+#ifndef _LINUX_SORT_H
+#define _LINUX_SORT_H
+
+#include <linux/types.h>
+
+void sort(void *base, size_t num, size_t size,
+ int (*cmp)(const void *, const void *),
+ void (*swap)(void *, void *, int));
+
+#endif
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/lib/Makefile
--- a/xen/arch/ia64/lib/Makefile Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-#
-# Makefile for ia64-specific library routines..
-#
-
-include $(BASEDIR)/Rules.mk
-
-OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
- __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
- bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
- clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \
- flush.o ip_fast_csum.o do_csum.o copy_user.o \
- memset.o strlen.o memcpy.o
-
-default: $(OBJS)
- $(LD) -r -o ia64lib.o $(OBJS)
-
-AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
-
-__divdi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivdi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__moddi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umoddi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__divsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__modsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umodsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-clean:
- rm -f *.o *~
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/Makefile
--- a/xen/arch/ia64/linux/lib/Makefile Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-#
-# Makefile for ia64-specific library routines..
-#
-
-include $(BASEDIR)/Rules.mk
-
-OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
- __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
- bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
- clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \
- flush.o ip_fast_csum.o do_csum.o copy_user.o \
- memset.o strlen.o memcpy.o
-
-default: $(OBJS)
- $(LD) -r -o ia64lib.o $(OBJS)
-
-AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
-
-__divdi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivdi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__moddi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umoddi3.o: idiv64.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__divsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__modsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umodsi3.o: idiv32.S
- $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-clean:
- rm -f *.o *~
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/carta_random.S
--- a/xen/arch/ia64/linux/lib/carta_random.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,54 +0,0 @@
-/*
- * Fast, simple, yet decent quality random number generator based on
- * a paper by David G. Carta ("Two Fast Implementations of the
- * `Minimal Standard' Random Number Generator," Communications of the
- * ACM, January, 1990).
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- * David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-#define a r2
-#define m r3
-#define lo r8
-#define hi r9
-#define t0 r16
-#define t1 r17
-#define seed r32
-
-GLOBAL_ENTRY(carta_random32)
- movl a = (16807 << 16) | 16807
- ;;
- pmpyshr2.u t0 = a, seed, 0
- pmpyshr2.u t1 = a, seed, 16
- ;;
- unpack2.l t0 = t1, t0
- dep m = -1, r0, 0, 31
- ;;
- zxt4 lo = t0
- shr.u hi = t0, 32
- ;;
- dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff)
- ;;
- shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16
- shr t1 = hi, 15 // t1 = (hi >> 15)
- ;;
- add lo = lo, t0
- ;;
- cmp.gtu p6, p0 = lo, m
- ;;
-(p6) and lo = lo, m
- ;;
-(p6) add lo = 1, lo
- ;;
- add lo = lo, t1
- ;;
- cmp.gtu p6, p0 = lo, m
- ;;
-(p6) and lo = lo, m
- ;;
-(p6) add lo = 1, lo
- br.ret.sptk.many rp
-END(carta_random32)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/checksum.c
--- a/xen/arch/ia64/linux/lib/checksum.c Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,102 +0,0 @@
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- * Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
- /* add up 32-bit words for 33 bits */
- x = (x & 0xffffffff) + (x >> 32);
- /* add up 16-bit and 17-bit words for 17+c bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up 16-bit and 2-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-unsigned short int
-csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short
len,
- unsigned short proto, unsigned int sum)
-{
- return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) <<
16) +
- ((unsigned long) proto << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-unsigned int
-csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short
len,
- unsigned short proto, unsigned int sum)
-{
- unsigned long result;
-
- result = (saddr + daddr + sum +
- ((unsigned long) ntohs(len) << 16) +
- ((unsigned long) proto << 8));
-
- /* Fold down to 32-bits so we don't lose in the typedef-less network
stack. */
- /* 64 to 33 */
- result = (result & 0xffffffff) + (result >> 32);
- /* 33 to 32 */
- result = (result & 0xffffffff) + (result >> 32);
- return result;
-}
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-unsigned int
-csum_partial (const unsigned char * buff, int len, unsigned int sum)
-{
- unsigned long result = do_csum(buff, len);
-
- /* add in old sum, and carry.. */
- result += sum;
- /* 32+c bits -> 32 bits */
- result = (result & 0xffffffff) + (result >> 32);
- return result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-unsigned short
-ip_compute_csum (unsigned char * buff, int len)
-{
- return ~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/clear_user.S
--- a/xen/arch/ia64/linux/lib/clear_user.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,209 +0,0 @@
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- * in0: address of buffer
- * in1: length of buffer in bytes
- * Outputs:
- * r8: number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf r32
-#define len r33
-
-//
-// local registers
-//
-#define cnt r16
-#define buf2 r17
-#define saved_lc r18
-#define saved_pfs r19
-#define tmp r20
-#define len2 r21
-#define len3 r22
-
-//
-// Theory of operations:
-// - we check whether or not the buffer is small, i.e., less than 17
-// in which case we do the byte by byte loop.
-//
-// - Otherwise we go progressively from 1 byte store to 8byte store in
-// the head part, the body is a 16byte store loop and we finish we the
-// tail for the last 15 bytes.
-// The good point about this breakdown is that the long buffer handling
-// contains only 2 branches.
-//
-// The reason for not using shifting & masking for both the head and the
-// tail is to stay semantically correct. This routine is not supposed
-// to write bytes outside of the buffer. While most of the time this would
-// be ok, we can't tolerate a mistake. A classical example is the case
-// of multithreaded code were to the extra bytes touched is actually owned
-// by another thread which runs concurrently to ours. Another, less likely,
-// example is with device drivers where reading an I/O mapped location may
-// have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,2,0,0,0
- cmp.eq p6,p0=r0,len // check for zero length
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // preserve ar.lc (slow)
- .body
- ;; // avoid WAW on CFM
- adds tmp=-1,len // br.ctop is repeat/until
- mov ret0=len // return value is length at this point
-(p6) br.ret.spnt.many rp
- ;;
- cmp.lt p6,p0=16,len // if len > 16 then long memset
- mov ar.lc=tmp // initialize lc for small count
-(p6) br.cond.dptk .long_do_clear
- ;; // WAR on ar.lc
- //
- // worst case 16 iterations, avg 8 iterations
- //
- // We could have played with the predicates to use the extra
- // M slot for 2 stores/iteration but the cost the initialization
- // the various counters compared to how long the loop is supposed
- // to last on average does not make this solution viable.
- //
-1:
- EX( .Lexit1, st1 [buf]=r0,1 )
- adds len=-1,len // countdown length using len
- br.cloop.dptk 1b
- ;; // avoid RAW on ar.lc
- //
- // .Lexit4: comes from byte by byte loop
- // len contains bytes left
-.Lexit1:
- mov ret0=len // faster than using ar.lc
- mov ar.lc=saved_lc
- br.ret.sptk.many rp // end of short clear_user
-
-
- //
- // At this point we know we have more than 16 bytes to copy
- // so we focus on alignment (no branches required)
- //
- // The use of len/len2 for countdown of the number of bytes left
- // instead of ret0 is due to the fact that the exception code
- // changes the values of r8.
- //
-.long_do_clear:
- tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
- ;;
- EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
-(p6) adds len=-1,len;; // sync because buf is modified
- tbit.nz p6,p0=buf,1
- ;;
- EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
-(p6) adds len=-2,len;;
- tbit.nz p6,p0=buf,2
- ;;
- EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
-(p6) adds len=-4,len;;
- tbit.nz p6,p0=buf,3
- ;;
- EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
-(p6) adds len=-8,len;;
- shr.u cnt=len,4 // number of 128-bit (2x64bit) words
- ;;
- cmp.eq p6,p0=r0,cnt
- adds tmp=-1,cnt
-(p6) br.cond.dpnt .dotail // we have less than 16 bytes left
- ;;
- adds buf2=8,buf // setup second base pointer
- mov ar.lc=tmp
- ;;
-
- //
- // 16bytes/iteration core loop
- //
- // The second store can never generate a fault because
- // we come into the loop only when we are 16-byte aligned.
- // This means that if we cross a page then it will always be
- // in the first store and never in the second.
- //
- //
- // We need to keep track of the remaining length. A possible
(optimistic)
- // way would be to use ar.lc and derive how many byte were left by
- // doing : left= 16*ar.lc + 16. this would avoid the addition at
- // every iteration.
- // However we need to keep the synchronization point. A template
- // M;;MB does not exist and thus we can keep the addition at no
- // extra cycle cost (use a nop slot anyway). It also simplifies the
- // (unlikely) error recovery code
- //
-
-2: EX(.Lexit3, st8 [buf]=r0,16 )
- ;; // needed to get len correct when error
- st8 [buf2]=r0,16
- adds len=-16,len
- br.cloop.dptk 2b
- ;;
- mov ar.lc=saved_lc
- //
- // tail correction based on len only
- //
- // We alternate the use of len3,len2 to allow parallelism and correct
- // error handling. We also reuse p6/p7 to return correct value.
- // The addition of len2/len3 does not cost anything more compared to
- // the regular memset as we had empty slots.
- //
-.dotail:
- mov len2=len // for parallelization of error handling
- mov len3=len
- tbit.nz p6,p0=len,3
- ;;
- EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
-(p6) adds len3=-8,len2
- tbit.nz p7,p6=len,2
- ;;
- EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
-(p7) adds len2=-4,len3
- tbit.nz p6,p7=len,1
- ;;
- EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
-(p6) adds len3=-2,len2
- tbit.nz p7,p6=len,0
- ;;
- EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
- mov ret0=r0 // success
- br.ret.sptk.many rp // end of most likely path
-
- //
- // Outlined error handling code
- //
-
- //
- // .Lexit3: comes from core loop, need restore pr/lc
- // len contains bytes left
- //
- //
- // .Lexit2:
- // if p6 -> coming from st8 or st2 : len2 contains what's left
- // if p7 -> coming from st4 or st1 : len3 contains what's left
- // We must restore lc/pr even though might not have been used.
-.Lexit2:
- .pred.rel "mutex", p6, p7
-(p6) mov len=len2
-(p7) mov len=len3
- ;;
- //
- // .Lexit4: comes from head, need not restore pr/lc
- // len contains bytes left
- //
-.Lexit3:
- mov ret0=len
- mov ar.lc=saved_lc
- br.ret.sptk.many rp
-END(__do_clear_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_page.S
--- a/xen/arch/ia64/linux/lib/copy_page.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,98 +0,0 @@
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- * in0: address of target page
- * in1: address of source page
- * Output:
- * no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@xxxxxxxxxx>
- * David Mosberger <davidm@xxxxxxxxxx>
- *
- * 4/06/01 davidm Tuned to make it perform well both for cached and
uncached copies.
- */
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH 3
-#define EPI p[PIPE_DEPTH-1]
-
-#define lcount r16
-#define saved_pr r17
-#define saved_lc r18
-#define saved_pfs r19
-#define src1 r20
-#define src2 r21
-#define tgt1 r22
-#define tgt2 r23
-#define srcf r24
-#define tgtf r25
-#define tgt_last r26
-
-#define Nrot ((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
- .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
- t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
- .rotp p[PIPE_DEPTH]
-
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- mov ar.ec=PIPE_DEPTH
-
- mov lcount=PAGE_SIZE/64-1
- .save pr, saved_pr
- mov saved_pr=pr
- mov pr.rot=1<<16
-
- .body
-
- mov src1=in1
- adds src2=8,in1
- mov tgt_last = PAGE_SIZE
- ;;
- adds tgt2=8,in0
- add srcf=512,in1
- mov ar.lc=lcount
- mov tgt1=in0
- add tgtf=512,in0
- add tgt_last = tgt_last, in0
- ;;
-1:
-(p[0]) ld8 t1[0]=[src1],16
-(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0]) ld8 t2[0]=[src2],16
-(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16
- cmp.ltu p6,p0 = tgtf, tgt_last
- ;;
-(p[0]) ld8 t3[0]=[src1],16
-(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0]) ld8 t4[0]=[src2],16
-(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16
- ;;
-(p[0]) ld8 t5[0]=[src1],16
-(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0]) ld8 t6[0]=[src2],16
-(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16
- ;;
-(p[0]) ld8 t7[0]=[src1],16
-(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0]) ld8 t8[0]=[src2],16
-(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6) lfetch [srcf], 64
-(p6) lfetch [tgtf], 64
- br.ctop.sptk.few 1b
- ;;
- mov pr=saved_pr,0xffffffffffff0000 // restore predicates
- mov ar.pfs=saved_pfs
- mov ar.lc=saved_lc
- br.ret.sptk.many rp
-END(copy_page)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_user.S
--- a/xen/arch/ia64/linux/lib/copy_user.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,610 +0,0 @@
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- * in0 address of source buffer
- * in1 address of destination buffer
- * in2 number of bytes to copy
- *
- * Outputs:
- * ret0 0 in case of success. The number of bytes NOT copied in
- * case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Fixme:
- * - handle the case where we have more than 16 bytes and the alignment
- * are different.
- * - more benchmarking
- * - fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK 16 // we do byte copy below (must be >=16)
-#define PIPE_DEPTH 21 // pipe depth
-
-#define EPI p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst in0
-#define src in1
-#define len in2
-
-//
-// local registers
-//
-#define t1 r2 // rshift in bytes
-#define t2 r3 // lshift in bytes
-#define rshift r14 // right shift in bits
-#define lshift r15 // left shift in bits
-#define word1 r16
-#define word2 r17
-#define cnt r18
-#define len2 r19
-#define saved_lc r20
-#define saved_pr r21
-#define tmp r22
-#define val r23
-#define src1 r24
-#define dst1 r25
-#define src2 r26
-#define dst2 r27
-#define len1 r28
-#define enddst r29
-#define endsrc r30
-#define saved_pfs r31
-
-GLOBAL_ENTRY(__copy_user)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
- .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
- .rotp p[PIPE_DEPTH]
-
- adds len2=-1,len // br.ctop is repeat/until
- mov ret0=r0
-
- ;; // RAW of cfm when len=0
- cmp.eq p8,p0=r0,len // check for zero length
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // preserve ar.lc (slow)
-(p8) br.ret.spnt.many rp // empty mempcy()
- ;;
- add enddst=dst,len // first byte after end of source
- add endsrc=src,len // first byte after end of destination
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates
-
- .body
-
- mov dst1=dst // copy because of rotation
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
-
- mov src1=src // copy because of rotation
- mov ar.lc=len2 // initialize lc for small count
- cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
-
- xor tmp=src,dst // same alignment test prepare
-(p10) br.cond.dptk .long_copy_user
- ;; // RAW pr.rot/p16 ?
- //
- // Now we do the byte by byte loop with software pipeline
- //
- // p7 is necessarily false by now
-1:
- EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 1b
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.pfs=saved_pfs // restore ar.ec
- br.ret.sptk.many rp // end of short memcpy
-
- //
- // Not 8-byte aligned
- //
-.diff_align_copy_user:
- // At this point we know we have more than 16 bytes to copy
- // and also that src and dest do _not_ have the same alignment.
- and src2=0x7,src1 // src offset
- and dst2=0x7,dst1 // dst offset
- ;;
- // The basic idea is that we copy byte-by-byte at the head so
- // that we can reach 8-byte alignment for both src1 and dst1.
- // Then copy the body using software pipelined 8-byte copy,
- // shifting the two back-to-back words right and left, then copy
- // the tail by copying byte-by-byte.
- //
- // Fault handling. If the byte-by-byte at the head fails on the
- // load, then restart and finish the pipleline by copying zeros
- // to the dst1. Then copy zeros for the rest of dst1.
- // If 8-byte software pipeline fails on the load, do the same as
- // failure_in3 does. If the byte-by-byte at the tail fails, it is
- // handled simply by failure_in_pipe1.
- //
- // The case p14 represents the source has more bytes in the
- // the first word (by the shifted part), whereas the p15 needs to
- // copy some bytes from the 2nd word of the source that has the
- // tail of the 1st of the destination.
- //
-
- //
- // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
- // to copy the head to dst1, to start 8-byte copy software pipeline.
- // We know src1 is not 8-byte aligned in this case.
- //
- cmp.eq p14,p15=r0,dst2
-(p15) br.cond.spnt 1f
- ;;
- sub t1=8,src2
- mov t2=src2
- ;;
- shl rshift=t2,3
- sub len1=len,t1 // set len1
- ;;
- sub lshift=64,rshift
- ;;
- br.cond.spnt .word_copy_user
- ;;
-1:
- cmp.leu p14,p15=src2,dst2
- sub t1=dst2,src2
- ;;
- .pred.rel "mutex", p14, p15
-(p14) sub word1=8,src2 // (8 - src offset)
-(p15) sub t1=r0,t1 // absolute value
-(p15) sub word1=8,dst2 // (8 - dst offset)
- ;;
- // For the case p14, we don't need to copy the shifted part to
- // the 1st word of destination.
- sub t2=8,t1
-(p14) sub word1=word1,t1
- ;;
- sub len1=len,word1 // resulting len
-(p15) shl rshift=t1,3 // in bits
-(p14) shl rshift=t2,3
- ;;
-(p14) sub len1=len1,t1
- adds cnt=-1,word1
- ;;
- sub lshift=64,rshift
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- mov ar.lc=cnt
- ;;
-2:
- EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 2b
- ;;
- clrrrb
- ;;
-.word_copy_user:
- cmp.gtu p9,p0=16,len1
-(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
- ;;
- shr.u cnt=len1,3 // number of 64-bit words
- ;;
- adds cnt=-1,cnt
- ;;
- .pred.rel "mutex", p14, p15
-(p14) sub src1=src1,t2
-(p15) sub src1=src1,t1
- //
- // Now both src1 and dst1 point to an 8-byte aligned address. And
- // we have more than 8 bytes to copy.
- //
- mov ar.lc=cnt
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- ;;
-3:
- //
- // The pipleline consists of 3 stages:
- // 1 (p16): Load a word from src1
- // 2 (EPI_1): Shift right pair, saving to tmp
- // 3 (EPI): Store tmp to dst1
- //
- // To make it simple, use at least 2 (p16) loops to set up val1[n]
- // because we need 2 back-to-back val1[] to get tmp.
- // Note that this implies EPI_2 must be p18 or greater.
- //
-
-#define EPI_1 p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift) \
- (pred) br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift) \
-.copy_user_bit##rshift: \
-1: \
- EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
- EX(3f,(p16) ld8 val1[1]=[src1],8); \
-(p16) mov val1[0]=r0; \
- br.ctop.dptk 1b; \
- ;; \
- br.cond.sptk.many .diff_align_do_tail; \
-2: \
-(EPI) st8 [dst1]=tmp,8; \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
-3: \
-(p16) mov val1[1]=r0; \
-(p16) mov val1[0]=r0; \
- br.ctop.dptk 2b; \
- ;; \
- br.cond.sptk.many .failure_in2
-
- //
- // Since the instruction 'shrp' requires a fixed 128-bit value
- // specifying the bits to shift, we need to provide 7 cases
- // below.
- //
- SWITCH(p6, 8)
- SWITCH(p7, 16)
- SWITCH(p8, 24)
- SWITCH(p9, 32)
- SWITCH(p10, 40)
- SWITCH(p11, 48)
- SWITCH(p12, 56)
- ;;
- CASE(p6, 8)
- CASE(p7, 16)
- CASE(p8, 24)
- CASE(p9, 32)
- CASE(p10, 40)
- CASE(p11, 48)
- CASE(p12, 56)
- ;;
- BODY(8)
- BODY(16)
- BODY(24)
- BODY(32)
- BODY(40)
- BODY(48)
- BODY(56)
- ;;
-.diff_align_do_tail:
- .pred.rel "mutex", p14, p15
-(p14) sub src1=src1,t1
-(p14) adds dst1=-8,dst1
-(p15) sub dst1=dst1,t1
- ;;
-4:
- // Tail correction.
- //
- // The problem with this piplelined loop is that the last word is not
- // loaded and thus parf of the last word written is not correct.
- // To fix that, we simply copy the tail byte by byte.
-
- sub len1=endsrc,src1,1
- clrrrb
- ;;
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- mov ar.lc=len1
- ;;
-5:
- EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 5b
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // Beginning of long mempcy (i.e. > 16 bytes)
- //
-.long_copy_user:
- tbit.nz p6,p7=src1,0 // odd alignment
- and tmp=7,tmp
- ;;
- cmp.eq p10,p8=r0,tmp
- mov len1=len // copy because of rotation
-(p8) br.cond.dpnt .diff_align_copy_user
- ;;
- // At this point we know we have more than 16 bytes to copy
- // and also that both src and dest have the same alignment
- // which may not be the one we want. So for now we must move
- // forward slowly until we reach 16byte alignment: no need to
- // worry about reaching the end of buffer.
- //
- EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
-(p6) adds len1=-1,len1;;
- tbit.nz p7,p0=src1,1
- ;;
- EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
-(p7) adds len1=-2,len1;;
- tbit.nz p8,p0=src1,2
- ;;
- //
- // Stop bit not required after ld4 because if we fail on ld4
- // we have never executed the ld1, therefore st1 is not executed.
- //
- EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
- ;;
- EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
- tbit.nz p9,p0=src1,3
- ;;
- //
- // Stop bit not required after ld8 because if we fail on ld8
- // we have never executed the ld2, therefore st2 is not executed.
- //
- EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
- EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8) adds len1=-4,len1
- ;;
- EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9) adds len1=-8,len1;;
- shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
- ;;
- EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
- tbit.nz p6,p0=len1,3
- cmp.eq p7,p0=r0,cnt
- adds tmp=-1,cnt // br.ctop is repeat/until
-(p7) br.cond.dpnt .dotail // we have less than 16 bytes left
- ;;
- adds src2=8,src1
- adds dst2=8,dst1
- mov ar.lc=tmp
- ;;
- //
- // 16bytes/iteration
- //
-2:
- EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16) ld8 val2[0]=[src2],16
-
- EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
- br.ctop.dptk 2b
- ;; // RAW on src1 when fall through from loop
- //
- // Tail correction based on len only
- //
- // No matter where we come from (loop or test) the src1 pointer
- // is 16 byte aligned AND we have less than 16 bytes to copy.
- //
-.dotail:
- EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
- tbit.nz p7,p0=len1,2
- ;;
- EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
- tbit.nz p8,p0=len1,1
- ;;
- EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
- tbit.nz p9,p0=len1,0
- ;;
- EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
- ;;
- EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
- mov ar.lc=saved_lc
- ;;
- EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
- mov ar.pfs=saved_pfs
- ;;
- EX(.failure_out, (p9) st1 [dst1]=val2[1])
- br.ret.sptk.many rp
-
-
- //
- // Here we handle the case where the byte by byte copy fails
- // on the load.
- // Several factors make the zeroing of the rest of the buffer kind of
- // tricky:
- // - the pipeline: loads/stores are not in sync (pipeline)
- //
- // In the same loop iteration, the dst1 pointer does not directly
- // reflect where the faulty load was.
- //
- // - pipeline effect
- // When you get a fault on load, you may have valid data from
- // previous loads not yet store in transit. Such data must be
- // store normally before moving onto zeroing the rest.
- //
- // - single/multi dispersal independence.
- //
- // solution:
- // - we don't disrupt the pipeline, i.e. data in transit in
- // the software pipeline will be eventually move to memory.
- // We simply replace the load with a simple mov and keep the
- // pipeline going. We can't really do this inline because
- // p16 is always reset to 1 when lc > 0.
- //
-.failure_in_pipe1:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
-1:
-(p16) mov val1[0]=r0
-(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
- br.ctop.dptk 1b
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // This is the case where the byte by byte copy fails on the load
- // when we copy the head. We need to finish the pipeline and copy
- // zeros for the rest of the destination. Since this happens
- // at the top we still need to fill the body and tail.
-.failure_in_pipe2:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
-2:
-(p16) mov val1[0]=r0
-(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
- br.ctop.dptk 2b
- ;;
- sub len=enddst,dst1,1 // precompute len
- br.cond.dptk.many .failure_in1bis
- ;;
-
- //
- // Here we handle the head & tail part when we check for alignment.
- // The following code handles only the load failures. The
- // main diffculty comes from the fact that loads/stores are
- // scheduled. So when you fail on a load, the stores corresponding
- // to previous successful loads must be executed.
- //
- // However some simplifications are possible given the way
- // things work.
- //
- // 1) HEAD
- // Theory of operation:
- //
- // Page A | Page B
- // ---------|-----
- // 1|8 x
- // 1 2|8 x
- // 4|8 x
- // 1 4|8 x
- // 2 4|8 x
- // 1 2 4|8 x
- // |1
- // |2 x
- // |4 x
- //
- // page_size >= 4k (2^12). (x means 4, 2, 1)
- // Here we suppose Page A exists and Page B does not.
- //
- // As we move towards eight byte alignment we may encounter faults.
- // The numbers on each page show the size of the load (current
alignment).
- //
- // Key point:
- // - if you fail on 1, 2, 4 then you have never executed any
smaller
- // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
- // before.
- //
- // This allows us to simplify the cleanup code, because basically you
- // only have to worry about "pending" stores in the case of a failing
- // ld8(). Given the way the code is written today, this means only
- // worry about st2, st4. There we can use the information encapsulated
- // into the predicates.
- //
- // Other key point:
- // - if you fail on the ld8 in the head, it means you went straight
- // to it, i.e. 8byte alignment within an unexisting page.
- // Again this comes from the fact that if you crossed just for the ld8
then
- // you are 8byte aligned but also 16byte align, therefore you would
- // either go for the 16byte copy loop OR the ld8 in the tail part.
- // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is
impossible
- // because it would mean you had 15bytes to copy in which case you
- // would have defaulted to the byte by byte copy.
- //
- //
- // 2) TAIL
- // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
- // aligned.
- //
- // Key point:
- // This means that we either:
- // - are right on a page boundary
- // OR
- // - are at more than 16 bytes from a page boundary with
- // at most 15 bytes to copy: no chance of crossing.
- //
- // This allows us to assume that if we fail on a load we haven't
possibly
- // executed any of the previous (tail) ones, so we don't need to do
- // any stores. For instance, if we fail on ld2, this means we had
- // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
- //
- // This means that we are in a situation similar the a fault in the
- // head part. That's nice!
- //
-.failure_in1:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
- sub len=endsrc,src1,1
- //
- // we know that ret0 can never be zero at this point
- // because we failed why trying to do a load, i.e. there is still
- // some work to do.
- // The failure_in1bis and length problem is taken care of at the
- // calling side.
- //
- ;;
-.failure_in1bis: // from (.failure_in3)
- mov ar.lc=len // Continue with a stupid byte store.
- ;;
-5:
- st1 [dst1]=r0,1
- br.cloop.dptk 5b
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // Here we simply restart the loop but instead
- // of doing loads we fill the pipeline with zeroes
- // We can't simply store r0 because we may have valid
- // data in transit in the pipeline.
- // ar.lc and ar.ec are setup correctly at this point
- //
- // we MUST use src1/endsrc here and not dst1/enddst because
- // of the pipeline effect.
- //
-.failure_in3:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
- ;;
-2:
-(p16) mov val1[0]=r0
-(p16) mov val2[0]=r0
-(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
- br.ctop.dptk 2b
- ;;
- cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
- sub len=enddst,dst1,1 // precompute len
-(p6) br.cond.dptk .failure_in1bis
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
-.failure_in2:
- sub ret0=endsrc,src1
- cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
- sub len=enddst,dst1,1 // precompute len
-(p6) br.cond.dptk .failure_in1bis
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // handling of failures on stores: that's the easy part
- //
-.failure_out:
- sub ret0=enddst,dst1
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
-
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-END(__copy_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/csum_partial_copy.c
--- a/xen/arch/ia64/linux/lib/csum_partial_copy.c Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,151 +0,0 @@
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- * Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <asm/uaccess.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
- /* add up 32-bit words for 33 bits */
- x = (x & 0xffffffff) + (x >> 32);
- /* add up 16-bit and 17-bit words for 17+c bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up 16-bit and 2-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
- int odd, count;
- unsigned long result = (unsigned long)psum;
-
- if (len <= 0)
- goto out;
- odd = 1 & (unsigned long) buff;
- if (odd) {
- result = *buff << 8;
- len--;
- buff++;
- }
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *) buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- if (4 & (unsigned long) buff) {
- result += *(unsigned int *) buff;
- count--;
- len -= 4;
- buff += 4;
- }
- count >>= 1; /* nr of 64-bit words.. */
- if (count) {
- unsigned long carry = 0;
- do {
- unsigned long w = *(unsigned long *)
buff;
- count--;
- buff += 8;
- result += carry;
- result += w;
- carry = (w > result);
- } while (count);
- result += carry;
- result = (result & 0xffffffff) + (result >> 32);
- }
- if (len & 4) {
- result += *(unsigned int *) buff;
- buff += 4;
- }
- }
- if (len & 2) {
- result += *(unsigned short *) buff;
- buff += 2;
- }
- }
- if (len & 1)
- result += *buff;
-
- result = from64to16(result);
-
- if (odd)
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
- return result;
-}
-
-/*
- * XXX Fixme
- *
- * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
- * But it's very tricky to get right even in C.
- */
-extern unsigned long do_csum(const unsigned char *, long);
-
-static unsigned int
-do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char
*dst,
- int len, unsigned int psum, int *errp)
-{
- unsigned long result;
-
- /* XXX Fixme
- * for now we separate the copy from checksum for obvious
- * alignment difficulties. Look at the Alpha code and you'll be
- * scared.
- */
-
- if (__copy_from_user(dst, src, len) != 0 && errp)
- *errp = -EFAULT;
-
- result = do_csum(dst, len);
-
- /* add in old sum, and carry.. */
- result += psum;
- /* 32+c bits -> 32 bits */
- result = (result & 0xffffffff) + (result >> 32);
- return result;
-}
-
-unsigned int
-csum_partial_copy_from_user (const unsigned char __user *src, unsigned char
*dst,
- int len, unsigned int sum, int *errp)
-{
- if (!access_ok(VERIFY_READ, src, len)) {
- *errp = -EFAULT;
- memset(dst, 0, len);
- return sum;
- }
-
- return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
-}
-
-unsigned int
-csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
- int len, unsigned int sum)
-{
- return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
-}
-
-EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/dec_and_lock.c
--- a/xen/arch/ia64/linux/lib/dec_and_lock.c Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2003 Jerome Marchand, Bull S.A.
- * Cleaned up by David Mosberger-Tang <davidm@xxxxxxxxxx>
- *
- * This file is released under the GPLv2, or at your option any later version.
- *
- * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg"
instruction. This
- * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
- */
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-/*
- * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.
Both of these
- * operations have to be done atomically, so that the count doesn't drop to
zero without
- * acquiring the spinlock first.
- */
-int
-_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
-{
- int old, new;
-
- do {
- old = atomic_read(refcount);
- new = old - 1;
-
- if (unlikely (old == 1)) {
- /* oops, we may be decrementing to zero, do it the slow
way... */
- spin_lock(lock);
- if (atomic_dec_and_test(refcount))
- return 1;
- spin_unlock(lock);
- return 0;
- }
- } while (cmpxchg(&refcount->counter, old, new) != old);
- return 0;
-}
-
-EXPORT_SYMBOL(_atomic_dec_and_lock);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/do_csum.S
--- a/xen/arch/ia64/linux/lib/do_csum.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,323 +0,0 @@
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- * in0: address of buffer to checksum (char *)
- * in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- * Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * 02/04/22 Ken Chen <kenneth.w.chen@xxxxxxxxx>
- * Data locality study on the checksum buffer.
- * More optimization cleanup - remove excessive stop bits.
- * 02/04/08 David Mosberger <davidm@xxxxxxxxxx>
- * More cleanup and tuning.
- * 01/04/18 Jun Nakajima <jun.nakajima@xxxxxxxxx>
- * Clean up and optimize and the software pipeline, loading two
- * back-to-back 8-byte words per loop. Clean up the initialization
- * for the loop. Support the cases where load latency = 1 or 2.
- * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-// The goal is to go as quickly as possible to the point where
-// we can checksum 16 bytes/loop. Before reaching that point we must
-// take care of incorrect alignment of first byte.
-//
-// The code hereafter also takes care of the "tail" part of the buffer
-// before entering the core loop, if any. The checksum is a sum so it
-// allows us to commute operations. So we do the "head" and "tail"
-// first to finish at full speed in the body. Once we get the head and
-// tail values, we feed them into the pipeline, very handy initialization.
-//
-// Of course we deal with the special case where the whole buffer fits
-// into one 8 byte word. In this case we have only one entry in the
pipeline.
-//
-// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-// possible load latency and also to accommodate for head and tail.
-//
-// The end of the function deals with folding the checksum from 64bits
-// down to 16bits taking care of the carry.
-//
-// This version avoids synchronization in the core loop by also using a
-// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-// wordx[] (x=1,2)
-// |---|
-// | | 0 : new value loaded in pipeline
-// |---|
-// | | - : in transit data
-// |---|
-// | | LOAD_LATENCY : current value to add to checksum
-// |---|
-// | | LOAD_LATENCY+1 : previous value added to checksum
-// |---| (previous iteration)
-//
-// resultx[] (x=1,2)
-// |---|
-// | | 0 : initial value
-// |---|
-// | | LOAD_LATENCY-1 : new checksum
-// |---|
-// | | LOAD_LATENCY : previous value of checksum
-// |---|
-// | | LOAD_LATENCY+1 : final checksum when out of the loop
-// |---|
-//
-//
-// See RFC1071 "Computing the Internet Checksum" for various techniques for
-// calculating the Internet checksum.
-//
-// NOT YET DONE:
-// - Maybe another algorithm which would take care of the folding at the
-// end in a different manner
-// - Work with people more knowledgeable than me on the network stack
-// to figure out if we could not split the function depending on the
-// type of packet or alignment we get. Like the ip_fast_csum() routine
-// where we know we have at least 20bytes worth of data to checksum.
-// - Do a better job of handling small packets.
-// - Note on prefetching: it was found that under various load, i.e. ftp
read/write,
-// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate
is at 99.8%
-// on the data that buffer points to (partly because the checksum is
often preceded by
-// a copy_from_user()). This finding indiate that lfetch will not be
beneficial since
-// the data is already in the cache.
-//
-
-#define saved_pfs r11
-#define hmask r16
-#define tmask r17
-#define first1 r18
-#define firstval r19
-#define firstoff r20
-#define last r21
-#define lastval r22
-#define lastoff r23
-#define saved_lc r24
-#define saved_pr r25
-#define tmp1 r26
-#define tmp2 r27
-#define tmp3 r28
-#define carry1 r29
-#define carry2 r30
-#define first2 r31
-
-#define buf in0
-#define len in1
-
-#define LOAD_LATENCY 2 // XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH (LOAD_LATENCY+2)
-#define ELD p[LOAD_LATENCY] // end of load
-#define ELD_1 p[LOAD_LATENCY+1] // and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,2,16,0,16
- .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
- .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
- mov ret0=r0 // in case we have zero length
- cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
- ;;
- add tmp1=buf,len // last byte's address
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates (rotation)
-(p6) br.ret.spnt.many rp // return if zero or negative length
-
- mov hmask=-1 // initialize head mask
- tbit.nz p15,p0=buf,0 // is buf an odd address?
- and first1=-8,buf // 8-byte align down address of first1 element
-
- and firstoff=7,buf // how many bytes off for first1 element
- mov tmask=-1 // initialize tail mask
-
- ;;
- adds tmp2=-1,tmp1 // last-1
- and lastoff=7,tmp1 // how many bytes off for last element
- ;;
- sub tmp1=8,lastoff // complement to lastoff
- and last=-8,tmp2 // address of word containing last byte
- ;;
- sub tmp3=last,first1 // tmp3=distance from first1 to last
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // save lc
- cmp.eq p8,p9=last,first1 // everything fits in one word ?
-
- ld8 firstval=[first1],8 // load, ahead of time, "first1" word
- and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
- shl tmp2=firstoff,3 // number of bits
- ;;
-(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
- shl tmp1=tmp1,3 // number of bits
-(p9) adds tmp3=-8,tmp3 // effectively loaded
- ;;
-(p8) mov lastval=r0 // we don't need lastval if first1==last
- shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
- shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
- ;;
- .body
-#define count tmp3
-
-(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
-(p9) and word2[0]=lastval,tmask // mask last it as appropriate
- shr.u count=count,3 // how many 8-byte?
- ;;
- // If count is odd, finish this 8-byte word so that we can
- // load two back-to-back 8-byte words per loop thereafter.
- and word1[0]=firstval,hmask // and mask it as appropriate
- tbit.nz p10,p11=count,0 // if (count is odd)
- ;;
-(p8) mov result1[0]=word1[0]
-(p9) add result1[0]=word1[0],word2[0]
- ;;
- cmp.ltu p6,p0=result1[0],word1[0] // check the carry
- cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
- ;;
-(p6) adds result1[0]=1,result1[0]
-(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
-(p11) br.cond.dptk .do_csum16 // if (count is even)
-
- // Here count is odd.
- ld8 word1[1]=[first1],8 // load an 8-byte word
- cmp.eq p9,p10=1,count // if (count == 1)
- adds count=-1,count // loaded an 8-byte word
- ;;
- add result1[0]=result1[0],word1[1]
- ;;
- cmp.ltu p6,p0=result1[0],word1[1]
- ;;
-(p6) adds result1[0]=1,result1[0]
-(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
- // Fall through to caluculate the checksum, feeding result1[0] as
- // the initial value in result1[0].
- //
- // Calculate the checksum loading two 8-byte words per loop.
- //
-.do_csum16:
- add first2=8,first1
- shr.u count=count,1 // we do 16 bytes per loop
- ;;
- adds count=-1,count
- mov carry1=r0
- mov carry2=r0
- brp.loop.imp 1f,2f
- ;;
- mov ar.ec=PIPE_DEPTH
- mov ar.lc=count // set lc
- mov pr.rot=1<<16
- // result1[0] must be initialized in advance.
- mov result2[0]=r0
- ;;
- .align 32
-1:
-(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0]) ld8 word1[0]=[first1],16
-(p[0]) ld8 word2[0]=[first2],16
- br.ctop.sptk 1b
- ;;
- // Since len is a 32-bit value, carry cannot be larger than a 64-bit
value.
-(pC1[1])adds carry1=1,carry1 // since we miss the last one
-(pC2[1])adds carry2=1,carry2
- ;;
- add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
- add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
- ;;
- cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
- cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
- ;;
-(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
- ;;
- add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
- ;;
- cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
- ;;
-(p6) adds result1[0]=1,result1[0]
- ;;
-.do_csum_exit:
- //
- // now fold 64 into 16 bits taking care of carry
- // that's not very good because it has lots of sequentiality
- //
- mov tmp3=0xffff
- zxt4 tmp1=result1[0]
- shr.u tmp2=result1[0],32
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add ret0=tmp1,tmp2
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- // if buf was odd then swap bytes
- mov ar.pfs=saved_pfs // restore ar.ec
-(p15) mux1 ret0=ret0,@rev // reverse word
- ;;
- mov ar.lc=saved_lc
-(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
- br.ret.sptk.many rp
-
-// I (Jun Nakajima) wrote an equivalent code (see below), but it was
-// not much better than the original. So keep the original there so that
-// someone else can challenge.
-//
-// shr.u word1[0]=result1[0],32
-// zxt4 result1[0]=result1[0]
-// ;;
-// add result1[0]=result1[0],word1[0]
-// ;;
-// zxt2 result2[0]=result1[0]
-// extr.u word1[0]=result1[0],16,16
-// shr.u carry1=result1[0],32
-// ;;
-// add result2[0]=result2[0],word1[0]
-// ;;
-// add result2[0]=result2[0],carry1
-// ;;
-// extr.u ret0=result2[0],16,16
-// ;;
-// add ret0=ret0,result2[0]
-// ;;
-// zxt2 ret0=ret0
-// mov ar.pfs=saved_pfs // restore ar.ec
-// mov pr=saved_pr,0xffffffffffff0000
-// ;;
-// // if buf was odd then swap bytes
-// mov ar.lc=saved_lc
-//(p15) mux1 ret0=ret0,@rev // reverse word
-// ;;
-//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
-// br.ret.sptk.many rp
-
-END(do_csum)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/io.c
--- a/xen/arch/ia64/linux/lib/io.c Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,165 +0,0 @@
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
- char *dst = to;
-
- while (count) {
- count--;
- *dst++ = readb(from++);
- }
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
- const char *src = from;
-
- while (count) {
- count--;
- writeb(*src++, to++);
- }
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
- unsigned char ch = (char)(c & 0xff);
-
- while (count) {
- count--;
- writeb(ch, dst);
- dst++;
- }
-}
-EXPORT_SYMBOL(memset_io);
-
-#ifdef CONFIG_IA64_GENERIC
-
-#undef __ia64_inb
-#undef __ia64_inw
-#undef __ia64_inl
-#undef __ia64_outb
-#undef __ia64_outw
-#undef __ia64_outl
-#undef __ia64_readb
-#undef __ia64_readw
-#undef __ia64_readl
-#undef __ia64_readq
-#undef __ia64_readb_relaxed
-#undef __ia64_readw_relaxed
-#undef __ia64_readl_relaxed
-#undef __ia64_readq_relaxed
-#undef __ia64_writeb
-#undef __ia64_writew
-#undef __ia64_writel
-#undef __ia64_writeq
-#undef __ia64_mmiowb
-
-unsigned int
-__ia64_inb (unsigned long port)
-{
- return ___ia64_inb(port);
-}
-
-unsigned int
-__ia64_inw (unsigned long port)
-{
- return ___ia64_inw(port);
-}
-
-unsigned int
-__ia64_inl (unsigned long port)
-{
- return ___ia64_inl(port);
-}
-
-void
-__ia64_outb (unsigned char val, unsigned long port)
-{
- ___ia64_outb(val, port);
-}
-
-void
-__ia64_outw (unsigned short val, unsigned long port)
-{
- ___ia64_outw(val, port);
-}
-
-void
-__ia64_outl (unsigned int val, unsigned long port)
-{
- ___ia64_outl(val, port);
-}
-
-unsigned char
-__ia64_readb (void __iomem *addr)
-{
- return ___ia64_readb (addr);
-}
-
-unsigned short
-__ia64_readw (void __iomem *addr)
-{
- return ___ia64_readw (addr);
-}
-
-unsigned int
-__ia64_readl (void __iomem *addr)
-{
- return ___ia64_readl (addr);
-}
-
-unsigned long
-__ia64_readq (void __iomem *addr)
-{
- return ___ia64_readq (addr);
-}
-
-unsigned char
-__ia64_readb_relaxed (void __iomem *addr)
-{
- return ___ia64_readb (addr);
-}
-
-unsigned short
-__ia64_readw_relaxed (void __iomem *addr)
-{
- return ___ia64_readw (addr);
-}
-
-unsigned int
-__ia64_readl_relaxed (void __iomem *addr)
-{
- return ___ia64_readl (addr);
-}
-
-unsigned long
-__ia64_readq_relaxed (void __iomem *addr)
-{
- return ___ia64_readq (addr);
-}
-
-void
-__ia64_mmiowb(void)
-{
- ___ia64_mmiowb();
-}
-
-#endif /* CONFIG_IA64_GENERIC */
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/ip_fast_csum.S
--- a/xen/arch/ia64/linux/lib/ip_fast_csum.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,90 +0,0 @@
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- * in0: address of buffer to checksum (char *)
- * in1: length of the buffer (int)
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@xxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes. However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0 r32
-#define in1 r33
-#define ret0 r8
-
-GLOBAL_ENTRY(ip_fast_csum)
- .prologue
- .body
- cmp.ne p6,p7=5,in1 // size other than 20 byte?
- and r14=3,in0 // is it aligned on 4-byte?
- add r15=4,in0 // second source pointer
- ;;
- cmp.ne.or.andcm p6,p7=r14,r0
- ;;
-(p7) ld4 r20=[in0],8
-(p7) ld4 r21=[r15],8
-(p6) br.spnt .generic
- ;;
- ld4 r22=[in0],8
- ld4 r23=[r15],8
- ;;
- ld4 r24=[in0]
- add r20=r20,r21
- add r22=r22,r23
- ;;
- add r20=r20,r22
- ;;
- add r20=r20,r24
- ;;
- shr.u ret0=r20,16 // now need to add the carry
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- ;;
- shr.u ret0=r20,16 // add carry again
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- ;;
- shr.u ret0=r20,16
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- ;;
- andcm ret0=-1,r20
- .restore sp // reset frame state
- br.ret.sptk.many b0
- ;;
-
-.generic:
- .prologue
- .save ar.pfs, r35
- alloc r35=ar.pfs,2,2,2,0
- .save rp, r34
- mov r34=b0
- .body
- dep.z out1=in1,2,30
- mov out0=in0
- ;;
- br.call.sptk.many b0=do_csum
- ;;
- andcm ret0=-1,ret0
- mov ar.pfs=r35
- mov b0=r34
- br.ret.sptk.many b0
-END(ip_fast_csum)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy.S
--- a/xen/arch/ia64/linux/lib/memcpy.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,301 +0,0 @@
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * in0: destination address
- * in1: source address
- * in2: number of bytes to copy
- * Output:
- * no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@xxxxxxxxxx>
- * David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-# define MEM_LAT 21 /* latency to memory */
-
-# define dst r2
-# define src r3
-# define retval r8
-# define saved_pfs r9
-# define saved_lc r10
-# define saved_pr r11
-# define cnt r16
-# define src2 r17
-# define t0 r18
-# define t1 r19
-# define t2 r20
-# define t3 r21
-# define t4 r22
-# define src_end r23
-
-# define N (MEM_LAT + 4)
-# define Nrot ((N + 7) & ~7)
-
- /*
- * First, check if everything (src, dst, len) is a multiple of eight.
If
- * so, we handle everything with no taken branches (other than the loop
- * itself) and a small icache footprint. Otherwise, we jump off to
- * the more general copy routine handling arbitrary
- * sizes/alignment etc.
- */
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- or t0=in0,in1
- ;;
-
- or t0=t0,in2
- .save pr, saved_pr
- mov saved_pr=pr
-
- .body
-
- cmp.eq p6,p0=in2,r0 // zero length?
- mov retval=in0 // return dst
-(p6) br.ret.spnt.many rp // zero length, return immediately
- ;;
-
- mov dst=in0 // copy because of rotation
- shr.u cnt=in2,3 // number of 8-byte words to copy
- mov pr.rot=1<<16
- ;;
-
- adds cnt=-1,cnt // br.ctop is repeat/until
- cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
- mov ar.ec=N
- ;;
-
- and t0=0x7,t0
- mov ar.lc=cnt
- ;;
- cmp.ne p6,p0=t0,r0
-
- mov src=in1 // copy because of rotation
-(p7) br.cond.spnt.few .memcpy_short
-(p6) br.cond.spnt.few .memcpy_long
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- .rotr val[N]
- .rotp p[N]
- .align 32
-1: { .mib
-(p[0]) ld8 val[0]=[src],8
- nop.i 0
- brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
- nop.f 0
- br.ctop.dptk.few 1b
-}
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Small (<16 bytes) unaligned copying is done via a simple
byte-at-the-time
- * copy loop. This performs relatively poorly on Itanium, but it
doesn't
- * get used very often (gcc inlines small copies) and due to atomicity
- * issues, we want to avoid read-modify-write of entire words.
- */
- .align 32
-.memcpy_short:
- adds cnt=-1,in2 // br.ctop is repeat/until
- mov ar.ec=MEM_LAT
- brp.loop.imp 1f, 2f
- ;;
- mov ar.lc=cnt
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- nop.m 0
- ;;
- /*
- * It is faster to put a stop bit in the loop here because it makes
- * the pipeline shorter (and latency is what matters on short copies).
- */
- .align 32
-1: { .mib
-(p[0]) ld1 val[0]=[src],1
- nop.i 0
- brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
- nop.f 0
- br.ctop.dptk.few 1b
-} ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
- * an overriding concern here, but throughput is. We first do
- * sub-word copying until the destination is aligned, then we check
- * if the source is also aligned. If so, we do a simple load/store-loop
- * until there are less than 8 bytes left over and then we do the tail,
- * by storing the last few bytes using sub-word copying. If the source
- * is not aligned, we branch off to the non-congruent loop.
- *
- * stage: op:
- * 0 ld
- * :
- * MEM_LAT+3 shrp
- * MEM_LAT+4 st
- *
- * On Itanium, the pipeline itself runs without stalls. However,
br.ctop
- * seems to introduce an unavoidable bubble in the pipeline so the
overall
- * latency is 2 cycles/iteration. This gives us a _copy_ throughput
- * of 4 byte/cycle. Still not bad.
- */
-# undef N
-# undef Nrot
-# define N (MEM_LAT + 5) /* number of stages */
-# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
-
-#define LOG_LOOP_SIZE 6
-
-.memcpy_long:
- alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
- and t0=-8,src // t0 = src & ~7
- and t2=7,src // t2 = src & 7
- ;;
- ld8 t0=[t0] // t0 = 1st source word
- adds src2=7,src // src2 = (src + 7)
- sub t4=r0,dst // t4 = -dst
- ;;
- and src2=-8,src2 // src2 = (src + 7) & ~7
- shl t2=t2,3 // t2 = 8*(src & 7)
- shl t4=t4,3 // t4 = 8*(dst & 7)
- ;;
- ld8 t1=[src2] // t1 = 1st source word if src is 8-byte
aligned, 2nd otherwise
- sub t3=64,t2 // t3 = 64-8*(src & 7)
- shr.u t0=t0,t2
- ;;
- add src_end=src,in2
- shl t1=t1,t3
- mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
- ;;
- or t0=t0,t1
- mov cnt=r0
- adds src_end=-1,src_end
- ;;
-(p3) st1 [dst]=t0,1
-(p3) shr.u t0=t0,8
-(p3) adds cnt=1,cnt
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
-(p4) adds cnt=2,cnt
- ;;
-(p5) st4 [dst]=t0,4
-(p5) adds cnt=4,cnt
- and src_end=-8,src_end // src_end = last word of source buffer
- ;;
-
- // At this point, dst is aligned to 8 bytes and there at least 16-7=9
bytes left to copy:
-
-1:{ add src=cnt,src // make src point to remainder of
source buffer
- sub cnt=in2,cnt // cnt = number of bytes left to copy
- mov t4=ip
- } ;;
- and src2=-8,src // align source pointer
- adds t4=.memcpy_loops-1b,t4
- mov ar.ec=N
-
- and t0=7,src // t0 = src & 7
- shr.u t2=cnt,3 // t2 = number of 8-byte words left to
copy
- shl cnt=cnt,3 // move bits 0-2 to 3-5
- ;;
-
- .rotr val[N+1], w[2]
- .rotp p[N]
-
- cmp.ne p6,p0=t0,r0 // is src aligned, too?
- shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
- adds t2=-1,t2 // br.ctop is repeat/until
- ;;
- add t4=t0,t4
- mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes
last-word bytes to copy
- mov ar.lc=t2
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
-(p6) ld8 val[1]=[src2],8 // prime the pump...
- mov b6=t4
- br.sptk.few b6
- ;;
-
-.memcpy_tail:
- // At this point, (p5,p4,p3) are set to the number of bytes left to
copy (which is
- // less than 8) and t0 contains the last few bytes of the src buffer:
-(p5) st4 [dst]=t0,4
-(p5) shr.u t0=t0,32
- mov ar.lc=saved_lc
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
- mov ar.pfs=saved_pfs
- ;;
-(p3) st1 [dst]=t0
- mov pr=saved_pr,-1
- br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
- .align 64
-
-#define COPY(shift,index)
\
- 1: { .mib
\
- (p[0]) ld8 val[0]=[src2],8;
\
- (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;
\
- brp.loop.imp 1b, 2f
\
- };
\
- 2: { .mfb
\
- (p[MEM_LAT+4]) st8 [dst]=w[1],8;
\
- nop.f 0;
\
- br.ctop.dptk.few 1b;
\
- };
\
- ;;
\
- ld8 val[N-1]=[src_end]; /* load last word (may be same
as val[N]) */ \
- ;;
\
- shrp t0=val[N-1],val[N-index],shift;
\
- br .memcpy_tail
-.memcpy_loops:
- COPY(0, 1) /* no point special casing this---it doesn't go any faster
without shrp */
- COPY(8, 0)
- COPY(16, 0)
- COPY(24, 0)
- COPY(32, 0)
- COPY(40, 0)
- COPY(48, 0)
- COPY(56, 0)
-
-END(memcpy)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strlen_user.S
--- a/xen/arch/ia64/linux/lib/strlen_user.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,198 +0,0 @@
-/*
- * Optimized version of the strlen_user() function
- *
- * Inputs:
- * in0 address of buffer
- *
- * Outputs:
- * ret0 0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- * David Mosberger-Tang <davidm@xxxxxxxxxx>
- * Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * 01/19/99 S.Eranian heavily enhanced version (see details below)
- * 09/24/99 S.Eranian added speculation recovery code
- */
-
-#include <asm/asmmacro.h>
-
-//
-// int strlen_user(char *)
-// ------------------------
-// Returns:
-// - length of string + 1
-// - 0 in case an exception is raised
-//
-// This is an enhanced version of the basic strlen_user. it includes a
-// combination of compute zero index (czx), parallel comparisons, speculative
-// loads and loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-// The goal is to look at the string in chunks of 8 bytes.
-// so we need to do a few extra checks at the beginning because the
-// string may not be 8-byte aligned. In this case we load the 8byte
-// quantity which includes the start of the string and mask the unused
-// bytes with 0xff to avoid confusing czx.
-// We use speculative loads and software pipelining to hide memory
-// latency and do read ahead safely. This way we defer any exception.
-//
-// Because we don't want the kernel to be relying on particular
-// settings of the DCR register, we provide recovery code in case
-// speculation fails. The recovery code is going to "redo" the work using
-// only normal loads. If we still get a fault then we return an
-// error (ret0=0). Otherwise we return the strlen+1 as usual.
-// The fact that speculation may fail can be caused, for instance, by
-// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-// a NaT bit will be set if the translation is not present. The normal
-// load, on the other hand, will cause the translation to be inserted
-// if the mapping exists.
-//
-// It should be noted that we execute recovery code only when we need
-// to use the data that has been speculatively loaded: we don't execute
-// recovery code on pure read ahead data.
-//
-// Remarks:
-// - the cmp r0,r0 is used as a fast way to initialize a predicate
-// register to 1. This is required to make sure that we get the parallel
-// compare correct.
-//
-// - we don't use the epilogue counter to exit the loop but we need to set
-// it to zero beforehand.
-//
-// - after the loop we must test for Nat values because neither the
-// czx nor cmp instruction raise a NaT consumption fault. We must be
-// careful not to look too far for a Nat for which we don't care.
-// For instance we don't need to look at a NaT in val2 if the zero byte
-// was in val1.
-//
-// - Clearly performance tuning is required.
-//
-
-#define saved_pfs r11
-#define tmp r10
-#define base r16
-#define orig r17
-#define saved_pr r18
-#define src r19
-#define mask r20
-#define val r21
-#define val1 r22
-#define val2 r23
-
-GLOBAL_ENTRY(__strlen_user)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,11,0,0,8
-
- .rotr v[2], w[2] // declares our 4 aliases
-
- extr.u tmp=in0,0,3 // tmp=least significant 3 bits
- mov orig=in0 // keep trackof initial byte address
- dep src=0,in0,0,3 // src=8byte-aligned in0 address
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates (rotation)
- ;;
-
- .body
-
- ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate)
- shl tmp=tmp,3 // multiply by 8bits/byte
- mov mask=-1 // our mask
- ;;
- ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline
- cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and)
- sub tmp=64,tmp // how many bits to shift our mask on the right
- ;;
- shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
- mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
- ;;
- add base=-16,src // keep track of aligned base
- chk.s v[1], .recover // if already NaT, then directly skip to recover
- or v[1]=v[1],mask // now we have a safe initial byte pattern
- ;;
-1:
- ld8.s v[0]=[src],8 // speculatively load next
- czx1.r val1=v[1] // search 0 byte from right
- czx1.r val2=w[1] // search 0 byte from right following 8bytes
- ;;
- ld8.s w[0]=[src],8 // speculatively load next to next
- cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
- cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
-(p6) br.wtop.dptk.few 1b // loop until p6 == 0
- ;;
- //
- // We must return try the recovery code iff
- // val1_is_nat || (val1==8 && val2_is_nat)
- //
- // XXX Fixme
- // - there must be a better way of doing the test
- //
- cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
- tnat.nz p6,p7=val1 // test NaT on val1
-(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
- ;;
- //
- // if we come here p7 is true, i.e., initialized for // cmp
- //
- cmp.eq.and p7,p0=8,val1// val1==8?
- tnat.nz.and p7,p0=val2 // test NaT if val2
-(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
- ;;
-(p8) mov val1=val2 // val2 contains the value
-(p8) adds src=-16,src // correct position when 3 ahead
-(p9) adds src=-24,src // correct position when 4 ahead
- ;;
- sub ret0=src,orig // distance from origin
- sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- sub ret0=ret0,tmp // length=now - back -1
- mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
- br.ret.sptk.many rp // end of normal execution
-
- //
- // Outlined recovery code when speculation failed
- //
- // This time we don't use speculation and rely on the normal exception
- // mechanism. that's why the loop is not as good as the previous one
- // because read ahead is not possible
- //
- // XXX Fixme
- // - today we restart from the beginning of the string instead
- // of trying to continue where we left off.
- //
-.recover:
- EX(.Lexit1, ld8 val=[base],8) // load the initial bytes
- ;;
- or val=val,mask // remask first bytes
- cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
- ;;
- //
- // ar.ec is still zero here
- //
-2:
- EX(.Lexit1, (p6) ld8 val=[base],8)
- ;;
- czx1.r val1=val // search 0 byte from right
- ;;
- cmp.eq p6,p0=8,val1 // val1==8 ?
-(p6) br.wtop.dptk.few 2b // loop until p6 == 0
- ;;
- sub ret0=base,orig // distance from base
- sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- sub ret0=ret0,tmp // length=now - back -1
- mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
- br.ret.sptk.many rp // end of successful recovery code
-
- //
- // We failed even on the normal load (called from exception handler)
- //
-.Lexit1:
- mov ret0=0
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
- br.ret.sptk.many rp
-END(__strlen_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strncpy_from_user.S
--- a/xen/arch/ia64/linux/lib/strncpy_from_user.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- * in0: address of destination buffer
- * in1: address of string to be copied
- * in2: length of buffer in bytes
- * Outputs:
- * r8: -EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- * by Andreas Schwab <schwab@xxxxxxx>).
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
- alloc r2=ar.pfs,3,0,0,0
- mov r8=0
- mov r9=in1
- ;;
- add r10=in1,in2
- cmp.eq p6,p0=r0,in2
-(p6) br.ret.spnt.many rp
-
- // XXX braindead copy loop---this needs to be optimized
-.Loop1:
- EX(.Lexit, ld1 r8=[in1],1)
- ;;
- EX(.Lexit, st1 [in0]=r8,1)
- cmp.ne p6,p7=r8,r0
- ;;
-(p6) cmp.ne.unc p8,p0=in1,r10
-(p8) br.cond.dpnt.few .Loop1
- ;;
-(p6) mov r8=in2 // buffer filled up---return buffer length
-(p7) sub r8=in1,r9,1 // return string length (excluding NUL
character)
-[.Lexit:]
- br.ret.sptk.many rp
-END(__strncpy_from_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strnlen_user.S
--- a/xen/arch/ia64/linux/lib/strnlen_user.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,45 +0,0 @@
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- * in0: address of buffer
- * in1: string length limit N
- * Outputs:
- * r8: 0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
- .prologue
- alloc r2=ar.pfs,2,0,0,0
- .save ar.lc, r16
- mov r16=ar.lc // preserve ar.lc
-
- .body
-
- add r3=-1,in1
- ;;
- mov ar.lc=r3
- mov r9=0
- ;;
- // XXX braindead strlen loop---this needs to be optimized
-.Loop1:
- EXCLR(.Lexit, ld1 r8=[in0],1)
- add r9=1,r9
- ;;
- cmp.eq p6,p0=r8,r0
-(p6) br.cond.dpnt .Lexit
- br.cloop.dptk.few .Loop1
-
- add r9=1,in1 // NUL not found---return N+1
- ;;
-.Lexit:
- mov r8=r9
- mov ar.lc=r16 // restore ar.lc
- br.ret.sptk.many rp
-END(__strnlen_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/xor.S
--- a/xen/arch/ia64/linux/lib/xor.S Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,184 +0,0 @@
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 3, 0, 13, 16
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- ;;
- .rotr s1[6+1], s2[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
- nop.f 0
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 4, 0, 20, 24
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
- ;;
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], s3[6]
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 5, 0, 27, 32
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- mov r19 = in4
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r20 = s3[6], s4[6]
- ;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r20
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 6, 0, 34, 40
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- mov r19 = in4
- mov r20 = in5
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r21 = s3[6], s4[6]
- ;;
-(p[0]) ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r21
- ;;
-(p[6]) xor d[0] = d[0], s5[6]
- nop.f 0
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_5)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/minstate.h
--- a/xen/arch/ia64/linux/minstate.h Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,254 +0,0 @@
-#include <linux/config.h>
-
-#include <asm/cache.h>
-
-#include "entry.h"
-
-/*
- * For ivt.s we want to access the stack virtually so we don't have to disable
translation
- * on interrupts.
- *
- * On entry:
- * r1: pointer to current task (ar.k6)
- */
-#define MINSTATE_START_SAVE_MIN_VIRT
\
-(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0,
little-endian, loadrs=0 */ \
- ;;
\
-(pUStk) mov.m r24=ar.rnat;
\
-(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base
of RBS */ \
-(pKStk) mov r1=sp; /* get sp */
\
- ;;
\
-(pUStk) lfetch.fault.excl.nt1 [r22];
\
-(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base
of memory stack */ \
-(pUStk) mov r23=ar.bspstore; /* save
ar.bspstore */ \
- ;;
\
-(pUStk) mov ar.bspstore=r22; /* switch to
kernel RBS */ \
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode,
use sp (r12) */ \
- ;;
\
-(pUStk) mov r18=ar.bsp;
\
-(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian,
loadrs=0 */ \
-
-#define MINSTATE_END_SAVE_MIN_VIRT
\
- bsw.1; /* switch back to bank 1 (must be last in insn
group) */ \
- ;;
-
-/*
- * For mca_asm.S we want to access the stack physically since the state is
saved before we
- * go virtual and don't want to destroy the iip or ipsr.
- */
-#define MINSTATE_START_SAVE_MIN_PHYS
\
-(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;
\
-(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;
\
-(pKStk) ld8 r3 = [r3];;
\
-(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;
\
-(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;
\
-(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0,
little-endian, loadrs=0 */ \
-(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of
register backing store */ \
- ;;
\
-(pUStk) mov r24=ar.rnat;
\
-(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base
of memory stack */ \
-(pUStk) mov r23=ar.bspstore; /* save
ar.bspstore */ \
-(pUStk) dep r22=-1,r22,61,3; /* compute kernel
virtual addr of RBS */ \
- ;;
\
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp
(r12) */ \
-(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS
*/ \
- ;;
\
-(pUStk) mov r18=ar.bsp;
\
-(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian,
loadrs=0 */ \
-
-#define MINSTATE_END_SAVE_MIN_PHYS
\
- dep r12=-1,r12,61,3; /* make sp a kernel virtual address */
\
- ;;
-
-#ifdef MINSTATE_VIRT
-# define MINSTATE_GET_CURRENT(reg) \
- movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
- ld8 reg=[reg]
-# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT
-# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT
-#endif
-
-#ifdef MINSTATE_PHYS
-# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg
-# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS
-# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS
-#endif
-
-/*
- * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
- * the minimum state necessary that allows us to turn psr.ic back
- * on.
- *
- * Assumed state upon entry:
- * psr.ic: off
- * r31: contains saved predicates (pr)
- *
- * Upon exit, the state is as follows:
- * psr.ic: off
- * r2 = points to &pt_regs.r16
- * r8 = contents of ar.ccv
- * r9 = contents of ar.csd
- * r10 = contents of ar.ssd
- * r11 = FPSR_DEFAULT
- * r12 = kernel sp (kernel virtual address)
- * r13 = points to current task_struct (kernel virtual address)
- * p15 = TRUE if psr.i is set in cr.ipsr
- * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
- * preserved
- *
- * Note that psr.ic is NOT turned on by this macro. This is so that
- * we can pass interruption state as arguments to a handler.
- */
-#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)
\
- MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */
\
- mov r27=ar.rsc; /* M */
\
- mov r20=r1; /* A */
\
- mov r25=ar.unat; /* M */
\
- mov r29=cr.ipsr; /* M */
\
- mov r26=ar.pfs; /* I */
\
- mov r28=cr.iip; /* M */
\
- mov r21=ar.fpsr; /* M */
\
- COVER; /* B;; (or nothing) */
\
- ;;
\
- adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;
\
- ;;
\
- ld1 r17=[r16]; /* load
current->thread.on_ustack flag */ \
- st1 [r16]=r0; /* clear
current->thread.on_ustack flag */ \
- adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
\
- /* switch from user to kernel RBS: */
\
- ;;
\
- invala; /* M */
\
- SAVE_IFS;
\
- cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode
already? */ \
- ;;
\
- MINSTATE_START_SAVE_MIN
\
- adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line
size */ \
- adds r16=PT(CR_IPSR),r1;
\
- ;;
\
- lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;
\
- st8 [r16]=r29; /* save cr.ipsr */
\
- ;;
\
- lfetch.fault.excl.nt1 [r17];
\
- tbit.nz p15,p0=r29,IA64_PSR_I_BIT;
\
- mov r29=b0
\
- ;;
\
- adds r16=PT(R8),r1; /* initialize first base pointer */
\
- adds r17=PT(R9),r1; /* initialize second base pointer */
\
-(pKStk) mov r18=r0; /* make sure r18 isn't NaT */
\
- ;;
\
-.mem.offset 0,0; st8.spill [r16]=r8,16;
\
-.mem.offset 8,0; st8.spill [r17]=r9,16;
\
- ;;
\
-.mem.offset 0,0; st8.spill [r16]=r10,24;
\
-.mem.offset 8,0; st8.spill [r17]=r11,24;
\
- ;;
\
- st8 [r16]=r28,16; /* save cr.iip */
\
- st8 [r17]=r30,16; /* save cr.ifs */
\
-(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */
\
- mov r8=ar.ccv;
\
- mov r9=ar.csd;
\
- mov r10=ar.ssd;
\
- movl r11=FPSR_DEFAULT; /* L-unit */
\
- ;;
\
- st8 [r16]=r25,16; /* save ar.unat */
\
- st8 [r17]=r26,16; /* save ar.pfs */
\
- shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */
\
- ;;
\
- st8 [r16]=r27,16; /* save ar.rsc */
\
-(pUStk) st8 [r17]=r24,16; /* save ar.rnat */
\
-(pKStk) adds r17=16,r17; /* skip over ar_rnat field */
\
- ;; /* avoid RAW on r16 & r17 */
\
-(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */
\
- st8 [r17]=r31,16; /* save predicates */
\
-(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */
\
- ;;
\
- st8 [r16]=r29,16; /* save b0 */
\
- st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */
\
- cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */
\
- ;;
\
-.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */
\
-.mem.offset 8,0; st8.spill [r17]=r12,16;
\
- adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes
of scratch) */ \
- ;;
\
-.mem.offset 0,0; st8.spill [r16]=r13,16;
\
-.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */
\
- movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;
\
- ld8 r13=[r13]; /* establish 'current' */
\
- ;;
\
-.mem.offset 0,0; st8.spill [r16]=r15,16;
\
-.mem.offset 8,0; st8.spill [r17]=r14,16;
\
- ;;
\
-.mem.offset 0,0; st8.spill [r16]=r2,16;
\
-.mem.offset 8,0; st8.spill [r17]=r3,16;
\
- adds r2=IA64_PT_REGS_R16_OFFSET,r1;
\
- ;;
\
- EXTRA;
\
- movl r1=__gp; /* establish kernel global pointer */
\
- ;;
\
- MINSTATE_END_SAVE_MIN
-
-/*
- * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
- *
- * Assumed state upon entry:
- * psr.ic: on
- * r2: points to &pt_regs.r16
- * r3: points to &pt_regs.r17
- * r8: contents of ar.ccv
- * r9: contents of ar.csd
- * r10: contents of ar.ssd
- * r11: FPSR_DEFAULT
- *
- * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
- */
-#define SAVE_REST \
-.mem.offset 0,0; st8.spill [r2]=r16,16; \
-.mem.offset 8,0; st8.spill [r3]=r17,16; \
- ;; \
-.mem.offset 0,0; st8.spill [r2]=r18,16; \
-.mem.offset 8,0; st8.spill [r3]=r19,16; \
- ;; \
-.mem.offset 0,0; st8.spill [r2]=r20,16; \
-.mem.offset 8,0; st8.spill [r3]=r21,16; \
- mov r18=b6; \
- ;; \
-.mem.offset 0,0; st8.spill [r2]=r22,16; \
-.mem.offset 8,0; st8.spill [r3]=r23,16; \
- mov r19=b7; \
- ;; \
-.mem.offset 0,0; st8.spill [r2]=r24,16; \
-.mem.offset 8,0; st8.spill [r3]=r25,16; \
- ;; \
-.mem.offset 0,0; st8.spill [r2]=r26,16; \
-.mem.offset 8,0; st8.spill [r3]=r27,16; \
- ;; \
-.mem.offset 0,0; st8.spill [r2]=r28,16; \
-.mem.offset 8,0; st8.spill [r3]=r29,16; \
- ;; \
-.mem.offset 0,0; st8.spill [r2]=r30,16; \
-.mem.offset 8,0; st8.spill [r3]=r31,32; \
- ;; \
- mov ar.fpsr=r11; /* M-unit */ \
- st8 [r2]=r8,8; /* ar.ccv */ \
- adds r24=PT(B6)-PT(F7),r3; \
- ;; \
- stf.spill [r2]=f6,32; \
- stf.spill [r3]=f7,32; \
- ;; \
- stf.spill [r2]=f8,32; \
- stf.spill [r3]=f9,32; \
- ;; \
- stf.spill [r2]=f10; \
- stf.spill [r3]=f11; \
- adds r25=PT(B7)-PT(F11),r3; \
- ;; \
- st8 [r24]=r18,16; /* b6 */ \
- st8 [r25]=r19,16; /* b7 */ \
- ;; \
- st8 [r24]=r9; /* ar.csd */ \
- st8 [r25]=r10; /* ar.ssd */ \
- ;;
-
-#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,)
-#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov
r15=r19)
-#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, )
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/pdb-stub.c
--- a/xen/arch/ia64/pdb-stub.c Tue Aug 30 23:51:51 2005
+++ /dev/null Wed Aug 31 20:32:27 2005
@@ -1,59 +0,0 @@
-
-/*
- * pervasive debugger
- * www.cl.cam.ac.uk/netos/pdb
- *
- * alex ho
- * 2004
- * university of cambridge computer laboratory
- *
- * code adapted originally from kgdb, nemesis, & gdbserver
- */
-
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <asm/ptrace.h>
-#include <xen/keyhandler.h>
-#include <asm/processor.h>
-#include <asm/pdb.h>
-#include <xen/list.h>
-#include <xen/serial.h>
-
-#define __PDB_GET_VAL 1
-#define __PDB_SET_VAL 2
-
-/*
- * Read or write memory in an address space
- */
-int pdb_change_values(u_char *buffer, int length,
- unsigned long cr3, unsigned long addr, int rw)
-{
- dummy();
- return 0;
-}
-
-/*
- * Set memory in a domain's address space
- * Set "length" bytes at "address" from "domain" to the values in "buffer".
- * Return the number of bytes set, 0 if there was a problem.
- */
-
-int pdb_set_values(u_char *buffer, int length,
- unsigned long cr3, unsigned long addr)
-{
- int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL);
- return count;
-}
-
-/*
- * Read memory from a domain's address space.
- * Fetch "length" bytes at "address" from "domain" into "buffer".
- * Return the number of bytes read, 0 if there was a problem.
- */
-
-int pdb_get_values(u_char *buffer, int length,
- unsigned long cr3, unsigned long addr)
-{
- return pdb_change_values(buffer, length, cr3, addr, __PDB_GET_VAL);
-}
-
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|