WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] Begin updating to 2.6.13 base

# HG changeset patch
# User djm@xxxxxxxxxxxxxxx
# Node ID b7276814008c9c924fceecf6fd9f67ccddaadcb2
# Parent  44316ce8327754a7a70c80ffff551e7c4619e066
Begin updating to 2.6.13 base

diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/Makefile
--- a/xen/arch/ia64/Makefile    Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/Makefile    Wed Aug 31 20:32:27 2005
@@ -1,18 +1,21 @@
 include $(BASEDIR)/Rules.mk
 
-VPATH = linux linux-xen
+VPATH = linux linux-xen linux/lib
+#VPATH = linux-xen linux/lib
 
 # libs-y       += arch/ia64/lib/lib.a
 
 OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \
-       xenmisc.o pdb-stub.o acpi.o hypercall.o \
+       xenmisc.o acpi.o hypercall.o \
        machvec.o dom0_ops.o domain.o hpsimserial.o pcdp.o \
        idle0_task.o pal.o hpsim.o efi.o efi_stub.o ivt.o mm_contig.o \
        xenmem.o sal.o cmdline.o mm_init.o tlb.o smpboot.o \
-       extable.o linuxextable.o xenirq.o xentime.o \
+       extable.o linuxextable.o sort.o xenirq.o xentime.o \
        regionreg.o entry.o unaligned.o privop.o vcpu.o \
        irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \
        grant_table.o sn_console.o
+
+#OBJS += idiv64.o idiv32.o                     \
 
 # TMP holder to contain *.0 moved out of CONFIG_VTI
 OBJS += vmx_init.o
@@ -22,6 +25,13 @@
        vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \
        vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o
 endif
+
+# files from xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
+OBJS +=        bitop.o clear_page.o flush.o copy_page_mck.o                    
\
+       memset.o strlen.o memcpy_mck.o                                  \
+       __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o                   \
+       __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
+
 # perfmon.o
 # unwind.o needed for kernel unwinding (rare)
 
@@ -30,8 +40,8 @@
 # remove following line if not privifying in memory
 # OBJS += privify.o
 
-default: $(OBJS) head.o ia64lib.o xen.lds.s
-       $(LD) -r -o arch.o $(OBJS) ia64lib.o
+default: $(OBJS) head.o xen.lds.s
+       $(LD) -r -o arch.o $(OBJS)
        $(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \
                -Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms
        $(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET)
@@ -79,12 +89,29 @@
        $(CC) -E $(CPPFLAGS) -P -DXEN -D__ASSEMBLY__ \
                -o xen.lds.s xen.lds.S
 
-ia64lib.o:
-       $(MAKE) -C linux/lib && cp linux/lib/ia64lib.o .
+# variants of divide/modulo
+# see files in xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
+__divdi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
+__udivdi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
+__moddi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
+__umoddi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
+__divsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
+__udivsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
+__modsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
+__umodsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
+
 
 clean:
        rm -f *.o *~ core  xen.lds.s 
$(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s
        rm -f asm-xsi-offsets.s $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h
-       rm -f lib/*.o
+       rm -f linux/lib/*.o
 
 .PHONY: default clean
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/setup.c
--- a/xen/arch/ia64/linux-xen/setup.c   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/setup.c   Wed Aug 31 20:32:27 2005
@@ -4,10 +4,15 @@
  * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
  *     David Mosberger-Tang <davidm@xxxxxxxxxx>
  *     Stephane Eranian <eranian@xxxxxxxxxx>
- * Copyright (C) 2000, Rohit Seth <rohit.seth@xxxxxxxxx>
+ * Copyright (C) 2000, 2004 Intel Corp
+ *     Rohit Seth <rohit.seth@xxxxxxxxx>
+ *     Suresh Siddha <suresh.b.siddha@xxxxxxxxx>
+ *     Gordon Jin <gordon.jin@xxxxxxxxx>
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999 Walt Drummond <drummond@xxxxxxxxxxx>
  *
+ * 12/26/04 S.Siddha, G.Jin, R.Seth
+ *                     Add multi-threading and multi-core detection
  * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
  * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
  * 03/31/00 R.Seth     cpu_initialized and current->processor fixes
@@ -15,6 +20,7 @@
  * 02/01/00 R.Seth     fixed get_cpuinfo for SMP
  * 01/07/99 S.Eranian  added the support for command line argument
  * 06/24/99 W.Drummond added boot_cpu_data.
+ * 05/28/05 Z. Menyhart        Dynamic stride size for "flush_icache_range()"
  */
 #include <linux/config.h>
 #include <linux/module.h>
@@ -35,6 +41,10 @@
 #include <linux/serial_core.h>
 #include <linux/efi.h>
 #include <linux/initrd.h>
+#ifndef XEN
+#include <linux/platform.h>
+#include <linux/pm.h>
+#endif
 
 #include <asm/ia32.h>
 #include <asm/machvec.h>
@@ -51,8 +61,10 @@
 #include <asm/smp.h>
 #include <asm/system.h>
 #include <asm/unistd.h>
+#ifdef XEN
 #include <asm/vmx.h>
 #include <asm/io.h>
+#endif
 
 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
 # error "struct cpuinfo_ia64 too big!"
@@ -64,12 +76,16 @@
 #endif
 
 DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
+#ifdef XEN
 DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr);
+#endif
 DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
 DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
 unsigned long ia64_cycles_per_usec;
 struct ia64_boot_param *ia64_boot_param;
 struct screen_info screen_info;
+unsigned long vga_console_iobase;
+unsigned long vga_console_membase;
 
 unsigned long ia64_max_cacheline_size;
 unsigned long ia64_iobase;     /* virtual address for I/O accesses */
@@ -78,7 +94,12 @@
 EXPORT_SYMBOL(io_space);
 unsigned int num_io_spaces;
 
-unsigned char aux_device_present = 0xaa;        /* XXX remove this when legacy 
I/O is gone */
+/*
+ * "flush_icache_range()" needs to know what processor dependent stride size 
to use
+ * when it makes i-cache(s) coherent with d-caches.
+ */
+#define        I_CACHE_STRIDE_SHIFT    5       /* Safest way to go: 32 bytes 
by 32 bytes */
+unsigned long ia64_i_cache_stride_shift = ~0;
 
 /*
  * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 
1).  This
@@ -287,23 +308,25 @@
 static inline int __init
 early_console_setup (char *cmdline)
 {
+       int earlycons = 0;
+
 #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
        {
                extern int sn_serial_console_early_setup(void);
                if (!sn_serial_console_early_setup())
-                       return 0;
+                       earlycons++;
        }
 #endif
 #ifdef CONFIG_EFI_PCDP
        if (!efi_setup_pcdp_console(cmdline))
-               return 0;
+               earlycons++;
 #endif
 #ifdef CONFIG_SERIAL_8250_CONSOLE
        if (!early_serial_console_init(cmdline))
-               return 0;
-#endif
-
-       return -1;
+               earlycons++;
+#endif
+
+       return (earlycons) ? 0 : -1;
 }
 
 static inline void
@@ -315,7 +338,34 @@
 #endif
 }
 
-void __init
+#ifdef CONFIG_SMP
+static void
+check_for_logical_procs (void)
+{
+       pal_logical_to_physical_t info;
+       s64 status;
+
+       status = ia64_pal_logical_to_phys(0, &info);
+       if (status == -1) {
+               printk(KERN_INFO "No logical to physical processor mapping "
+                      "available\n");
+               return;
+       }
+       if (status) {
+               printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n",
+                      status);
+               return;
+       }
+       /*
+        * Total number of siblings that BSP has.  Though not all of them 
+        * may have booted successfully. The correct number of siblings 
+        * booted is in info.overview_num_log.
+        */
+       smp_num_siblings = info.overview_tpc;
+       smp_num_cpucores = info.overview_cpp;
+}
+#endif
+
 #ifdef XEN
 early_setup_arch (char **cmdline_p)
 #else
@@ -398,6 +448,19 @@
 
 #ifdef CONFIG_SMP
        cpu_physical_id(0) = hard_smp_processor_id();
+
+       cpu_set(0, cpu_sibling_map[0]);
+       cpu_set(0, cpu_core_map[0]);
+
+       check_for_logical_procs();
+       if (smp_num_cpucores > 1)
+               printk(KERN_INFO
+                      "cpu package is Multi-Core capable: number of 
cores=%d\n",
+                      smp_num_cpucores);
+       if (smp_num_siblings > 1)
+               printk(KERN_INFO
+                      "cpu package is Multi-Threading capable: number of 
siblings=%d\n",
+                      smp_num_siblings);
 #endif
 
 #ifdef XEN
@@ -505,12 +568,23 @@
                   "cpu regs   : %u\n"
                   "cpu MHz    : %lu.%06lu\n"
                   "itc MHz    : %lu.%06lu\n"
-                  "BogoMIPS   : %lu.%02lu\n\n",
+                  "BogoMIPS   : %lu.%02lu\n",
                   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
                   features, c->ppn, c->number,
                   c->proc_freq / 1000000, c->proc_freq % 1000000,
                   c->itc_freq / 1000000, c->itc_freq % 1000000,
                   lpj*HZ/500000, (lpj*HZ/5000) % 100);
+#ifdef CONFIG_SMP
+       seq_printf(m, "siblings   : %u\n", c->num_log);
+       if (c->threads_per_core > 1 || c->cores_per_socket > 1)
+               seq_printf(m,
+                          "physical id: %u\n"
+                          "core id    : %u\n"
+                          "thread id  : %u\n",
+                          c->socket_id, c->core_id, c->thread_id);
+#endif
+       seq_printf(m,"\n");
+
        return 0;
 }
 
@@ -581,6 +655,14 @@
        memcpy(c->vendor, cpuid.field.vendor, 16);
 #ifdef CONFIG_SMP
        c->cpu = smp_processor_id();
+
+       /* below default values will be overwritten  by identify_siblings() 
+        * for Multi-Threading/Multi-Core capable cpu's
+        */
+       c->threads_per_core = c->cores_per_socket = c->num_log = 1;
+       c->socket_id = -1;
+
+       identify_siblings(c);
 #endif
        c->ppn = cpuid.field.ppn;
        c->number = cpuid.field.number;
@@ -611,6 +693,12 @@
        /* start_kernel() requires this... */
 }
 
+/*
+ * Calculate the max. cache line size.
+ *
+ * In addition, the minimum of the i-cache stride sizes is calculated for
+ * "flush_icache_range()".
+ */
 static void
 get_max_cacheline_size (void)
 {
@@ -624,6 +712,8 @@
                 printk(KERN_ERR "%s: ia64_pal_cache_summary() failed 
(status=%ld)\n",
                        __FUNCTION__, status);
                 max = SMP_CACHE_BYTES;
+               /* Safest setup for "flush_icache_range()" */
+               ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
                goto out;
         }
 
@@ -632,14 +722,31 @@
                                                    &cci);
                if (status != 0) {
                        printk(KERN_ERR
-                              "%s: ia64_pal_cache_config_info(l=%lu) failed 
(status=%ld)\n",
+                              "%s: ia64_pal_cache_config_info(l=%lu, 2) failed 
(status=%ld)\n",
                               __FUNCTION__, l, status);
                        max = SMP_CACHE_BYTES;
+                       /* The safest setup for "flush_icache_range()" */
+                       cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+                       cci.pcci_unified = 1;
                }
                line_size = 1 << cci.pcci_line_size;
                if (line_size > max)
                        max = line_size;
-        }
+               if (!cci.pcci_unified) {
+                       status = ia64_pal_cache_config_info(l,
+                                                   /* cache_type 
(instruction)= */ 1,
+                                                   &cci);
+                       if (status != 0) {
+                               printk(KERN_ERR
+                               "%s: ia64_pal_cache_config_info(l=%lu, 1) 
failed (status=%ld)\n",
+                                       __FUNCTION__, l, status);
+                               /* The safest setup for "flush_icache_range()" 
*/
+                               cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+                       }
+               }
+               if (cci.pcci_stride < ia64_i_cache_stride_shift)
+                       ia64_i_cache_stride_shift = cci.pcci_stride;
+       }
   out:
        if (max > ia64_max_cacheline_size)
                ia64_max_cacheline_size = max;
@@ -700,7 +807,17 @@
        ia64_set_kr(IA64_KR_FPU_OWNER, 0);
 
        /*
-        * Initialize default control register to defer all speculative faults. 
 The
+        * Initialize the page-table base register to a global
+        * directory with all zeroes.  This ensure that we can handle
+        * TLB-misses to user address-space even before we created the
+        * first user address-space.  This may happen, e.g., due to
+        * aggressive use of lfetch.fault.
+        */
+       ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
+
+       /*
+        * Initialize default control register to defer speculative faults 
except
+        * for those arising from TLB misses, which are not deferred.  The
         * kernel MUST NOT depend on a particular setting of these bits (in 
other words,
         * the kernel must have recovery code for all speculative accesses).  
Turn on
         * dcr.lc as per recommendation by the architecture team.  Most IA-32 
apps
@@ -762,6 +879,9 @@
        /* size of physical stacked register partition plus 8 bytes: */
        __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
        platform_cpu_init();
+#ifndef XEN
+       pm_idle = default_idle;
+#endif
 }
 
 void
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/extable.c
--- a/xen/arch/ia64/linux/extable.c     Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/extable.c     Wed Aug 31 20:32:27 2005
@@ -6,29 +6,29 @@
  */
 
 #include <linux/config.h>
+#include <linux/sort.h>
 
 #include <asm/uaccess.h>
 #include <asm/module.h>
 
-static inline int
-compare_entries (struct exception_table_entry *l, struct exception_table_entry 
*r)
+static int cmp_ex(const void *a, const void *b)
 {
+       const struct exception_table_entry *l = a, *r = b;
        u64 lip = (u64) &l->addr + l->addr;
        u64 rip = (u64) &r->addr + r->addr;
 
+       /* avoid overflow */
+       if (lip > rip)
+               return 1;
        if (lip < rip)
                return -1;
-       if (lip == rip)
-               return 0;
-       else
-               return 1;
+       return 0;
 }
 
-static inline void
-swap_entries (struct exception_table_entry *l, struct exception_table_entry *r)
+static void swap_ex(void *a, void *b, int size)
 {
+       struct exception_table_entry *l = a, *r = b, tmp;
        u64 delta = (u64) r - (u64) l;
-       struct exception_table_entry tmp;
 
        tmp = *l;
        l->addr = r->addr + delta;
@@ -38,23 +38,20 @@
 }
 
 /*
- * Sort the exception table.  It's usually already sorted, but there may be 
unordered
- * entries due to multiple text sections (such as the .init text section).  
Note that the
- * exception-table-entries contain location-relative addresses, which requires 
a bit of
- * care during sorting to avoid overflows in the offset members (e.g., it 
would not be
- * safe to make a temporary copy of an exception-table entry on the stack, 
because the
- * stack may be more than 2GB away from the exception-table).
+ * Sort the exception table. It's usually already sorted, but there
+ * may be unordered entries due to multiple text sections (such as the
+ * .init text section). Note that the exception-table-entries contain
+ * location-relative addresses, which requires a bit of care during
+ * sorting to avoid overflows in the offset members (e.g., it would
+ * not be safe to make a temporary copy of an exception-table entry on
+ * the stack, because the stack may be more than 2GB away from the
+ * exception-table).
  */
-void
-sort_extable (struct exception_table_entry *start, struct 
exception_table_entry *finish)
+void sort_extable (struct exception_table_entry *start,
+                  struct exception_table_entry *finish)
 {
-       struct exception_table_entry *p, *q;
-
-       /* insertion sort */
-       for (p = start + 1; p < finish; ++p)
-               /* start .. p-1 is sorted; push p down to it's proper place */
-               for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0; 
--q)
-                       swap_entries(&q[0], &q[-1]);
+       sort(start, finish - start, sizeof(struct exception_table_entry),
+            cmp_ex, swap_ex);
 }
 
 const struct exception_table_entry *
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/ia64_ksyms.c
--- a/xen/arch/ia64/linux/ia64_ksyms.c  Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/ia64_ksyms.c  Wed Aug 31 20:32:27 2005
@@ -57,9 +57,6 @@
 EXPORT_SYMBOL(__strlen_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strnlen_user);
-
-#include <asm/unistd.h>
-EXPORT_SYMBOL(__ia64_syscall);
 
 /* from arch/ia64/lib */
 extern void __divsi3(void);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/flush.S
--- a/xen/arch/ia64/linux/lib/flush.S   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/flush.S   Wed Aug 31 20:32:27 2005
@@ -1,39 +1,61 @@
 /*
  * Cache flushing routines.
  *
- * Copyright (C) 1999-2001 Hewlett-Packard Co
- * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
+ * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
+ *     David Mosberger-Tang <davidm@xxxxxxxxxx>
+ *
+ * 05/28/05 Zoltan Menyhart    Dynamic stride size
  */
+
 #include <asm/asmmacro.h>
-#include <asm/page.h>
+
 
        /*
         * flush_icache_range(start,end)
-        *      Must flush range from start to end-1 but nothing else (need to
+        *
+        *      Make i-cache(s) coherent with d-caches.
+        *
+        *      Must deal with range from start to end-1 but nothing else (need 
to
         *      be careful not to touch addresses that may be unmapped).
+        *
+        *      Note: "in0" and "in1" are preserved for debugging purposes.
         */
 GLOBAL_ENTRY(flush_icache_range)
+
        .prologue
-       alloc r2=ar.pfs,2,0,0,0
-       sub r8=in1,in0,1
+       alloc   r2=ar.pfs,2,0,0,0
+       movl    r3=ia64_i_cache_stride_shift
+       mov     r21=1
        ;;
-       shr.u r8=r8,5                   // we flush 32 bytes per iteration
-       .save ar.lc, r3
-       mov r3=ar.lc                    // save ar.lc
+       ld8     r20=[r3]                // r20: stride shift
+       sub     r22=in1,r0,1            // last byte address
+       ;;
+       shr.u   r23=in0,r20             // start / (stride size)
+       shr.u   r22=r22,r20             // (last byte address) / (stride size)
+       shl     r21=r21,r20             // r21: stride size of the i-cache(s)
+       ;;
+       sub     r8=r22,r23              // number of strides - 1
+       shl     r24=r23,r20             // r24: addresses for "fc.i" =
+                                       //      "start" rounded down to stride 
boundary
+       .save   ar.lc,r3
+       mov     r3=ar.lc                // save ar.lc
        ;;
 
        .body
-
-       mov ar.lc=r8
+       mov     ar.lc=r8
        ;;
-.Loop: fc in0                          // issuable on M0 only
-       add in0=32,in0
+       /*
+        * 32 byte aligned loop, even number of (actually 2) bundles
+        */
+.Loop: fc.i    r24                     // issuable on M0 only
+       add     r24=r21,r24             // we flush "stride size" bytes per 
iteration
+       nop.i   0
        br.cloop.sptk.few .Loop
        ;;
        sync.i
        ;;
        srlz.i
        ;;
-       mov ar.lc=r3                    // restore ar.lc
+       mov     ar.lc=r3                // restore ar.lc
        br.ret.sptk.many rp
 END(flush_icache_range)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy_mck.S
--- a/xen/arch/ia64/linux/lib/memcpy_mck.S      Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/memcpy_mck.S      Wed Aug 31 20:32:27 2005
@@ -75,6 +75,7 @@
        mov     f6=f0
        br.cond.sptk .common_code
        ;;
+END(memcpy)
 GLOBAL_ENTRY(__copy_user)
        .prologue
 // check dest alignment
@@ -300,7 +301,7 @@
        add     src_pre_mem=0,src0      // prefetch src pointer
        add     dst_pre_mem=0,dst0      // prefetch dest pointer
        and     src0=-8,src0            // 1st src pointer
-(p7)   mov     ar.lc = r21
+(p7)   mov     ar.lc = cnt
 (p8)   mov     ar.lc = r0
        ;;
        TEXT_ALIGN(32)
@@ -524,7 +525,6 @@
 #undef B
 #undef C
 #undef D
-END(memcpy)
 
 /*
  * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memset.S
--- a/xen/arch/ia64/linux/lib/memset.S  Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/memset.S  Wed Aug 31 20:32:27 2005
@@ -57,10 +57,10 @@
 { .mmi
        .prologue
        alloc   tmp = ar.pfs, 3, 0, 0, 0
-       .body
        lfetch.nt1 [dest]                       //
        .save   ar.lc, save_lc
        mov.i   save_lc = ar.lc
+       .body
 } { .mmi
        mov     ret0 = dest                     // return value
        cmp.ne  p_nz, p_zr = value, r0          // use stf.spill if value is 
zero
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/pcdp.h
--- a/xen/arch/ia64/linux/pcdp.h        Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/pcdp.h        Wed Aug 31 20:32:27 2005
@@ -2,7 +2,7 @@
  * Definitions for PCDP-defined console devices
  *
  * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf
- * v2.0:  http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf
+ * v2.0:  http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
  *
  * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
  *     Khalid Aziz <khalid.aziz@xxxxxx>
@@ -52,11 +52,36 @@
        u32                             clock_rate;
        u8                              pci_prog_intfc;
        u8                              flags;
-};
+       u16                             conout_index;
+       u32                             reserved;
+} __attribute__((packed));
+
+#define PCDP_IF_PCI    1
+
+/* pcdp_if_pci.trans */
+#define PCDP_PCI_TRANS_IOPORT  0x02
+#define PCDP_PCI_TRANS_MMIO    0x01
+
+struct pcdp_if_pci {
+       u8                      interconnect;
+       u8                      reserved;
+       u16                     length;
+       u8                      segment;
+       u8                      bus;
+       u8                      dev;
+       u8                      fun;
+       u16                     dev_id;
+       u16                     vendor_id;
+       u32                     acpi_interrupt;
+       u64                     mmio_tra;
+       u64                     ioport_tra;
+       u8                      flags;
+       u8                      trans;
+} __attribute__((packed));
 
 struct pcdp_vga {
        u8                      count;          /* address space descriptors */
-};
+} __attribute__((packed));
 
 /* pcdp_device.flags */
 #define PCDP_PRIMARY_CONSOLE   1
@@ -66,7 +91,9 @@
        u8                      flags;
        u16                     length;
        u16                     efi_index;
-};
+       /* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */
+       /* next data is device specific type (currently only pcdp_vga) */
+} __attribute__((packed));
 
 struct pcdp {
        u8                      signature[4];
@@ -81,4 +108,4 @@
        u32                     num_uarts;
        struct pcdp_uart        uart[0];        /* actual size is num_uarts */
        /* remainder of table is pcdp_device structures */
-};
+} __attribute__((packed));
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/minstate.h
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/minstate.h        Wed Aug 31 20:32:27 2005
@@ -0,0 +1,254 @@
+#include <linux/config.h>
+
+#include <asm/cache.h>
+
+#include "entry.h"
+
+/*
+ * For ivt.s we want to access the stack virtually so we don't have to disable 
translation
+ * on interrupts.
+ *
+ *  On entry:
+ *     r1:     pointer to current task (ar.k6)
+ */
+#define MINSTATE_START_SAVE_MIN_VIRT                                           
                \
+(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
+       ;;                                                                      
                \
+(pUStk)        mov.m r24=ar.rnat;                                              
                        \
+(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base 
of RBS */               \
+(pKStk) mov r1=sp;                                     /* get sp  */           
                \
+       ;;                                                                      
                \
+(pUStk) lfetch.fault.excl.nt1 [r22];                                           
                \
+(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
+(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
+       ;;                                                                      
                \
+(pUStk)        mov ar.bspstore=r22;                            /* switch to 
kernel RBS */              \
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, 
use sp (r12) */   \
+       ;;                                                                      
                \
+(pUStk)        mov r18=ar.bsp;                                                 
                        \
+(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
+
+#define MINSTATE_END_SAVE_MIN_VIRT                                             
                \
+       bsw.1;                  /* switch back to bank 1 (must be last in insn 
group) */        \
+       ;;
+
+/*
+ * For mca_asm.S we want to access the stack physically since the state is 
saved before we
+ * go virtual and don't want to destroy the iip or ipsr.
+ */
+#define MINSTATE_START_SAVE_MIN_PHYS                                           
                \
+(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                         
                \
+(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                   
                \
+(pKStk) ld8 r3 = [r3];;                                                        
                        \
+(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                            
                \
+(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                          
                \
+(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
+(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of 
register backing store */    \
+       ;;                                                                      
                \
+(pUStk)        mov r24=ar.rnat;                                                
                        \
+(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
+(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
+(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel 
virtual addr of RBS */        \
+       ;;                                                                      
                \
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp 
(r12) */           \
+(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS 
*/                      \
+       ;;                                                                      
                \
+(pUStk)        mov r18=ar.bsp;                                                 
                        \
+(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
+
+#define MINSTATE_END_SAVE_MIN_PHYS                                             
                \
+       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */  
                \
+       ;;
+
+#ifdef MINSTATE_VIRT
+# define MINSTATE_GET_CURRENT(reg)     \
+               movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
+               ld8 reg=[reg]
+# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
+# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
+#endif
+
+#ifdef MINSTATE_PHYS
+# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
+# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
+# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
+#endif
+
+/*
+ * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
+ * the minimum state necessary that allows us to turn psr.ic back
+ * on.
+ *
+ * Assumed state upon entry:
+ *     psr.ic: off
+ *     r31:    contains saved predicates (pr)
+ *
+ * Upon exit, the state is as follows:
+ *     psr.ic: off
+ *      r2 = points to &pt_regs.r16
+ *      r8 = contents of ar.ccv
+ *      r9 = contents of ar.csd
+ *     r10 = contents of ar.ssd
+ *     r11 = FPSR_DEFAULT
+ *     r12 = kernel sp (kernel virtual address)
+ *     r13 = points to current task_struct (kernel virtual address)
+ *     p15 = TRUE if psr.i is set in cr.ipsr
+ *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
+ *             preserved
+ *
+ * Note that psr.ic is NOT turned on by this macro.  This is so that
+ * we can pass interruption state as arguments to a handler.
+ */
+#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                      
                \
+       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                       
                \
+       mov r27=ar.rsc;                 /* M */                                 
                \
+       mov r20=r1;                     /* A */                                 
                \
+       mov r25=ar.unat;                /* M */                                 
                \
+       mov r29=cr.ipsr;                /* M */                                 
                \
+       mov r26=ar.pfs;                 /* I */                                 
                \
+       mov r28=cr.iip;                 /* M */                                 
                \
+       mov r21=ar.fpsr;                /* M */                                 
                \
+       COVER;                          /* B;; (or nothing) */                  
                \
+       ;;                                                                      
                \
+       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                         
                \
+       ;;                                                                      
                \
+       ld1 r17=[r16];                          /* load 
current->thread.on_ustack flag */       \
+       st1 [r16]=r0;                           /* clear 
current->thread.on_ustack flag */      \
+       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                          
                \
+       /* switch from user to kernel RBS: */                                   
                \
+       ;;                                                                      
                \
+       invala;                         /* M */                                 
                \
+       SAVE_IFS;                                                               
                \
+       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode 
already? */            \
+       ;;                                                                      
                \
+       MINSTATE_START_SAVE_MIN                                                 
                \
+       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line 
size */           \
+       adds r16=PT(CR_IPSR),r1;                                                
                \
+       ;;                                                                      
                \
+       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                             
                \
+       st8 [r16]=r29;          /* save cr.ipsr */                              
                \
+       ;;                                                                      
                \
+       lfetch.fault.excl.nt1 [r17];                                            
                \
+       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                      
                \
+       mov r29=b0                                                              
                \
+       ;;                                                                      
                \
+       adds r16=PT(R8),r1;     /* initialize first base pointer */             
                \
+       adds r17=PT(R9),r1;     /* initialize second base pointer */            
                \
+(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */           
                        \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r8,16;                                        
                        \
+.mem.offset 8,0; st8.spill [r17]=r9,16;                                        
                        \
+        ;;                                                                     
                \
+.mem.offset 0,0; st8.spill [r16]=r10,24;                                       
                \
+.mem.offset 8,0; st8.spill [r17]=r11,24;                                       
                \
+        ;;                                                                     
                \
+       st8 [r16]=r28,16;       /* save cr.iip */                               
                \
+       st8 [r17]=r30,16;       /* save cr.ifs */                               
                \
+(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                  
                        \
+       mov r8=ar.ccv;                                                          
                \
+       mov r9=ar.csd;                                                          
                \
+       mov r10=ar.ssd;                                                         
                \
+       movl r11=FPSR_DEFAULT;   /* L-unit */                                   
                \
+       ;;                                                                      
                \
+       st8 [r16]=r25,16;       /* save ar.unat */                              
                \
+       st8 [r17]=r26,16;       /* save ar.pfs */                               
                \
+       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */    
                \
+       ;;                                                                      
                \
+       st8 [r16]=r27,16;       /* save ar.rsc */                               
                \
+(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                      
                        \
+(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */           
                        \
+       ;;                      /* avoid RAW on r16 & r17 */                    
                \
+(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                  
                        \
+       st8 [r17]=r31,16;       /* save predicates */                           
                \
+(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */       
                        \
+       ;;                                                                      
                \
+       st8 [r16]=r29,16;       /* save b0 */                                   
                \
+       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */            
                \
+       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */      
                \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */          
                \
+.mem.offset 8,0; st8.spill [r17]=r12,16;                                       
                \
+       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes 
of scratch) */  \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r13,16;                                       
                \
+.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */              
                \
+       movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;                      
                \
+       ld8 r13=[r13];                  /* establish 'current' */               
                \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r15,16;                                       
                \
+.mem.offset 8,0; st8.spill [r17]=r14,16;                                       
                \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r2,16;                                        
                        \
+.mem.offset 8,0; st8.spill [r17]=r3,16;                                        
                        \
+       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                     
                \
+       ;;                                                                      
                \
+       EXTRA;                                                                  
                \
+       movl r1=__gp;           /* establish kernel global pointer */           
                \
+       ;;                                                                      
                \
+       MINSTATE_END_SAVE_MIN
+
+/*
+ * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
+ *
+ * Assumed state upon entry:
+ *     psr.ic: on
+ *     r2:     points to &pt_regs.r16
+ *     r3:     points to &pt_regs.r17
+ *     r8:     contents of ar.ccv
+ *     r9:     contents of ar.csd
+ *     r10:    contents of ar.ssd
+ *     r11:    FPSR_DEFAULT
+ *
+ * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
+ */
+#define SAVE_REST                              \
+.mem.offset 0,0; st8.spill [r2]=r16,16;                \
+.mem.offset 8,0; st8.spill [r3]=r17,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r18,16;                \
+.mem.offset 8,0; st8.spill [r3]=r19,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r20,16;                \
+.mem.offset 8,0; st8.spill [r3]=r21,16;                \
+       mov r18=b6;                             \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r22,16;                \
+.mem.offset 8,0; st8.spill [r3]=r23,16;                \
+       mov r19=b7;                             \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r24,16;                \
+.mem.offset 8,0; st8.spill [r3]=r25,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r26,16;                \
+.mem.offset 8,0; st8.spill [r3]=r27,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r28,16;                \
+.mem.offset 8,0; st8.spill [r3]=r29,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r30,16;                \
+.mem.offset 8,0; st8.spill [r3]=r31,32;                \
+       ;;                                      \
+       mov ar.fpsr=r11;        /* M-unit */    \
+       st8 [r2]=r8,8;          /* ar.ccv */    \
+       adds r24=PT(B6)-PT(F7),r3;              \
+       ;;                                      \
+       stf.spill [r2]=f6,32;                   \
+       stf.spill [r3]=f7,32;                   \
+       ;;                                      \
+       stf.spill [r2]=f8,32;                   \
+       stf.spill [r3]=f9,32;                   \
+       ;;                                      \
+       stf.spill [r2]=f10;                     \
+       stf.spill [r3]=f11;                     \
+       adds r25=PT(B7)-PT(F11),r3;             \
+       ;;                                      \
+       st8 [r24]=r18,16;       /* b6 */        \
+       st8 [r25]=r19,16;       /* b7 */        \
+       ;;                                      \
+       st8 [r24]=r9;           /* ar.csd */    \
+       st8 [r25]=r10;          /* ar.ssd */    \
+       ;;
+
+#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
+#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov 
r15=r19)
+#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/sort.c
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/sort.c    Wed Aug 31 20:32:27 2005
@@ -0,0 +1,122 @@
+/*
+ * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ *
+ * Jan 23 2005  Matt Mackall <mpm@xxxxxxxxxxx>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#ifdef XEN
+#include <linux/types.h>
+#endif
+
+void u32_swap(void *a, void *b, int size)
+{
+       u32 t = *(u32 *)a;
+       *(u32 *)a = *(u32 *)b;
+       *(u32 *)b = t;
+}
+
+void generic_swap(void *a, void *b, int size)
+{
+       char t;
+
+       do {
+               t = *(char *)a;
+               *(char *)a++ = *(char *)b;
+               *(char *)b++ = t;
+       } while (--size > 0);
+}
+
+/*
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp: pointer to comparison function
+ * @swap: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+void sort(void *base, size_t num, size_t size,
+         int (*cmp)(const void *, const void *),
+         void (*swap)(void *, void *, int size))
+{
+       /* pre-scale counters for performance */
+       int i = (num/2) * size, n = num * size, c, r;
+
+       if (!swap)
+               swap = (size == 4 ? u32_swap : generic_swap);
+
+       /* heapify */
+       for ( ; i >= 0; i -= size) {
+               for (r = i; r * 2 < n; r  = c) {
+                       c = r * 2;
+                       if (c < n - size && cmp(base + c, base + c + size) < 0)
+                               c += size;
+                       if (cmp(base + r, base + c) >= 0)
+                               break;
+                       swap(base + r, base + c, size);
+               }
+       }
+
+       /* sort */
+       for (i = n - size; i >= 0; i -= size) {
+               swap(base, base + i, size);
+               for (r = 0; r * 2 < i; r = c) {
+                       c = r * 2;
+                       if (c < i - size && cmp(base + c, base + c + size) < 0)
+                               c += size;
+                       if (cmp(base + r, base + c) >= 0)
+                               break;
+                       swap(base + r, base + c, size);
+               }
+       }
+}
+
+EXPORT_SYMBOL(sort);
+
+#if 0
+/* a simple boot-time regression test */
+
+int cmpint(const void *a, const void *b)
+{
+       return *(int *)a - *(int *)b;
+}
+
+static int sort_test(void)
+{
+       int *a, i, r = 1;
+
+       a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
+       BUG_ON(!a);
+
+       printk("testing sort()\n");
+
+       for (i = 0; i < 1000; i++) {
+               r = (r * 725861) % 6599;
+               a[i] = r;
+       }
+
+       sort(a, 1000, sizeof(int), cmpint, NULL);
+
+       for (i = 0; i < 999; i++)
+               if (a[i] > a[i+1]) {
+                       printk("sort() failed!\n");
+                       break;
+               }
+
+       kfree(a);
+
+       return 0;
+}
+
+module_init(sort_test);
+#endif
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/README.origin
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/README.origin Wed Aug 31 20:32:27 2005
@@ -0,0 +1,20 @@
+Source files in this directory are identical copies of linux-2.6.13 files:
+
+cmdline.c              -> linux/lib/cmdline.c
+efi_stub.S             -> linux/arch/ia64/efi_stub.S
+extable.c              -> linux/arch/ia64/mm/extable.c
+hpsim.S                        -> linux/arch/ia64/hp/sim/hpsim.S
+ia64_ksyms.c           -> linux/arch/ia64/kernel/ia64_ksyms.c
+linuxextable.c         -> linux/kernel/extable.c
+machvec.c              -> linux/arch/ia64/kernel/machvec.c
+patch.c                        -> linux/arch/ia64/kernel/patch.c
+pcdp.h                 -> drivers/firmware/pcdp.h
+lib/bitop.c            -> linux/arch/ia64/lib/bitop.c
+lib/clear_page.S       -> linux/arch/ia64/lib/clear_page.S
+lib/copy_page_mck.S    -> linux/arch/ia64/lib/copy_page_mck.S
+lib/flush.S            -> linux/arch/ia64/lib/flush.S
+lib/idiv32.S           -> linux/arch/ia64/lib/idiv32.S
+lib/idiv64.S           -> linux/arch/ia64/lib/idiv64.S
+lib/memcpy_mck.S       -> linux/arch/ia64/lib/memcpy_mck.S
+lib/memset.S           -> linux/arch/ia64/lib/memset.S
+lib/strlen.S           -> linux/arch/ia64/lib/strlen.S
diff -r 44316ce83277 -r b7276814008c xen/include/asm-ia64/linux/sort.h
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/include/asm-ia64/linux/sort.h Wed Aug 31 20:32:27 2005
@@ -0,0 +1,10 @@
+#ifndef _LINUX_SORT_H
+#define _LINUX_SORT_H
+
+#include <linux/types.h>
+
+void sort(void *base, size_t num, size_t size,
+         int (*cmp)(const void *, const void *),
+         void (*swap)(void *, void *, int));
+
+#endif
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/lib/Makefile
--- a/xen/arch/ia64/lib/Makefile        Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-#
-# Makefile for ia64-specific library routines..
-#
-
-include $(BASEDIR)/Rules.mk
-
-OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o                  \
-       __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o                   \
-       bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
-       clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o   \
-       flush.o ip_fast_csum.o do_csum.o copy_user.o                    \
-       memset.o strlen.o memcpy.o 
-
-default: $(OBJS)
-       $(LD) -r -o ia64lib.o $(OBJS)
-
-AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
-
-__divdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__moddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umoddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__divsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__modsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umodsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-clean:
-       rm -f *.o *~
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/Makefile
--- a/xen/arch/ia64/linux/lib/Makefile  Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-#
-# Makefile for ia64-specific library routines..
-#
-
-include $(BASEDIR)/Rules.mk
-
-OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o                  \
-       __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o                   \
-       bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
-       clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o   \
-       flush.o ip_fast_csum.o do_csum.o copy_user.o                    \
-       memset.o strlen.o memcpy.o 
-
-default: $(OBJS)
-       $(LD) -r -o ia64lib.o $(OBJS)
-
-AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
-
-__divdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__moddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umoddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__divsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__modsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umodsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-clean:
-       rm -f *.o *~
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/carta_random.S
--- a/xen/arch/ia64/linux/lib/carta_random.S    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,54 +0,0 @@
-/*
- * Fast, simple, yet decent quality random number generator based on
- * a paper by David G. Carta ("Two Fast Implementations of the
- * `Minimal Standard' Random Number Generator," Communications of the
- * ACM, January, 1990).
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-#define a      r2
-#define m      r3
-#define lo     r8
-#define hi     r9
-#define t0     r16
-#define t1     r17
-#define        seed    r32
-
-GLOBAL_ENTRY(carta_random32)
-       movl    a = (16807 << 16) | 16807
-       ;;
-       pmpyshr2.u t0 = a, seed, 0
-       pmpyshr2.u t1 = a, seed, 16
-       ;;
-       unpack2.l t0 = t1, t0
-       dep     m = -1, r0, 0, 31
-       ;;
-       zxt4    lo = t0
-       shr.u   hi = t0, 32
-       ;;
-       dep     t0 = 0, hi, 15, 49      // t0 = (hi & 0x7fff)
-       ;;
-       shl     t0 = t0, 16             // t0 = (hi & 0x7fff) << 16
-       shr     t1 = hi, 15             // t1 = (hi >> 15)
-       ;;
-       add     lo = lo, t0
-       ;;
-       cmp.gtu p6, p0 = lo, m
-       ;;
-(p6)   and     lo = lo, m
-       ;;
-(p6)   add     lo = 1, lo
-       ;;
-       add     lo = lo, t1
-       ;;
-       cmp.gtu p6, p0 = lo, m
-       ;;
-(p6)   and     lo = lo, m
-       ;;
-(p6)   add     lo = 1, lo
-       br.ret.sptk.many rp
-END(carta_random32)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/checksum.c
--- a/xen/arch/ia64/linux/lib/checksum.c        Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,102 +0,0 @@
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
-       /* add up 32-bit words for 33 bits */
-       x = (x & 0xffffffff) + (x >> 32);
-       /* add up 16-bit and 17-bit words for 17+c bits */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up 16-bit and 2-bit for 16+c bit */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up carry.. */
-       x = (x & 0xffff) + (x >> 16);
-       return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-unsigned short int
-csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short 
len,
-                  unsigned short proto, unsigned int sum)
-{
-       return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 
16) +
-                          ((unsigned long) proto << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-unsigned int
-csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short 
len,
-                   unsigned short proto, unsigned int sum)
-{
-       unsigned long result;
-
-       result = (saddr + daddr + sum +
-                 ((unsigned long) ntohs(len) << 16) +
-                 ((unsigned long) proto << 8));
-
-       /* Fold down to 32-bits so we don't lose in the typedef-less network 
stack.  */
-       /* 64 to 33 */
-       result = (result & 0xffffffff) + (result >> 32);
-       /* 33 to 32 */
-       result = (result & 0xffffffff) + (result >> 32);
-       return result;
-}
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-unsigned int
-csum_partial (const unsigned char * buff, int len, unsigned int sum)
-{
-       unsigned long result = do_csum(buff, len);
-
-       /* add in old sum, and carry.. */
-       result += sum;
-       /* 32+c bits -> 32 bits */
-       result = (result & 0xffffffff) + (result >> 32);
-       return result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-unsigned short
-ip_compute_csum (unsigned char * buff, int len)
-{
-       return ~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/clear_user.S
--- a/xen/arch/ia64/linux/lib/clear_user.S      Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,209 +0,0 @@
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- *     in0:    address of buffer
- *     in1:    length of buffer in bytes
- * Outputs:
- *     r8:     number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf            r32
-#define len            r33
-
-//
-// local registers
-//
-#define cnt            r16
-#define buf2           r17
-#define saved_lc       r18
-#define saved_pfs      r19
-#define tmp            r20
-#define len2           r21
-#define len3           r22
-
-//
-// Theory of operations:
-//     - we check whether or not the buffer is small, i.e., less than 17
-//       in which case we do the byte by byte loop.
-//
-//     - Otherwise we go progressively from 1 byte store to 8byte store in
-//       the head part, the body is a 16byte store loop and we finish we the
-//       tail for the last 15 bytes.
-//       The good point about this breakdown is that the long buffer handling
-//       contains only 2 branches.
-//
-//     The reason for not using shifting & masking for both the head and the
-//     tail is to stay semantically correct. This routine is not supposed
-//     to write bytes outside of the buffer. While most of the time this would
-//     be ok, we can't tolerate a mistake. A classical example is the case
-//     of multithreaded code were to the extra bytes touched is actually owned
-//     by another thread which runs concurrently to ours. Another, less likely,
-//     example is with device drivers where reading an I/O mapped location may
-//     have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc   saved_pfs=ar.pfs,2,0,0,0
-       cmp.eq p6,p0=r0,len             // check for zero length
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc              // preserve ar.lc (slow)
-       .body
-       ;;                              // avoid WAW on CFM
-       adds tmp=-1,len                 // br.ctop is repeat/until
-       mov ret0=len                    // return value is length at this point
-(p6)   br.ret.spnt.many rp
-       ;;
-       cmp.lt p6,p0=16,len             // if len > 16 then long memset
-       mov ar.lc=tmp                   // initialize lc for small count
-(p6)   br.cond.dptk .long_do_clear
-       ;;                              // WAR on ar.lc
-       //
-       // worst case 16 iterations, avg 8 iterations
-       //
-       // We could have played with the predicates to use the extra
-       // M slot for 2 stores/iteration but the cost the initialization
-       // the various counters compared to how long the loop is supposed
-       // to last on average does not make this solution viable.
-       //
-1:
-       EX( .Lexit1, st1 [buf]=r0,1 )
-       adds len=-1,len                 // countdown length using len
-       br.cloop.dptk 1b
-       ;;                              // avoid RAW on ar.lc
-       //
-       // .Lexit4: comes from byte by byte loop
-       //          len contains bytes left
-.Lexit1:
-       mov ret0=len                    // faster than using ar.lc
-       mov ar.lc=saved_lc
-       br.ret.sptk.many rp             // end of short clear_user
-
-
-       //
-       // At this point we know we have more than 16 bytes to copy
-       // so we focus on alignment (no branches required)
-       //
-       // The use of len/len2 for countdown of the number of bytes left
-       // instead of ret0 is due to the fact that the exception code
-       // changes the values of r8.
-       //
-.long_do_clear:
-       tbit.nz p6,p0=buf,0             // odd alignment (for long_do_clear)
-       ;;
-       EX( .Lexit3, (p6) st1 [buf]=r0,1 )      // 1-byte aligned
-(p6)   adds len=-1,len;;               // sync because buf is modified
-       tbit.nz p6,p0=buf,1
-       ;;
-       EX( .Lexit3, (p6) st2 [buf]=r0,2 )      // 2-byte aligned
-(p6)   adds len=-2,len;;
-       tbit.nz p6,p0=buf,2
-       ;;
-       EX( .Lexit3, (p6) st4 [buf]=r0,4 )      // 4-byte aligned
-(p6)   adds len=-4,len;;
-       tbit.nz p6,p0=buf,3
-       ;;
-       EX( .Lexit3, (p6) st8 [buf]=r0,8 )      // 8-byte aligned
-(p6)   adds len=-8,len;;
-       shr.u cnt=len,4         // number of 128-bit (2x64bit) words
-       ;;
-       cmp.eq p6,p0=r0,cnt
-       adds tmp=-1,cnt
-(p6)   br.cond.dpnt .dotail            // we have less than 16 bytes left
-       ;;
-       adds buf2=8,buf                 // setup second base pointer
-       mov ar.lc=tmp
-       ;;
-
-       //
-       // 16bytes/iteration core loop
-       //
-       // The second store can never generate a fault because
-       // we come into the loop only when we are 16-byte aligned.
-       // This means that if we cross a page then it will always be
-       // in the first store and never in the second.
-       //
-       //
-       // We need to keep track of the remaining length. A possible 
(optimistic)
-       // way would be to use ar.lc and derive how many byte were left by
-       // doing : left= 16*ar.lc + 16.  this would avoid the addition at
-       // every iteration.
-       // However we need to keep the synchronization point. A template
-       // M;;MB does not exist and thus we can keep the addition at no
-       // extra cycle cost (use a nop slot anyway). It also simplifies the
-       // (unlikely)  error recovery code
-       //
-
-2:     EX(.Lexit3, st8 [buf]=r0,16 )
-       ;;                              // needed to get len correct when error
-       st8 [buf2]=r0,16
-       adds len=-16,len
-       br.cloop.dptk 2b
-       ;;
-       mov ar.lc=saved_lc
-       //
-       // tail correction based on len only
-       //
-       // We alternate the use of len3,len2 to allow parallelism and correct
-       // error handling. We also reuse p6/p7 to return correct value.
-       // The addition of len2/len3 does not cost anything more compared to
-       // the regular memset as we had empty slots.
-       //
-.dotail:
-       mov len2=len                    // for parallelization of error handling
-       mov len3=len
-       tbit.nz p6,p0=len,3
-       ;;
-       EX( .Lexit2, (p6) st8 [buf]=r0,8 )      // at least 8 bytes
-(p6)   adds len3=-8,len2
-       tbit.nz p7,p6=len,2
-       ;;
-       EX( .Lexit2, (p7) st4 [buf]=r0,4 )      // at least 4 bytes
-(p7)   adds len2=-4,len3
-       tbit.nz p6,p7=len,1
-       ;;
-       EX( .Lexit2, (p6) st2 [buf]=r0,2 )      // at least 2 bytes
-(p6)   adds len3=-2,len2
-       tbit.nz p7,p6=len,0
-       ;;
-       EX( .Lexit2, (p7) st1 [buf]=r0 )        // only 1 byte left
-       mov ret0=r0                             // success
-       br.ret.sptk.many rp                     // end of most likely path
-
-       //
-       // Outlined error handling code
-       //
-
-       //
-       // .Lexit3: comes from core loop, need restore pr/lc
-       //          len contains bytes left
-       //
-       //
-       // .Lexit2:
-       //      if p6 -> coming from st8 or st2 : len2 contains what's left
-       //      if p7 -> coming from st4 or st1 : len3 contains what's left
-       // We must restore lc/pr even though might not have been used.
-.Lexit2:
-       .pred.rel "mutex", p6, p7
-(p6)   mov len=len2
-(p7)   mov len=len3
-       ;;
-       //
-       // .Lexit4: comes from head, need not restore pr/lc
-       //          len contains bytes left
-       //
-.Lexit3:
-       mov ret0=len
-       mov ar.lc=saved_lc
-       br.ret.sptk.many rp
-END(__do_clear_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_page.S
--- a/xen/arch/ia64/linux/lib/copy_page.S       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,98 +0,0 @@
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- *     in0:    address of target page
- *     in1:    address of source page
- * Output:
- *     no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *     David Mosberger <davidm@xxxxxxxxxx>
- *
- * 4/06/01 davidm      Tuned to make it perform well both for cached and 
uncached copies.
- */
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH     3
-#define EPI            p[PIPE_DEPTH-1]
-
-#define lcount         r16
-#define saved_pr       r17
-#define saved_lc       r18
-#define saved_pfs      r19
-#define src1           r20
-#define src2           r21
-#define tgt1           r22
-#define tgt2           r23
-#define srcf           r24
-#define tgtf           r25
-#define tgt_last       r26
-
-#define Nrot           ((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
-       .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
-             t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
-       .rotp p[PIPE_DEPTH]
-
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc
-       mov ar.ec=PIPE_DEPTH
-
-       mov lcount=PAGE_SIZE/64-1
-       .save pr, saved_pr
-       mov saved_pr=pr
-       mov pr.rot=1<<16
-
-       .body
-
-       mov src1=in1
-       adds src2=8,in1
-       mov tgt_last = PAGE_SIZE
-       ;;
-       adds tgt2=8,in0
-       add srcf=512,in1
-       mov ar.lc=lcount
-       mov tgt1=in0
-       add tgtf=512,in0
-       add tgt_last = tgt_last, in0
-       ;;
-1:
-(p[0]) ld8 t1[0]=[src1],16
-(EPI)  st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0]) ld8 t2[0]=[src2],16
-(EPI)  st8 [tgt2]=t2[PIPE_DEPTH-1],16
-       cmp.ltu p6,p0 = tgtf, tgt_last
-       ;;
-(p[0]) ld8 t3[0]=[src1],16
-(EPI)  st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0]) ld8 t4[0]=[src2],16
-(EPI)  st8 [tgt2]=t4[PIPE_DEPTH-1],16
-       ;;
-(p[0]) ld8 t5[0]=[src1],16
-(EPI)  st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0]) ld8 t6[0]=[src2],16
-(EPI)  st8 [tgt2]=t6[PIPE_DEPTH-1],16
-       ;;
-(p[0]) ld8 t7[0]=[src1],16
-(EPI)  st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0]) ld8 t8[0]=[src2],16
-(EPI)  st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6)   lfetch [srcf], 64
-(p6)   lfetch [tgtf], 64
-       br.ctop.sptk.few 1b
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000      // restore predicates
-       mov ar.pfs=saved_pfs
-       mov ar.lc=saved_lc
-       br.ret.sptk.many rp
-END(copy_page)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_user.S
--- a/xen/arch/ia64/linux/lib/copy_user.S       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,610 +0,0 @@
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- *     in0     address of source buffer
- *     in1     address of destination buffer
- *     in2     number of bytes to copy
- *
- * Outputs:
- *     ret0    0 in case of success. The number of bytes NOT copied in
- *             case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Fixme:
- *     - handle the case where we have more than 16 bytes and the alignment
- *       are different.
- *     - more benchmarking
- *     - fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK     16      // we do byte copy below (must be >=16)
-#define PIPE_DEPTH     21      // pipe depth
-
-#define EPI            p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst            in0
-#define src            in1
-#define len            in2
-
-//
-// local registers
-//
-#define t1             r2      // rshift in bytes
-#define t2             r3      // lshift in bytes
-#define rshift         r14     // right shift in bits
-#define lshift         r15     // left shift in bits
-#define word1          r16
-#define word2          r17
-#define cnt            r18
-#define len2           r19
-#define saved_lc       r20
-#define saved_pr       r21
-#define tmp            r22
-#define val            r23
-#define src1           r24
-#define dst1           r25
-#define src2           r26
-#define dst2           r27
-#define len1           r28
-#define enddst         r29
-#define endsrc         r30
-#define saved_pfs      r31
-
-GLOBAL_ENTRY(__copy_user)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
-       .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
-       .rotp p[PIPE_DEPTH]
-
-       adds len2=-1,len        // br.ctop is repeat/until
-       mov ret0=r0
-
-       ;;                      // RAW of cfm when len=0
-       cmp.eq p8,p0=r0,len     // check for zero length
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc      // preserve ar.lc (slow)
-(p8)   br.ret.spnt.many rp     // empty mempcy()
-       ;;
-       add enddst=dst,len      // first byte after end of source
-       add endsrc=src,len      // first byte after end of destination
-       .save pr, saved_pr
-       mov saved_pr=pr         // preserve predicates
-
-       .body
-
-       mov dst1=dst            // copy because of rotation
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-
-       mov src1=src            // copy because of rotation
-       mov ar.lc=len2          // initialize lc for small count
-       cmp.lt p10,p7=COPY_BREAK,len    // if len > COPY_BREAK then long copy
-
-       xor tmp=src,dst         // same alignment test prepare
-(p10)  br.cond.dptk .long_copy_user
-       ;;                      // RAW pr.rot/p16 ?
-       //
-       // Now we do the byte by byte loop with software pipeline
-       //
-       // p7 is necessarily false by now
-1:
-       EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-       EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-       br.ctop.dptk.few 1b
-       ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.pfs=saved_pfs            // restore ar.ec
-       br.ret.sptk.many rp             // end of short memcpy
-
-       //
-       // Not 8-byte aligned
-       //
-.diff_align_copy_user:
-       // At this point we know we have more than 16 bytes to copy
-       // and also that src and dest do _not_ have the same alignment.
-       and src2=0x7,src1                               // src offset
-       and dst2=0x7,dst1                               // dst offset
-       ;;
-       // The basic idea is that we copy byte-by-byte at the head so
-       // that we can reach 8-byte alignment for both src1 and dst1.
-       // Then copy the body using software pipelined 8-byte copy,
-       // shifting the two back-to-back words right and left, then copy
-       // the tail by copying byte-by-byte.
-       //
-       // Fault handling. If the byte-by-byte at the head fails on the
-       // load, then restart and finish the pipleline by copying zeros
-       // to the dst1. Then copy zeros for the rest of dst1.
-       // If 8-byte software pipeline fails on the load, do the same as
-       // failure_in3 does. If the byte-by-byte at the tail fails, it is
-       // handled simply by failure_in_pipe1.
-       //
-       // The case p14 represents the source has more bytes in the
-       // the first word (by the shifted part), whereas the p15 needs to
-       // copy some bytes from the 2nd word of the source that has the
-       // tail of the 1st of the destination.
-       //
-
-       //
-       // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
-       // to copy the head to dst1, to start 8-byte copy software pipeline.
-       // We know src1 is not 8-byte aligned in this case.
-       //
-       cmp.eq p14,p15=r0,dst2
-(p15)  br.cond.spnt 1f
-       ;;
-       sub t1=8,src2
-       mov t2=src2
-       ;;
-       shl rshift=t2,3
-       sub len1=len,t1                                 // set len1
-       ;;
-       sub lshift=64,rshift
-       ;;
-       br.cond.spnt .word_copy_user
-       ;;
-1:
-       cmp.leu p14,p15=src2,dst2
-       sub t1=dst2,src2
-       ;;
-       .pred.rel "mutex", p14, p15
-(p14)  sub word1=8,src2                                // (8 - src offset)
-(p15)  sub t1=r0,t1                                    // absolute value
-(p15)  sub word1=8,dst2                                // (8 - dst offset)
-       ;;
-       // For the case p14, we don't need to copy the shifted part to
-       // the 1st word of destination.
-       sub t2=8,t1
-(p14)  sub word1=word1,t1
-       ;;
-       sub len1=len,word1                              // resulting len
-(p15)  shl rshift=t1,3                                 // in bits
-(p14)  shl rshift=t2,3
-       ;;
-(p14)  sub len1=len1,t1
-       adds cnt=-1,word1
-       ;;
-       sub lshift=64,rshift
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-       mov ar.lc=cnt
-       ;;
-2:
-       EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-       EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-       br.ctop.dptk.few 2b
-       ;;
-       clrrrb
-       ;;
-.word_copy_user:
-       cmp.gtu p9,p0=16,len1
-(p9)   br.cond.spnt 4f                 // if (16 > len1) skip 8-byte copy
-       ;;
-       shr.u cnt=len1,3                // number of 64-bit words
-       ;;
-       adds cnt=-1,cnt
-       ;;
-       .pred.rel "mutex", p14, p15
-(p14)  sub src1=src1,t2
-(p15)  sub src1=src1,t1
-       //
-       // Now both src1 and dst1 point to an 8-byte aligned address. And
-       // we have more than 8 bytes to copy.
-       //
-       mov ar.lc=cnt
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-       ;;
-3:
-       //
-       // The pipleline consists of 3 stages:
-       // 1 (p16):     Load a word from src1
-       // 2 (EPI_1):   Shift right pair, saving to tmp
-       // 3 (EPI):     Store tmp to dst1
-       //
-       // To make it simple, use at least 2 (p16) loops to set up val1[n]
-       // because we need 2 back-to-back val1[] to get tmp.
-       // Note that this implies EPI_2 must be p18 or greater.
-       //
-
-#define EPI_1          p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift)    cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift)      \
-       (pred)  br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift)                                           \
-.copy_user_bit##rshift:                                                \
-1:                                                             \
-       EX(.failure_out,(EPI) st8 [dst1]=tmp,8);                \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
-       EX(3f,(p16) ld8 val1[1]=[src1],8);                      \
-(p16)  mov val1[0]=r0;                                         \
-       br.ctop.dptk 1b;                                        \
-       ;;                                                      \
-       br.cond.sptk.many .diff_align_do_tail;                  \
-2:                                                             \
-(EPI)  st8 [dst1]=tmp,8;                                       \
-(EPI_1)        shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;  \
-3:                                                             \
-(p16)  mov val1[1]=r0;                                         \
-(p16)  mov val1[0]=r0;                                         \
-       br.ctop.dptk 2b;                                        \
-       ;;                                                      \
-       br.cond.sptk.many .failure_in2
-
-       //
-       // Since the instruction 'shrp' requires a fixed 128-bit value
-       // specifying the bits to shift, we need to provide 7 cases
-       // below.
-       //
-       SWITCH(p6, 8)
-       SWITCH(p7, 16)
-       SWITCH(p8, 24)
-       SWITCH(p9, 32)
-       SWITCH(p10, 40)
-       SWITCH(p11, 48)
-       SWITCH(p12, 56)
-       ;;
-       CASE(p6, 8)
-       CASE(p7, 16)
-       CASE(p8, 24)
-       CASE(p9, 32)
-       CASE(p10, 40)
-       CASE(p11, 48)
-       CASE(p12, 56)
-       ;;
-       BODY(8)
-       BODY(16)
-       BODY(24)
-       BODY(32)
-       BODY(40)
-       BODY(48)
-       BODY(56)
-       ;;
-.diff_align_do_tail:
-       .pred.rel "mutex", p14, p15
-(p14)  sub src1=src1,t1
-(p14)  adds dst1=-8,dst1
-(p15)  sub dst1=dst1,t1
-       ;;
-4:
-       // Tail correction.
-       //
-       // The problem with this piplelined loop is that the last word is not
-       // loaded and thus parf of the last word written is not correct.
-       // To fix that, we simply copy the tail byte by byte.
-
-       sub len1=endsrc,src1,1
-       clrrrb
-       ;;
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-       mov ar.lc=len1
-       ;;
-5:
-       EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-       EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-       br.ctop.dptk.few 5b
-       ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // Beginning of long mempcy (i.e. > 16 bytes)
-       //
-.long_copy_user:
-       tbit.nz p6,p7=src1,0    // odd alignment
-       and tmp=7,tmp
-       ;;
-       cmp.eq p10,p8=r0,tmp
-       mov len1=len            // copy because of rotation
-(p8)   br.cond.dpnt .diff_align_copy_user
-       ;;
-       // At this point we know we have more than 16 bytes to copy
-       // and also that both src and dest have the same alignment
-       // which may not be the one we want. So for now we must move
-       // forward slowly until we reach 16byte alignment: no need to
-       // worry about reaching the end of buffer.
-       //
-       EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)      // 1-byte aligned
-(p6)   adds len1=-1,len1;;
-       tbit.nz p7,p0=src1,1
-       ;;
-       EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)      // 2-byte aligned
-(p7)   adds len1=-2,len1;;
-       tbit.nz p8,p0=src1,2
-       ;;
-       //
-       // Stop bit not required after ld4 because if we fail on ld4
-       // we have never executed the ld1, therefore st1 is not executed.
-       //
-       EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)      // 4-byte aligned
-       ;;
-       EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
-       tbit.nz p9,p0=src1,3
-       ;;
-       //
-       // Stop bit not required after ld8 because if we fail on ld8
-       // we have never executed the ld2, therefore st2 is not executed.
-       //
-       EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)      // 8-byte aligned
-       EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8)   adds len1=-4,len1
-       ;;
-       EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9)   adds len1=-8,len1;;
-       shr.u cnt=len1,4                // number of 128-bit (2x64bit) words
-       ;;
-       EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
-       tbit.nz p6,p0=len1,3
-       cmp.eq p7,p0=r0,cnt
-       adds tmp=-1,cnt                 // br.ctop is repeat/until
-(p7)   br.cond.dpnt .dotail            // we have less than 16 bytes left
-       ;;
-       adds src2=8,src1
-       adds dst2=8,dst1
-       mov ar.lc=tmp
-       ;;
-       //
-       // 16bytes/iteration
-       //
-2:
-       EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16)  ld8 val2[0]=[src2],16
-
-       EX(.failure_out, (EPI)  st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI)  st8 [dst2]=val2[PIPE_DEPTH-1],16
-       br.ctop.dptk 2b
-       ;;                      // RAW on src1 when fall through from loop
-       //
-       // Tail correction based on len only
-       //
-       // No matter where we come from (loop or test) the src1 pointer
-       // is 16 byte aligned AND we have less than 16 bytes to copy.
-       //
-.dotail:
-       EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)      // at least 8 bytes
-       tbit.nz p7,p0=len1,2
-       ;;
-       EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)      // at least 4 bytes
-       tbit.nz p8,p0=len1,1
-       ;;
-       EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)      // at least 2 bytes
-       tbit.nz p9,p0=len1,0
-       ;;
-       EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
-       ;;
-       EX(.failure_in1,(p9) ld1 val2[1]=[src1])        // only 1 byte left
-       mov ar.lc=saved_lc
-       ;;
-       EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       EX(.failure_out, (p8)   st2 [dst1]=val2[0],2)
-       mov ar.pfs=saved_pfs
-       ;;
-       EX(.failure_out, (p9)   st1 [dst1]=val2[1])
-       br.ret.sptk.many rp
-
-
-       //
-       // Here we handle the case where the byte by byte copy fails
-       // on the load.
-       // Several factors make the zeroing of the rest of the buffer kind of
-       // tricky:
-       //      - the pipeline: loads/stores are not in sync (pipeline)
-       //
-       //        In the same loop iteration, the dst1 pointer does not directly
-       //        reflect where the faulty load was.
-       //
-       //      - pipeline effect
-       //        When you get a fault on load, you may have valid data from
-       //        previous loads not yet store in transit. Such data must be
-       //        store normally before moving onto zeroing the rest.
-       //
-       //      - single/multi dispersal independence.
-       //
-       // solution:
-       //      - we don't disrupt the pipeline, i.e. data in transit in
-       //        the software pipeline will be eventually move to memory.
-       //        We simply replace the load with a simple mov and keep the
-       //        pipeline going. We can't really do this inline because
-       //        p16 is always reset to 1 when lc > 0.
-       //
-.failure_in_pipe1:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-1:
-(p16)  mov val1[0]=r0
-(EPI)  st1 [dst1]=val1[PIPE_DEPTH-1],1
-       br.ctop.dptk 1b
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // This is the case where the byte by byte copy fails on the load
-       // when we copy the head. We need to finish the pipeline and copy
-       // zeros for the rest of the destination. Since this happens
-       // at the top we still need to fill the body and tail.
-.failure_in_pipe2:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-2:
-(p16)  mov val1[0]=r0
-(EPI)  st1 [dst1]=val1[PIPE_DEPTH-1],1
-       br.ctop.dptk 2b
-       ;;
-       sub len=enddst,dst1,1           // precompute len
-       br.cond.dptk.many .failure_in1bis
-       ;;
-
-       //
-       // Here we handle the head & tail part when we check for alignment.
-       // The following code handles only the load failures. The
-       // main diffculty comes from the fact that loads/stores are
-       // scheduled. So when you fail on a load, the stores corresponding
-       // to previous successful loads must be executed.
-       //
-       // However some simplifications are possible given the way
-       // things work.
-       //
-       // 1) HEAD
-       // Theory of operation:
-       //
-       //  Page A   | Page B
-       //  ---------|-----
-       //          1|8 x
-       //        1 2|8 x
-       //          4|8 x
-       //        1 4|8 x
-       //        2 4|8 x
-       //      1 2 4|8 x
-       //           |1
-       //           |2 x
-       //           |4 x
-       //
-       // page_size >= 4k (2^12).  (x means 4, 2, 1)
-       // Here we suppose Page A exists and Page B does not.
-       //
-       // As we move towards eight byte alignment we may encounter faults.
-       // The numbers on each page show the size of the load (current 
alignment).
-       //
-       // Key point:
-       //      - if you fail on 1, 2, 4 then you have never executed any 
smaller
-       //        size loads, e.g. failing ld4 means no ld1 nor ld2 executed
-       //        before.
-       //
-       // This allows us to simplify the cleanup code, because basically you
-       // only have to worry about "pending" stores in the case of a failing
-       // ld8(). Given the way the code is written today, this means only
-       // worry about st2, st4. There we can use the information encapsulated
-       // into the predicates.
-       //
-       // Other key point:
-       //      - if you fail on the ld8 in the head, it means you went straight
-       //        to it, i.e. 8byte alignment within an unexisting page.
-       // Again this comes from the fact that if you crossed just for the ld8 
then
-       // you are 8byte aligned but also 16byte align, therefore you would
-       // either go for the 16byte copy loop OR the ld8 in the tail part.
-       // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is 
impossible
-       // because it would mean you had 15bytes to copy in which case you
-       // would have defaulted to the byte by byte copy.
-       //
-       //
-       // 2) TAIL
-       // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
-       // aligned.
-       //
-       // Key point:
-       // This means that we either:
-       //              - are right on a page boundary
-       //      OR
-       //              - are at more than 16 bytes from a page boundary with
-       //                at most 15 bytes to copy: no chance of crossing.
-       //
-       // This allows us to assume that if we fail on a load we haven't 
possibly
-       // executed any of the previous (tail) ones, so we don't need to do
-       // any stores. For instance, if we fail on ld2, this means we had
-       // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
-       //
-       // This means that we are in a situation similar the a fault in the
-       // head part. That's nice!
-       //
-.failure_in1:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-       sub len=endsrc,src1,1
-       //
-       // we know that ret0 can never be zero at this point
-       // because we failed why trying to do a load, i.e. there is still
-       // some work to do.
-       // The failure_in1bis and length problem is taken care of at the
-       // calling side.
-       //
-       ;;
-.failure_in1bis:               // from (.failure_in3)
-       mov ar.lc=len           // Continue with a stupid byte store.
-       ;;
-5:
-       st1 [dst1]=r0,1
-       br.cloop.dptk 5b
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // Here we simply restart the loop but instead
-       // of doing loads we fill the pipeline with zeroes
-       // We can't simply store r0 because we may have valid
-       // data in transit in the pipeline.
-       // ar.lc and ar.ec are setup correctly at this point
-       //
-       // we MUST use src1/endsrc here and not dst1/enddst because
-       // of the pipeline effect.
-       //
-.failure_in3:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-       ;;
-2:
-(p16)  mov val1[0]=r0
-(p16)  mov val2[0]=r0
-(EPI)  st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI)  st8 [dst2]=val2[PIPE_DEPTH-1],16
-       br.ctop.dptk 2b
-       ;;
-       cmp.ne p6,p0=dst1,enddst        // Do we need to finish the tail ?
-       sub len=enddst,dst1,1           // precompute len
-(p6)   br.cond.dptk .failure_in1bis
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-.failure_in2:
-       sub ret0=endsrc,src1
-       cmp.ne p6,p0=dst1,enddst        // Do we need to finish the tail ?
-       sub len=enddst,dst1,1           // precompute len
-(p6)   br.cond.dptk .failure_in1bis
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // handling of failures on stores: that's the easy part
-       //
-.failure_out:
-       sub ret0=enddst,dst1
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-END(__copy_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/csum_partial_copy.c
--- a/xen/arch/ia64/linux/lib/csum_partial_copy.c       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,151 +0,0 @@
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <asm/uaccess.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
-       /* add up 32-bit words for 33 bits */
-       x = (x & 0xffffffff) + (x >> 32);
-       /* add up 16-bit and 17-bit words for 17+c bits */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up 16-bit and 2-bit for 16+c bit */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up carry.. */
-       x = (x & 0xffff) + (x >> 16);
-       return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
-       int odd, count;
-       unsigned long result = (unsigned long)psum;
-
-       if (len <= 0)
-               goto out;
-       odd = 1 & (unsigned long) buff;
-       if (odd) {
-               result = *buff << 8;
-               len--;
-               buff++;
-       }
-       count = len >> 1;               /* nr of 16-bit words.. */
-       if (count) {
-               if (2 & (unsigned long) buff) {
-                       result += *(unsigned short *) buff;
-                       count--;
-                       len -= 2;
-                       buff += 2;
-               }
-               count >>= 1;            /* nr of 32-bit words.. */
-               if (count) {
-                       if (4 & (unsigned long) buff) {
-                               result += *(unsigned int *) buff;
-                               count--;
-                               len -= 4;
-                               buff += 4;
-                       }
-                       count >>= 1;    /* nr of 64-bit words.. */
-                       if (count) {
-                               unsigned long carry = 0;
-                               do {
-                                       unsigned long w = *(unsigned long *) 
buff;
-                                       count--;
-                                       buff += 8;
-                                       result += carry;
-                                       result += w;
-                                       carry = (w > result);
-                               } while (count);
-                               result += carry;
-                               result = (result & 0xffffffff) + (result >> 32);
-                       }
-                       if (len & 4) {
-                               result += *(unsigned int *) buff;
-                               buff += 4;
-                       }
-               }
-               if (len & 2) {
-                       result += *(unsigned short *) buff;
-                       buff += 2;
-               }
-       }
-       if (len & 1)
-               result += *buff;
-
-       result = from64to16(result);
-
-       if (odd)
-               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
-       return result;
-}
-
-/*
- * XXX Fixme
- *
- * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
- * But it's very tricky to get right even in C.
- */
-extern unsigned long do_csum(const unsigned char *, long);
-
-static unsigned int
-do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char 
*dst,
-                               int len, unsigned int psum, int *errp)
-{
-       unsigned long result;
-
-       /* XXX Fixme
-        * for now we separate the copy from checksum for obvious
-        * alignment difficulties. Look at the Alpha code and you'll be
-        * scared.
-        */
-
-       if (__copy_from_user(dst, src, len) != 0 && errp)
-               *errp = -EFAULT;
-
-       result = do_csum(dst, len);
-
-       /* add in old sum, and carry.. */
-       result += psum;
-       /* 32+c bits -> 32 bits */
-       result = (result & 0xffffffff) + (result >> 32);
-       return result;
-}
-
-unsigned int
-csum_partial_copy_from_user (const unsigned char __user *src, unsigned char 
*dst,
-                            int len, unsigned int sum, int *errp)
-{
-       if (!access_ok(VERIFY_READ, src, len)) {
-               *errp = -EFAULT;
-               memset(dst, 0, len);
-               return sum;
-       }
-
-       return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
-}
-
-unsigned int
-csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
-                         int len, unsigned int sum)
-{
-       return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
-}
-
-EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/dec_and_lock.c
--- a/xen/arch/ia64/linux/lib/dec_and_lock.c    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2003 Jerome Marchand, Bull S.A.
- *     Cleaned up by David Mosberger-Tang <davidm@xxxxxxxxxx>
- *
- * This file is released under the GPLv2, or at your option any later version.
- *
- * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" 
instruction.  This
- * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
- */
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-/*
- * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.  
Both of these
- * operations have to be done atomically, so that the count doesn't drop to 
zero without
- * acquiring the spinlock first.
- */
-int
-_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
-{
-       int old, new;
-
-       do {
-               old = atomic_read(refcount);
-               new = old - 1;
-
-               if (unlikely (old == 1)) {
-                       /* oops, we may be decrementing to zero, do it the slow 
way... */
-                       spin_lock(lock);
-                       if (atomic_dec_and_test(refcount))
-                               return 1;
-                       spin_unlock(lock);
-                       return 0;
-               }
-       } while (cmpxchg(&refcount->counter, old, new) != old);
-       return 0;
-}
-
-EXPORT_SYMBOL(_atomic_dec_and_lock);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/do_csum.S
--- a/xen/arch/ia64/linux/lib/do_csum.S Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,323 +0,0 @@
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- *     in0: address of buffer to checksum (char *)
- *     in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * 02/04/22    Ken Chen <kenneth.w.chen@xxxxxxxxx>
- *             Data locality study on the checksum buffer.
- *             More optimization cleanup - remove excessive stop bits.
- * 02/04/08    David Mosberger <davidm@xxxxxxxxxx>
- *             More cleanup and tuning.
- * 01/04/18    Jun Nakajima <jun.nakajima@xxxxxxxxx>
- *             Clean up and optimize and the software pipeline, loading two
- *             back-to-back 8-byte words per loop. Clean up the initialization
- *             for the loop. Support the cases where load latency = 1 or 2.
- *             Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-//     The goal is to go as quickly as possible to the point where
-//     we can checksum 16 bytes/loop. Before reaching that point we must
-//     take care of incorrect alignment of first byte.
-//
-//     The code hereafter also takes care of the "tail" part of the buffer
-//     before entering the core loop, if any. The checksum is a sum so it
-//     allows us to commute operations. So we do the "head" and "tail"
-//     first to finish at full speed in the body. Once we get the head and
-//     tail values, we feed them into the pipeline, very handy initialization.
-//
-//     Of course we deal with the special case where the whole buffer fits
-//     into one 8 byte word. In this case we have only one entry in the 
pipeline.
-//
-//     We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-//     possible load latency and also to accommodate for head and tail.
-//
-//     The end of the function deals with folding the checksum from 64bits
-//     down to 16bits taking care of the carry.
-//
-//     This version avoids synchronization in the core loop by also using a
-//     pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-//      wordx[] (x=1,2)
-//     |---|
-//      |   | 0                        : new value loaded in pipeline
-//     |---|
-//      |   | -                        : in transit data
-//     |---|
-//      |   | LOAD_LATENCY     : current value to add to checksum
-//     |---|
-//      |   | LOAD_LATENCY+1   : previous value added to checksum
-//      |---|                  (previous iteration)
-//
-//     resultx[] (x=1,2)
-//     |---|
-//      |   | 0                        : initial value
-//     |---|
-//      |   | LOAD_LATENCY-1   : new checksum
-//     |---|
-//      |   | LOAD_LATENCY     : previous value of checksum
-//     |---|
-//      |   | LOAD_LATENCY+1   : final checksum when out of the loop
-//      |---|
-//
-//
-//     See RFC1071 "Computing the Internet Checksum" for various techniques for
-//     calculating the Internet checksum.
-//
-// NOT YET DONE:
-//     - Maybe another algorithm which would take care of the folding at the
-//       end in a different manner
-//     - Work with people more knowledgeable than me on the network stack
-//       to figure out if we could not split the function depending on the
-//       type of packet or alignment we get. Like the ip_fast_csum() routine
-//       where we know we have at least 20bytes worth of data to checksum.
-//     - Do a better job of handling small packets.
-//     - Note on prefetching: it was found that under various load, i.e. ftp 
read/write,
-//       nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate 
is at 99.8%
-//       on the data that buffer points to (partly because the checksum is 
often preceded by
-//       a copy_from_user()).  This finding indiate that lfetch will not be 
beneficial since
-//       the data is already in the cache.
-//
-
-#define saved_pfs      r11
-#define hmask          r16
-#define tmask          r17
-#define first1         r18
-#define firstval       r19
-#define firstoff       r20
-#define last           r21
-#define lastval                r22
-#define lastoff                r23
-#define saved_lc       r24
-#define saved_pr       r25
-#define tmp1           r26
-#define tmp2           r27
-#define tmp3           r28
-#define carry1         r29
-#define carry2         r30
-#define first2         r31
-
-#define buf            in0
-#define len            in1
-
-#define LOAD_LATENCY   2       // XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH                     (LOAD_LATENCY+2)
-#define ELD    p[LOAD_LATENCY]         // end of load
-#define ELD_1  p[LOAD_LATENCY+1]       // and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,2,16,0,16
-       .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
-       .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
-       mov ret0=r0             // in case we have zero length
-       cmp.lt p0,p6=r0,len     // check for zero length or negative (32bit len)
-       ;;
-       add tmp1=buf,len        // last byte's address
-       .save pr, saved_pr
-       mov saved_pr=pr         // preserve predicates (rotation)
-(p6)   br.ret.spnt.many rp     // return if zero or negative length
-
-       mov hmask=-1            // initialize head mask
-       tbit.nz p15,p0=buf,0    // is buf an odd address?
-       and first1=-8,buf       // 8-byte align down address of first1 element
-
-       and firstoff=7,buf      // how many bytes off for first1 element
-       mov tmask=-1            // initialize tail mask
-
-       ;;
-       adds tmp2=-1,tmp1       // last-1
-       and lastoff=7,tmp1      // how many bytes off for last element
-       ;;
-       sub tmp1=8,lastoff      // complement to lastoff
-       and last=-8,tmp2        // address of word containing last byte
-       ;;
-       sub tmp3=last,first1    // tmp3=distance from first1 to last
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc      // save lc
-       cmp.eq p8,p9=last,first1        // everything fits in one word ?
-
-       ld8 firstval=[first1],8 // load, ahead of time, "first1" word
-       and tmp1=7, tmp1        // make sure that if tmp1==8 -> tmp1=0
-       shl tmp2=firstoff,3     // number of bits
-       ;;
-(p9)   ld8 lastval=[last]      // load, ahead of time, "last" word, if needed
-       shl tmp1=tmp1,3         // number of bits
-(p9)   adds tmp3=-8,tmp3       // effectively loaded
-       ;;
-(p8)   mov lastval=r0          // we don't need lastval if first1==last
-       shl hmask=hmask,tmp2    // build head mask, mask off [0,first1off[
-       shr.u tmask=tmask,tmp1  // build tail mask, mask off ]8,lastoff]
-       ;;
-       .body
-#define count tmp3
-
-(p8)   and hmask=hmask,tmask   // apply tail mask to head mask if 1 word only
-(p9)   and word2[0]=lastval,tmask      // mask last it as appropriate
-       shr.u count=count,3     // how many 8-byte?
-       ;;
-       // If count is odd, finish this 8-byte word so that we can
-       // load two back-to-back 8-byte words per loop thereafter.
-       and word1[0]=firstval,hmask     // and mask it as appropriate
-       tbit.nz p10,p11=count,0         // if (count is odd)
-       ;;
-(p8)   mov result1[0]=word1[0]
-(p9)   add result1[0]=word1[0],word2[0]
-       ;;
-       cmp.ltu p6,p0=result1[0],word1[0]       // check the carry
-       cmp.eq.or.andcm p8,p0=0,count           // exit if zero 8-byte
-       ;;
-(p6)   adds result1[0]=1,result1[0]
-(p8)   br.cond.dptk .do_csum_exit      // if (within an 8-byte word)
-(p11)  br.cond.dptk .do_csum16         // if (count is even)
-
-       // Here count is odd.
-       ld8 word1[1]=[first1],8         // load an 8-byte word
-       cmp.eq p9,p10=1,count           // if (count == 1)
-       adds count=-1,count             // loaded an 8-byte word
-       ;;
-       add result1[0]=result1[0],word1[1]
-       ;;
-       cmp.ltu p6,p0=result1[0],word1[1]
-       ;;
-(p6)   adds result1[0]=1,result1[0]
-(p9)   br.cond.sptk .do_csum_exit      // if (count == 1) exit
-       // Fall through to caluculate the checksum, feeding result1[0] as
-       // the initial value in result1[0].
-       //
-       // Calculate the checksum loading two 8-byte words per loop.
-       //
-.do_csum16:
-       add first2=8,first1
-       shr.u count=count,1     // we do 16 bytes per loop
-       ;;
-       adds count=-1,count
-       mov carry1=r0
-       mov carry2=r0
-       brp.loop.imp 1f,2f
-       ;;
-       mov ar.ec=PIPE_DEPTH
-       mov ar.lc=count // set lc
-       mov pr.rot=1<<16
-       // result1[0] must be initialized in advance.
-       mov result2[0]=r0
-       ;;
-       .align 32
-1:
-(ELD_1)        cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1)        cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD)  add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD)  add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0]) ld8 word1[0]=[first1],16
-(p[0]) ld8 word2[0]=[first2],16
-       br.ctop.sptk 1b
-       ;;
-       // Since len is a 32-bit value, carry cannot be larger than a 64-bit 
value.
-(pC1[1])adds carry1=1,carry1   // since we miss the last one
-(pC2[1])adds carry2=1,carry2
-       ;;
-       add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
-       add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
-       ;;
-       cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
-       cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
-       ;;
-(p6)   adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7)   adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
-       ;;
-       add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
-       ;;
-       cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
-       ;;
-(p6)   adds result1[0]=1,result1[0]
-       ;;
-.do_csum_exit:
-       //
-       // now fold 64 into 16 bits taking care of carry
-       // that's not very good because it has lots of sequentiality
-       //
-       mov tmp3=0xffff
-       zxt4 tmp1=result1[0]
-       shr.u tmp2=result1[0],32
-       ;;
-       add result1[0]=tmp1,tmp2
-       ;;
-       and tmp1=result1[0],tmp3
-       shr.u tmp2=result1[0],16
-       ;;
-       add result1[0]=tmp1,tmp2
-       ;;
-       and tmp1=result1[0],tmp3
-       shr.u tmp2=result1[0],16
-       ;;
-       add result1[0]=tmp1,tmp2
-       ;;
-       and tmp1=result1[0],tmp3
-       shr.u tmp2=result1[0],16
-       ;;
-       add ret0=tmp1,tmp2
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       // if buf was odd then swap bytes
-       mov ar.pfs=saved_pfs            // restore ar.ec
-(p15)  mux1 ret0=ret0,@rev             // reverse word
-       ;;
-       mov ar.lc=saved_lc
-(p15)  shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
-       br.ret.sptk.many rp
-
-//     I (Jun Nakajima) wrote an equivalent code (see below), but it was
-//     not much better than the original. So keep the original there so that
-//     someone else can challenge.
-//
-//     shr.u word1[0]=result1[0],32
-//     zxt4 result1[0]=result1[0]
-//     ;;
-//     add result1[0]=result1[0],word1[0]
-//     ;;
-//     zxt2 result2[0]=result1[0]
-//     extr.u word1[0]=result1[0],16,16
-//     shr.u carry1=result1[0],32
-//     ;;
-//     add result2[0]=result2[0],word1[0]
-//     ;;
-//     add result2[0]=result2[0],carry1
-//     ;;
-//     extr.u ret0=result2[0],16,16
-//     ;;
-//     add ret0=ret0,result2[0]
-//     ;;
-//     zxt2 ret0=ret0
-//     mov ar.pfs=saved_pfs             // restore ar.ec
-//     mov pr=saved_pr,0xffffffffffff0000
-//     ;;
-//     // if buf was odd then swap bytes
-//     mov ar.lc=saved_lc
-//(p15)        mux1 ret0=ret0,@rev             // reverse word
-//     ;;
-//(p15)        shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
-//     br.ret.sptk.many rp
-
-END(do_csum)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/io.c
--- a/xen/arch/ia64/linux/lib/io.c      Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,165 +0,0 @@
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
-       char *dst = to;
-
-       while (count) {
-               count--;
-               *dst++ = readb(from++);
-       }
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
-       const char *src = from;
-
-       while (count) {
-               count--;
-               writeb(*src++, to++);
-       }
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
-       unsigned char ch = (char)(c & 0xff);
-
-       while (count) {
-               count--;
-               writeb(ch, dst);
-               dst++;
-       }
-}
-EXPORT_SYMBOL(memset_io);
-
-#ifdef CONFIG_IA64_GENERIC
-
-#undef __ia64_inb
-#undef __ia64_inw
-#undef __ia64_inl
-#undef __ia64_outb
-#undef __ia64_outw
-#undef __ia64_outl
-#undef __ia64_readb
-#undef __ia64_readw
-#undef __ia64_readl
-#undef __ia64_readq
-#undef __ia64_readb_relaxed
-#undef __ia64_readw_relaxed
-#undef __ia64_readl_relaxed
-#undef __ia64_readq_relaxed
-#undef __ia64_writeb
-#undef __ia64_writew
-#undef __ia64_writel
-#undef __ia64_writeq
-#undef __ia64_mmiowb
-
-unsigned int
-__ia64_inb (unsigned long port)
-{
-       return ___ia64_inb(port);
-}
-
-unsigned int
-__ia64_inw (unsigned long port)
-{
-       return ___ia64_inw(port);
-}
-
-unsigned int
-__ia64_inl (unsigned long port)
-{
-       return ___ia64_inl(port);
-}
-
-void
-__ia64_outb (unsigned char val, unsigned long port)
-{
-       ___ia64_outb(val, port);
-}
-
-void
-__ia64_outw (unsigned short val, unsigned long port)
-{
-       ___ia64_outw(val, port);
-}
-
-void
-__ia64_outl (unsigned int val, unsigned long port)
-{
-       ___ia64_outl(val, port);
-}
-
-unsigned char
-__ia64_readb (void __iomem *addr)
-{
-       return ___ia64_readb (addr);
-}
-
-unsigned short
-__ia64_readw (void __iomem *addr)
-{
-       return ___ia64_readw (addr);
-}
-
-unsigned int
-__ia64_readl (void __iomem *addr)
-{
-       return ___ia64_readl (addr);
-}
-
-unsigned long
-__ia64_readq (void __iomem *addr)
-{
-       return ___ia64_readq (addr);
-}
-
-unsigned char
-__ia64_readb_relaxed (void __iomem *addr)
-{
-       return ___ia64_readb (addr);
-}
-
-unsigned short
-__ia64_readw_relaxed (void __iomem *addr)
-{
-       return ___ia64_readw (addr);
-}
-
-unsigned int
-__ia64_readl_relaxed (void __iomem *addr)
-{
-       return ___ia64_readl (addr);
-}
-
-unsigned long
-__ia64_readq_relaxed (void __iomem *addr)
-{
-       return ___ia64_readq (addr);
-}
-
-void
-__ia64_mmiowb(void)
-{
-       ___ia64_mmiowb();
-}
-
-#endif /* CONFIG_IA64_GENERIC */
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/ip_fast_csum.S
--- a/xen/arch/ia64/linux/lib/ip_fast_csum.S    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,90 +0,0 @@
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- *      in0: address of buffer to checksum (char *)
- *      in1: length of the buffer (int)
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@xxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes.  However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0    r32
-#define in1    r33
-#define ret0   r8
-
-GLOBAL_ENTRY(ip_fast_csum)
-       .prologue
-       .body
-       cmp.ne  p6,p7=5,in1     // size other than 20 byte?
-       and     r14=3,in0       // is it aligned on 4-byte?
-       add     r15=4,in0       // second source pointer
-       ;;
-       cmp.ne.or.andcm p6,p7=r14,r0
-       ;;
-(p7)   ld4     r20=[in0],8
-(p7)   ld4     r21=[r15],8
-(p6)   br.spnt .generic
-       ;;
-       ld4     r22=[in0],8
-       ld4     r23=[r15],8
-       ;;
-       ld4     r24=[in0]
-       add     r20=r20,r21
-       add     r22=r22,r23
-       ;;
-       add     r20=r20,r22
-       ;;
-       add     r20=r20,r24
-       ;;
-       shr.u   ret0=r20,16     // now need to add the carry
-       zxt2    r20=r20
-       ;;
-       add     r20=ret0,r20
-       ;;
-       shr.u   ret0=r20,16     // add carry again
-       zxt2    r20=r20
-       ;;
-       add     r20=ret0,r20
-       ;;
-       shr.u   ret0=r20,16
-       zxt2    r20=r20
-       ;;
-       add     r20=ret0,r20
-       ;;
-       andcm   ret0=-1,r20
-       .restore sp             // reset frame state
-       br.ret.sptk.many b0
-       ;;
-
-.generic:
-       .prologue
-       .save ar.pfs, r35
-       alloc   r35=ar.pfs,2,2,2,0
-       .save rp, r34
-       mov     r34=b0
-       .body
-       dep.z   out1=in1,2,30
-       mov     out0=in0
-       ;;
-       br.call.sptk.many b0=do_csum
-       ;;
-       andcm   ret0=-1,ret0
-       mov     ar.pfs=r35
-       mov     b0=r34
-       br.ret.sptk.many b0
-END(ip_fast_csum)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy.S
--- a/xen/arch/ia64/linux/lib/memcpy.S  Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,301 +0,0 @@
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- *     in0:    destination address
- *     in1:    source address
- *     in2:    number of bytes to copy
- * Output:
- *     no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-#      define MEM_LAT  21              /* latency to memory */
-
-#      define dst      r2
-#      define src      r3
-#      define retval   r8
-#      define saved_pfs r9
-#      define saved_lc r10
-#      define saved_pr r11
-#      define cnt      r16
-#      define src2     r17
-#      define t0       r18
-#      define t1       r19
-#      define t2       r20
-#      define t3       r21
-#      define t4       r22
-#      define src_end  r23
-
-#      define N        (MEM_LAT + 4)
-#      define Nrot     ((N + 7) & ~7)
-
-       /*
-        * First, check if everything (src, dst, len) is a multiple of eight.  
If
-        * so, we handle everything with no taken branches (other than the loop
-        * itself) and a small icache footprint.  Otherwise, we jump off to
-        * the more general copy routine handling arbitrary
-        * sizes/alignment etc.
-        */
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc
-       or t0=in0,in1
-       ;;
-
-       or t0=t0,in2
-       .save pr, saved_pr
-       mov saved_pr=pr
-
-       .body
-
-       cmp.eq p6,p0=in2,r0     // zero length?
-       mov retval=in0          // return dst
-(p6)   br.ret.spnt.many rp     // zero length, return immediately
-       ;;
-
-       mov dst=in0             // copy because of rotation
-       shr.u cnt=in2,3         // number of 8-byte words to copy
-       mov pr.rot=1<<16
-       ;;
-
-       adds cnt=-1,cnt         // br.ctop is repeat/until
-       cmp.gtu p7,p0=16,in2    // copying less than 16 bytes?
-       mov ar.ec=N
-       ;;
-
-       and t0=0x7,t0
-       mov ar.lc=cnt
-       ;;
-       cmp.ne p6,p0=t0,r0
-
-       mov src=in1             // copy because of rotation
-(p7)   br.cond.spnt.few .memcpy_short
-(p6)   br.cond.spnt.few .memcpy_long
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       nop.i   0
-       ;;
-       nop.m   0
-       ;;
-       .rotr val[N]
-       .rotp p[N]
-       .align 32
-1: { .mib
-(p[0]) ld8 val[0]=[src],8
-       nop.i 0
-       brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
-       nop.f 0
-       br.ctop.dptk.few 1b
-}
-       ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,-1
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       /*
-        * Small (<16 bytes) unaligned copying is done via a simple 
byte-at-the-time
-        * copy loop.  This performs relatively poorly on Itanium, but it 
doesn't
-        * get used very often (gcc inlines small copies) and due to atomicity
-        * issues, we want to avoid read-modify-write of entire words.
-        */
-       .align 32
-.memcpy_short:
-       adds cnt=-1,in2         // br.ctop is repeat/until
-       mov ar.ec=MEM_LAT
-       brp.loop.imp 1f, 2f
-       ;;
-       mov ar.lc=cnt
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       nop.i   0
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       ;;
-       /*
-        * It is faster to put a stop bit in the loop here because it makes
-        * the pipeline shorter (and latency is what matters on short copies).
-        */
-       .align 32
-1: { .mib
-(p[0]) ld1 val[0]=[src],1
-       nop.i 0
-       brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
-       nop.f 0
-       br.ctop.dptk.few 1b
-} ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,-1
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       /*
-        * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
-        * an overriding concern here, but throughput is.  We first do
-        * sub-word copying until the destination is aligned, then we check
-        * if the source is also aligned.  If so, we do a simple load/store-loop
-        * until there are less than 8 bytes left over and then we do the tail,
-        * by storing the last few bytes using sub-word copying.  If the source
-        * is not aligned, we branch off to the non-congruent loop.
-        *
-        *   stage:   op:
-        *         0  ld
-        *         :
-        * MEM_LAT+3  shrp
-        * MEM_LAT+4  st
-        *
-        * On Itanium, the pipeline itself runs without stalls.  However,  
br.ctop
-        * seems to introduce an unavoidable bubble in the pipeline so the 
overall
-        * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
-        * of 4 byte/cycle.  Still not bad.
-        */
-#      undef N
-#      undef Nrot
-#      define N        (MEM_LAT + 5)           /* number of stages */
-#      define Nrot     ((N+1 + 2 + 7) & ~7)    /* number of rotating regs */
-
-#define LOG_LOOP_SIZE  6
-
-.memcpy_long:
-       alloc t3=ar.pfs,3,Nrot,0,Nrot   // resize register frame
-       and t0=-8,src           // t0 = src & ~7
-       and t2=7,src            // t2 = src & 7
-       ;;
-       ld8 t0=[t0]             // t0 = 1st source word
-       adds src2=7,src         // src2 = (src + 7)
-       sub t4=r0,dst           // t4 = -dst
-       ;;
-       and src2=-8,src2        // src2 = (src + 7) & ~7
-       shl t2=t2,3             // t2 = 8*(src & 7)
-       shl t4=t4,3             // t4 = 8*(dst & 7)
-       ;;
-       ld8 t1=[src2]           // t1 = 1st source word if src is 8-byte 
aligned, 2nd otherwise
-       sub t3=64,t2            // t3 = 64-8*(src & 7)
-       shr.u t0=t0,t2
-       ;;
-       add src_end=src,in2
-       shl t1=t1,t3
-       mov pr=t4,0x38          // (p5,p4,p3)=(dst & 7)
-       ;;
-       or t0=t0,t1
-       mov cnt=r0
-       adds src_end=-1,src_end
-       ;;
-(p3)   st1 [dst]=t0,1
-(p3)   shr.u t0=t0,8
-(p3)   adds cnt=1,cnt
-       ;;
-(p4)   st2 [dst]=t0,2
-(p4)   shr.u t0=t0,16
-(p4)   adds cnt=2,cnt
-       ;;
-(p5)   st4 [dst]=t0,4
-(p5)   adds cnt=4,cnt
-       and src_end=-8,src_end  // src_end = last word of source buffer
-       ;;
-
-       // At this point, dst is aligned to 8 bytes and there at least 16-7=9 
bytes left to copy:
-
-1:{    add src=cnt,src                 // make src point to remainder of 
source buffer
-       sub cnt=in2,cnt                 // cnt = number of bytes left to copy
-       mov t4=ip
-  }    ;;
-       and src2=-8,src                 // align source pointer
-       adds t4=.memcpy_loops-1b,t4
-       mov ar.ec=N
-
-       and t0=7,src                    // t0 = src & 7
-       shr.u t2=cnt,3                  // t2 = number of 8-byte words left to 
copy
-       shl cnt=cnt,3                   // move bits 0-2 to 3-5
-       ;;
-
-       .rotr val[N+1], w[2]
-       .rotp p[N]
-
-       cmp.ne p6,p0=t0,r0              // is src aligned, too?
-       shl t0=t0,LOG_LOOP_SIZE         // t0 = 8*(src & 7)
-       adds t2=-1,t2                   // br.ctop is repeat/until
-       ;;
-       add t4=t0,t4
-       mov pr=cnt,0x38                 // set (p5,p4,p3) to # of bytes 
last-word bytes to copy
-       mov ar.lc=t2
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       nop.i   0
-       ;;
-       nop.m   0
-       ;;
-(p6)   ld8 val[1]=[src2],8             // prime the pump...
-       mov b6=t4
-       br.sptk.few b6
-       ;;
-
-.memcpy_tail:
-       // At this point, (p5,p4,p3) are set to the number of bytes left to 
copy (which is
-       // less than 8) and t0 contains the last few bytes of the src buffer:
-(p5)   st4 [dst]=t0,4
-(p5)   shr.u t0=t0,32
-       mov ar.lc=saved_lc
-       ;;
-(p4)   st2 [dst]=t0,2
-(p4)   shr.u t0=t0,16
-       mov ar.pfs=saved_pfs
-       ;;
-(p3)   st1 [dst]=t0
-       mov pr=saved_pr,-1
-       br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
-       .align 64
-
-#define COPY(shift,index)                                                      
                \
- 1: { .mib                                                                     
                \
-       (p[0])          ld8 val[0]=[src2],8;                                    
                \
-       (p[MEM_LAT+3])  shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;    
                \
-                       brp.loop.imp 1b, 2f                                     
                \
-    };                                                                         
                \
- 2: { .mfb                                                                     
                \
-       (p[MEM_LAT+4])  st8 [dst]=w[1],8;                                       
                \
-                       nop.f 0;                                                
                \
-                       br.ctop.dptk.few 1b;                                    
                \
-    };                                                                         
                \
-                       ;;                                                      
                \
-                       ld8 val[N-1]=[src_end]; /* load last word (may be same 
as val[N]) */    \
-                       ;;                                                      
                \
-                       shrp t0=val[N-1],val[N-index],shift;                    
                \
-                       br .memcpy_tail
-.memcpy_loops:
-       COPY(0, 1) /* no point special casing this---it doesn't go any faster 
without shrp */
-       COPY(8, 0)
-       COPY(16, 0)
-       COPY(24, 0)
-       COPY(32, 0)
-       COPY(40, 0)
-       COPY(48, 0)
-       COPY(56, 0)
-
-END(memcpy)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strlen_user.S
--- a/xen/arch/ia64/linux/lib/strlen_user.S     Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,198 +0,0 @@
-/*
- * Optimized version of the strlen_user() function
- *
- * Inputs:
- *     in0     address of buffer
- *
- * Outputs:
- *     ret0    0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * 01/19/99 S.Eranian heavily enhanced version (see details below)
- * 09/24/99 S.Eranian added speculation recovery code
- */
-
-#include <asm/asmmacro.h>
-
-//
-// int strlen_user(char *)
-// ------------------------
-// Returns:
-//     - length of string + 1
-//     - 0 in case an exception is raised
-//
-// This is an enhanced version of the basic strlen_user. it includes a
-// combination of compute zero index (czx), parallel comparisons, speculative
-// loads and loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-//       The goal is to look at the string in chunks of 8 bytes.
-//       so we need to do a few extra checks at the beginning because the
-//       string may not be 8-byte aligned. In this case we load the 8byte
-//       quantity which includes the start of the string and mask the unused
-//       bytes with 0xff to avoid confusing czx.
-//       We use speculative loads and software pipelining to hide memory
-//       latency and do read ahead safely. This way we defer any exception.
-//
-//       Because we don't want the kernel to be relying on particular
-//       settings of the DCR register, we provide recovery code in case
-//       speculation fails. The recovery code is going to "redo" the work using
-//       only normal loads. If we still get a fault then we return an
-//       error (ret0=0). Otherwise we return the strlen+1 as usual.
-//       The fact that speculation may fail can be caused, for instance, by
-//       the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-//       a NaT bit will be set if the translation is not present. The normal
-//       load, on the other hand, will cause the translation to be inserted
-//       if the mapping exists.
-//
-//       It should be noted that we execute recovery code only when we need
-//       to use the data that has been speculatively loaded: we don't execute
-//       recovery code on pure read ahead data.
-//
-// Remarks:
-//     - the cmp r0,r0 is used as a fast way to initialize a predicate
-//       register to 1. This is required to make sure that we get the parallel
-//       compare correct.
-//
-//     - we don't use the epilogue counter to exit the loop but we need to set
-//       it to zero beforehand.
-//
-//     - after the loop we must test for Nat values because neither the
-//       czx nor cmp instruction raise a NaT consumption fault. We must be
-//       careful not to look too far for a Nat for which we don't care.
-//       For instance we don't need to look at a NaT in val2 if the zero byte
-//       was in val1.
-//
-//     - Clearly performance tuning is required.
-//
-
-#define saved_pfs      r11
-#define        tmp             r10
-#define base           r16
-#define orig           r17
-#define saved_pr       r18
-#define src            r19
-#define mask           r20
-#define val            r21
-#define val1           r22
-#define val2           r23
-
-GLOBAL_ENTRY(__strlen_user)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,11,0,0,8
-
-       .rotr v[2], w[2]        // declares our 4 aliases
-
-       extr.u tmp=in0,0,3      // tmp=least significant 3 bits
-       mov orig=in0            // keep trackof initial byte address
-       dep src=0,in0,0,3       // src=8byte-aligned in0 address
-       .save pr, saved_pr
-       mov saved_pr=pr         // preserve predicates (rotation)
-       ;;
-
-       .body
-
-       ld8.s v[1]=[src],8      // load the initial 8bytes (must speculate)
-       shl tmp=tmp,3           // multiply by 8bits/byte
-       mov mask=-1             // our mask
-       ;;
-       ld8.s w[1]=[src],8      // load next 8 bytes in 2nd pipeline
-       cmp.eq p6,p0=r0,r0      // sets p6 (required because of // cmp.and)
-       sub tmp=64,tmp          // how many bits to shift our mask on the right
-       ;;
-       shr.u   mask=mask,tmp   // zero enough bits to hold v[1] valuable part
-       mov ar.ec=r0            // clear epilogue counter (saved in ar.pfs)
-       ;;
-       add base=-16,src        // keep track of aligned base
-       chk.s v[1], .recover    // if already NaT, then directly skip to recover
-       or v[1]=v[1],mask       // now we have a safe initial byte pattern
-       ;;
-1:
-       ld8.s v[0]=[src],8      // speculatively load next
-       czx1.r val1=v[1]        // search 0 byte from right
-       czx1.r val2=w[1]        // search 0 byte from right following 8bytes
-       ;;
-       ld8.s w[0]=[src],8      // speculatively load next to next
-       cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
-       cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
-(p6)   br.wtop.dptk.few 1b     // loop until p6 == 0
-       ;;
-       //
-       // We must return try the recovery code iff
-       // val1_is_nat || (val1==8 && val2_is_nat)
-       //
-       // XXX Fixme
-       //      - there must be a better way of doing the test
-       //
-       cmp.eq  p8,p9=8,val1    // p6 = val1 had zero (disambiguate)
-       tnat.nz p6,p7=val1      // test NaT on val1
-(p6)   br.cond.spnt .recover   // jump to recovery if val1 is NaT
-       ;;
-       //
-       // if we come here p7 is true, i.e., initialized for // cmp
-       //
-       cmp.eq.and  p7,p0=8,val1// val1==8?
-       tnat.nz.and p7,p0=val2  // test NaT if val2
-(p7)   br.cond.spnt .recover   // jump to recovery if val2 is NaT
-       ;;
-(p8)   mov val1=val2           // val2 contains the value
-(p8)   adds src=-16,src        // correct position when 3 ahead
-(p9)   adds src=-24,src        // correct position when 4 ahead
-       ;;
-       sub ret0=src,orig       // distance from origin
-       sub tmp=7,val1          // 7=8-1 because this strlen returns strlen+1
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       sub ret0=ret0,tmp       // length=now - back -1
-       mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
-       br.ret.sptk.many rp     // end of normal execution
-
-       //
-       // Outlined recovery code when speculation failed
-       //
-       // This time we don't use speculation and rely on the normal exception
-       // mechanism. that's why the loop is not as good as the previous one
-       // because read ahead is not possible
-       //
-       // XXX Fixme
-       //      - today we restart from the beginning of the string instead
-       //        of trying to continue where we left off.
-       //
-.recover:
-       EX(.Lexit1, ld8 val=[base],8)   // load the initial bytes
-       ;;
-       or val=val,mask                 // remask first bytes
-       cmp.eq p0,p6=r0,r0              // nullify first ld8 in loop
-       ;;
-       //
-       // ar.ec is still zero here
-       //
-2:
-       EX(.Lexit1, (p6) ld8 val=[base],8)
-       ;;
-       czx1.r val1=val         // search 0 byte from right
-       ;;
-       cmp.eq p6,p0=8,val1     // val1==8 ?
-(p6)   br.wtop.dptk.few 2b     // loop until p6 == 0
-       ;;
-       sub ret0=base,orig      // distance from base
-       sub tmp=7,val1          // 7=8-1 because this strlen returns strlen+1
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       sub ret0=ret0,tmp       // length=now - back -1
-       mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
-       br.ret.sptk.many rp     // end of successful recovery code
-
-       //
-       // We failed even on the normal load (called from exception handler)
-       //
-.Lexit1:
-       mov ret0=0
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
-       br.ret.sptk.many rp
-END(__strlen_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strncpy_from_user.S
--- a/xen/arch/ia64/linux/lib/strncpy_from_user.S       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- *     in0:    address of destination buffer
- *     in1:    address of string to be copied
- *     in2:    length of buffer in bytes
- * Outputs:
- *     r8:     -EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- *                      by Andreas Schwab <schwab@xxxxxxx>).
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
-       alloc r2=ar.pfs,3,0,0,0
-       mov r8=0
-       mov r9=in1
-       ;;
-       add r10=in1,in2
-       cmp.eq p6,p0=r0,in2
-(p6)   br.ret.spnt.many rp
-
-       // XXX braindead copy loop---this needs to be optimized
-.Loop1:
-       EX(.Lexit, ld1 r8=[in1],1)
-       ;;
-       EX(.Lexit, st1 [in0]=r8,1)
-       cmp.ne p6,p7=r8,r0
-       ;;
-(p6)   cmp.ne.unc p8,p0=in1,r10
-(p8)   br.cond.dpnt.few .Loop1
-       ;;
-(p6)   mov r8=in2              // buffer filled up---return buffer length
-(p7)   sub r8=in1,r9,1         // return string length (excluding NUL 
character)
-[.Lexit:]
-       br.ret.sptk.many rp
-END(__strncpy_from_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strnlen_user.S
--- a/xen/arch/ia64/linux/lib/strnlen_user.S    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,45 +0,0 @@
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- *     in0:    address of buffer
- *     in1:    string length limit N
- * Outputs:
- *     r8:     0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
-       .prologue
-       alloc r2=ar.pfs,2,0,0,0
-       .save ar.lc, r16
-       mov r16=ar.lc                   // preserve ar.lc
-
-       .body
-
-       add r3=-1,in1
-       ;;
-       mov ar.lc=r3
-       mov r9=0
-       ;;
-       // XXX braindead strlen loop---this needs to be optimized
-.Loop1:
-       EXCLR(.Lexit, ld1 r8=[in0],1)
-       add r9=1,r9
-       ;;
-       cmp.eq p6,p0=r8,r0
-(p6)   br.cond.dpnt .Lexit
-       br.cloop.dptk.few .Loop1
-
-       add r9=1,in1                    // NUL not found---return N+1
-       ;;
-.Lexit:
-       mov r8=r9
-       mov ar.lc=r16                   // restore ar.lc
-       br.ret.sptk.many rp
-END(__strnlen_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/xor.S
--- a/xen/arch/ia64/linux/lib/xor.S     Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,184 +0,0 @@
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 3, 0, 13, 16
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       ;;
-       .rotr s1[6+1], s2[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
-       nop.f 0
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 4, 0, 20, 24
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov r18 = in3
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       ;;
-       .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-       ;;
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], s3[6]
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 5, 0, 27, 32
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov r18 = in3
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       mov r19 = in4
-       ;;
-       .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r20 = s3[6], s4[6]
-       ;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r20
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 6, 0, 34, 40
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov r18 = in3
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       mov r19 = in4
-       mov r20 = in5
-       ;;
-       .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r21 = s3[6], s4[6]
-       ;;
-(p[0]) ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r21
-       ;;
-(p[6])   xor d[0] = d[0], s5[6]
-       nop.f 0
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_5)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/minstate.h
--- a/xen/arch/ia64/linux/minstate.h    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,254 +0,0 @@
-#include <linux/config.h>
-
-#include <asm/cache.h>
-
-#include "entry.h"
-
-/*
- * For ivt.s we want to access the stack virtually so we don't have to disable 
translation
- * on interrupts.
- *
- *  On entry:
- *     r1:     pointer to current task (ar.k6)
- */
-#define MINSTATE_START_SAVE_MIN_VIRT                                           
                \
-(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
-       ;;                                                                      
                \
-(pUStk)        mov.m r24=ar.rnat;                                              
                        \
-(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base 
of RBS */               \
-(pKStk) mov r1=sp;                                     /* get sp  */           
                \
-       ;;                                                                      
                \
-(pUStk) lfetch.fault.excl.nt1 [r22];                                           
                \
-(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
-(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
-       ;;                                                                      
                \
-(pUStk)        mov ar.bspstore=r22;                            /* switch to 
kernel RBS */              \
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, 
use sp (r12) */   \
-       ;;                                                                      
                \
-(pUStk)        mov r18=ar.bsp;                                                 
                        \
-(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
-
-#define MINSTATE_END_SAVE_MIN_VIRT                                             
                \
-       bsw.1;                  /* switch back to bank 1 (must be last in insn 
group) */        \
-       ;;
-
-/*
- * For mca_asm.S we want to access the stack physically since the state is 
saved before we
- * go virtual and don't want to destroy the iip or ipsr.
- */
-#define MINSTATE_START_SAVE_MIN_PHYS                                           
                \
-(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                         
                \
-(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                   
                \
-(pKStk) ld8 r3 = [r3];;                                                        
                        \
-(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                            
                \
-(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                          
                \
-(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
-(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of 
register backing store */    \
-       ;;                                                                      
                \
-(pUStk)        mov r24=ar.rnat;                                                
                        \
-(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
-(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
-(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel 
virtual addr of RBS */        \
-       ;;                                                                      
                \
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp 
(r12) */           \
-(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS 
*/                      \
-       ;;                                                                      
                \
-(pUStk)        mov r18=ar.bsp;                                                 
                        \
-(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
-
-#define MINSTATE_END_SAVE_MIN_PHYS                                             
                \
-       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */  
                \
-       ;;
-
-#ifdef MINSTATE_VIRT
-# define MINSTATE_GET_CURRENT(reg)     \
-               movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
-               ld8 reg=[reg]
-# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
-# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
-#endif
-
-#ifdef MINSTATE_PHYS
-# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
-# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
-# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
-#endif
-
-/*
- * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
- * the minimum state necessary that allows us to turn psr.ic back
- * on.
- *
- * Assumed state upon entry:
- *     psr.ic: off
- *     r31:    contains saved predicates (pr)
- *
- * Upon exit, the state is as follows:
- *     psr.ic: off
- *      r2 = points to &pt_regs.r16
- *      r8 = contents of ar.ccv
- *      r9 = contents of ar.csd
- *     r10 = contents of ar.ssd
- *     r11 = FPSR_DEFAULT
- *     r12 = kernel sp (kernel virtual address)
- *     r13 = points to current task_struct (kernel virtual address)
- *     p15 = TRUE if psr.i is set in cr.ipsr
- *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
- *             preserved
- *
- * Note that psr.ic is NOT turned on by this macro.  This is so that
- * we can pass interruption state as arguments to a handler.
- */
-#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                      
                \
-       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                       
                \
-       mov r27=ar.rsc;                 /* M */                                 
                \
-       mov r20=r1;                     /* A */                                 
                \
-       mov r25=ar.unat;                /* M */                                 
                \
-       mov r29=cr.ipsr;                /* M */                                 
                \
-       mov r26=ar.pfs;                 /* I */                                 
                \
-       mov r28=cr.iip;                 /* M */                                 
                \
-       mov r21=ar.fpsr;                /* M */                                 
                \
-       COVER;                          /* B;; (or nothing) */                  
                \
-       ;;                                                                      
                \
-       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                         
                \
-       ;;                                                                      
                \
-       ld1 r17=[r16];                          /* load 
current->thread.on_ustack flag */       \
-       st1 [r16]=r0;                           /* clear 
current->thread.on_ustack flag */      \
-       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                          
                \
-       /* switch from user to kernel RBS: */                                   
                \
-       ;;                                                                      
                \
-       invala;                         /* M */                                 
                \
-       SAVE_IFS;                                                               
                \
-       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode 
already? */            \
-       ;;                                                                      
                \
-       MINSTATE_START_SAVE_MIN                                                 
                \
-       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line 
size */           \
-       adds r16=PT(CR_IPSR),r1;                                                
                \
-       ;;                                                                      
                \
-       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                             
                \
-       st8 [r16]=r29;          /* save cr.ipsr */                              
                \
-       ;;                                                                      
                \
-       lfetch.fault.excl.nt1 [r17];                                            
                \
-       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                      
                \
-       mov r29=b0                                                              
                \
-       ;;                                                                      
                \
-       adds r16=PT(R8),r1;     /* initialize first base pointer */             
                \
-       adds r17=PT(R9),r1;     /* initialize second base pointer */            
                \
-(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */           
                        \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r8,16;                                        
                        \
-.mem.offset 8,0; st8.spill [r17]=r9,16;                                        
                        \
-        ;;                                                                     
                \
-.mem.offset 0,0; st8.spill [r16]=r10,24;                                       
                \
-.mem.offset 8,0; st8.spill [r17]=r11,24;                                       
                \
-        ;;                                                                     
                \
-       st8 [r16]=r28,16;       /* save cr.iip */                               
                \
-       st8 [r17]=r30,16;       /* save cr.ifs */                               
                \
-(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                  
                        \
-       mov r8=ar.ccv;                                                          
                \
-       mov r9=ar.csd;                                                          
                \
-       mov r10=ar.ssd;                                                         
                \
-       movl r11=FPSR_DEFAULT;   /* L-unit */                                   
                \
-       ;;                                                                      
                \
-       st8 [r16]=r25,16;       /* save ar.unat */                              
                \
-       st8 [r17]=r26,16;       /* save ar.pfs */                               
                \
-       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */    
                \
-       ;;                                                                      
                \
-       st8 [r16]=r27,16;       /* save ar.rsc */                               
                \
-(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                      
                        \
-(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */           
                        \
-       ;;                      /* avoid RAW on r16 & r17 */                    
                \
-(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                  
                        \
-       st8 [r17]=r31,16;       /* save predicates */                           
                \
-(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */       
                        \
-       ;;                                                                      
                \
-       st8 [r16]=r29,16;       /* save b0 */                                   
                \
-       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */            
                \
-       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */      
                \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */          
                \
-.mem.offset 8,0; st8.spill [r17]=r12,16;                                       
                \
-       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes 
of scratch) */  \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r13,16;                                       
                \
-.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */              
                \
-       movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;                      
                \
-       ld8 r13=[r13];                  /* establish 'current' */               
                \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r15,16;                                       
                \
-.mem.offset 8,0; st8.spill [r17]=r14,16;                                       
                \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r2,16;                                        
                        \
-.mem.offset 8,0; st8.spill [r17]=r3,16;                                        
                        \
-       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                     
                \
-       ;;                                                                      
                \
-       EXTRA;                                                                  
                \
-       movl r1=__gp;           /* establish kernel global pointer */           
                \
-       ;;                                                                      
                \
-       MINSTATE_END_SAVE_MIN
-
-/*
- * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
- *
- * Assumed state upon entry:
- *     psr.ic: on
- *     r2:     points to &pt_regs.r16
- *     r3:     points to &pt_regs.r17
- *     r8:     contents of ar.ccv
- *     r9:     contents of ar.csd
- *     r10:    contents of ar.ssd
- *     r11:    FPSR_DEFAULT
- *
- * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
- */
-#define SAVE_REST                              \
-.mem.offset 0,0; st8.spill [r2]=r16,16;                \
-.mem.offset 8,0; st8.spill [r3]=r17,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r18,16;                \
-.mem.offset 8,0; st8.spill [r3]=r19,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r20,16;                \
-.mem.offset 8,0; st8.spill [r3]=r21,16;                \
-       mov r18=b6;                             \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r22,16;                \
-.mem.offset 8,0; st8.spill [r3]=r23,16;                \
-       mov r19=b7;                             \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r24,16;                \
-.mem.offset 8,0; st8.spill [r3]=r25,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r26,16;                \
-.mem.offset 8,0; st8.spill [r3]=r27,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r28,16;                \
-.mem.offset 8,0; st8.spill [r3]=r29,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r30,16;                \
-.mem.offset 8,0; st8.spill [r3]=r31,32;                \
-       ;;                                      \
-       mov ar.fpsr=r11;        /* M-unit */    \
-       st8 [r2]=r8,8;          /* ar.ccv */    \
-       adds r24=PT(B6)-PT(F7),r3;              \
-       ;;                                      \
-       stf.spill [r2]=f6,32;                   \
-       stf.spill [r3]=f7,32;                   \
-       ;;                                      \
-       stf.spill [r2]=f8,32;                   \
-       stf.spill [r3]=f9,32;                   \
-       ;;                                      \
-       stf.spill [r2]=f10;                     \
-       stf.spill [r3]=f11;                     \
-       adds r25=PT(B7)-PT(F11),r3;             \
-       ;;                                      \
-       st8 [r24]=r18,16;       /* b6 */        \
-       st8 [r25]=r19,16;       /* b7 */        \
-       ;;                                      \
-       st8 [r24]=r9;           /* ar.csd */    \
-       st8 [r25]=r10;          /* ar.ssd */    \
-       ;;
-
-#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
-#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov 
r15=r19)
-#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/pdb-stub.c
--- a/xen/arch/ia64/pdb-stub.c  Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,59 +0,0 @@
-
-/*
- * pervasive debugger
- * www.cl.cam.ac.uk/netos/pdb
- *
- * alex ho
- * 2004
- * university of cambridge computer laboratory
- *
- * code adapted originally from kgdb, nemesis, & gdbserver
- */
-
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <asm/ptrace.h>
-#include <xen/keyhandler.h> 
-#include <asm/processor.h>
-#include <asm/pdb.h>
-#include <xen/list.h>
-#include <xen/serial.h>
-
-#define __PDB_GET_VAL 1
-#define __PDB_SET_VAL 2
-
-/*
- * Read or write memory in an address space
- */
-int pdb_change_values(u_char *buffer, int length,
-                     unsigned long cr3, unsigned long addr, int rw)
-{
-       dummy();
-       return 0;
-}
-
-/*
- * Set memory in a domain's address space
- * Set "length" bytes at "address" from "domain" to the values in "buffer".
- * Return the number of bytes set, 0 if there was a problem.
- */
-
-int pdb_set_values(u_char *buffer, int length,
-                  unsigned long cr3, unsigned long addr)
-{
-    int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL);
-    return count;
-}
-
-/*
- * Read memory from a domain's address space.
- * Fetch "length" bytes at "address" from "domain" into "buffer".
- * Return the number of bytes read, 0 if there was a problem.
- */
-
-int pdb_get_values(u_char *buffer, int length,
-                  unsigned long cr3, unsigned long addr)
-{
-  return pdb_change_values(buffer, length, cr3, addr, __PDB_GET_VAL);
-}
-

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] Begin updating to 2.6.13 base, Xen patchbot -unstable <=