Using dcbz avoids first reading a cache line from memory before writing to the
line.
Timing results (starting with clean cache, ie no write-backs for dirty lines):
JS20:
elapsed time: 0x0000000000009f5e
elapsed time using dcbz: 0x000000000000569e
elapsed time: 0x0000000000009fe9
elapsed time using dcbz: 0x0000000000005765
JS21:
elapsed time: 0x000000000000089e
elapsed time using dcbz: 0x0000000000000439
elapsed time: 0x0000000000000886
elapsed time using dcbz: 0x0000000000000438
.........................................
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
typedef unsigned char uchar;
typedef unsigned long ulong;
#define LINE_SIZE 128
#define PAGE_SIZE 0x1000
#define BUF1_SIZE (PAGE_SIZE * 64)
#define BUF2_SIZE (PAGE_SIZE)
#define BUF3_SIZE (0x800000)
static __inline__ ulong time_base(void);
static __inline__ void copy_page(void *dp, void *sp);
static __inline__ void cacheable_copy_page(void *dp, void *sp);
static __inline__ void cacheable_clear_page(void *addr);
static uchar clean_cache(uchar *buf3);
int main(int argc, char **argv){
int i;
ulong tb1, tb2;
uchar *buf1, *buf2, *buf3, *bufp;
buf1 = malloc(BUF1_SIZE + PAGE_SIZE);
buf2 = malloc(BUF2_SIZE + PAGE_SIZE);
buf3 = malloc(BUF3_SIZE + PAGE_SIZE);
buf1 = (uchar *)((ulong)(buf1 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));
buf2 = (uchar *)((ulong)(buf2 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));
buf3 = (uchar *)((ulong)(buf3 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));
memset(buf1, 1, BUF1_SIZE);
memset(buf2, 2, BUF2_SIZE);
memset(buf3, 3, BUF3_SIZE);
clean_cache(buf3);
tb1 = time_base();
for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){
copy_page(bufp, buf2);
copy_page(bufp+(PAGE_SIZE*1), buf2);
copy_page(bufp+(PAGE_SIZE*2), buf2);
copy_page(bufp+(PAGE_SIZE*3), buf2);
copy_page(bufp+(PAGE_SIZE*4), buf2);
copy_page(bufp+(PAGE_SIZE*5), buf2);
copy_page(bufp+(PAGE_SIZE*6), buf2);
copy_page(bufp+(PAGE_SIZE*7), buf2);
copy_page(bufp+(PAGE_SIZE*8), buf2);
copy_page(bufp+(PAGE_SIZE*9), buf2);
copy_page(bufp+(PAGE_SIZE*10), buf2);
copy_page(bufp+(PAGE_SIZE*11), buf2);
copy_page(bufp+(PAGE_SIZE*12), buf2);
copy_page(bufp+(PAGE_SIZE*13), buf2);
copy_page(bufp+(PAGE_SIZE*14), buf2);
copy_page(bufp+(PAGE_SIZE*15), buf2);
}
tb2 = time_base();
printf("elapsed time: 0x%016lx\n", tb2 - tb1);
clean_cache(buf3);
tb1 = time_base();
for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){
cacheable_copy_page(bufp, buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*1), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*2), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*3), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*4), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*5), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*6), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*7), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*8), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*9), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*10), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*11), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*12), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*13), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*14), buf2);
cacheable_copy_page(bufp+(PAGE_SIZE*15), buf2);
}
tb2 = time_base();
printf("elapsed time using dcbz: 0x%016lx\n", tb2 - tb1);
return(0);
}
static __inline__ ulong time_base(void)
{
ulong tb;
__asm__ __volatile__(
"mftb %0 # read time base"
: "=r" (tb));
return tb;
}
static __inline__ void cacheable_clear_page(void *addr)
{
ulong lines, line_size;
line_size = LINE_SIZE;
lines = PAGE_SIZE / line_size;
__asm__ __volatile__(
"mtctr %1 # clear_page\n\
1: dcbz 0,%0\n\
add %0,%0,%3\n\
bdnz 1b"
: "=r" (addr)
: "r" (lines), "0" (addr), "r" (line_size)
: "%ctr", "memory");
}
static __inline__ void copy_page(void *dp, void *sp)
{
ulong dwords, dword_size;
dword_size = 8;
dwords = (PAGE_SIZE / dword_size) - 1;
__asm__ __volatile__(
"mtctr %2 # copy_page\n\
ld %2,0(%1)\n\
std %2,0(%0)\n\
1: ldu %2,8(%1)\n\
stdu %2,8(%0)\n\
bdnz 1b"
: /* no result */
: "r" (dp), "r" (sp), "r" (dwords)
: "%ctr", "memory");
}
static __inline__ void cacheable_copy_page(void *dp, void *sp)
{
cacheable_clear_page(dp);
copy_page(dp, sp);
}
static uchar clean_cache(uchar *buf3)
{
int i;
uchar uc, *ucp = buf3;
for (i = 0; i < BUF3_SIZE / LINE_SIZE; i++){
uc += *ucp;
ucp += LINE_SIZE;
}
return(uc);
}
_______________________________________________
Xen-ppc-devel mailing list
Xen-ppc-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ppc-devel
|