# HG changeset patch # User diego # Date 1236679509 0 # Node ID b29169fccda9570dab8a2ce25946d75f8f5d47f3 # Parent 33a7261a0c308a2c44a2c746ad5b9d1b7b317921 Fix and restructure fastmemcpybench. It is now one binary that runs all available memcpy variants and prints benchmark results about them. diff -r 33a7261a0c30 -r b29169fccda9 Makefile --- a/Makefile Tue Mar 10 02:21:49 2009 +0000 +++ b/Makefile Tue Mar 10 10:05:09 2009 +0000 @@ -986,7 +986,7 @@ testsclean: -rm -f $(foreach file,$(TESTS),$(call ADD_ALL_EXESUFS,$(file))) -TOOLS = $(addprefix TOOLS/,alaw-gen asfinfo avi-fix avisubdump compare dump_mp4 movinfo netstream subrip vivodump) +TOOLS = $(addprefix TOOLS/,alaw-gen asfinfo avi-fix avisubdump compare dump_mp4 fastmemcpybench movinfo netstream subrip vivodump) ifdef ARCH_X86 TOOLS += TOOLS/modify_reg @@ -999,7 +999,7 @@ toolsclean: -rm -f $(foreach file,$(ALLTOOLS),$(call ADD_ALL_EXESUFS,$(file))) - -rm -f TOOLS/fastmem-* TOOLS/realcodecs/*.so.6.0 + -rm -f TOOLS/realcodecs/*.so.6.0 TOOLS/bmovl-test$(EXESUF): -lSDL_image @@ -1016,27 +1016,11 @@ TOOLS/netstream$(EXESUF) TOOLS/vivodump$(EXESUF): $(subst mplayer.o,mplayer-nomain.o,$(OBJS_MPLAYER)) $(filter-out %mencoder.o,$(OBJS_MENCODER)) $(OBJS_COMMON) $(COMMON_LIBS) $(CC) $(CFLAGS) -o $@ $^ $(EXTRALIBS_MPLAYER) $(EXTRALIBS_MENCODER) $(COMMON_LDFLAGS) -TOOLS/fastmem-c$(EXESUF): CFLAGS += -DHAVE_MMX=0 -DHAVE_AMD3DNOW=0 -DHAVE_MMX2=0 -DHAVE_SSE=0 -DNAME=\"C\" -TOOLS/fastmem-mmx$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=0 -DHAVE_MMX2=0 -DHAVE_SSE=0 -DNAME=\"MMX\" -TOOLS/fastmem-k6$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=1 -DHAVE_MMX2=0 -DHAVE_SSE=0 -DNAME=\"K6\" -TOOLS/fastmem-k7$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=1 -DHAVE_MMX2=1 -DHAVE_SSE=0 -DNAME=\"K7\" -TOOLS/fastmem-sse$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=0 -DHAVE_MMX2=1 -DHAVE_SSE=1 -DNAME=\"SSE\" -TOOLS/fastmem-mga-mmx$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=0 -DHAVE_MMX2=0 -DHAVE_SSE=0 -DNAME=\"MGA-MMX\" -DCONFIG_MGA -TOOLS/fastmem-mga-k6$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=1 -DHAVE_MMX2=0 -DHAVE_SSE=0 -DNAME=\"MGA-K6\" -DCONFIG_MGA -TOOLS/fastmem-mga-k7$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=1 -DHAVE_MMX2=1 -DHAVE_SSE=0 -DNAME=\"MGA-K7\" -DCONFIG_MGA -TOOLS/fastmem-mga-sse$(EXESUF): CFLAGS += -DHAVE_MMX=1 -DHAVE_AMD3DNOW=0 -DHAVE_MMX2=1 -DHAVE_SSE=1 -DNAME=\"MGA-SSE\" -DCONFIG_MGA - -fastmemcpybench: $(addsuffix $(EXESUF),$(addprefix TOOLS/fastmem-,c mmx k6 k7 sse mga-mmx mga-k6 mga-k7 mga-sse)) - -TOOLS/fastmem-%$(EXESUF): TOOLS/fastmemcpybench.c libvo/aclib.c - $(CC) $(CFLAGS) -o $@ $^ - REAL_SRCS = $(wildcard TOOLS/realcodecs/*.c) REAL_TARGETS = $(REAL_SRCS:.c=.so.6.0) realcodecs: $(REAL_TARGETS) - -fastmemcpybench realcodecs: CFLAGS += -g +realcodecs: CFLAGS += -g %.so.6.0: %.o ld -shared -o $@ $< -ldl -lc diff -r 33a7261a0c30 -r b29169fccda9 TOOLS/fastmem.sh --- a/TOOLS/fastmem.sh Tue Mar 10 02:21:49 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ - -sync -sleep 2 -./fastmem-k6 -sleep 2 -./fastmem-k7 -sleep 2 -./fastmem-mmx -sleep 2 -./fastmem-sse -sleep 2 -./fastmem-c -sleep 2 -./fastmem2-k6 -sleep 2 -./fastmem2-k7 -sleep 2 -./fastmem2-mmx -sleep 2 -./fastmem2-sse -sleep 2 diff -r 33a7261a0c30 -r b29169fccda9 TOOLS/fastmemcpybench.c --- a/TOOLS/fastmemcpybench.c Tue Mar 10 02:21:49 2009 +0000 +++ b/TOOLS/fastmemcpybench.c Tue Mar 10 10:05:09 2009 +0000 @@ -7,8 +7,6 @@ * was not confirmed through testing. */ -/* According to Uoti this code is broken. */ - #include #include #include @@ -18,7 +16,92 @@ #include #include #include -#include "libvo/fastmemcpy.h" + +#include "config.h" +#include "cpudetect.h" + +#define BLOCK_SIZE 4096 +#define CONFUSION_FACTOR 0 + +#if HAVE_MMX +#define COMPILE_MMX +#endif + +#if HAVE_MMX2 +#define COMPILE_MMX2 +#endif + +#if HAVE_AMD3DNOW +#define COMPILE_AMD3DNOW +#endif + +#if HAVE_SSE +#define COMPILE_SSE +#endif + +#ifdef COMPILE_MMX +#undef RENAME +#undef HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_AMD3DNOW +#undef HAVE_SSE +#undef HAVE_SSE2 +#define HAVE_MMX 1 +#define HAVE_MMX2 0 +#define HAVE_AMD3DNOW 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define RENAME(a) a ## _MMX +#include "libvo/aclib_template.c" +#endif + +#ifdef COMPILE_MMX2 +#undef RENAME +#undef HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_AMD3DNOW +#undef HAVE_SSE +#undef HAVE_SSE2 +#define HAVE_MMX 1 +#define HAVE_MMX2 1 +#define HAVE_AMD3DNOW 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define RENAME(a) a ## _MMX2 +#include "libvo/aclib_template.c" +#endif + +#ifdef COMPILE_AMD3DNOW +#undef RENAME +#undef HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_AMD3DNOW +#undef HAVE_SSE +#undef HAVE_SSE2 +#define HAVE_MMX 1 +#define HAVE_MMX2 0 +#define HAVE_AMD3DNOW 1 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define RENAME(a) a ## _3DNow +#include "libvo/aclib_template.c" +#endif + +#ifdef COMPILE_SSE +#undef RENAME +#undef HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_AMD3DNOW +#undef HAVE_SSE +#undef HAVE_SSE2 +#define HAVE_MMX 1 +#define HAVE_MMX2 1 +#define HAVE_AMD3DNOW 0 +#define HAVE_SSE 1 +#define HAVE_SSE2 1 +#define RENAME(a) a ## _SSE +#include "libvo/aclib_template.c" +#endif //#define ARR_SIZE 100000 #define ARR_SIZE (1024*768*2) @@ -114,11 +197,60 @@ t = GetTimer(); v1 = read_tsc(); for (i = 0; i < 100; i++) - fast_memcpy(marr1, marr2, ARR_SIZE - 16); + memcpy(marr1, marr2, ARR_SIZE - 16); + v2 = read_tsc(); + t = GetTimer() - t; + // ARR_SIZE*100 / (1024*1024) / (t/1000000) = ARR_SIZE*95.36743 / t + printf("libc: CPU clocks=%llu = %dus (%5.3ffps) %5.1fMB/s\n", v2-v1, t, + 100000000.0f/(float)t, (float)ARR_SIZE*95.36743f/(float)t); + +#if HAVE_MMX + t = GetTimer(); + v1 = read_tsc(); + for (i = 0; i < 100; i++) + fast_memcpy_MMX(marr1, marr2, ARR_SIZE - 16); + v2 = read_tsc(); + t = GetTimer() - t; + // ARR_SIZE*100 / (1024*1024) / (t/1000000) = ARR_SIZE*95.36743 / t + printf("MMX: CPU clocks=%llu = %dus (%5.3ffps) %5.1fMB/s\n", v2-v1, t, + 100000000.0f/(float)t, (float)ARR_SIZE*95.36743f/(float)t); +#endif + +#if HAVE_AMD3DNOW + t = GetTimer(); + v1 = read_tsc(); + for (i = 0; i < 100; i++) + fast_memcpy_3DNow(marr1, marr2, ARR_SIZE - 16); v2 = read_tsc(); t = GetTimer() - t; // ARR_SIZE*100 / (1024*1024) / (t/1000000) = ARR_SIZE*95.36743 / t - printf(NAME ": CPU clocks=%llu = %dus (%5.3ffps) %5.1fMB/s\n", v2-v1, t, + printf("3DNow!: CPU clocks=%llu = %dus (%5.3ffps) %5.1fMB/s\n", v2-v1, t, + 100000000.0f/(float)t, (float)ARR_SIZE*95.36743f/(float)t); +#endif + +#if HAVE_MMX2 + t = GetTimer(); + v1 = read_tsc(); + for (i = 0; i < 100; i++) + fast_memcpy_MMX2(marr1, marr2, ARR_SIZE - 16); + v2 = read_tsc(); + t = GetTimer() - t; + // ARR_SIZE*100 / (1024*1024) / (t/1000000) = ARR_SIZE*95.36743 / t + printf("MMX2: CPU clocks=%llu = %dus (%5.3ffps) %5.1fMB/s\n", v2-v1, t, 100000000.0f/(float)t, (float)ARR_SIZE*95.36743f/(float)t); +#endif + +#if HAVE_SSE + t = GetTimer(); + v1 = read_tsc(); + for (i = 0; i < 100; i++) + fast_memcpy_SSE(marr1, marr2, ARR_SIZE - 16); + v2 = read_tsc(); + t = GetTimer() - t; + // ARR_SIZE*100 / (1024*1024) / (t/1000000) = ARR_SIZE*95.36743 / t + printf("SSE: CPU clocks=%llu = %dus (%5.3ffps) %5.1fMB/s\n", v2-v1, t, + 100000000.0f/(float)t, (float)ARR_SIZE*95.36743f/(float)t); +#endif + return 0; }