Mercurial > libavcodec.hg
changeset 7220:a94b2cf78a2e libavcodec
Make LOAD4/STORE4 macros more generic.
Patch by Victor Pollex victor pollex web de
Original thread: [PATCH] mmx implementation of vc-1 inverse transformations
Date: 06/21/2008 03:37 PM
author | benoit |
---|---|
date | Tue, 08 Jul 2008 09:24:11 +0000 |
parents | f72ef5b28253 |
children | c36517d7608f |
files | i386/dsputil_mmx.h i386/dsputilenc_mmx.c |
diffstat | 2 files changed, 18 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/i386/dsputil_mmx.h Mon Jul 07 21:25:18 2008 +0000 +++ b/i386/dsputil_mmx.h Tue Jul 08 09:24:11 2008 +0000 @@ -57,6 +57,18 @@ extern const double ff_pd_1[2]; extern const double ff_pd_2[2]; +#define LOAD4(stride,in,a,b,c,d)\ + "movq 0*"#stride"+"#in", "#a"\n\t"\ + "movq 1*"#stride"+"#in", "#b"\n\t"\ + "movq 2*"#stride"+"#in", "#c"\n\t"\ + "movq 3*"#stride"+"#in", "#d"\n\t" + +#define STORE4(stride,out,a,b,c,d)\ + "movq "#a", 0*"#stride"+"#out"\n\t"\ + "movq "#b", 1*"#stride"+"#out"\n\t"\ + "movq "#c", 2*"#stride"+"#out"\n\t"\ + "movq "#d", 3*"#stride"+"#out"\n\t" + /* in/out: mma=mma+mmb, mmb=mmb-mma */ #define SUMSUB_BA( a, b ) \ "paddw "#b", "#a" \n\t"\
--- a/i386/dsputilenc_mmx.c Mon Jul 07 21:25:18 2008 +0000 +++ b/i386/dsputilenc_mmx.c Tue Jul 08 09:24:11 2008 +0000 @@ -998,18 +998,6 @@ "paddusw %%xmm1, %%xmm0 \n\t" #endif -#define LOAD4(o, a, b, c, d)\ - "movq "#o"(%1), "#a" \n\t"\ - "movq "#o"+8(%1), "#b" \n\t"\ - "movq "#o"+16(%1), "#c" \n\t"\ - "movq "#o"+24(%1), "#d" \n\t"\ - -#define STORE4(o, a, b, c, d)\ - "movq "#a", "#o"(%1) \n\t"\ - "movq "#b", "#o"+8(%1) \n\t"\ - "movq "#c", "#o"+16(%1) \n\t"\ - "movq "#d", "#o"+24(%1) \n\t"\ - /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to * about 100k on extreme inputs. But that's very unlikely to occur in natural video, * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ @@ -1053,11 +1041,11 @@ "movq %%mm7, 96(%1) \n\t"\ \ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ - STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ + STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ \ "movq 96(%1), %%mm7 \n\t"\ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ - STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ + STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ \ : "=r" (sum)\ : "r"(temp)\ @@ -1071,7 +1059,7 @@ "movq %%mm7, 96(%1) \n\t"\ \ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ - STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ + STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ \ "movq 96(%1), %%mm7 \n\t"\ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ @@ -1079,7 +1067,7 @@ "movq %%mm6, %%mm7 \n\t"\ "movq %%mm0, %%mm6 \n\t"\ \ - LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ + LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ \ HADAMARD48\ "movq %%mm7, 64(%1) \n\t"\ @@ -1095,8 +1083,8 @@ "paddusw %%mm1, %%mm0 \n\t"\ "movq %%mm0, 64(%1) \n\t"\ \ - LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ - LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ + LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ + LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ \ HADAMARD48\ "movq %%mm7, (%1) \n\t"\