comparison x86/dsputil_h264_template_mmx.c @ 8519:cc64e1343397 libavcodec

Use H264 MMX chroma functions to accelerate RV40 decoding. Patch by Mathieu Velten (matmaul A gmail)
author cehoyos
date Sun, 04 Jan 2009 01:36:11 +0000
parents 7768bdfd4f7b
children 98970e51365a
comparison
equal deleted inserted replaced
8518:f2c406b05158 8519:cc64e1343397
23 * MMX optimized version of (put|avg)_h264_chroma_mc8. 23 * MMX optimized version of (put|avg)_h264_chroma_mc8.
24 * H264_CHROMA_MC8_TMPL must be defined to the desired function name 24 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
25 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg 25 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
26 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function 26 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
27 */ 27 */
28 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) 28 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
29 { 29 {
30 const uint64_t *rnd_reg;
31 DECLARE_ALIGNED_8(uint64_t, AA); 30 DECLARE_ALIGNED_8(uint64_t, AA);
32 DECLARE_ALIGNED_8(uint64_t, DD); 31 DECLARE_ALIGNED_8(uint64_t, DD);
33 int i; 32 int i;
34 33
35 if(y==0 && x==0) { 34 if(y==0 && x==0) {
43 if(y==0 || x==0) 42 if(y==0 || x==0)
44 { 43 {
45 /* 1 dimensional filter only */ 44 /* 1 dimensional filter only */
46 const int dxy = x ? 1 : stride; 45 const int dxy = x ? 1 : stride;
47 46
48 rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3;
49
50 __asm__ volatile( 47 __asm__ volatile(
51 "movd %0, %%mm5\n\t" 48 "movd %0, %%mm5\n\t"
52 "movq %1, %%mm4\n\t" 49 "movq %1, %%mm4\n\t"
53 "movq %2, %%mm6\n\t" /* mm6 = rnd */ 50 "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */
54 "punpcklwd %%mm5, %%mm5\n\t" 51 "punpcklwd %%mm5, %%mm5\n\t"
55 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 52 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
56 "pxor %%mm7, %%mm7\n\t" 53 "pxor %%mm7, %%mm7\n\t"
57 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ 54 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */
58 :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg)); 55 :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
59 56
60 for(i=0; i<h; i++) { 57 for(i=0; i<h; i++) {
61 __asm__ volatile( 58 __asm__ volatile(
62 /* mm0 = src[0..7], mm1 = src[1..8] */ 59 /* mm0 = src[0..7], mm1 = src[1..8] */
63 "movq %0, %%mm0\n\t" 60 "movq %0, %%mm0\n\t"
76 "pmullw %%mm4, %%mm0\n\t" 73 "pmullw %%mm4, %%mm0\n\t"
77 "pmullw %%mm4, %%mm1\n\t" 74 "pmullw %%mm4, %%mm1\n\t"
78 "pmullw %%mm5, %%mm2\n\t" 75 "pmullw %%mm5, %%mm2\n\t"
79 "pmullw %%mm5, %%mm3\n\t" 76 "pmullw %%mm5, %%mm3\n\t"
80 77
81 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */ 78 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */
82 "paddw %%mm6, %%mm0\n\t" 79 "paddw %%mm6, %%mm0\n\t"
83 "paddw %%mm6, %%mm1\n\t" 80 "paddw %%mm6, %%mm1\n\t"
84 "paddw %%mm2, %%mm0\n\t" 81 "paddw %%mm2, %%mm0\n\t"
85 "paddw %%mm3, %%mm1\n\t" 82 "paddw %%mm3, %%mm1\n\t"
86 "psrlw $3, %%mm0\n\t" 83 "psrlw $3, %%mm0\n\t"
95 } 92 }
96 return; 93 return;
97 } 94 }
98 95
99 /* general case, bilinear */ 96 /* general case, bilinear */
100 rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a;
101 __asm__ volatile("movd %2, %%mm4\n\t" 97 __asm__ volatile("movd %2, %%mm4\n\t"
102 "movd %3, %%mm6\n\t" 98 "movd %3, %%mm6\n\t"
103 "punpcklwd %%mm4, %%mm4\n\t" 99 "punpcklwd %%mm4, %%mm4\n\t"
104 "punpcklwd %%mm6, %%mm6\n\t" 100 "punpcklwd %%mm6, %%mm6\n\t"
105 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 101 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
170 "paddw %%mm4, %%mm3\n\t" 166 "paddw %%mm4, %%mm3\n\t"
171 "movq %0, %%mm0\n\t" 167 "movq %0, %%mm0\n\t"
172 : : "m" (src[0]), "m" (src[1]), "m" (DD)); 168 : : "m" (src[0]), "m" (src[1]), "m" (DD));
173 169
174 __asm__ volatile( 170 __asm__ volatile(
175 /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */ 171 /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */
176 "paddw %1, %%mm2\n\t" 172 "paddw %1, %%mm2\n\t"
177 "paddw %1, %%mm3\n\t" 173 "paddw %1, %%mm3\n\t"
178 "psrlw $6, %%mm2\n\t" 174 "psrlw $6, %%mm2\n\t"
179 "psrlw $6, %%mm3\n\t" 175 "psrlw $6, %%mm3\n\t"
180 "packuswb %%mm3, %%mm2\n\t" 176 "packuswb %%mm3, %%mm2\n\t"
183 : "=m" (dst[0]) : "m" (*rnd_reg)); 179 : "=m" (dst[0]) : "m" (*rnd_reg));
184 dst+= stride; 180 dst+= stride;
185 } 181 }
186 } 182 }
187 183
188 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 184 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
189 { 185 {
190 __asm__ volatile( 186 __asm__ volatile(
191 "pxor %%mm7, %%mm7 \n\t" 187 "pxor %%mm7, %%mm7 \n\t"
192 "movd %5, %%mm2 \n\t" 188 "movd %5, %%mm2 \n\t"
193 "movd %6, %%mm3 \n\t" 189 "movd %6, %%mm3 \n\t"
247 "movd %%mm1, (%0) \n\t" 243 "movd %%mm1, (%0) \n\t"
248 "add %3, %0 \n\t" 244 "add %3, %0 \n\t"
249 "sub $2, %2 \n\t" 245 "sub $2, %2 \n\t"
250 "jnz 1b \n\t" 246 "jnz 1b \n\t"
251 : "+r"(dst), "+r"(src), "+r"(h) 247 : "+r"(dst), "+r"(src), "+r"(h)
252 : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y) 248 : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
253 ); 249 );
254 } 250 }
255 251
256 #ifdef H264_CHROMA_MC2_TMPL 252 #ifdef H264_CHROMA_MC2_TMPL
257 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 253 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)