Mercurial > libavcodec.hg
comparison x86/dsputil_h264_template_mmx.c @ 8519:cc64e1343397 libavcodec
Use H264 MMX chroma functions to accelerate RV40 decoding.
Patch by Mathieu Velten (matmaul A gmail)
author | cehoyos |
---|---|
date | Sun, 04 Jan 2009 01:36:11 +0000 |
parents | 7768bdfd4f7b |
children | 98970e51365a |
comparison
equal
deleted
inserted
replaced
8518:f2c406b05158 | 8519:cc64e1343397 |
---|---|
23 * MMX optimized version of (put|avg)_h264_chroma_mc8. | 23 * MMX optimized version of (put|avg)_h264_chroma_mc8. |
24 * H264_CHROMA_MC8_TMPL must be defined to the desired function name | 24 * H264_CHROMA_MC8_TMPL must be defined to the desired function name |
25 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg | 25 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg |
26 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function | 26 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function |
27 */ | 27 */ |
28 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) | 28 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) |
29 { | 29 { |
30 const uint64_t *rnd_reg; | |
31 DECLARE_ALIGNED_8(uint64_t, AA); | 30 DECLARE_ALIGNED_8(uint64_t, AA); |
32 DECLARE_ALIGNED_8(uint64_t, DD); | 31 DECLARE_ALIGNED_8(uint64_t, DD); |
33 int i; | 32 int i; |
34 | 33 |
35 if(y==0 && x==0) { | 34 if(y==0 && x==0) { |
43 if(y==0 || x==0) | 42 if(y==0 || x==0) |
44 { | 43 { |
45 /* 1 dimensional filter only */ | 44 /* 1 dimensional filter only */ |
46 const int dxy = x ? 1 : stride; | 45 const int dxy = x ? 1 : stride; |
47 | 46 |
48 rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3; | |
49 | |
50 __asm__ volatile( | 47 __asm__ volatile( |
51 "movd %0, %%mm5\n\t" | 48 "movd %0, %%mm5\n\t" |
52 "movq %1, %%mm4\n\t" | 49 "movq %1, %%mm4\n\t" |
53 "movq %2, %%mm6\n\t" /* mm6 = rnd */ | 50 "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ |
54 "punpcklwd %%mm5, %%mm5\n\t" | 51 "punpcklwd %%mm5, %%mm5\n\t" |
55 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ | 52 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ |
56 "pxor %%mm7, %%mm7\n\t" | 53 "pxor %%mm7, %%mm7\n\t" |
57 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ | 54 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ |
58 :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg)); | 55 :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); |
59 | 56 |
60 for(i=0; i<h; i++) { | 57 for(i=0; i<h; i++) { |
61 __asm__ volatile( | 58 __asm__ volatile( |
62 /* mm0 = src[0..7], mm1 = src[1..8] */ | 59 /* mm0 = src[0..7], mm1 = src[1..8] */ |
63 "movq %0, %%mm0\n\t" | 60 "movq %0, %%mm0\n\t" |
76 "pmullw %%mm4, %%mm0\n\t" | 73 "pmullw %%mm4, %%mm0\n\t" |
77 "pmullw %%mm4, %%mm1\n\t" | 74 "pmullw %%mm4, %%mm1\n\t" |
78 "pmullw %%mm5, %%mm2\n\t" | 75 "pmullw %%mm5, %%mm2\n\t" |
79 "pmullw %%mm5, %%mm3\n\t" | 76 "pmullw %%mm5, %%mm3\n\t" |
80 | 77 |
81 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */ | 78 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */ |
82 "paddw %%mm6, %%mm0\n\t" | 79 "paddw %%mm6, %%mm0\n\t" |
83 "paddw %%mm6, %%mm1\n\t" | 80 "paddw %%mm6, %%mm1\n\t" |
84 "paddw %%mm2, %%mm0\n\t" | 81 "paddw %%mm2, %%mm0\n\t" |
85 "paddw %%mm3, %%mm1\n\t" | 82 "paddw %%mm3, %%mm1\n\t" |
86 "psrlw $3, %%mm0\n\t" | 83 "psrlw $3, %%mm0\n\t" |
95 } | 92 } |
96 return; | 93 return; |
97 } | 94 } |
98 | 95 |
99 /* general case, bilinear */ | 96 /* general case, bilinear */ |
100 rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a; | |
101 __asm__ volatile("movd %2, %%mm4\n\t" | 97 __asm__ volatile("movd %2, %%mm4\n\t" |
102 "movd %3, %%mm6\n\t" | 98 "movd %3, %%mm6\n\t" |
103 "punpcklwd %%mm4, %%mm4\n\t" | 99 "punpcklwd %%mm4, %%mm4\n\t" |
104 "punpcklwd %%mm6, %%mm6\n\t" | 100 "punpcklwd %%mm6, %%mm6\n\t" |
105 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ | 101 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ |
170 "paddw %%mm4, %%mm3\n\t" | 166 "paddw %%mm4, %%mm3\n\t" |
171 "movq %0, %%mm0\n\t" | 167 "movq %0, %%mm0\n\t" |
172 : : "m" (src[0]), "m" (src[1]), "m" (DD)); | 168 : : "m" (src[0]), "m" (src[1]), "m" (DD)); |
173 | 169 |
174 __asm__ volatile( | 170 __asm__ volatile( |
175 /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */ | 171 /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */ |
176 "paddw %1, %%mm2\n\t" | 172 "paddw %1, %%mm2\n\t" |
177 "paddw %1, %%mm3\n\t" | 173 "paddw %1, %%mm3\n\t" |
178 "psrlw $6, %%mm2\n\t" | 174 "psrlw $6, %%mm2\n\t" |
179 "psrlw $6, %%mm3\n\t" | 175 "psrlw $6, %%mm3\n\t" |
180 "packuswb %%mm3, %%mm2\n\t" | 176 "packuswb %%mm3, %%mm2\n\t" |
183 : "=m" (dst[0]) : "m" (*rnd_reg)); | 179 : "=m" (dst[0]) : "m" (*rnd_reg)); |
184 dst+= stride; | 180 dst+= stride; |
185 } | 181 } |
186 } | 182 } |
187 | 183 |
188 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) | 184 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) |
189 { | 185 { |
190 __asm__ volatile( | 186 __asm__ volatile( |
191 "pxor %%mm7, %%mm7 \n\t" | 187 "pxor %%mm7, %%mm7 \n\t" |
192 "movd %5, %%mm2 \n\t" | 188 "movd %5, %%mm2 \n\t" |
193 "movd %6, %%mm3 \n\t" | 189 "movd %6, %%mm3 \n\t" |
247 "movd %%mm1, (%0) \n\t" | 243 "movd %%mm1, (%0) \n\t" |
248 "add %3, %0 \n\t" | 244 "add %3, %0 \n\t" |
249 "sub $2, %2 \n\t" | 245 "sub $2, %2 \n\t" |
250 "jnz 1b \n\t" | 246 "jnz 1b \n\t" |
251 : "+r"(dst), "+r"(src), "+r"(h) | 247 : "+r"(dst), "+r"(src), "+r"(h) |
252 : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y) | 248 : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) |
253 ); | 249 ); |
254 } | 250 } |
255 | 251 |
256 #ifdef H264_CHROMA_MC2_TMPL | 252 #ifdef H264_CHROMA_MC2_TMPL |
257 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) | 253 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) |