Mercurial > libavcodec.hg
comparison i386/h264dsp_mmx.c @ 6320:ffb2a7b80d6d libavcodec
ff_h264_idct8_add_sse2.
compared to mmx, 217->126 cycles on core2, 262->220 on k8.
author | lorenm |
---|---|
date | Sun, 03 Feb 2008 07:05:11 +0000 |
parents | 4089a1ae6558 |
children | 57bd93f81a14 |
comparison
equal
deleted
inserted
replaced
6319:4089a1ae6558 | 6320:ffb2a7b80d6d |
---|---|
73 | 73 |
74 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ | 74 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ |
75 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) | 75 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) |
76 | 76 |
77 "pxor %%mm7, %%mm7 \n\t" | 77 "pxor %%mm7, %%mm7 \n\t" |
78 :: "m"(ff_pw_32)); | 78 :: "m"(*ff_pw_32)); |
79 | 79 |
80 asm volatile( | 80 asm volatile( |
81 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) | 81 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) |
82 "add %1, %0 \n\t" | 82 "add %1, %0 \n\t" |
83 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) | 83 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) |
207 : "memory" | 207 : "memory" |
208 ); | 208 ); |
209 } | 209 } |
210 | 210 |
211 add_pixels_clamped_mmx(b2, dst, stride); | 211 add_pixels_clamped_mmx(b2, dst, stride); |
212 } | |
213 | |
214 #define STORE_DIFF_8P( p, d, t, z )\ | |
215 "movq "#d", "#t" \n"\ | |
216 "psraw $6, "#p" \n"\ | |
217 "punpcklbw "#z", "#t" \n"\ | |
218 "paddsw "#t", "#p" \n"\ | |
219 "packuswb "#p", "#p" \n"\ | |
220 "movq "#p", "#d" \n" | |
221 | |
222 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ | |
223 "movdqa "#c", "#a" \n"\ | |
224 "movdqa "#g", "#e" \n"\ | |
225 "psraw $1, "#c" \n"\ | |
226 "psraw $1, "#g" \n"\ | |
227 "psubw "#e", "#c" \n"\ | |
228 "paddw "#a", "#g" \n"\ | |
229 "movdqa "#b", "#e" \n"\ | |
230 "psraw $1, "#e" \n"\ | |
231 "paddw "#b", "#e" \n"\ | |
232 "paddw "#d", "#e" \n"\ | |
233 "paddw "#f", "#e" \n"\ | |
234 "movdqa "#f", "#a" \n"\ | |
235 "psraw $1, "#a" \n"\ | |
236 "paddw "#f", "#a" \n"\ | |
237 "paddw "#h", "#a" \n"\ | |
238 "psubw "#b", "#a" \n"\ | |
239 "psubw "#d", "#b" \n"\ | |
240 "psubw "#d", "#f" \n"\ | |
241 "paddw "#h", "#b" \n"\ | |
242 "psubw "#h", "#f" \n"\ | |
243 "psraw $1, "#d" \n"\ | |
244 "psraw $1, "#h" \n"\ | |
245 "psubw "#d", "#b" \n"\ | |
246 "psubw "#h", "#f" \n"\ | |
247 "movdqa "#e", "#d" \n"\ | |
248 "movdqa "#a", "#h" \n"\ | |
249 "psraw $2, "#d" \n"\ | |
250 "psraw $2, "#h" \n"\ | |
251 "paddw "#f", "#d" \n"\ | |
252 "paddw "#b", "#h" \n"\ | |
253 "psraw $2, "#f" \n"\ | |
254 "psraw $2, "#b" \n"\ | |
255 "psubw "#f", "#e" \n"\ | |
256 "psubw "#a", "#b" \n"\ | |
257 "movdqa 0x00(%1), "#a" \n"\ | |
258 "movdqa 0x40(%1), "#f" \n"\ | |
259 SUMSUB_BA(f, a)\ | |
260 SUMSUB_BA(g, f)\ | |
261 SUMSUB_BA(c, a)\ | |
262 SUMSUB_BA(e, g)\ | |
263 SUMSUB_BA(b, c)\ | |
264 SUMSUB_BA(h, a)\ | |
265 SUMSUB_BA(d, f) | |
266 | |
267 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) | |
268 { | |
269 asm volatile( | |
270 "movdqa 0x10(%1), %%xmm1 \n" | |
271 "movdqa 0x20(%1), %%xmm2 \n" | |
272 "movdqa 0x30(%1), %%xmm3 \n" | |
273 "movdqa 0x50(%1), %%xmm5 \n" | |
274 "movdqa 0x60(%1), %%xmm6 \n" | |
275 "movdqa 0x70(%1), %%xmm7 \n" | |
276 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) | |
277 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) | |
278 "paddw %4, %%xmm4 \n" | |
279 "movdqa %%xmm4, 0x00(%1) \n" | |
280 "movdqa %%xmm2, 0x40(%1) \n" | |
281 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) | |
282 "movdqa %%xmm6, 0x60(%1) \n" | |
283 "movdqa %%xmm7, 0x70(%1) \n" | |
284 "pxor %%xmm7, %%xmm7 \n" | |
285 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) | |
286 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) | |
287 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) | |
288 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) | |
289 "lea (%0,%2,4), %0 \n" | |
290 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) | |
291 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) | |
292 "movdqa 0x60(%1), %%xmm0 \n" | |
293 "movdqa 0x70(%1), %%xmm1 \n" | |
294 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) | |
295 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) | |
296 :"+r"(dst) | |
297 :"r"(block), "r"((long)stride), "r"(3L*stride), "m"(*ff_pw_32) | |
298 ); | |
212 } | 299 } |
213 | 300 |
214 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | 301 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) |
215 { | 302 { |
216 int dc = (block[0] + 32) >> 6; | 303 int dc = (block[0] + 32) >> 6; |
837 "add $24, %0 \n\t"\ | 924 "add $24, %0 \n\t"\ |
838 "add %3, %1 \n\t"\ | 925 "add %3, %1 \n\t"\ |
839 "decl %2 \n\t"\ | 926 "decl %2 \n\t"\ |
840 " jnz 1b \n\t"\ | 927 " jnz 1b \n\t"\ |
841 : "+a"(tmp), "+c"(dst), "+m"(h)\ | 928 : "+a"(tmp), "+c"(dst), "+m"(h)\ |
842 : "S"((long)dstStride), "m"(ff_pw_32)\ | 929 : "S"((long)dstStride), "m"(*ff_pw_32)\ |
843 : "memory"\ | 930 : "memory"\ |
844 );\ | 931 );\ |
845 }\ | 932 }\ |
846 \ | 933 \ |
847 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 934 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
1111 "add $48, %0 \n\t"\ | 1198 "add $48, %0 \n\t"\ |
1112 "add %3, %1 \n\t"\ | 1199 "add %3, %1 \n\t"\ |
1113 "decl %2 \n\t"\ | 1200 "decl %2 \n\t"\ |
1114 " jnz 1b \n\t"\ | 1201 " jnz 1b \n\t"\ |
1115 : "+a"(tmp), "+c"(dst), "+m"(h)\ | 1202 : "+a"(tmp), "+c"(dst), "+m"(h)\ |
1116 : "S"((long)dstStride), "m"(ff_pw_32)\ | 1203 : "S"((long)dstStride), "m"(*ff_pw_32)\ |
1117 : "memory"\ | 1204 : "memory"\ |
1118 );\ | 1205 );\ |
1119 tmp += 8 - size*24;\ | 1206 tmp += 8 - size*24;\ |
1120 dst += 8 - size*dstStride;\ | 1207 dst += 8 - size*dstStride;\ |
1121 }while(w--);\ | 1208 }while(w--);\ |