comparison x86/vp3dsp_mmx.c @ 11637:f7281af560fe libavcodec

vp3: DC-only IDCT 2-4% faster overall decode
author conrad
date Sat, 17 Apr 2010 02:04:30 +0000
parents b57409c0c286
children 7dd2a45249a9
comparison
equal deleted inserted replaced
11636:a9e758788a12 11637:f7281af560fe
393 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) 393 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
394 { 394 {
395 ff_vp3_idct_mmx(block); 395 ff_vp3_idct_mmx(block);
396 add_pixels_clamped_mmx(block, dest, line_size); 396 add_pixels_clamped_mmx(block, dest, line_size);
397 } 397 }
398
399 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
400 {
401 int dc = block[0];
402 dc = (46341*dc)>>16;
403 dc = (46341*dc + (8<<16))>>20;
404
405 __asm__ volatile(
406 "movd %3, %%mm0 \n\t"
407 "pshufw $0, %%mm0, %%mm0 \n\t"
408 "pxor %%mm1, %%mm1 \n\t"
409 "psubw %%mm0, %%mm1 \n\t"
410 "packuswb %%mm0, %%mm0 \n\t"
411 "packuswb %%mm1, %%mm1 \n\t"
412
413 #define DC_ADD \
414 "movq (%0), %%mm2 \n\t" \
415 "movq (%0,%1), %%mm3 \n\t" \
416 "paddusb %%mm0, %%mm2 \n\t" \
417 "movq (%0,%1,2), %%mm4 \n\t" \
418 "paddusb %%mm0, %%mm3 \n\t" \
419 "movq (%0,%2), %%mm5 \n\t" \
420 "paddusb %%mm0, %%mm4 \n\t" \
421 "paddusb %%mm0, %%mm5 \n\t" \
422 "psubusb %%mm1, %%mm2 \n\t" \
423 "psubusb %%mm1, %%mm3 \n\t" \
424 "movq %%mm2, (%0) \n\t" \
425 "psubusb %%mm1, %%mm4 \n\t" \
426 "movq %%mm3, (%0,%1) \n\t" \
427 "psubusb %%mm1, %%mm5 \n\t" \
428 "movq %%mm4, (%0,%1,2) \n\t" \
429 "movq %%mm5, (%0,%2) \n\t"
430
431 DC_ADD
432 "lea (%0,%1,4), %0 \n\t"
433 DC_ADD
434
435 : "+r"(dest)
436 : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
437 );
438 }