Mercurial > libavcodec.hg
comparison x86/vp3dsp_mmx.c @ 11637:f7281af560fe libavcodec
vp3: DC-only IDCT
2-4% faster overall decode
author | conrad |
---|---|
date | Sat, 17 Apr 2010 02:04:30 +0000 |
parents | b57409c0c286 |
children | 7dd2a45249a9 |
comparison
equal
deleted
inserted
replaced
11636:a9e758788a12 | 11637:f7281af560fe |
---|---|
393 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) | 393 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
394 { | 394 { |
395 ff_vp3_idct_mmx(block); | 395 ff_vp3_idct_mmx(block); |
396 add_pixels_clamped_mmx(block, dest, line_size); | 396 add_pixels_clamped_mmx(block, dest, line_size); |
397 } | 397 } |
398 | |
399 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) | |
400 { | |
401 int dc = block[0]; | |
402 dc = (46341*dc)>>16; | |
403 dc = (46341*dc + (8<<16))>>20; | |
404 | |
405 __asm__ volatile( | |
406 "movd %3, %%mm0 \n\t" | |
407 "pshufw $0, %%mm0, %%mm0 \n\t" | |
408 "pxor %%mm1, %%mm1 \n\t" | |
409 "psubw %%mm0, %%mm1 \n\t" | |
410 "packuswb %%mm0, %%mm0 \n\t" | |
411 "packuswb %%mm1, %%mm1 \n\t" | |
412 | |
413 #define DC_ADD \ | |
414 "movq (%0), %%mm2 \n\t" \ | |
415 "movq (%0,%1), %%mm3 \n\t" \ | |
416 "paddusb %%mm0, %%mm2 \n\t" \ | |
417 "movq (%0,%1,2), %%mm4 \n\t" \ | |
418 "paddusb %%mm0, %%mm3 \n\t" \ | |
419 "movq (%0,%2), %%mm5 \n\t" \ | |
420 "paddusb %%mm0, %%mm4 \n\t" \ | |
421 "paddusb %%mm0, %%mm5 \n\t" \ | |
422 "psubusb %%mm1, %%mm2 \n\t" \ | |
423 "psubusb %%mm1, %%mm3 \n\t" \ | |
424 "movq %%mm2, (%0) \n\t" \ | |
425 "psubusb %%mm1, %%mm4 \n\t" \ | |
426 "movq %%mm3, (%0,%1) \n\t" \ | |
427 "psubusb %%mm1, %%mm5 \n\t" \ | |
428 "movq %%mm4, (%0,%1,2) \n\t" \ | |
429 "movq %%mm5, (%0,%2) \n\t" | |
430 | |
431 DC_ADD | |
432 "lea (%0,%1,4), %0 \n\t" | |
433 DC_ADD | |
434 | |
435 : "+r"(dest) | |
436 : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) | |
437 ); | |
438 } |