Mercurial > libavcodec.hg
comparison x86/h264dsp_mmx.c @ 8510:cea216e44ee3 libavcodec
Add x264 SSE2 iDCT functions to H.264 decoder.
author | darkshikari |
---|---|
date | Sat, 03 Jan 2009 00:46:17 +0000 |
parents | 7768bdfd4f7b |
children | cc64e1343397 |
comparison
equal
deleted
inserted
replaced
8509:5dffd656f606 | 8510:cea216e44ee3 |
---|---|
469 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | 469 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); |
470 else if(block[i*16]) | 470 else if(block[i*16]) |
471 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | 471 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); |
472 } | 472 } |
473 } | 473 } |
474 | |
475 #if defined(CONFIG_GPL) && defined(HAVE_YASM) | |
476 static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride) | |
477 { | |
478 __asm__ volatile( | |
479 "movd %0, %%mm0 \n\t" // 0 0 X D | |
480 "punpcklwd %1, %%mm0 \n\t" // x X d D | |
481 "paddsw %2, %%mm0 \n\t" | |
482 "psraw $6, %%mm0 \n\t" | |
483 "punpcklwd %%mm0, %%mm0 \n\t" // d d D D | |
484 "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0 | |
485 "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D | |
486 "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D | |
487 "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D | |
488 "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D | |
489 ::"m"(block[ 0]), | |
490 "m"(block[16]), | |
491 "m"(ff_pw_32) | |
492 ); | |
493 __asm__ volatile( | |
494 "movq %0, %%mm2 \n\t" | |
495 "movq %1, %%mm3 \n\t" | |
496 "movq %2, %%mm4 \n\t" | |
497 "movq %3, %%mm5 \n\t" | |
498 "paddusb %%mm0, %%mm2 \n\t" | |
499 "paddusb %%mm0, %%mm3 \n\t" | |
500 "paddusb %%mm0, %%mm4 \n\t" | |
501 "paddusb %%mm0, %%mm5 \n\t" | |
502 "psubusb %%mm1, %%mm2 \n\t" | |
503 "psubusb %%mm1, %%mm3 \n\t" | |
504 "psubusb %%mm1, %%mm4 \n\t" | |
505 "psubusb %%mm1, %%mm5 \n\t" | |
506 "movq %%mm2, %0 \n\t" | |
507 "movq %%mm3, %1 \n\t" | |
508 "movq %%mm4, %2 \n\t" | |
509 "movq %%mm5, %3 \n\t" | |
510 :"+m"(*(uint64_t*)(dst+0*stride)), | |
511 "+m"(*(uint64_t*)(dst+1*stride)), | |
512 "+m"(*(uint64_t*)(dst+2*stride)), | |
513 "+m"(*(uint64_t*)(dst+3*stride)) | |
514 ); | |
515 } | |
516 | |
517 extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride); | |
518 | |
519 static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
520 int i; | |
521 for(i=0; i<16; i+=2) | |
522 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) | |
523 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); | |
524 } | |
525 | |
526 static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
527 int i; | |
528 for(i=0; i<16; i+=2){ | |
529 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) | |
530 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); | |
531 else if(block[i*16]|block[i*16+16]) | |
532 ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride); | |
533 } | |
534 } | |
535 | |
536 static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
537 int i; | |
538 for(i=16; i<16+8; i++){ | |
539 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) | |
540 ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |
541 else if(block[i*16]|block[i*16+16]) | |
542 ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |
543 } | |
544 } | |
545 #endif | |
474 | 546 |
475 /***********************************/ | 547 /***********************************/ |
476 /* deblocking */ | 548 /* deblocking */ |
477 | 549 |
478 // out: o = |x-y|>a | 550 // out: o = |x-y|>a |