# HG changeset patch # User darkshikari # Date 1277490349 0 # Node ID afee30fe8c262c06bff1f9addfe968c4ff2fcf36 # Parent 56aba5a9761ceea6de9e45eaefdae9368c5925f3 16x16 and 8x8c x86 SIMD intra pred functions for VP8 and H.264 diff -r 56aba5a9761c -r afee30fe8c26 h264pred.c --- a/h264pred.c Fri Jun 25 18:14:07 2010 +0000 +++ b/h264pred.c Fri Jun 25 18:25:49 2010 +0000 @@ -1299,4 +1299,5 @@ h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c; if (ARCH_ARM) ff_h264_pred_init_arm(h, codec_id); + if (ARCH_X86) ff_h264_pred_init_x86(h, codec_id); } diff -r 56aba5a9761c -r afee30fe8c26 h264pred.h --- a/h264pred.h Fri Jun 25 18:14:07 2010 +0000 +++ b/h264pred.h Fri Jun 25 18:25:49 2010 +0000 @@ -87,5 +87,6 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id); void ff_h264_pred_init_arm(H264PredContext *h, int codec_id); +void ff_h264_pred_init_x86(H264PredContext *h, int codec_id); #endif /* AVCODEC_H264PRED_H */ diff -r 56aba5a9761c -r afee30fe8c26 x86/Makefile --- a/x86/Makefile Fri Jun 25 18:14:07 2010 +0000 +++ b/x86/Makefile Fri Jun 25 18:25:49 2010 +0000 @@ -8,6 +8,7 @@ $(YASM-OBJS-FFT-yes) YASM-OBJS-$(CONFIG_GPL) += x86/h264_deblock_sse2.o \ x86/h264_idct_sse2.o \ + x86/h264_intrapred.o \ MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o MMX-OBJS-$(CONFIG_MP1FLOAT_DECODER) += x86/mpegaudiodec_mmx.o diff -r 56aba5a9761c -r afee30fe8c26 x86/dsputil_mmx.c --- a/x86/dsputil_mmx.c Fri Jun 25 18:14:07 2010 +0000 +++ b/x86/dsputil_mmx.c Fri Jun 25 18:25:49 2010 +0000 @@ -62,7 +62,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; diff -r 56aba5a9761c -r afee30fe8c26 x86/dsputil_mmx.h --- a/x86/dsputil_mmx.h Fri Jun 25 18:14:07 2010 +0000 +++ b/x86/dsputil_mmx.h Fri Jun 25 18:25:49 2010 +0000 @@ -48,7 +48,7 @@ extern const uint64_t ff_pw_255; extern const uint64_t ff_pb_1; -extern const uint64_t ff_pb_3; +extern const xmm_reg ff_pb_3; extern const uint64_t ff_pb_7; extern const uint64_t ff_pb_1F; extern const uint64_t ff_pb_3F; diff -r 56aba5a9761c -r afee30fe8c26 x86/h264_intrapred.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/h264_intrapred.asm Fri Jun 25 18:25:49 2010 +0000 @@ -0,0 +1,486 @@ +;****************************************************************************** +;* H.264 intra prediction asm optimizations +;* Copyright (c) 2010 Jason Garrett-Glaser +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA + +tm_shuf: times 8 db 0x03, 0x80 + +SECTION .text + +cextern pb_3 + +;----------------------------------------------------------------------------- +; void pred16x16_vertical(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +cglobal pred16x16_vertical_mmx, 2,3 + sub r0, r1 + mov r2, 8 + movq mm0, [r0+0] + movq mm1, [r0+8] +.loop: + movq [r0+r1*1+0], mm0 + movq [r0+r1*1+8], mm1 + movq [r0+r1*2+0], mm0 + movq [r0+r1*2+8], mm1 + lea r0, [r0+r1*2] + dec r2 + jg .loop + REP_RET + +cglobal pred16x16_vertical_sse, 2,3 + sub r0, r1 + mov r2, 4 + movaps xmm0, [r0] +.loop: + movaps [r0+r1*1], xmm0 + movaps [r0+r1*2], xmm0 + lea r0, [r0+r1*2] + movaps [r0+r1*1], xmm0 + movaps [r0+r1*2], xmm0 + lea r0, [r0+r1*2] + dec r2 + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void pred16x16_horizontal(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +%macro PRED16x16_H 1 +cglobal pred16x16_horizontal_%1, 2,3 + mov r2, 8 +%ifidn %1, ssse3 + mova m2, [pb_3] +%endif +.loop: + movd m0, [r0+r1*0-4] + movd m1, [r0+r1*1-4] + +%ifidn %1, ssse3 + pshufb m0, m2 + pshufb m1, m2 +%else + punpcklbw m0, m0 + punpcklbw m1, m1 +%ifidn %1, mmxext + pshufw m0, m0, 0xff + pshufw m1, m1, 0xff +%else + punpckhwd m0, m0 + punpckhwd m1, m1 + punpckhdq m0, m0 + punpckhdq m1, m1 +%endif + mova [r0+r1*0+8], m0 + mova [r0+r1*1+8], m1 +%endif + + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2 + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_H mmx +PRED16x16_H mmxext +INIT_XMM +PRED16x16_H ssse3 + +;----------------------------------------------------------------------------- +; void pred16x16_dc(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +%macro PRED16x16_DC 2 +cglobal pred16x16_dc_%1, 2,7 + mov r4, r0 + sub r0, r1 + pxor mm0, mm0 + pxor mm1, mm1 + psadbw mm0, [r0+0] + psadbw mm1, [r0+8] + dec r0 + movzx r5d, byte [r0+r1*1] + paddw mm0, mm1 + movd r6d, mm0 + lea r0, [r0+r1*2] +%rep 7 + movzx r2d, byte [r0+r1*0] + movzx r3d, byte [r0+r1*1] + add r5d, r2d + add r6d, r3d + lea r0, [r0+r1*2] +%endrep + movzx r2d, byte [r0+r1*0] + add r5d, r6d + lea r2d, [r2+r5+16] + shr r2d, 5 +%ifidn %1, mmx + movd m0, r2d + punpcklbw m0, m0 + punpcklwd m0, m0 + punpckldq m0, m0 +%elifidn %1, mmxext + movd m0, r2d + punpcklbw m0, m0 + pshufw m0, m0, 0 +%elifidn %1, sse + imul r2d, 0x01010101 + movd m0, r2d + shufps m0, m0, 0 +%elifidn %1, sse2 + movd m0, r2d + punpcklbw m0, m0 + pshuflw m0, m0, 0 + punpcklqdq m0, m0 +%elifidn %1, ssse3 + pxor m1, m1 + movd m0, r2d + pshufb m0, m1 +%endif + +%if mmsize==8 + mov r3d, 8 +.loop: + %2 [r4+r1*0+0], m0 + %2 [r4+r1*0+8], m0 + %2 [r4+r1*1+0], m0 + %2 [r4+r1*1+8], m0 +%else + mov r3d, 4 +.loop: + %2 [r4+r1*0], m0 + %2 [r4+r1*1], m0 + lea r4, [r4+r1*2] + %2 [r4+r1*0], m0 + %2 [r4+r1*1], m0 +%endif + lea r4, [r4+r1*2] + dec r3d + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_DC mmx, movq +PRED16x16_DC mmxext, movq +INIT_XMM +PRED16x16_DC sse, movaps +PRED16x16_DC sse2, movdqa +PRED16x16_DC ssse3, movdqa + +;----------------------------------------------------------------------------- +; void pred16x16_tm_vp8(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +%macro PRED16x16_TM_MMX 1 +cglobal pred16x16_tm_vp8_%1, 2,5 + sub r0, r1 + pxor mm7, mm7 + movq mm0, [r0+0] + movq mm2, [r0+8] + movq mm1, mm0 + movq mm3, mm2 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + movzx r3d, byte [r0-1] + mov r4d, 16 +.loop: + movzx r2d, byte [r0+r1-1] + sub r2d, r3d + movd mm4, r2d +%ifidn %1, mmx + punpcklwd mm4, mm4 + punpckldq mm4, mm4 +%else + pshufw mm4, mm4, 0 +%endif + movq mm5, mm4 + movq mm6, mm4 + movq mm7, mm4 + paddw mm4, mm0 + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 + packuswb mm4, mm5 + packuswb mm6, mm7 + movq [r0+r1+0], mm4 + movq [r0+r1+8], mm6 + add r0, r1 + dec r4d + jg .loop + REP_RET +%endmacro + +PRED16x16_TM_MMX mmx +PRED16x16_TM_MMX mmxext + +cglobal pred16x16_tm_vp8_sse2, 2,6,6 + sub r0, r1 + pxor xmm2, xmm2 + movdqa xmm0, [r0] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movzx r4d, byte [r0-1] + mov r5d, 8 +.loop: + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + sub r2d, r4d + sub r3d, r4d + movd xmm2, r2d + movd xmm4, r3d + pshuflw xmm2, xmm2, 0 + pshuflw xmm4, xmm4, 0 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm4, xmm4 + movdqa xmm3, xmm2 + movdqa xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm3, xmm1 + paddw xmm4, xmm0 + paddw xmm5, xmm1 + packuswb xmm2, xmm3 + packuswb xmm4, xmm5 + movdqa [r0+r1*1], xmm2 + movdqa [r0+r1*2], xmm4 + lea r0, [r0+r1*2] + dec r5d + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void pred8x8_vertical(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +cglobal pred8x8_vertical_mmx, 2,2 + sub r0, r1 + movq mm0, [r0] +%rep 3 + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + lea r0, [r0+r1*2] +%endrep + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + RET + +;----------------------------------------------------------------------------- +; void pred8x8_horizontal(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8_H 1 +cglobal pred8x8_horizontal_%1, 2,3 + mov r2, 4 +%ifidn %1, ssse3 + mova m2, [pb_3] +%endif +.loop: + movd m0, [r0+r1*0-4] + movd m1, [r0+r1*1-4] +%ifidn %1, ssse3 + pshufb m0, m2 + pshufb m1, m2 +%else + punpcklbw m0, m0 + punpcklbw m1, m1 +%ifidn %1, mmxext + pshufw m0, m0, 0xff + pshufw m1, m1, 0xff +%else + punpckhwd m0, m0 + punpckhwd m1, m1 + punpckhdq m0, m0 + punpckhdq m1, m1 +%endif +%endif + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2 + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED8x8_H mmx +PRED8x8_H mmxext +PRED8x8_H ssse3 + +;----------------------------------------------------------------------------- +; void pred8x8_dc_rv40(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8_DC 1 +cglobal pred8x8_dc_rv40_%1, 2,7 + mov r4, r0 + sub r0, r1 + pxor mm0, mm0 + psadbw mm0, [r0] + dec r0 + movzx r5d, byte [r0+r1*1] + movd r6d, mm0 + lea r0, [r0+r1*2] +%rep 3 + movzx r2d, byte [r0+r1*0] + movzx r3d, byte [r0+r1*1] + add r5d, r2d + add r6d, r3d + lea r0, [r0+r1*2] +%endrep + movzx r2d, byte [r0+r1*0] + add r5d, r6d + lea r2d, [r2+r5+8] + shr r2d, 4 +%ifidn %1, mmx + movd mm0, r2d + punpcklbw mm0, mm0 + punpcklwd mm0, mm0 + punpckldq mm0, mm0 +%else + movd mm0, r2d + punpcklbw mm0, mm0 + pshufw mm0, mm0, 0 +%endif + mov r3d, 4 +.loop: + movq [r4+r1*0], mm0 + movq [r4+r1*1], mm0 + lea r4, [r4+r1*2] + dec r3d + jg .loop + REP_RET +%endmacro + + +PRED8x8_DC mmx +PRED8x8_DC mmxext + +;----------------------------------------------------------------------------- +; void pred8x8_tm_vp8(uint8_t *src, int stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8_TM_MMX 1 +cglobal pred8x8_tm_vp8_%1, 2,6 + sub r0, r1 + pxor mm7, mm7 + movq mm0, [r0] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movzx r4d, byte [r0-1] + mov r5d, 4 +.loop: + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + sub r2d, r4d + sub r3d, r4d + movd mm2, r2d + movd mm4, r3d +%ifidn %1, mmx + punpcklwd mm2, mm2 + punpcklwd mm4, mm4 + punpckldq mm2, mm2 + punpckldq mm4, mm4 +%else + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%endif + movq mm3, mm2 + movq mm5, mm4 + paddw mm2, mm0 + paddw mm3, mm1 + paddw mm4, mm0 + paddw mm5, mm1 + packuswb mm2, mm3 + packuswb mm4, mm5 + movq [r0+r1*1], mm2 + movq [r0+r1*2], mm4 + lea r0, [r0+r1*2] + dec r5d + jg .loop + REP_RET +%endmacro + +PRED8x8_TM_MMX mmx +PRED8x8_TM_MMX mmxext + +cglobal pred8x8_tm_vp8_sse2, 2,6,4 + sub r0, r1 + pxor xmm1, xmm1 + movq xmm0, [r0] + punpcklbw xmm0, xmm1 + movzx r4d, byte [r0-1] + mov r5d, 4 +.loop: + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + sub r2d, r4d + sub r3d, r4d + movd xmm2, r2d + movd xmm3, r3d + pshuflw xmm2, xmm2, 0 + pshuflw xmm3, xmm3, 0 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + paddw xmm2, xmm0 + paddw xmm3, xmm0 + packuswb xmm2, xmm3 + movq [r0+r1*1], xmm2 + movhps [r0+r1*2], xmm2 + lea r0, [r0+r1*2] + dec r5d + jg .loop + REP_RET + +cglobal pred8x8_tm_vp8_ssse3, 2,3,6 + sub r0, r1 + movdqa xmm4, [tm_shuf] + pxor xmm1, xmm1 + movq xmm0, [r0] + punpcklbw xmm0, xmm1 + movd xmm5, [r0-4] + pshufb xmm5, xmm4 + mov r2d, 4 +.loop: + movd xmm2, [r0+r1*1-4] + movd xmm3, [r0+r1*2-4] + pshufb xmm2, xmm4 + pshufb xmm3, xmm4 + psubw xmm2, xmm5 + psubw xmm3, xmm5 + paddw xmm2, xmm0 + paddw xmm3, xmm0 + packuswb xmm2, xmm3 + movq [r0+r1*1], xmm2 + movhps [r0+r1*2], xmm2 + lea r0, [r0+r1*2] + dec r2d + jg .loop + REP_RET diff -r 56aba5a9761c -r afee30fe8c26 x86/h264dsp_mmx.c --- a/x86/h264dsp_mmx.c Fri Jun 25 18:14:07 2010 +0000 +++ b/x86/h264dsp_mmx.c Fri Jun 25 18:25:49 2010 +0000 @@ -19,6 +19,7 @@ */ #include "dsputil_mmx.h" +#include "libavcodec/h264pred.h" DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; @@ -2322,3 +2323,77 @@ H264_WEIGHT( 4, 4) H264_WEIGHT( 4, 2) +void ff_pred16x16_vertical_mmx (uint8_t *src, int stride); +void ff_pred16x16_vertical_sse (uint8_t *src, int stride); +void ff_pred16x16_horizontal_mmx (uint8_t *src, int stride); +void ff_pred16x16_horizontal_mmxext(uint8_t *src, int stride); +void ff_pred16x16_horizontal_ssse3 (uint8_t *src, int stride); +void ff_pred16x16_dc_mmx (uint8_t *src, int stride); +void ff_pred16x16_dc_mmxext (uint8_t *src, int stride); +void ff_pred16x16_dc_sse (uint8_t *src, int stride); +void ff_pred16x16_dc_sse2 (uint8_t *src, int stride); +void ff_pred16x16_dc_ssse3 (uint8_t *src, int stride); +void ff_pred16x16_tm_vp8_mmx (uint8_t *src, int stride); +void ff_pred16x16_tm_vp8_mmxext (uint8_t *src, int stride); +void ff_pred16x16_tm_vp8_sse2 (uint8_t *src, int stride); +void ff_pred8x8_dc_rv40_mmx (uint8_t *src, int stride); +void ff_pred8x8_dc_rv40_mmxext (uint8_t *src, int stride); +void ff_pred8x8_vertical_mmx (uint8_t *src, int stride); +void ff_pred8x8_horizontal_mmx (uint8_t *src, int stride); +void ff_pred8x8_horizontal_mmxext (uint8_t *src, int stride); +void ff_pred8x8_horizontal_ssse3 (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_mmx (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_mmxext (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_sse2 (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_ssse3 (uint8_t *src, int stride); + +void ff_h264_pred_init_x86(H264PredContext *h, int codec_id) +{ +#if HAVE_YASM + if (mm_flags & FF_MM_MMX) { + h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_mmx; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmx; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmx; + h->pred8x8 [VERT_PRED8x8] = ff_pred8x8_vertical_mmx; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx; + if (codec_id == CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmx; + h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_mmx; + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmx; + } + } + + if (mm_flags & FF_MM_MMXEXT) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmxext; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmxext; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext; + if (codec_id == CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmxext; + h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_mmxext; + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmxext; + } + } + + if (mm_flags & FF_MM_SSE) { + h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_sse; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_sse; + } + + if (mm_flags & FF_MM_SSE2) { + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_sse2; + if (codec_id == CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_sse2; + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_sse2; + } + } + + if (mm_flags & FF_MM_SSSE3) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_ssse3; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_ssse3; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_ssse3; + if (codec_id == CODEC_ID_VP8) { + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_ssse3; + } + } +#endif +}