# HG changeset patch # User rbultje # Date 1283374576 0 # Node ID 4c3e6ff1237efc0c65b20bacbf63062d98f5f289 # Parent 3941687b4fa9435cc9c2add4f5e949f57076fdc9 Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square biweight code to sse2/ssse3; add sse2 weight code; and use that same code to create mmx2 functions also, so that the inline asm in h264dsp_mmx.c can be removed. OK'ed by Jason on IRC. diff -r 3941687b4fa9 -r 4c3e6ff1237e x86/Makefile --- a/x86/Makefile Wed Sep 01 20:48:59 2010 +0000 +++ b/x86/Makefile Wed Sep 01 20:56:16 2010 +0000 @@ -10,7 +10,7 @@ MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock_sse2.o \ - x86/h264_weight_sse2.o \ + x86/h264_weight.o \ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o diff -r 3941687b4fa9 -r 4c3e6ff1237e x86/h264_weight.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/h264_weight.asm Wed Sep 01 20:56:16 2010 +0000 @@ -0,0 +1,375 @@ +;***************************************************************************** +;* SSE2-optimized weighted prediction code +;***************************************************************************** +;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2010 Eli Friedman +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; biweight pred: +; +; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int log2_denom, int weightd, int weights, +; int offset); +; and +; void h264_weight_16x16_sse2(uint8_t *dst, int stride, +; int log2_denom, int weight, +; int offset); +;----------------------------------------------------------------------------- + +%macro WEIGHT_SETUP 0 + add r4, r4 + inc r4 + movd m3, r3 + movd m5, r4 + movd m6, r2 + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro WEIGHT_OP 2 + movh m0, [r0+%1] + movh m1, [r0+%2] + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m3 + pmullw m1, m3 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro WEIGHT_FUNC_DBL_MM 1 +cglobal h264_weight_16x%1_mmx2, 5, 5, 0 + WEIGHT_SETUP + mov r2, %1 +%if %1 == 16 +.nextrow + WEIGHT_OP 0, 4 + mova [r0 ], m0 + WEIGHT_OP 8, 12 + mova [r0+8], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_weight_16x16_mmx2.nextrow +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_DBL_MM 16 +WEIGHT_FUNC_DBL_MM 8 + +%macro WEIGHT_FUNC_MM 4 +cglobal h264_weight_%1x%2_%4, 7, 7, % + WEIGHT_SETUP + mov r2, %2 +%if %2 == 16 +.nextrow + WEIGHT_OP 0, mmsize/2 + mova [r0], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_weight_%1x16_%4.nextrow +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_MM 8, 16, 0, mmx2 +WEIGHT_FUNC_MM 8, 8, 0, mmx2 +WEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_MM 16, 16, 8, sse2 +WEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro WEIGHT_FUNC_HALF_MM 5 +cglobal h264_weight_%1x%2_%5, 5, 5, %4 + WEIGHT_SETUP + mov r2, %2/2 + lea r3, [r1*2] +%if %2 == mmsize +.nextrow + WEIGHT_OP 0, r1 + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r1], m0 +%else + psrlq m0, 32 + movh [r0+r1], m0 +%endif + add r0, r3 + dec r2 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_weight_%1x%3_%5.nextrow +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m3, r4 + movd m4, r5 + movd m5, r6 + movd m6, r3 + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m4, m4, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro BIWEIGHT_STEPA 3 + movh m%1, [r0+%3] + movh m%2, [r1+%3] + punpcklbw m%1, m7 + punpcklbw m%2, m7 + pmullw m%1, m3 + pmullw m%2, m4 + paddsw m%1, m%2 +%endmacro + +%macro BIWEIGHT_STEPB 0 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro BIWEIGHT_FUNC_DBL_MM 1 +cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 + BIWEIGHT_SETUP + mov r3, %1 +%if %1 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, 4 + BIWEIGHT_STEPB + mova [r0], m0 + BIWEIGHT_STEPA 0, 1, 8 + BIWEIGHT_STEPA 1, 2, 12 + BIWEIGHT_STEPB + mova [r0+8], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_16x16_mmx2.nextrow +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_DBL_MM 16 +BIWEIGHT_FUNC_DBL_MM 8 + +%macro BIWEIGHT_FUNC_MM 4 +cglobal h264_biweight_%1x%2_%4, 7, 7, %3 + BIWEIGHT_SETUP + mov r3, %2 +%if %2 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, mmsize/2 + BIWEIGHT_STEPB + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_%1x16_%4.nextrow +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_MM 16, 16, 8, sse2 +BIWEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro BIWEIGHT_FUNC_HALF_MM 5 +cglobal h264_biweight_%1x%2_%5, 7, 7, %4 + BIWEIGHT_SETUP + mov r3, %2/2 + lea r4, [r2*2] +%if %2 == mmsize +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, r2 + BIWEIGHT_STEPB + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r2], m0 +%else + psrlq m0, 32 + movh [r0+r2], m0 +%endif + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_%1x%3_%5.nextrow +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SSSE3_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m4, r4 + movd m0, r5 + movd m5, r6 + movd m6, r3 + pslld m5, m6 + psrld m5, 1 + punpcklbw m4, m0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%endmacro + +%macro BIWEIGHT_SSSE3_OP 0 + pmaddubsw m0, m4 + pmaddubsw m2, m4 + paddsw m0, m5 + paddsw m2, m5 + psraw m0, m6 + psraw m2, m6 + packuswb m0, m2 +%endmacro + +%macro BIWEIGHT_SSSE3_16 1 +cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1 + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m2, [r0+8] + movh m3, [r1+8] + punpcklbw m0, [r1] + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_16x16_ssse3.nextrow +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_16 16 +BIWEIGHT_SSSE3_16 8 + +%macro BIWEIGHT_SSSE3_8 1 +cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1/2 + lea r4, [r2*2] + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m1, [r1] + movh m2, [r0+r2] + movh m3, [r1+r2] + punpcklbw m0, m1 + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + movh [r0], m0 + movhps [r0+r2], m0 + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_8x16_ssse3.nextrow +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_8 16 +BIWEIGHT_SSSE3_8 8 +BIWEIGHT_SSSE3_8 4 diff -r 3941687b4fa9 -r 4c3e6ff1237e x86/h264_weight_sse2.asm --- a/x86/h264_weight_sse2.asm Wed Sep 01 20:48:59 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,170 +0,0 @@ -;***************************************************************************** -;* SSE2-optimized weighted prediction code -;***************************************************************************** -;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt -;* Copyright (C) 2010 Eli Friedman -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86inc.asm" - -SECTION .text -INIT_XMM - -;----------------------------------------------------------------------------- -; biweight pred: -; -; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, -; int log2_denom, int weightd, int weights, -; int offset); -;----------------------------------------------------------------------------- - -%macro BIWEIGHT_SSE2_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m3, r4 - movd m4, r5 - movd m5, r6 - movd m6, r3 - pslld m5, m6 - psrld m5, 1 - pshuflw m3, m3, 0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m3, m3 - punpcklqdq m4, m4 - punpcklqdq m5, m5 - pxor m7, m7 -%endmacro - -%macro BIWEIGHT_SSE2_STEPA 3 - movh m%1, [r0+%3] - movh m%2, [r1+%3] - punpcklbw m%1, m7 - punpcklbw m%2, m7 - pmullw m%1, m3 - pmullw m%2, m4 - paddsw m%1, m%2 -%endmacro - -%macro BIWEIGHT_SSE2_STEPB 0 - paddsw m0, m5 - paddsw m1, m5 - psraw m0, m6 - psraw m1, m6 - packuswb m0, m1 -%endmacro - -cglobal h264_biweight_16x16_sse2, 7, 7, 8 - BIWEIGHT_SSE2_SETUP - mov r3, 16 - -.nextrow - BIWEIGHT_SSE2_STEPA 0, 1, 0 - BIWEIGHT_SSE2_STEPA 1, 2, 8 - BIWEIGHT_SSE2_STEPB - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3 - jnz .nextrow - REP_RET - -cglobal h264_biweight_8x8_sse2, 7, 7, 8 - BIWEIGHT_SSE2_SETUP - mov r3, 4 - lea r4, [r2*2] - -.nextrow - BIWEIGHT_SSE2_STEPA 0, 1, 0 - BIWEIGHT_SSE2_STEPA 1, 2, r2 - BIWEIGHT_SSE2_STEPB - movh [r0], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3 - jnz .nextrow - REP_RET - -%macro BIWEIGHT_SSSE3_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m4, r4 - movd m0, r5 - movd m5, r6 - movd m6, r3 - pslld m5, m6 - psrld m5, 1 - punpcklbw m4, m0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m4, m4 - punpcklqdq m5, m5 -%endmacro - -%macro BIWEIGHT_SSSE3_OP 0 - pmaddubsw m0, m4 - pmaddubsw m2, m4 - paddsw m0, m5 - paddsw m2, m5 - psraw m0, m6 - psraw m2, m6 - packuswb m0, m2 -%endmacro - -cglobal h264_biweight_16x16_ssse3, 7, 7, 8 - BIWEIGHT_SSSE3_SETUP - mov r3, 16 - -.nextrow - movh m0, [r0] - movh m2, [r0+8] - movh m3, [r1+8] - punpcklbw m0, [r1] - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3 - jnz .nextrow - REP_RET - -cglobal h264_biweight_8x8_ssse3, 7, 7, 8 - BIWEIGHT_SSSE3_SETUP - mov r3, 4 - lea r4, [r2*2] - -.nextrow - movh m0, [r0] - movh m1, [r1] - movh m2, [r0+r2] - movh m3, [r1+r2] - punpcklbw m0, m1 - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - movh [r0], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3 - jnz .nextrow - REP_RET diff -r 3941687b4fa9 -r 4c3e6ff1237e x86/h264dsp_mmx.c --- a/x86/h264dsp_mmx.c Wed Sep 01 20:48:59 2010 +0000 +++ b/x86/h264dsp_mmx.c Wed Sep 01 20:56:16 2010 +0000 @@ -921,115 +921,33 @@ /***********************************/ /* weighted prediction */ -static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) -{ - int x, y; - offset <<= log2_denom; - offset += (1 << log2_denom) >> 1; - __asm__ volatile( - "movd %0, %%mm4 \n\t" - "movd %1, %%mm5 \n\t" - "movd %2, %%mm6 \n\t" - "pshufw $0, %%mm4, %%mm4 \n\t" - "pshufw $0, %%mm5, %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - :: "g"(weight), "g"(offset), "g"(log2_denom) - ); - for(y=0; yh264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; + } + if(mm_flags & FF_MM_SSE2){ + c->h264_idct8_add = ff_h264_idct8_add_sse2; + c->h264_idct8_add4= ff_h264_idct8_add4_sse2; + } +#if HAVE_YASM + if (mm_flags & FF_MM_MMX2){ +#if ARCH_X86_32 + c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; + c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; +#endif c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; @@ -1094,21 +1023,20 @@ c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; - } - if(mm_flags & FF_MM_SSE2){ - c->h264_idct8_add = ff_h264_idct8_add_sse2; - c->h264_idct8_add4= ff_h264_idct8_add4_sse2; - } -#if HAVE_YASM - if (mm_flags & FF_MM_MMX2){ -#if ARCH_X86_32 - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; -#endif if( mm_flags&FF_MM_SSE2 ){ + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; + #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; @@ -1123,7 +1051,10 @@ } if ( mm_flags&FF_MM_SSSE3 ){ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; } } #endif