Mercurial > libavcodec.hg
changeset 12367:06bdd447f4f7 libavcodec
Add file missing in r24702
author | darkshikari |
---|---|
date | Thu, 05 Aug 2010 00:49:48 +0000 |
parents | 09a31ef6ed58 |
children | ba14e3adeccd |
files | x86/h264_weight_sse2.asm |
diffstat | 1 files changed, 170 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/h264_weight_sse2.asm Thu Aug 05 00:49:48 2010 +0000 @@ -0,0 +1,170 @@ +;***************************************************************************** +;* SSE2-optimized weighted prediction code +;***************************************************************************** +;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION .text +INIT_XMM + +;----------------------------------------------------------------------------- +; biweight pred: +; +; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int log2_denom, int weightd, int weights, +; int offset); +;----------------------------------------------------------------------------- + +%macro BIWEIGHT_SSE2_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m3, r4 + movd m4, r5 + movd m5, r6 + movd m6, r3 + pslld m5, m6 + psrld m5, 1 + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m4, m4 + punpcklqdq m5, m5 + pxor m7, m7 +%endmacro + +%macro BIWEIGHT_SSE2_STEPA 3 + movh m%1, [r0+%3] + movh m%2, [r1+%3] + punpcklbw m%1, m7 + punpcklbw m%2, m7 + pmullw m%1, m3 + pmullw m%2, m4 + paddsw m%1, m%2 +%endmacro + +%macro BIWEIGHT_SSE2_STEPB 0 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +cglobal h264_biweight_16x16_sse2, 7, 7, 8 + BIWEIGHT_SSE2_SETUP + mov r3, 16 + +.nextrow + BIWEIGHT_SSE2_STEPA 0, 1, 0 + BIWEIGHT_SSE2_STEPA 1, 2, 8 + BIWEIGHT_SSE2_STEPB + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET + +cglobal h264_biweight_8x8_sse2, 7, 7, 8 + BIWEIGHT_SSE2_SETUP + mov r3, 4 + lea r4, [r2*2] + +.nextrow + BIWEIGHT_SSE2_STEPA 0, 1, 0 + BIWEIGHT_SSE2_STEPA 1, 2, r2 + BIWEIGHT_SSE2_STEPB + movh [r0], m0 + movhps [r0+r2], m0 + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET + +%macro BIWEIGHT_SSSE3_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m4, r4 + movd m0, r5 + movd m5, r6 + movd m6, r3 + pslld m5, m6 + psrld m5, 1 + punpcklbw m4, m0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%endmacro + +%macro BIWEIGHT_SSSE3_OP 0 + pmaddubsw m0, m4 + pmaddubsw m2, m4 + paddsw m0, m5 + paddsw m2, m5 + psraw m0, m6 + psraw m2, m6 + packuswb m0, m2 +%endmacro + +cglobal h264_biweight_16x16_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, 16 + +.nextrow + movh m0, [r0] + movh m2, [r0+8] + movh m3, [r1+8] + punpcklbw m0, [r1] + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET + +cglobal h264_biweight_8x8_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, 4 + lea r4, [r2*2] + +.nextrow + movh m0, [r0] + movh m1, [r1] + movh m2, [r0+r2] + movh m3, [r1+r2] + punpcklbw m0, m1 + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + movh [r0], m0 + movhps [r0+r2], m0 + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET