comparison x86/h264_weight_sse2.asm @ 12367:06bdd447f4f7 libavcodec

Add file missing in r24702
author darkshikari
date Thu, 05 Aug 2010 00:49:48 +0000
parents
children
comparison
equal deleted inserted replaced
12366:09a31ef6ed58 12367:06bdd447f4f7
1 ;*****************************************************************************
2 ;* SSE2-optimized weighted prediction code
3 ;*****************************************************************************
4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6 ;*
7 ;* This file is part of FFmpeg.
8 ;*
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
13 ;*
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
18 ;*
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
23
24 %include "x86inc.asm"
25
26 SECTION .text
27 INIT_XMM
28
29 ;-----------------------------------------------------------------------------
30 ; biweight pred:
31 ;
32 ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
33 ; int log2_denom, int weightd, int weights,
34 ; int offset);
35 ;-----------------------------------------------------------------------------
36
37 %macro BIWEIGHT_SSE2_SETUP 0
38 add r6, 1
39 or r6, 1
40 add r3, 1
41 movd m3, r4
42 movd m4, r5
43 movd m5, r6
44 movd m6, r3
45 pslld m5, m6
46 psrld m5, 1
47 pshuflw m3, m3, 0
48 pshuflw m4, m4, 0
49 pshuflw m5, m5, 0
50 punpcklqdq m3, m3
51 punpcklqdq m4, m4
52 punpcklqdq m5, m5
53 pxor m7, m7
54 %endmacro
55
56 %macro BIWEIGHT_SSE2_STEPA 3
57 movh m%1, [r0+%3]
58 movh m%2, [r1+%3]
59 punpcklbw m%1, m7
60 punpcklbw m%2, m7
61 pmullw m%1, m3
62 pmullw m%2, m4
63 paddsw m%1, m%2
64 %endmacro
65
66 %macro BIWEIGHT_SSE2_STEPB 0
67 paddsw m0, m5
68 paddsw m1, m5
69 psraw m0, m6
70 psraw m1, m6
71 packuswb m0, m1
72 %endmacro
73
74 cglobal h264_biweight_16x16_sse2, 7, 7, 8
75 BIWEIGHT_SSE2_SETUP
76 mov r3, 16
77
78 .nextrow
79 BIWEIGHT_SSE2_STEPA 0, 1, 0
80 BIWEIGHT_SSE2_STEPA 1, 2, 8
81 BIWEIGHT_SSE2_STEPB
82 mova [r0], m0
83 add r0, r2
84 add r1, r2
85 dec r3
86 jnz .nextrow
87 REP_RET
88
89 cglobal h264_biweight_8x8_sse2, 7, 7, 8
90 BIWEIGHT_SSE2_SETUP
91 mov r3, 4
92 lea r4, [r2*2]
93
94 .nextrow
95 BIWEIGHT_SSE2_STEPA 0, 1, 0
96 BIWEIGHT_SSE2_STEPA 1, 2, r2
97 BIWEIGHT_SSE2_STEPB
98 movh [r0], m0
99 movhps [r0+r2], m0
100 add r0, r4
101 add r1, r4
102 dec r3
103 jnz .nextrow
104 REP_RET
105
106 %macro BIWEIGHT_SSSE3_SETUP 0
107 add r6, 1
108 or r6, 1
109 add r3, 1
110 movd m4, r4
111 movd m0, r5
112 movd m5, r6
113 movd m6, r3
114 pslld m5, m6
115 psrld m5, 1
116 punpcklbw m4, m0
117 pshuflw m4, m4, 0
118 pshuflw m5, m5, 0
119 punpcklqdq m4, m4
120 punpcklqdq m5, m5
121 %endmacro
122
123 %macro BIWEIGHT_SSSE3_OP 0
124 pmaddubsw m0, m4
125 pmaddubsw m2, m4
126 paddsw m0, m5
127 paddsw m2, m5
128 psraw m0, m6
129 psraw m2, m6
130 packuswb m0, m2
131 %endmacro
132
133 cglobal h264_biweight_16x16_ssse3, 7, 7, 8
134 BIWEIGHT_SSSE3_SETUP
135 mov r3, 16
136
137 .nextrow
138 movh m0, [r0]
139 movh m2, [r0+8]
140 movh m3, [r1+8]
141 punpcklbw m0, [r1]
142 punpcklbw m2, m3
143 BIWEIGHT_SSSE3_OP
144 mova [r0], m0
145 add r0, r2
146 add r1, r2
147 dec r3
148 jnz .nextrow
149 REP_RET
150
151 cglobal h264_biweight_8x8_ssse3, 7, 7, 8
152 BIWEIGHT_SSSE3_SETUP
153 mov r3, 4
154 lea r4, [r2*2]
155
156 .nextrow
157 movh m0, [r0]
158 movh m1, [r1]
159 movh m2, [r0+r2]
160 movh m3, [r1+r2]
161 punpcklbw m0, m1
162 punpcklbw m2, m3
163 BIWEIGHT_SSSE3_OP
164 movh [r0], m0
165 movhps [r0+r2], m0
166 add r0, r4
167 add r1, r4
168 dec r3
169 jnz .nextrow
170 REP_RET