Mercurial > libavcodec.hg
comparison x86/h264_weight.asm @ 12451:4c3e6ff1237e libavcodec
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
biweight code to sse2/ssse3; add sse2 weight code; and use that same code to
create mmx2 functions also, so that the inline asm in h264dsp_mmx.c can be
removed. OK'ed by Jason on IRC.
author | rbultje |
---|---|
date | Wed, 01 Sep 2010 20:56:16 +0000 |
parents | |
children | 2c28e7d75e9c |
comparison
equal
deleted
inserted
replaced
12450:3941687b4fa9 | 12451:4c3e6ff1237e |
---|---|
1 ;***************************************************************************** | |
2 ;* SSE2-optimized weighted prediction code | |
3 ;***************************************************************************** | |
4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt | |
5 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> | |
6 ;* | |
7 ;* This file is part of FFmpeg. | |
8 ;* | |
9 ;* FFmpeg is free software; you can redistribute it and/or | |
10 ;* modify it under the terms of the GNU Lesser General Public | |
11 ;* License as published by the Free Software Foundation; either | |
12 ;* version 2.1 of the License, or (at your option) any later version. | |
13 ;* | |
14 ;* FFmpeg is distributed in the hope that it will be useful, | |
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 ;* Lesser General Public License for more details. | |
18 ;* | |
19 ;* You should have received a copy of the GNU Lesser General Public | |
20 ;* License along with FFmpeg; if not, write to the Free Software | |
21 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 ;****************************************************************************** | |
23 | |
24 %include "x86inc.asm" | |
25 | |
26 SECTION .text | |
27 | |
28 ;----------------------------------------------------------------------------- | |
29 ; biweight pred: | |
30 ; | |
31 ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, | |
32 ; int log2_denom, int weightd, int weights, | |
33 ; int offset); | |
34 ; and | |
35 ; void h264_weight_16x16_sse2(uint8_t *dst, int stride, | |
36 ; int log2_denom, int weight, | |
37 ; int offset); | |
38 ;----------------------------------------------------------------------------- | |
39 | |
40 %macro WEIGHT_SETUP 0 | |
41 add r4, r4 | |
42 inc r4 | |
43 movd m3, r3 | |
44 movd m5, r4 | |
45 movd m6, r2 | |
46 pslld m5, m6 | |
47 psrld m5, 1 | |
48 %if mmsize == 16 | |
49 pshuflw m3, m3, 0 | |
50 pshuflw m5, m5, 0 | |
51 punpcklqdq m3, m3 | |
52 punpcklqdq m5, m5 | |
53 %else | |
54 pshufw m3, m3, 0 | |
55 pshufw m5, m5, 0 | |
56 %endif | |
57 pxor m7, m7 | |
58 %endmacro | |
59 | |
60 %macro WEIGHT_OP 2 | |
61 movh m0, [r0+%1] | |
62 movh m1, [r0+%2] | |
63 punpcklbw m0, m7 | |
64 punpcklbw m1, m7 | |
65 pmullw m0, m3 | |
66 pmullw m1, m3 | |
67 paddsw m0, m5 | |
68 paddsw m1, m5 | |
69 psraw m0, m6 | |
70 psraw m1, m6 | |
71 packuswb m0, m1 | |
72 %endmacro | |
73 | |
74 %macro WEIGHT_FUNC_DBL_MM 1 | |
75 cglobal h264_weight_16x%1_mmx2, 5, 5, 0 | |
76 WEIGHT_SETUP | |
77 mov r2, %1 | |
78 %if %1 == 16 | |
79 .nextrow | |
80 WEIGHT_OP 0, 4 | |
81 mova [r0 ], m0 | |
82 WEIGHT_OP 8, 12 | |
83 mova [r0+8], m0 | |
84 add r0, r1 | |
85 dec r2 | |
86 jnz .nextrow | |
87 REP_RET | |
88 %else | |
89 jmp _ff_h264_weight_16x16_mmx2.nextrow | |
90 %endif | |
91 %endmacro | |
92 | |
93 INIT_MMX | |
94 WEIGHT_FUNC_DBL_MM 16 | |
95 WEIGHT_FUNC_DBL_MM 8 | |
96 | |
97 %macro WEIGHT_FUNC_MM 4 | |
98 cglobal h264_weight_%1x%2_%4, 7, 7, % | |
99 WEIGHT_SETUP | |
100 mov r2, %2 | |
101 %if %2 == 16 | |
102 .nextrow | |
103 WEIGHT_OP 0, mmsize/2 | |
104 mova [r0], m0 | |
105 add r0, r1 | |
106 dec r2 | |
107 jnz .nextrow | |
108 REP_RET | |
109 %else | |
110 jmp _ff_h264_weight_%1x16_%4.nextrow | |
111 %endif | |
112 %endmacro | |
113 | |
114 INIT_MMX | |
115 WEIGHT_FUNC_MM 8, 16, 0, mmx2 | |
116 WEIGHT_FUNC_MM 8, 8, 0, mmx2 | |
117 WEIGHT_FUNC_MM 8, 4, 0, mmx2 | |
118 INIT_XMM | |
119 WEIGHT_FUNC_MM 16, 16, 8, sse2 | |
120 WEIGHT_FUNC_MM 16, 8, 8, sse2 | |
121 | |
122 %macro WEIGHT_FUNC_HALF_MM 5 | |
123 cglobal h264_weight_%1x%2_%5, 5, 5, %4 | |
124 WEIGHT_SETUP | |
125 mov r2, %2/2 | |
126 lea r3, [r1*2] | |
127 %if %2 == mmsize | |
128 .nextrow | |
129 WEIGHT_OP 0, r1 | |
130 movh [r0], m0 | |
131 %if mmsize == 16 | |
132 movhps [r0+r1], m0 | |
133 %else | |
134 psrlq m0, 32 | |
135 movh [r0+r1], m0 | |
136 %endif | |
137 add r0, r3 | |
138 dec r2 | |
139 jnz .nextrow | |
140 REP_RET | |
141 %else | |
142 jmp _ff_h264_weight_%1x%3_%5.nextrow | |
143 %endif | |
144 %endmacro | |
145 | |
146 INIT_MMX | |
147 WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 | |
148 WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 | |
149 WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 | |
150 INIT_XMM | |
151 WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 | |
152 WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 | |
153 WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |
154 | |
155 %macro BIWEIGHT_SETUP 0 | |
156 add r6, 1 | |
157 or r6, 1 | |
158 add r3, 1 | |
159 movd m3, r4 | |
160 movd m4, r5 | |
161 movd m5, r6 | |
162 movd m6, r3 | |
163 pslld m5, m6 | |
164 psrld m5, 1 | |
165 %if mmsize == 16 | |
166 pshuflw m3, m3, 0 | |
167 pshuflw m4, m4, 0 | |
168 pshuflw m5, m5, 0 | |
169 punpcklqdq m3, m3 | |
170 punpcklqdq m4, m4 | |
171 punpcklqdq m5, m5 | |
172 %else | |
173 pshufw m3, m3, 0 | |
174 pshufw m4, m4, 0 | |
175 pshufw m5, m5, 0 | |
176 %endif | |
177 pxor m7, m7 | |
178 %endmacro | |
179 | |
180 %macro BIWEIGHT_STEPA 3 | |
181 movh m%1, [r0+%3] | |
182 movh m%2, [r1+%3] | |
183 punpcklbw m%1, m7 | |
184 punpcklbw m%2, m7 | |
185 pmullw m%1, m3 | |
186 pmullw m%2, m4 | |
187 paddsw m%1, m%2 | |
188 %endmacro | |
189 | |
190 %macro BIWEIGHT_STEPB 0 | |
191 paddsw m0, m5 | |
192 paddsw m1, m5 | |
193 psraw m0, m6 | |
194 psraw m1, m6 | |
195 packuswb m0, m1 | |
196 %endmacro | |
197 | |
198 %macro BIWEIGHT_FUNC_DBL_MM 1 | |
199 cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 | |
200 BIWEIGHT_SETUP | |
201 mov r3, %1 | |
202 %if %1 == 16 | |
203 .nextrow | |
204 BIWEIGHT_STEPA 0, 1, 0 | |
205 BIWEIGHT_STEPA 1, 2, 4 | |
206 BIWEIGHT_STEPB | |
207 mova [r0], m0 | |
208 BIWEIGHT_STEPA 0, 1, 8 | |
209 BIWEIGHT_STEPA 1, 2, 12 | |
210 BIWEIGHT_STEPB | |
211 mova [r0+8], m0 | |
212 add r0, r2 | |
213 add r1, r2 | |
214 dec r3 | |
215 jnz .nextrow | |
216 REP_RET | |
217 %else | |
218 jmp _ff_h264_biweight_16x16_mmx2.nextrow | |
219 %endif | |
220 %endmacro | |
221 | |
222 INIT_MMX | |
223 BIWEIGHT_FUNC_DBL_MM 16 | |
224 BIWEIGHT_FUNC_DBL_MM 8 | |
225 | |
226 %macro BIWEIGHT_FUNC_MM 4 | |
227 cglobal h264_biweight_%1x%2_%4, 7, 7, %3 | |
228 BIWEIGHT_SETUP | |
229 mov r3, %2 | |
230 %if %2 == 16 | |
231 .nextrow | |
232 BIWEIGHT_STEPA 0, 1, 0 | |
233 BIWEIGHT_STEPA 1, 2, mmsize/2 | |
234 BIWEIGHT_STEPB | |
235 mova [r0], m0 | |
236 add r0, r2 | |
237 add r1, r2 | |
238 dec r3 | |
239 jnz .nextrow | |
240 REP_RET | |
241 %else | |
242 jmp _ff_h264_biweight_%1x16_%4.nextrow | |
243 %endif | |
244 %endmacro | |
245 | |
246 INIT_MMX | |
247 BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 | |
248 BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 | |
249 BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 | |
250 INIT_XMM | |
251 BIWEIGHT_FUNC_MM 16, 16, 8, sse2 | |
252 BIWEIGHT_FUNC_MM 16, 8, 8, sse2 | |
253 | |
254 %macro BIWEIGHT_FUNC_HALF_MM 5 | |
255 cglobal h264_biweight_%1x%2_%5, 7, 7, %4 | |
256 BIWEIGHT_SETUP | |
257 mov r3, %2/2 | |
258 lea r4, [r2*2] | |
259 %if %2 == mmsize | |
260 .nextrow | |
261 BIWEIGHT_STEPA 0, 1, 0 | |
262 BIWEIGHT_STEPA 1, 2, r2 | |
263 BIWEIGHT_STEPB | |
264 movh [r0], m0 | |
265 %if mmsize == 16 | |
266 movhps [r0+r2], m0 | |
267 %else | |
268 psrlq m0, 32 | |
269 movh [r0+r2], m0 | |
270 %endif | |
271 add r0, r4 | |
272 add r1, r4 | |
273 dec r3 | |
274 jnz .nextrow | |
275 REP_RET | |
276 %else | |
277 jmp _ff_h264_biweight_%1x%3_%5.nextrow | |
278 %endif | |
279 %endmacro | |
280 | |
281 INIT_MMX | |
282 BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 | |
283 BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 | |
284 BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 | |
285 INIT_XMM | |
286 BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 | |
287 BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 | |
288 BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |
289 | |
290 %macro BIWEIGHT_SSSE3_SETUP 0 | |
291 add r6, 1 | |
292 or r6, 1 | |
293 add r3, 1 | |
294 movd m4, r4 | |
295 movd m0, r5 | |
296 movd m5, r6 | |
297 movd m6, r3 | |
298 pslld m5, m6 | |
299 psrld m5, 1 | |
300 punpcklbw m4, m0 | |
301 pshuflw m4, m4, 0 | |
302 pshuflw m5, m5, 0 | |
303 punpcklqdq m4, m4 | |
304 punpcklqdq m5, m5 | |
305 %endmacro | |
306 | |
307 %macro BIWEIGHT_SSSE3_OP 0 | |
308 pmaddubsw m0, m4 | |
309 pmaddubsw m2, m4 | |
310 paddsw m0, m5 | |
311 paddsw m2, m5 | |
312 psraw m0, m6 | |
313 psraw m2, m6 | |
314 packuswb m0, m2 | |
315 %endmacro | |
316 | |
317 %macro BIWEIGHT_SSSE3_16 1 | |
318 cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 | |
319 BIWEIGHT_SSSE3_SETUP | |
320 mov r3, %1 | |
321 | |
322 %if %1 == 16 | |
323 .nextrow | |
324 movh m0, [r0] | |
325 movh m2, [r0+8] | |
326 movh m3, [r1+8] | |
327 punpcklbw m0, [r1] | |
328 punpcklbw m2, m3 | |
329 BIWEIGHT_SSSE3_OP | |
330 mova [r0], m0 | |
331 add r0, r2 | |
332 add r1, r2 | |
333 dec r3 | |
334 jnz .nextrow | |
335 REP_RET | |
336 %else | |
337 jmp _ff_h264_biweight_16x16_ssse3.nextrow | |
338 %endif | |
339 %endmacro | |
340 | |
341 INIT_XMM | |
342 BIWEIGHT_SSSE3_16 16 | |
343 BIWEIGHT_SSSE3_16 8 | |
344 | |
345 %macro BIWEIGHT_SSSE3_8 1 | |
346 cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 | |
347 BIWEIGHT_SSSE3_SETUP | |
348 mov r3, %1/2 | |
349 lea r4, [r2*2] | |
350 | |
351 %if %1 == 16 | |
352 .nextrow | |
353 movh m0, [r0] | |
354 movh m1, [r1] | |
355 movh m2, [r0+r2] | |
356 movh m3, [r1+r2] | |
357 punpcklbw m0, m1 | |
358 punpcklbw m2, m3 | |
359 BIWEIGHT_SSSE3_OP | |
360 movh [r0], m0 | |
361 movhps [r0+r2], m0 | |
362 add r0, r4 | |
363 add r1, r4 | |
364 dec r3 | |
365 jnz .nextrow | |
366 REP_RET | |
367 %else | |
368 jmp _ff_h264_biweight_8x16_ssse3.nextrow | |
369 %endif | |
370 %endmacro | |
371 | |
372 INIT_XMM | |
373 BIWEIGHT_SSSE3_8 16 | |
374 BIWEIGHT_SSSE3_8 8 | |
375 BIWEIGHT_SSSE3_8 4 |