comparison x86/h264_weight.asm @ 12451:4c3e6ff1237e libavcodec

Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square biweight code to sse2/ssse3; add sse2 weight code; and use that same code to create mmx2 functions also, so that the inline asm in h264dsp_mmx.c can be removed. OK'ed by Jason on IRC.
author rbultje
date Wed, 01 Sep 2010 20:56:16 +0000
parents
children 2c28e7d75e9c
comparison
equal deleted inserted replaced
12450:3941687b4fa9 12451:4c3e6ff1237e
1 ;*****************************************************************************
2 ;* SSE2-optimized weighted prediction code
3 ;*****************************************************************************
4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6 ;*
7 ;* This file is part of FFmpeg.
8 ;*
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
13 ;*
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
18 ;*
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
23
24 %include "x86inc.asm"
25
26 SECTION .text
27
28 ;-----------------------------------------------------------------------------
29 ; biweight pred:
30 ;
31 ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
32 ; int log2_denom, int weightd, int weights,
33 ; int offset);
34 ; and
35 ; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
36 ; int log2_denom, int weight,
37 ; int offset);
38 ;-----------------------------------------------------------------------------
39
40 %macro WEIGHT_SETUP 0
41 add r4, r4
42 inc r4
43 movd m3, r3
44 movd m5, r4
45 movd m6, r2
46 pslld m5, m6
47 psrld m5, 1
48 %if mmsize == 16
49 pshuflw m3, m3, 0
50 pshuflw m5, m5, 0
51 punpcklqdq m3, m3
52 punpcklqdq m5, m5
53 %else
54 pshufw m3, m3, 0
55 pshufw m5, m5, 0
56 %endif
57 pxor m7, m7
58 %endmacro
59
60 %macro WEIGHT_OP 2
61 movh m0, [r0+%1]
62 movh m1, [r0+%2]
63 punpcklbw m0, m7
64 punpcklbw m1, m7
65 pmullw m0, m3
66 pmullw m1, m3
67 paddsw m0, m5
68 paddsw m1, m5
69 psraw m0, m6
70 psraw m1, m6
71 packuswb m0, m1
72 %endmacro
73
74 %macro WEIGHT_FUNC_DBL_MM 1
75 cglobal h264_weight_16x%1_mmx2, 5, 5, 0
76 WEIGHT_SETUP
77 mov r2, %1
78 %if %1 == 16
79 .nextrow
80 WEIGHT_OP 0, 4
81 mova [r0 ], m0
82 WEIGHT_OP 8, 12
83 mova [r0+8], m0
84 add r0, r1
85 dec r2
86 jnz .nextrow
87 REP_RET
88 %else
89 jmp _ff_h264_weight_16x16_mmx2.nextrow
90 %endif
91 %endmacro
92
93 INIT_MMX
94 WEIGHT_FUNC_DBL_MM 16
95 WEIGHT_FUNC_DBL_MM 8
96
97 %macro WEIGHT_FUNC_MM 4
98 cglobal h264_weight_%1x%2_%4, 7, 7, %
99 WEIGHT_SETUP
100 mov r2, %2
101 %if %2 == 16
102 .nextrow
103 WEIGHT_OP 0, mmsize/2
104 mova [r0], m0
105 add r0, r1
106 dec r2
107 jnz .nextrow
108 REP_RET
109 %else
110 jmp _ff_h264_weight_%1x16_%4.nextrow
111 %endif
112 %endmacro
113
114 INIT_MMX
115 WEIGHT_FUNC_MM 8, 16, 0, mmx2
116 WEIGHT_FUNC_MM 8, 8, 0, mmx2
117 WEIGHT_FUNC_MM 8, 4, 0, mmx2
118 INIT_XMM
119 WEIGHT_FUNC_MM 16, 16, 8, sse2
120 WEIGHT_FUNC_MM 16, 8, 8, sse2
121
122 %macro WEIGHT_FUNC_HALF_MM 5
123 cglobal h264_weight_%1x%2_%5, 5, 5, %4
124 WEIGHT_SETUP
125 mov r2, %2/2
126 lea r3, [r1*2]
127 %if %2 == mmsize
128 .nextrow
129 WEIGHT_OP 0, r1
130 movh [r0], m0
131 %if mmsize == 16
132 movhps [r0+r1], m0
133 %else
134 psrlq m0, 32
135 movh [r0+r1], m0
136 %endif
137 add r0, r3
138 dec r2
139 jnz .nextrow
140 REP_RET
141 %else
142 jmp _ff_h264_weight_%1x%3_%5.nextrow
143 %endif
144 %endmacro
145
146 INIT_MMX
147 WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
148 WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
149 WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
150 INIT_XMM
151 WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
152 WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
153 WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
154
155 %macro BIWEIGHT_SETUP 0
156 add r6, 1
157 or r6, 1
158 add r3, 1
159 movd m3, r4
160 movd m4, r5
161 movd m5, r6
162 movd m6, r3
163 pslld m5, m6
164 psrld m5, 1
165 %if mmsize == 16
166 pshuflw m3, m3, 0
167 pshuflw m4, m4, 0
168 pshuflw m5, m5, 0
169 punpcklqdq m3, m3
170 punpcklqdq m4, m4
171 punpcklqdq m5, m5
172 %else
173 pshufw m3, m3, 0
174 pshufw m4, m4, 0
175 pshufw m5, m5, 0
176 %endif
177 pxor m7, m7
178 %endmacro
179
180 %macro BIWEIGHT_STEPA 3
181 movh m%1, [r0+%3]
182 movh m%2, [r1+%3]
183 punpcklbw m%1, m7
184 punpcklbw m%2, m7
185 pmullw m%1, m3
186 pmullw m%2, m4
187 paddsw m%1, m%2
188 %endmacro
189
190 %macro BIWEIGHT_STEPB 0
191 paddsw m0, m5
192 paddsw m1, m5
193 psraw m0, m6
194 psraw m1, m6
195 packuswb m0, m1
196 %endmacro
197
198 %macro BIWEIGHT_FUNC_DBL_MM 1
199 cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
200 BIWEIGHT_SETUP
201 mov r3, %1
202 %if %1 == 16
203 .nextrow
204 BIWEIGHT_STEPA 0, 1, 0
205 BIWEIGHT_STEPA 1, 2, 4
206 BIWEIGHT_STEPB
207 mova [r0], m0
208 BIWEIGHT_STEPA 0, 1, 8
209 BIWEIGHT_STEPA 1, 2, 12
210 BIWEIGHT_STEPB
211 mova [r0+8], m0
212 add r0, r2
213 add r1, r2
214 dec r3
215 jnz .nextrow
216 REP_RET
217 %else
218 jmp _ff_h264_biweight_16x16_mmx2.nextrow
219 %endif
220 %endmacro
221
222 INIT_MMX
223 BIWEIGHT_FUNC_DBL_MM 16
224 BIWEIGHT_FUNC_DBL_MM 8
225
226 %macro BIWEIGHT_FUNC_MM 4
227 cglobal h264_biweight_%1x%2_%4, 7, 7, %3
228 BIWEIGHT_SETUP
229 mov r3, %2
230 %if %2 == 16
231 .nextrow
232 BIWEIGHT_STEPA 0, 1, 0
233 BIWEIGHT_STEPA 1, 2, mmsize/2
234 BIWEIGHT_STEPB
235 mova [r0], m0
236 add r0, r2
237 add r1, r2
238 dec r3
239 jnz .nextrow
240 REP_RET
241 %else
242 jmp _ff_h264_biweight_%1x16_%4.nextrow
243 %endif
244 %endmacro
245
246 INIT_MMX
247 BIWEIGHT_FUNC_MM 8, 16, 0, mmx2
248 BIWEIGHT_FUNC_MM 8, 8, 0, mmx2
249 BIWEIGHT_FUNC_MM 8, 4, 0, mmx2
250 INIT_XMM
251 BIWEIGHT_FUNC_MM 16, 16, 8, sse2
252 BIWEIGHT_FUNC_MM 16, 8, 8, sse2
253
254 %macro BIWEIGHT_FUNC_HALF_MM 5
255 cglobal h264_biweight_%1x%2_%5, 7, 7, %4
256 BIWEIGHT_SETUP
257 mov r3, %2/2
258 lea r4, [r2*2]
259 %if %2 == mmsize
260 .nextrow
261 BIWEIGHT_STEPA 0, 1, 0
262 BIWEIGHT_STEPA 1, 2, r2
263 BIWEIGHT_STEPB
264 movh [r0], m0
265 %if mmsize == 16
266 movhps [r0+r2], m0
267 %else
268 psrlq m0, 32
269 movh [r0+r2], m0
270 %endif
271 add r0, r4
272 add r1, r4
273 dec r3
274 jnz .nextrow
275 REP_RET
276 %else
277 jmp _ff_h264_biweight_%1x%3_%5.nextrow
278 %endif
279 %endmacro
280
281 INIT_MMX
282 BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
283 BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
284 BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
285 INIT_XMM
286 BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
287 BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
288 BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
289
290 %macro BIWEIGHT_SSSE3_SETUP 0
291 add r6, 1
292 or r6, 1
293 add r3, 1
294 movd m4, r4
295 movd m0, r5
296 movd m5, r6
297 movd m6, r3
298 pslld m5, m6
299 psrld m5, 1
300 punpcklbw m4, m0
301 pshuflw m4, m4, 0
302 pshuflw m5, m5, 0
303 punpcklqdq m4, m4
304 punpcklqdq m5, m5
305 %endmacro
306
307 %macro BIWEIGHT_SSSE3_OP 0
308 pmaddubsw m0, m4
309 pmaddubsw m2, m4
310 paddsw m0, m5
311 paddsw m2, m5
312 psraw m0, m6
313 psraw m2, m6
314 packuswb m0, m2
315 %endmacro
316
317 %macro BIWEIGHT_SSSE3_16 1
318 cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
319 BIWEIGHT_SSSE3_SETUP
320 mov r3, %1
321
322 %if %1 == 16
323 .nextrow
324 movh m0, [r0]
325 movh m2, [r0+8]
326 movh m3, [r1+8]
327 punpcklbw m0, [r1]
328 punpcklbw m2, m3
329 BIWEIGHT_SSSE3_OP
330 mova [r0], m0
331 add r0, r2
332 add r1, r2
333 dec r3
334 jnz .nextrow
335 REP_RET
336 %else
337 jmp _ff_h264_biweight_16x16_ssse3.nextrow
338 %endif
339 %endmacro
340
341 INIT_XMM
342 BIWEIGHT_SSSE3_16 16
343 BIWEIGHT_SSSE3_16 8
344
345 %macro BIWEIGHT_SSSE3_8 1
346 cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
347 BIWEIGHT_SSSE3_SETUP
348 mov r3, %1/2
349 lea r4, [r2*2]
350
351 %if %1 == 16
352 .nextrow
353 movh m0, [r0]
354 movh m1, [r1]
355 movh m2, [r0+r2]
356 movh m3, [r1+r2]
357 punpcklbw m0, m1
358 punpcklbw m2, m3
359 BIWEIGHT_SSSE3_OP
360 movh [r0], m0
361 movhps [r0+r2], m0
362 add r0, r4
363 add r1, r4
364 dec r3
365 jnz .nextrow
366 REP_RET
367 %else
368 jmp _ff_h264_biweight_8x16_ssse3.nextrow
369 %endif
370 %endmacro
371
372 INIT_XMM
373 BIWEIGHT_SSSE3_8 16
374 BIWEIGHT_SSSE3_8 8
375 BIWEIGHT_SSSE3_8 4