Mercurial > libavcodec.hg
annotate x86/dsputil_yasm.asm @ 10644:5da7180afadf libavcodec
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
author | lorenm |
---|---|
date | Sat, 05 Dec 2009 15:09:10 +0000 |
parents | 66242b8fbd32 |
children | bb14c1586891 |
rev | line source |
---|---|
8430 | 1 ;****************************************************************************** |
2 ;* MMX optimized DSP utils | |
3 ;* Copyright (c) 2008 Loren Merritt | |
4 ;* | |
5 ;* This file is part of FFmpeg. | |
6 ;* | |
7 ;* FFmpeg is free software; you can redistribute it and/or | |
8 ;* modify it under the terms of the GNU Lesser General Public | |
9 ;* License as published by the Free Software Foundation; either | |
10 ;* version 2.1 of the License, or (at your option) any later version. | |
11 ;* | |
12 ;* FFmpeg is distributed in the hope that it will be useful, | |
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 ;* Lesser General Public License for more details. | |
16 ;* | |
17 ;* You should have received a copy of the GNU Lesser General Public | |
18 ;* License along with FFmpeg; if not, write to the Free Software | |
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 ;****************************************************************************** | |
21 | |
22 %include "x86inc.asm" | |
23 | |
10430 | 24 SECTION_RODATA |
25 pb_f: times 16 db 15 | |
26 pb_zzzzzzzz77777777: times 8 db -1 | |
27 pb_7: times 8 db 7 | |
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 | |
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |
30 | |
8430 | 31 section .text align=16 |
32 | |
33 %macro PSWAPD_SSE 2 | |
34 pshufw %1, %2, 0x4e | |
35 %endmacro | |
36 %macro PSWAPD_3DN1 2 | |
37 movq %1, %2 | |
38 psrlq %1, 32 | |
39 punpckldq %1, %2 | |
40 %endmacro | |
41 | |
42 %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |
43 ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |
44 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |
45 %ifdef ARCH_X86_64 | |
46 %define lend r10d | |
47 mov lend, r2d | |
48 %else | |
49 %define lend dword r2m | |
50 %endif | |
51 mov src1q, [srcq+1*gprsize] | |
52 mov src2q, [srcq+2*gprsize] | |
53 mov src3q, [srcq+3*gprsize] | |
54 mov src4q, [srcq+4*gprsize] | |
55 mov src5q, [srcq+5*gprsize] | |
56 mov srcq, [srcq] | |
57 sub src1q, srcq | |
58 sub src2q, srcq | |
59 sub src3q, srcq | |
60 sub src4q, srcq | |
61 sub src5q, srcq | |
62 .loop: | |
63 cvtps2pi mm0, [srcq] | |
64 cvtps2pi mm1, [srcq+src1q] | |
65 cvtps2pi mm2, [srcq+src2q] | |
66 cvtps2pi mm3, [srcq+src3q] | |
67 cvtps2pi mm4, [srcq+src4q] | |
68 cvtps2pi mm5, [srcq+src5q] | |
69 packssdw mm0, mm3 | |
70 packssdw mm1, mm4 | |
71 packssdw mm2, mm5 | |
72 pswapd mm3, mm0 | |
73 punpcklwd mm0, mm1 | |
74 punpckhwd mm1, mm2 | |
75 punpcklwd mm2, mm3 | |
76 pswapd mm3, mm0 | |
77 punpckldq mm0, mm2 | |
78 punpckhdq mm2, mm1 | |
79 punpckldq mm1, mm3 | |
80 movq [dstq ], mm0 | |
81 movq [dstq+16], mm2 | |
82 movq [dstq+ 8], mm1 | |
83 add srcq, 8 | |
84 add dstq, 24 | |
85 sub lend, 2 | |
86 jg .loop | |
87 emms | |
88 RET | |
89 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |
90 | |
91 %define pswapd PSWAPD_SSE | |
92 FLOAT_TO_INT16_INTERLEAVE6 sse | |
93 %define cvtps2pi pf2id | |
94 %define pswapd PSWAPD_3DN1 | |
95 FLOAT_TO_INT16_INTERLEAVE6 3dnow | |
96 %undef pswapd | |
97 FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |
98 %undef cvtps2pi | |
99 | |
8760 | 100 |
101 | |
10633 | 102 %macro SCALARPRODUCT 1 |
10644 | 103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
10633 | 104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
105 shl orderq, 1 | |
106 add v1q, orderq | |
107 add v2q, orderq | |
108 neg orderq | |
109 movd m3, shiftm | |
110 pxor m2, m2 | |
111 .loop: | |
112 movu m0, [v1q + orderq] | |
113 movu m1, [v1q + orderq + mmsize] | |
114 pmaddwd m0, [v2q + orderq] | |
115 pmaddwd m1, [v2q + orderq + mmsize] | |
116 paddd m2, m0 | |
117 paddd m2, m1 | |
118 add orderq, mmsize*2 | |
119 jl .loop | |
120 %if mmsize == 16 | |
121 movhlps m0, m2 | |
122 paddd m2, m0 | |
123 psrad m2, m3 | |
124 pshuflw m0, m2, 0x4e | |
125 %else | |
126 psrad m2, m3 | |
127 pshufw m0, m2, 0x4e | |
128 %endif | |
129 paddd m2, m0 | |
130 movd eax, m2 | |
131 RET | |
10644 | 132 |
133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
134 cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul | |
135 shl orderq, 1 | |
136 movd m7, mulm | |
137 %if mmsize == 16 | |
138 pshuflw m7, m7, 0 | |
139 punpcklqdq m7, m7 | |
140 %else | |
141 pshufw m7, m7, 0 | |
142 %endif | |
143 pxor m6, m6 | |
144 add v1q, orderq | |
145 add v2q, orderq | |
146 add v3q, orderq | |
147 neg orderq | |
148 .loop: | |
149 movu m0, [v2q + orderq] | |
150 movu m1, [v2q + orderq + mmsize] | |
151 mova m4, [v1q + orderq] | |
152 mova m5, [v1q + orderq + mmsize] | |
153 movu m2, [v3q + orderq] | |
154 movu m3, [v3q + orderq + mmsize] | |
155 pmaddwd m0, m4 | |
156 pmaddwd m1, m5 | |
157 pmullw m2, m7 | |
158 pmullw m3, m7 | |
159 paddd m6, m0 | |
160 paddd m6, m1 | |
161 paddw m2, m4 | |
162 paddw m3, m5 | |
163 mova [v1q + orderq], m2 | |
164 mova [v1q + orderq + mmsize], m3 | |
165 add orderq, mmsize*2 | |
166 jl .loop | |
167 %if mmsize == 16 | |
168 movhlps m0, m6 | |
169 paddd m6, m0 | |
170 pshuflw m0, m6, 0x4e | |
171 %else | |
172 pshufw m0, m6, 0x4e | |
173 %endif | |
174 paddd m6, m0 | |
175 movd eax, m6 | |
176 RET | |
10633 | 177 %endmacro |
178 | |
179 INIT_MMX | |
180 SCALARPRODUCT mmx2 | |
181 INIT_XMM | |
182 SCALARPRODUCT sse2 | |
183 | |
10644 | 184 %macro SCALARPRODUCT_LOOP 1 |
185 align 16 | |
186 .loop%1: | |
187 sub orderq, mmsize*2 | |
188 %if %1 | |
189 mova m1, m4 | |
190 mova m4, [v2q + orderq] | |
191 mova m0, [v2q + orderq + mmsize] | |
192 palignr m1, m0, %1 | |
193 palignr m0, m4, %1 | |
194 mova m3, m5 | |
195 mova m5, [v3q + orderq] | |
196 mova m2, [v3q + orderq + mmsize] | |
197 palignr m3, m2, %1 | |
198 palignr m2, m5, %1 | |
199 %else | |
200 mova m0, [v2q + orderq] | |
201 mova m1, [v2q + orderq + mmsize] | |
202 mova m2, [v3q + orderq] | |
203 mova m3, [v3q + orderq + mmsize] | |
204 %endif | |
205 pmaddwd m0, [v1q + orderq] | |
206 pmaddwd m1, [v1q + orderq + mmsize] | |
207 pmullw m2, m7 | |
208 pmullw m3, m7 | |
209 paddw m2, [v1q + orderq] | |
210 paddw m3, [v1q + orderq + mmsize] | |
211 paddd m6, m0 | |
212 paddd m6, m1 | |
213 mova [v1q + orderq], m2 | |
214 mova [v1q + orderq + mmsize], m3 | |
215 jg .loop%1 | |
216 %if %1 | |
217 jmp .end | |
218 %endif | |
219 %endmacro | |
220 | |
221 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
222 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul | |
223 shl orderq, 1 | |
224 movd m7, mulm | |
225 pshuflw m7, m7, 0 | |
226 punpcklqdq m7, m7 | |
227 pxor m6, m6 | |
228 mov r4d, v2d | |
229 and r4d, 15 | |
230 and v2q, ~15 | |
231 and v3q, ~15 | |
232 mova m4, [v2q + orderq] | |
233 mova m5, [v3q + orderq] | |
234 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |
235 cmp r4d, 0 | |
236 je .loop0 | |
237 cmp r4d, 2 | |
238 je .loop2 | |
239 cmp r4d, 4 | |
240 je .loop4 | |
241 cmp r4d, 6 | |
242 je .loop6 | |
243 cmp r4d, 8 | |
244 je .loop8 | |
245 cmp r4d, 10 | |
246 je .loop10 | |
247 cmp r4d, 12 | |
248 je .loop12 | |
249 SCALARPRODUCT_LOOP 14 | |
250 SCALARPRODUCT_LOOP 12 | |
251 SCALARPRODUCT_LOOP 10 | |
252 SCALARPRODUCT_LOOP 8 | |
253 SCALARPRODUCT_LOOP 6 | |
254 SCALARPRODUCT_LOOP 4 | |
255 SCALARPRODUCT_LOOP 2 | |
256 SCALARPRODUCT_LOOP 0 | |
257 .end: | |
258 movhlps m0, m6 | |
259 paddd m6, m0 | |
260 pshuflw m0, m6, 0x4e | |
261 paddd m6, m0 | |
262 movd eax, m6 | |
263 RET | |
264 | |
10633 | 265 |
266 | |
10431 | 267 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
8760 | 268 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |
269 movq mm0, [topq] | |
270 movq mm2, mm0 | |
271 movd mm4, [left_topq] | |
272 psllq mm2, 8 | |
273 movq mm1, mm0 | |
274 por mm4, mm2 | |
275 movd mm3, [leftq] | |
276 psubb mm0, mm4 ; t-tl | |
277 add dstq, wq | |
278 add topq, wq | |
279 add diffq, wq | |
280 neg wq | |
281 jmp .skip | |
282 .loop: | |
283 movq mm4, [topq+wq] | |
284 movq mm0, mm4 | |
285 psllq mm4, 8 | |
286 por mm4, mm1 | |
287 movq mm1, mm0 ; t | |
288 psubb mm0, mm4 ; t-tl | |
289 .skip: | |
290 movq mm2, [diffq+wq] | |
291 %assign i 0 | |
292 %rep 8 | |
293 movq mm4, mm0 | |
294 paddb mm4, mm3 ; t-tl+l | |
295 movq mm5, mm3 | |
296 pmaxub mm3, mm1 | |
297 pminub mm5, mm1 | |
298 pminub mm3, mm4 | |
299 pmaxub mm3, mm5 ; median | |
300 paddb mm3, mm2 ; +residual | |
301 %if i==0 | |
302 movq mm7, mm3 | |
303 psllq mm7, 56 | |
304 %else | |
305 movq mm6, mm3 | |
306 psrlq mm7, 8 | |
307 psllq mm6, 56 | |
308 por mm7, mm6 | |
309 %endif | |
310 %if i<7 | |
311 psrlq mm0, 8 | |
312 psrlq mm1, 8 | |
313 psrlq mm2, 8 | |
314 %endif | |
315 %assign i i+1 | |
316 %endrep | |
317 movq [dstq+wq], mm7 | |
318 add wq, 8 | |
319 jl .loop | |
320 movzx r2d, byte [dstq-1] | |
321 mov [leftq], r2d | |
322 movzx r2d, byte [topq-1] | |
323 mov [left_topq], r2d | |
324 RET | |
10430 | 325 |
326 | |
327 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned | |
328 add srcq, wq | |
329 add dstq, wq | |
330 neg wq | |
331 %%.loop: | |
332 mova m1, [srcq+wq] | |
333 mova m2, m1 | |
334 psllw m1, 8 | |
335 paddb m1, m2 | |
336 mova m2, m1 | |
337 pshufb m1, m3 | |
338 paddb m1, m2 | |
339 pshufb m0, m5 | |
340 mova m2, m1 | |
341 pshufb m1, m4 | |
342 paddb m1, m2 | |
343 %if mmsize == 16 | |
344 mova m2, m1 | |
345 pshufb m1, m6 | |
346 paddb m1, m2 | |
347 %endif | |
348 paddb m0, m1 | |
349 %if %1 | |
350 mova [dstq+wq], m0 | |
351 %else | |
352 movq [dstq+wq], m0 | |
353 movhps [dstq+wq+8], m0 | |
354 %endif | |
355 add wq, mmsize | |
356 jl %%.loop | |
357 mov eax, mmsize-1 | |
358 sub eax, wd | |
359 movd m1, eax | |
360 pshufb m0, m1 | |
361 movd eax, m0 | |
362 RET | |
363 %endmacro | |
364 | |
10431 | 365 ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |
10430 | 366 INIT_MMX |
367 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left | |
368 .skip_prologue: | |
369 mova m5, [pb_7 GLOBAL] | |
370 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |
371 mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |
372 movd m0, leftm | |
373 psllq m0, 56 | |
374 ADD_HFYU_LEFT_LOOP 1 | |
375 | |
376 INIT_XMM | |
377 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | |
378 mova m5, [pb_f GLOBAL] | |
379 mova m6, [pb_zzzzzzzz77777777 GLOBAL] | |
380 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |
381 mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |
382 movd m0, leftm | |
383 pslldq m0, 15 | |
384 test srcq, 15 | |
10434
276b3a342389
fix linking on systems with a function name prefix (10l in r20287)
lorenm
parents:
10431
diff
changeset
|
385 jnz add_hfyu_left_prediction_ssse3.skip_prologue |
10430 | 386 test dstq, 15 |
387 jnz .unaligned | |
388 ADD_HFYU_LEFT_LOOP 1 | |
389 .unaligned: | |
390 ADD_HFYU_LEFT_LOOP 0 | |
391 |