Mercurial > libavcodec.hg
annotate x86/dsputil_yasm.asm @ 11339:a82cdda1f507 libavcodec
Merge weight & offset tables, 15 cpu cycles faster.
author | michael |
---|---|
date | Wed, 03 Mar 2010 14:41:43 +0000 |
parents | abb3b23bda35 |
children | 980030a3e315 |
rev | line source |
---|---|
8430 | 1 ;****************************************************************************** |
2 ;* MMX optimized DSP utils | |
3 ;* Copyright (c) 2008 Loren Merritt | |
4 ;* | |
5 ;* This file is part of FFmpeg. | |
6 ;* | |
7 ;* FFmpeg is free software; you can redistribute it and/or | |
8 ;* modify it under the terms of the GNU Lesser General Public | |
9 ;* License as published by the Free Software Foundation; either | |
10 ;* version 2.1 of the License, or (at your option) any later version. | |
11 ;* | |
12 ;* FFmpeg is distributed in the hope that it will be useful, | |
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 ;* Lesser General Public License for more details. | |
16 ;* | |
17 ;* You should have received a copy of the GNU Lesser General Public | |
18 ;* License along with FFmpeg; if not, write to the Free Software | |
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 ;****************************************************************************** | |
21 | |
22 %include "x86inc.asm" | |
23 | |
10430 | 24 SECTION_RODATA |
25 pb_f: times 16 db 15 | |
26 pb_zzzzzzzz77777777: times 8 db -1 | |
27 pb_7: times 8 db 7 | |
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 | |
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |
30 | |
8430 | 31 section .text align=16 |
32 | |
33 %macro PSWAPD_SSE 2 | |
34 pshufw %1, %2, 0x4e | |
35 %endmacro | |
36 %macro PSWAPD_3DN1 2 | |
37 movq %1, %2 | |
38 psrlq %1, 32 | |
39 punpckldq %1, %2 | |
40 %endmacro | |
41 | |
42 %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |
43 ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |
44 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |
45 %ifdef ARCH_X86_64 | |
46 %define lend r10d | |
47 mov lend, r2d | |
48 %else | |
49 %define lend dword r2m | |
50 %endif | |
51 mov src1q, [srcq+1*gprsize] | |
52 mov src2q, [srcq+2*gprsize] | |
53 mov src3q, [srcq+3*gprsize] | |
54 mov src4q, [srcq+4*gprsize] | |
55 mov src5q, [srcq+5*gprsize] | |
56 mov srcq, [srcq] | |
57 sub src1q, srcq | |
58 sub src2q, srcq | |
59 sub src3q, srcq | |
60 sub src4q, srcq | |
61 sub src5q, srcq | |
62 .loop: | |
63 cvtps2pi mm0, [srcq] | |
64 cvtps2pi mm1, [srcq+src1q] | |
65 cvtps2pi mm2, [srcq+src2q] | |
66 cvtps2pi mm3, [srcq+src3q] | |
67 cvtps2pi mm4, [srcq+src4q] | |
68 cvtps2pi mm5, [srcq+src5q] | |
69 packssdw mm0, mm3 | |
70 packssdw mm1, mm4 | |
71 packssdw mm2, mm5 | |
72 pswapd mm3, mm0 | |
73 punpcklwd mm0, mm1 | |
74 punpckhwd mm1, mm2 | |
75 punpcklwd mm2, mm3 | |
76 pswapd mm3, mm0 | |
77 punpckldq mm0, mm2 | |
78 punpckhdq mm2, mm1 | |
79 punpckldq mm1, mm3 | |
80 movq [dstq ], mm0 | |
81 movq [dstq+16], mm2 | |
82 movq [dstq+ 8], mm1 | |
83 add srcq, 8 | |
84 add dstq, 24 | |
85 sub lend, 2 | |
86 jg .loop | |
87 emms | |
88 RET | |
89 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |
90 | |
91 %define pswapd PSWAPD_SSE | |
92 FLOAT_TO_INT16_INTERLEAVE6 sse | |
93 %define cvtps2pi pf2id | |
94 %define pswapd PSWAPD_3DN1 | |
95 FLOAT_TO_INT16_INTERLEAVE6 3dnow | |
96 %undef pswapd | |
97 FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |
98 %undef cvtps2pi | |
99 | |
8760 | 100 |
101 | |
10633 | 102 %macro SCALARPRODUCT 1 |
10644 | 103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
10633 | 104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
105 shl orderq, 1 | |
106 add v1q, orderq | |
107 add v2q, orderq | |
108 neg orderq | |
109 movd m3, shiftm | |
110 pxor m2, m2 | |
111 .loop: | |
112 movu m0, [v1q + orderq] | |
113 movu m1, [v1q + orderq + mmsize] | |
114 pmaddwd m0, [v2q + orderq] | |
115 pmaddwd m1, [v2q + orderq + mmsize] | |
116 paddd m2, m0 | |
117 paddd m2, m1 | |
118 add orderq, mmsize*2 | |
119 jl .loop | |
120 %if mmsize == 16 | |
121 movhlps m0, m2 | |
122 paddd m2, m0 | |
123 psrad m2, m3 | |
124 pshuflw m0, m2, 0x4e | |
125 %else | |
126 psrad m2, m3 | |
127 pshufw m0, m2, 0x4e | |
128 %endif | |
129 paddd m2, m0 | |
130 movd eax, m2 | |
131 RET | |
10644 | 132 |
133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
10660 | 134 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul |
10644 | 135 shl orderq, 1 |
136 movd m7, mulm | |
137 %if mmsize == 16 | |
138 pshuflw m7, m7, 0 | |
139 punpcklqdq m7, m7 | |
140 %else | |
141 pshufw m7, m7, 0 | |
142 %endif | |
143 pxor m6, m6 | |
144 add v1q, orderq | |
145 add v2q, orderq | |
146 add v3q, orderq | |
147 neg orderq | |
148 .loop: | |
149 movu m0, [v2q + orderq] | |
150 movu m1, [v2q + orderq + mmsize] | |
151 mova m4, [v1q + orderq] | |
152 mova m5, [v1q + orderq + mmsize] | |
153 movu m2, [v3q + orderq] | |
154 movu m3, [v3q + orderq + mmsize] | |
155 pmaddwd m0, m4 | |
156 pmaddwd m1, m5 | |
157 pmullw m2, m7 | |
158 pmullw m3, m7 | |
159 paddd m6, m0 | |
160 paddd m6, m1 | |
161 paddw m2, m4 | |
162 paddw m3, m5 | |
163 mova [v1q + orderq], m2 | |
164 mova [v1q + orderq + mmsize], m3 | |
165 add orderq, mmsize*2 | |
166 jl .loop | |
167 %if mmsize == 16 | |
168 movhlps m0, m6 | |
169 paddd m6, m0 | |
170 pshuflw m0, m6, 0x4e | |
171 %else | |
172 pshufw m0, m6, 0x4e | |
173 %endif | |
174 paddd m6, m0 | |
175 movd eax, m6 | |
176 RET | |
10633 | 177 %endmacro |
178 | |
179 INIT_MMX | |
180 SCALARPRODUCT mmx2 | |
181 INIT_XMM | |
182 SCALARPRODUCT sse2 | |
183 | |
10644 | 184 %macro SCALARPRODUCT_LOOP 1 |
185 align 16 | |
186 .loop%1: | |
187 sub orderq, mmsize*2 | |
188 %if %1 | |
189 mova m1, m4 | |
190 mova m4, [v2q + orderq] | |
191 mova m0, [v2q + orderq + mmsize] | |
192 palignr m1, m0, %1 | |
193 palignr m0, m4, %1 | |
194 mova m3, m5 | |
195 mova m5, [v3q + orderq] | |
196 mova m2, [v3q + orderq + mmsize] | |
197 palignr m3, m2, %1 | |
198 palignr m2, m5, %1 | |
199 %else | |
200 mova m0, [v2q + orderq] | |
201 mova m1, [v2q + orderq + mmsize] | |
202 mova m2, [v3q + orderq] | |
203 mova m3, [v3q + orderq + mmsize] | |
204 %endif | |
10646
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
205 %define t0 [v1q + orderq] |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
206 %define t1 [v1q + orderq + mmsize] |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
207 %ifdef ARCH_X86_64 |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
208 mova m8, t0 |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
209 mova m9, t1 |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
210 %define t0 m8 |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
211 %define t1 m9 |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
212 %endif |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
213 pmaddwd m0, t0 |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
214 pmaddwd m1, t1 |
10644 | 215 pmullw m2, m7 |
216 pmullw m3, m7 | |
10646
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
217 paddw m2, t0 |
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
218 paddw m3, t1 |
10644 | 219 paddd m6, m0 |
220 paddd m6, m1 | |
221 mova [v1q + orderq], m2 | |
222 mova [v1q + orderq + mmsize], m3 | |
223 jg .loop%1 | |
224 %if %1 | |
225 jmp .end | |
226 %endif | |
227 %endmacro | |
228 | |
229 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
10646
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
230 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul |
10644 | 231 shl orderq, 1 |
232 movd m7, mulm | |
233 pshuflw m7, m7, 0 | |
234 punpcklqdq m7, m7 | |
235 pxor m6, m6 | |
236 mov r4d, v2d | |
237 and r4d, 15 | |
238 and v2q, ~15 | |
239 and v3q, ~15 | |
240 mova m4, [v2q + orderq] | |
241 mova m5, [v3q + orderq] | |
242 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |
243 cmp r4d, 0 | |
244 je .loop0 | |
245 cmp r4d, 2 | |
246 je .loop2 | |
247 cmp r4d, 4 | |
248 je .loop4 | |
249 cmp r4d, 6 | |
250 je .loop6 | |
251 cmp r4d, 8 | |
252 je .loop8 | |
253 cmp r4d, 10 | |
254 je .loop10 | |
255 cmp r4d, 12 | |
256 je .loop12 | |
257 SCALARPRODUCT_LOOP 14 | |
258 SCALARPRODUCT_LOOP 12 | |
259 SCALARPRODUCT_LOOP 10 | |
260 SCALARPRODUCT_LOOP 8 | |
261 SCALARPRODUCT_LOOP 6 | |
262 SCALARPRODUCT_LOOP 4 | |
263 SCALARPRODUCT_LOOP 2 | |
264 SCALARPRODUCT_LOOP 0 | |
265 .end: | |
266 movhlps m0, m6 | |
267 paddd m6, m0 | |
268 pshuflw m0, m6, 0x4e | |
269 paddd m6, m0 | |
270 movd eax, m6 | |
271 RET | |
272 | |
10633 | 273 |
274 | |
10431 | 275 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
8760 | 276 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |
277 movq mm0, [topq] | |
278 movq mm2, mm0 | |
279 movd mm4, [left_topq] | |
280 psllq mm2, 8 | |
281 movq mm1, mm0 | |
282 por mm4, mm2 | |
283 movd mm3, [leftq] | |
284 psubb mm0, mm4 ; t-tl | |
285 add dstq, wq | |
286 add topq, wq | |
287 add diffq, wq | |
288 neg wq | |
289 jmp .skip | |
290 .loop: | |
291 movq mm4, [topq+wq] | |
292 movq mm0, mm4 | |
293 psllq mm4, 8 | |
294 por mm4, mm1 | |
295 movq mm1, mm0 ; t | |
296 psubb mm0, mm4 ; t-tl | |
297 .skip: | |
298 movq mm2, [diffq+wq] | |
299 %assign i 0 | |
300 %rep 8 | |
301 movq mm4, mm0 | |
302 paddb mm4, mm3 ; t-tl+l | |
303 movq mm5, mm3 | |
304 pmaxub mm3, mm1 | |
305 pminub mm5, mm1 | |
306 pminub mm3, mm4 | |
307 pmaxub mm3, mm5 ; median | |
308 paddb mm3, mm2 ; +residual | |
309 %if i==0 | |
310 movq mm7, mm3 | |
311 psllq mm7, 56 | |
312 %else | |
313 movq mm6, mm3 | |
314 psrlq mm7, 8 | |
315 psllq mm6, 56 | |
316 por mm7, mm6 | |
317 %endif | |
318 %if i<7 | |
319 psrlq mm0, 8 | |
320 psrlq mm1, 8 | |
321 psrlq mm2, 8 | |
322 %endif | |
323 %assign i i+1 | |
324 %endrep | |
325 movq [dstq+wq], mm7 | |
326 add wq, 8 | |
327 jl .loop | |
328 movzx r2d, byte [dstq-1] | |
329 mov [leftq], r2d | |
330 movzx r2d, byte [topq-1] | |
331 mov [left_topq], r2d | |
332 RET | |
10430 | 333 |
334 | |
335 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned | |
336 add srcq, wq | |
337 add dstq, wq | |
338 neg wq | |
339 %%.loop: | |
340 mova m1, [srcq+wq] | |
341 mova m2, m1 | |
342 psllw m1, 8 | |
343 paddb m1, m2 | |
344 mova m2, m1 | |
345 pshufb m1, m3 | |
346 paddb m1, m2 | |
347 pshufb m0, m5 | |
348 mova m2, m1 | |
349 pshufb m1, m4 | |
350 paddb m1, m2 | |
351 %if mmsize == 16 | |
352 mova m2, m1 | |
353 pshufb m1, m6 | |
354 paddb m1, m2 | |
355 %endif | |
356 paddb m0, m1 | |
357 %if %1 | |
358 mova [dstq+wq], m0 | |
359 %else | |
360 movq [dstq+wq], m0 | |
361 movhps [dstq+wq+8], m0 | |
362 %endif | |
363 add wq, mmsize | |
364 jl %%.loop | |
365 mov eax, mmsize-1 | |
366 sub eax, wd | |
367 movd m1, eax | |
368 pshufb m0, m1 | |
369 movd eax, m0 | |
370 RET | |
371 %endmacro | |
372 | |
10431 | 373 ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |
10430 | 374 INIT_MMX |
375 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left | |
376 .skip_prologue: | |
377 mova m5, [pb_7 GLOBAL] | |
378 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |
379 mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |
380 movd m0, leftm | |
381 psllq m0, 56 | |
382 ADD_HFYU_LEFT_LOOP 1 | |
383 | |
384 INIT_XMM | |
385 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | |
386 mova m5, [pb_f GLOBAL] | |
387 mova m6, [pb_zzzzzzzz77777777 GLOBAL] | |
388 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |
389 mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |
390 movd m0, leftm | |
391 pslldq m0, 15 | |
392 test srcq, 15 | |
10434
276b3a342389
fix linking on systems with a function name prefix (10l in r20287)
lorenm
parents:
10431
diff
changeset
|
393 jnz add_hfyu_left_prediction_ssse3.skip_prologue |
10430 | 394 test dstq, 15 |
395 jnz .unaligned | |
396 ADD_HFYU_LEFT_LOOP 1 | |
397 .unaligned: | |
398 ADD_HFYU_LEFT_LOOP 0 | |
399 | |
10964 | 400 |
401 ; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len) | |
402 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset | |
403 neg offsetq | |
404 shl offsetq, 2 | |
405 sub v1q, offsetq | |
406 sub v2q, offsetq | |
407 xorps xmm0, xmm0 | |
408 .loop: | |
409 movaps xmm1, [v1q+offsetq] | |
410 mulps xmm1, [v2q+offsetq] | |
411 addps xmm0, xmm1 | |
412 add offsetq, 16 | |
413 js .loop | |
414 movhlps xmm1, xmm0 | |
415 addps xmm0, xmm1 | |
416 movss xmm1, xmm0 | |
417 shufps xmm0, xmm0, 1 | |
418 addss xmm0, xmm1 | |
419 %ifndef ARCH_X86_64 | |
420 movd r0m, xmm0 | |
421 fld dword r0m | |
422 %endif | |
423 RET |