Mercurial > libavcodec.hg
comparison x86/h264_idct.asm @ 12492:58a960d6e34c libavcodec
Rename h264_idct_sse2.asm to h264_idct.asm; move inline IDCT asm from
h264dsp_mmx.c to h264_idct.asm (as yasm code). Because the loops are now
coded in asm instead of C, this is (depending on the function) up to 50%
faster for cases where gcc didn't do a great job at looping.
Since h264_idct_add8() is now faster than the manual loop setup in h264.c,
in-asm idct calling can now be enabled for chroma as well (see r16207). For
MMX, this is 5% faster. For SSE2 (which isn't done for chroma if h264.c does
the looping), this makes it up to 50% faster. Speed gain overall is ~0.5-1.0%.
author | rbultje |
---|---|
date | Tue, 14 Sep 2010 13:36:26 +0000 |
parents | |
children | ef2f2db5b7be |
comparison
equal
deleted
inserted
replaced
12491:990f8a5fc8af | 12492:58a960d6e34c |
---|---|
1 ;***************************************************************************** | |
2 ;* MMX/SSE2-optimized H.264 iDCT | |
3 ;***************************************************************************** | |
4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt | |
5 ;* Copyright (C) 2003-2008 x264 project | |
6 ;* | |
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> | |
8 ;* Loren Merritt <lorenm@u.washington.edu> | |
9 ;* Holger Lubitz <hal@duncan.ol.sub.de> | |
10 ;* Min Chen <chenm001.163.com> | |
11 ;* | |
12 ;* This file is part of FFmpeg. | |
13 ;* | |
14 ;* FFmpeg is free software; you can redistribute it and/or | |
15 ;* modify it under the terms of the GNU Lesser General Public | |
16 ;* License as published by the Free Software Foundation; either | |
17 ;* version 2.1 of the License, or (at your option) any later version. | |
18 ;* | |
19 ;* FFmpeg is distributed in the hope that it will be useful, | |
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
22 ;* Lesser General Public License for more details. | |
23 ;* | |
24 ;* You should have received a copy of the GNU Lesser General Public | |
25 ;* License along with FFmpeg; if not, write to the Free Software | |
26 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
27 ;***************************************************************************** | |
28 | |
29 %include "x86inc.asm" | |
30 %include "x86util.asm" | |
31 | |
32 SECTION_RODATA | |
33 | |
34 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split | |
35 scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 | |
36 db 6+1*8, 7+1*8, 6+2*8, 7+2*8 | |
37 db 4+3*8, 5+3*8, 4+4*8, 5+4*8 | |
38 db 6+3*8, 7+3*8, 6+4*8, 7+4*8 | |
39 db 1+1*8, 2+1*8 | |
40 db 1+2*8, 2+2*8 | |
41 db 1+4*8, 2+4*8 | |
42 db 1+5*8, 2+5*8 | |
43 %ifdef PIC | |
44 %define scan8 r11 | |
45 %else | |
46 %define scan8 scan8_mem | |
47 %endif | |
48 | |
49 cextern pw_32 | |
50 | |
51 SECTION .text | |
52 | |
53 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | |
54 %macro IDCT4_ADD 3 | |
55 ; Load dct coeffs | |
56 movq m0, [%2] | |
57 movq m1, [%2+8] | |
58 movq m2, [%2+16] | |
59 movq m3, [%2+24] | |
60 | |
61 IDCT4_1D 0, 1, 2, 3, 4, 5 | |
62 mova m6, [pw_32] | |
63 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
64 paddw m0, m6 | |
65 IDCT4_1D 0, 1, 2, 3, 4, 5 | |
66 pxor m7, m7 | |
67 | |
68 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 | |
69 lea %1, [%1+%3*2] | |
70 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 | |
71 %endmacro | |
72 | |
73 INIT_MMX | |
74 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) | |
75 cglobal h264_idct_add_mmx, 3, 3, 0 | |
76 IDCT4_ADD r0, r1, r2 | |
77 RET | |
78 | |
79 %macro IDCT8_1D 2 | |
80 mova m4, m5 | |
81 mova m0, m1 | |
82 psraw m4, 1 | |
83 psraw m1, 1 | |
84 paddw m4, m5 | |
85 paddw m1, m0 | |
86 paddw m4, m7 | |
87 paddw m1, m5 | |
88 psubw m4, m0 | |
89 paddw m1, m3 | |
90 | |
91 psubw m0, m3 | |
92 psubw m5, m3 | |
93 paddw m0, m7 | |
94 psubw m5, m7 | |
95 psraw m3, 1 | |
96 psraw m7, 1 | |
97 psubw m0, m3 | |
98 psubw m5, m7 | |
99 | |
100 mova m3, m4 | |
101 mova m7, m1 | |
102 psraw m1, 2 | |
103 psraw m3, 2 | |
104 paddw m3, m0 | |
105 psraw m0, 2 | |
106 paddw m1, m5 | |
107 psraw m5, 2 | |
108 psubw m0, m4 | |
109 psubw m7, m5 | |
110 | |
111 mova m4, m2 | |
112 mova m5, m6 | |
113 psraw m4, 1 | |
114 psraw m6, 1 | |
115 psubw m4, m5 | |
116 paddw m6, m2 | |
117 | |
118 mova m2, %1 | |
119 mova m5, %2 | |
120 SUMSUB_BA m5, m2 | |
121 SUMSUB_BA m6, m5 | |
122 SUMSUB_BA m4, m2 | |
123 SUMSUB_BA m7, m6 | |
124 SUMSUB_BA m0, m4 | |
125 SUMSUB_BA m3, m2 | |
126 SUMSUB_BA m1, m5 | |
127 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 | |
128 %endmacro | |
129 | |
130 %macro IDCT8_1D_FULL 1 | |
131 mova m7, [%1+112] | |
132 mova m6, [%1+ 96] | |
133 mova m5, [%1+ 80] | |
134 mova m3, [%1+ 48] | |
135 mova m2, [%1+ 32] | |
136 mova m1, [%1+ 16] | |
137 IDCT8_1D [%1], [%1+ 64] | |
138 %endmacro | |
139 | |
140 ; %1=int16_t *block, %2=int16_t *dstblock | |
141 %macro IDCT8_ADD_MMX_START 2 | |
142 IDCT8_1D_FULL %1 | |
143 mova [%1], m7 | |
144 TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
145 mova m7, [%1] | |
146 mova [%2 ], m0 | |
147 mova [%2+16], m1 | |
148 mova [%2+32], m2 | |
149 mova [%2+48], m3 | |
150 TRANSPOSE4x4W 4, 5, 6, 7, 3 | |
151 mova [%2+ 8], m4 | |
152 mova [%2+24], m5 | |
153 mova [%2+40], m6 | |
154 mova [%2+56], m7 | |
155 %endmacro | |
156 | |
157 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | |
158 %macro IDCT8_ADD_MMX_END 3 | |
159 IDCT8_1D_FULL %2 | |
160 mova [%2 ], m5 | |
161 mova [%2+16], m6 | |
162 mova [%2+32], m7 | |
163 | |
164 pxor m7, m7 | |
165 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 | |
166 lea %1, [%1+%3*2] | |
167 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 | |
168 mova m0, [%2 ] | |
169 mova m1, [%2+16] | |
170 mova m2, [%2+32] | |
171 lea %1, [%1+%3*2] | |
172 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 | |
173 lea %1, [%1+%3*2] | |
174 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 | |
175 %endmacro | |
176 | |
177 INIT_MMX | |
178 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) | |
179 cglobal h264_idct8_add_mmx, 3, 4, 0 | |
180 %assign pad 128+4-(stack_offset&7) | |
181 SUB rsp, pad | |
182 | |
183 add word [r1], 32 | |
184 IDCT8_ADD_MMX_START r1 , rsp | |
185 IDCT8_ADD_MMX_START r1+8, rsp+64 | |
186 lea r3, [r0+4] | |
187 IDCT8_ADD_MMX_END r0 , rsp, r2 | |
188 IDCT8_ADD_MMX_END r3 , rsp+8, r2 | |
189 | |
190 ADD rsp, pad | |
191 RET | |
192 | |
193 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | |
194 %macro IDCT8_ADD_SSE 4 | |
195 IDCT8_1D_FULL %2 | |
196 %ifdef ARCH_X86_64 | |
197 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
198 %else | |
199 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] | |
200 %endif | |
201 paddw m0, [pw_32] | |
202 | |
203 %ifndef ARCH_X86_64 | |
204 mova [%2 ], m0 | |
205 mova [%2+16], m4 | |
206 IDCT8_1D [%2], [%2+ 16] | |
207 mova [%2 ], m6 | |
208 mova [%2+16], m7 | |
209 %else | |
210 SWAP 0, 8 | |
211 SWAP 4, 9 | |
212 IDCT8_1D m8, m9 | |
213 SWAP 6, 8 | |
214 SWAP 7, 9 | |
215 %endif | |
216 | |
217 pxor m7, m7 | |
218 lea %4, [%3*3] | |
219 STORE_DIFF m0, m6, m7, [%1 ] | |
220 STORE_DIFF m1, m6, m7, [%1+%3 ] | |
221 STORE_DIFF m2, m6, m7, [%1+%3*2] | |
222 STORE_DIFF m3, m6, m7, [%1+%4 ] | |
223 %ifndef ARCH_X86_64 | |
224 mova m0, [%2 ] | |
225 mova m1, [%2+16] | |
226 %else | |
227 SWAP 0, 8 | |
228 SWAP 1, 9 | |
229 %endif | |
230 lea %1, [%1+%3*4] | |
231 STORE_DIFF m4, m6, m7, [%1 ] | |
232 STORE_DIFF m5, m6, m7, [%1+%3 ] | |
233 STORE_DIFF m0, m6, m7, [%1+%3*2] | |
234 STORE_DIFF m1, m6, m7, [%1+%4 ] | |
235 %endmacro | |
236 | |
237 INIT_XMM | |
238 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) | |
239 cglobal h264_idct8_add_sse2, 3, 4, 10 | |
240 IDCT8_ADD_SSE r0, r1, r2, r3 | |
241 RET | |
242 | |
243 %macro DC_ADD_MMX2_INIT 2-3 | |
244 %if %0 == 2 | |
245 movsx %1, word [%1] | |
246 add %1, 32 | |
247 sar %1, 6 | |
248 movd m0, %1 | |
249 lea %1, [%2*3] | |
250 %else | |
251 add %3, 32 | |
252 sar %3, 6 | |
253 movd m0, %3 | |
254 lea %3, [%2*3] | |
255 %endif | |
256 pshufw m0, m0, 0 | |
257 pxor m1, m1 | |
258 psubw m1, m0 | |
259 packuswb m0, m0 | |
260 packuswb m1, m1 | |
261 %endmacro | |
262 | |
263 %macro DC_ADD_MMX2_OP 3-4 | |
264 %1 m2, [%2 ] | |
265 %1 m3, [%2+%3 ] | |
266 %1 m4, [%2+%3*2] | |
267 %1 m5, [%2+%4 ] | |
268 paddusb m2, m0 | |
269 paddusb m3, m0 | |
270 paddusb m4, m0 | |
271 paddusb m5, m0 | |
272 psubusb m2, m1 | |
273 psubusb m3, m1 | |
274 psubusb m4, m1 | |
275 psubusb m5, m1 | |
276 %1 [%2 ], m2 | |
277 %1 [%2+%3 ], m3 | |
278 %1 [%2+%3*2], m4 | |
279 %1 [%2+%4 ], m5 | |
280 %endmacro | |
281 | |
282 INIT_MMX | |
283 ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
284 cglobal h264_idct_dc_add_mmx2, 3, 3, 0 | |
285 DC_ADD_MMX2_INIT r1, r2 | |
286 DC_ADD_MMX2_OP movh, r0, r2, r1 | |
287 RET | |
288 | |
289 ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
290 cglobal h264_idct8_dc_add_mmx2, 3, 3, 0 | |
291 DC_ADD_MMX2_INIT r1, r2 | |
292 DC_ADD_MMX2_OP mova, r0, r2, r1 | |
293 lea r0, [r0+r2*4] | |
294 DC_ADD_MMX2_OP mova, r0, r2, r1 | |
295 RET | |
296 | |
297 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, | |
298 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
299 cglobal h264_idct_add16_mmx, 5, 7, 0 | |
300 xor r5, r5 | |
301 %ifdef PIC | |
302 lea r11, [scan8_mem] | |
303 %endif | |
304 .nextblock | |
305 movzx r6, byte [scan8+r5] | |
306 movzx r6, byte [r4+r6] | |
307 test r6, r6 | |
308 jz .skipblock | |
309 mov r6d, dword [r1+r5*4] | |
310 lea r6, [r0+r6] | |
311 IDCT4_ADD r6, r2, r3 | |
312 .skipblock | |
313 inc r5 | |
314 add r2, 32 | |
315 cmp r5, 16 | |
316 jl .nextblock | |
317 REP_RET | |
318 | |
319 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, | |
320 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
321 cglobal h264_idct8_add4_mmx, 5, 7, 0 | |
322 %assign pad 128+4-(stack_offset&7) | |
323 SUB rsp, pad | |
324 | |
325 xor r5, r5 | |
326 %ifdef PIC | |
327 lea r11, [scan8_mem] | |
328 %endif | |
329 .nextblock | |
330 movzx r6, byte [scan8+r5] | |
331 movzx r6, byte [r4+r6] | |
332 test r6, r6 | |
333 jz .skipblock | |
334 mov r6d, dword [r1+r5*4] | |
335 lea r6, [r0+r6] | |
336 add word [r2], 32 | |
337 IDCT8_ADD_MMX_START r2 , rsp | |
338 IDCT8_ADD_MMX_START r2+8, rsp+64 | |
339 IDCT8_ADD_MMX_END r6 , rsp, r3 | |
340 mov r6d, dword [r1+r5*4] | |
341 lea r6, [r0+r6+4] | |
342 IDCT8_ADD_MMX_END r6 , rsp+8, r3 | |
343 .skipblock | |
344 add r5, 4 | |
345 add r2, 128 | |
346 cmp r5, 16 | |
347 jl .nextblock | |
348 ADD rsp, pad | |
349 RET | |
350 | |
351 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, | |
352 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
353 cglobal h264_idct_add16_mmx2, 5, 7, 0 | |
354 xor r5, r5 | |
355 %ifdef PIC | |
356 lea r11, [scan8_mem] | |
357 %endif | |
358 .nextblock | |
359 movzx r6, byte [scan8+r5] | |
360 movzx r6, byte [r4+r6] | |
361 test r6, r6 | |
362 jz .skipblock | |
363 cmp r6, 1 | |
364 jnz .no_dc | |
365 movsx r6, word [r2] | |
366 test r6, r6 | |
367 jz .no_dc | |
368 DC_ADD_MMX2_INIT r2, r3, r6 | |
369 %ifdef ARCH_X86_64 | |
370 %define dst_reg r10 | |
371 %define dst_regd r10d | |
372 %else | |
373 %define dst_reg r1 | |
374 %define dst_regd r1d | |
375 %endif | |
376 mov dst_regd, dword [r1+r5*4] | |
377 lea dst_reg, [r0+dst_reg] | |
378 DC_ADD_MMX2_OP movh, dst_reg, r3, r6 | |
379 %ifndef ARCH_X86_64 | |
380 mov r1, r1m | |
381 %endif | |
382 inc r5 | |
383 add r2, 32 | |
384 cmp r5, 16 | |
385 jl .nextblock | |
386 REP_RET | |
387 .no_dc | |
388 mov r6d, dword [r1+r5*4] | |
389 lea r6, [r0+r6] | |
390 IDCT4_ADD r6, r2, r3 | |
391 .skipblock | |
392 inc r5 | |
393 add r2, 32 | |
394 cmp r5, 16 | |
395 jl .nextblock | |
396 REP_RET | |
397 | |
398 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, | |
399 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
400 cglobal h264_idct_add16intra_mmx, 5, 7, 0 | |
401 xor r5, r5 | |
402 %ifdef PIC | |
403 lea r11, [scan8_mem] | |
404 %endif | |
405 .nextblock | |
406 movzx r6, byte [scan8+r5] | |
407 movzx r6, byte [r4+r6] | |
408 or r6w, word [r2] | |
409 test r6, r6 | |
410 jz .skipblock | |
411 mov r6d, dword [r1+r5*4] | |
412 lea r6, [r0+r6] | |
413 IDCT4_ADD r6, r2, r3 | |
414 .skipblock | |
415 inc r5 | |
416 add r2, 32 | |
417 cmp r5, 16 | |
418 jl .nextblock | |
419 REP_RET | |
420 | |
421 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, | |
422 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
423 cglobal h264_idct_add16intra_mmx2, 5, 7, 0 | |
424 xor r5, r5 | |
425 %ifdef PIC | |
426 lea r11, [scan8_mem] | |
427 %endif | |
428 .nextblock | |
429 movzx r6, byte [scan8+r5] | |
430 movzx r6, byte [r4+r6] | |
431 test r6, r6 | |
432 jz .try_dc | |
433 mov r6d, dword [r1+r5*4] | |
434 lea r6, [r0+r6] | |
435 IDCT4_ADD r6, r2, r3 | |
436 inc r5 | |
437 add r2, 32 | |
438 cmp r5, 16 | |
439 jl .nextblock | |
440 REP_RET | |
441 .try_dc | |
442 movsx r6, word [r2] | |
443 test r6, r6 | |
444 jz .skipblock | |
445 DC_ADD_MMX2_INIT r2, r3, r6 | |
446 %ifdef ARCH_X86_64 | |
447 %define dst_reg r10 | |
448 %define dst_regd r10d | |
449 %else | |
450 %define dst_reg r1 | |
451 %define dst_regd r1d | |
452 %endif | |
453 mov dst_regd, dword [r1+r5*4] | |
454 lea dst_reg, [r0+dst_reg] | |
455 DC_ADD_MMX2_OP movh, dst_reg, r3, r6 | |
456 %ifndef ARCH_X86_64 | |
457 mov r1, r1m | |
458 %endif | |
459 .skipblock | |
460 inc r5 | |
461 add r2, 32 | |
462 cmp r5, 16 | |
463 jl .nextblock | |
464 REP_RET | |
465 | |
466 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, | |
467 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
468 cglobal h264_idct8_add4_mmx2, 5, 7, 0 | |
469 %assign pad 128+4-(stack_offset&7) | |
470 SUB rsp, pad | |
471 | |
472 xor r5, r5 | |
473 %ifdef PIC | |
474 lea r11, [scan8_mem] | |
475 %endif | |
476 .nextblock | |
477 movzx r6, byte [scan8+r5] | |
478 movzx r6, byte [r4+r6] | |
479 test r6, r6 | |
480 jz .skipblock | |
481 cmp r6, 1 | |
482 jnz .no_dc | |
483 movsx r6, word [r2] | |
484 test r6, r6 | |
485 jz .no_dc | |
486 DC_ADD_MMX2_INIT r2, r3, r6 | |
487 %ifdef ARCH_X86_64 | |
488 %define dst_reg r10 | |
489 %define dst_regd r10d | |
490 %else | |
491 %define dst_reg r1 | |
492 %define dst_regd r1d | |
493 %endif | |
494 mov dst_regd, dword [r1+r5*4] | |
495 lea dst_reg, [r0+dst_reg] | |
496 DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |
497 lea dst_reg, [dst_reg+r3*4] | |
498 DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |
499 %ifndef ARCH_X86_64 | |
500 mov r1, r1m | |
501 %endif | |
502 add r5, 4 | |
503 add r2, 128 | |
504 cmp r5, 16 | |
505 jl .nextblock | |
506 | |
507 ADD rsp, pad | |
508 RET | |
509 .no_dc | |
510 mov r6d, dword [r1+r5*4] | |
511 lea r6, [r0+r6] | |
512 add word [r2], 32 | |
513 IDCT8_ADD_MMX_START r2 , rsp | |
514 IDCT8_ADD_MMX_START r2+8, rsp+64 | |
515 IDCT8_ADD_MMX_END r6 , rsp, r3 | |
516 mov r6d, dword [r1+r5*4] | |
517 lea r6, [r0+r6+4] | |
518 IDCT8_ADD_MMX_END r6 , rsp+8, r3 | |
519 .skipblock | |
520 add r5, 4 | |
521 add r2, 128 | |
522 cmp r5, 16 | |
523 jl .nextblock | |
524 | |
525 ADD rsp, pad | |
526 RET | |
527 | |
528 INIT_XMM | |
529 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, | |
530 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
531 cglobal h264_idct8_add4_sse2, 5, 7, 10 | |
532 xor r5, r5 | |
533 %ifdef PIC | |
534 lea r11, [scan8_mem] | |
535 %endif | |
536 .nextblock | |
537 movzx r6, byte [scan8+r5] | |
538 movzx r6, byte [r4+r6] | |
539 test r6, r6 | |
540 jz .skipblock | |
541 cmp r6, 1 | |
542 jnz .no_dc | |
543 movsx r6, word [r2] | |
544 test r6, r6 | |
545 jz .no_dc | |
546 INIT_MMX | |
547 DC_ADD_MMX2_INIT r2, r3, r6 | |
548 %ifdef ARCH_X86_64 | |
549 %define dst_reg r10 | |
550 %define dst_regd r10d | |
551 %else | |
552 %define dst_reg r1 | |
553 %define dst_regd r1d | |
554 %endif | |
555 mov dst_regd, dword [r1+r5*4] | |
556 lea dst_reg, [r0+dst_reg] | |
557 DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |
558 lea dst_reg, [dst_reg+r3*4] | |
559 DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |
560 %ifndef ARCH_X86_64 | |
561 mov r1, r1m | |
562 %endif | |
563 add r5, 4 | |
564 add r2, 128 | |
565 cmp r5, 16 | |
566 jl .nextblock | |
567 REP_RET | |
568 .no_dc | |
569 INIT_XMM | |
570 mov dst_regd, dword [r1+r5*4] | |
571 lea dst_reg, [r0+dst_reg] | |
572 IDCT8_ADD_SSE dst_reg, r2, r3, r6 | |
573 %ifndef ARCH_X86_64 | |
574 mov r1, r1m | |
575 %endif | |
576 .skipblock | |
577 add r5, 4 | |
578 add r2, 128 | |
579 cmp r5, 16 | |
580 jl .nextblock | |
581 REP_RET | |
582 | |
583 INIT_MMX | |
584 h264_idct_add8_mmx_plane: | |
585 .nextblock | |
586 movzx r6, byte [scan8+r5] | |
587 movzx r6, byte [r4+r6] | |
588 or r6w, word [r2] | |
589 test r6, r6 | |
590 jz .skipblock | |
591 %ifdef ARCH_X86_64 | |
592 mov r0d, dword [r1+r5*4] | |
593 add r0, [r10] | |
594 %else | |
595 mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
596 mov r0, [r0] | |
597 add r0, dword [r1+r5*4] | |
598 %endif | |
599 IDCT4_ADD r0, r2, r3 | |
600 .skipblock | |
601 inc r5 | |
602 add r2, 32 | |
603 test r5, 3 | |
604 jnz .nextblock | |
605 rep ret | |
606 | |
607 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, | |
608 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
609 cglobal h264_idct_add8_mmx, 5, 7, 0 | |
610 mov r5, 16 | |
611 add r2, 512 | |
612 %ifdef PIC | |
613 lea r11, [scan8_mem] | |
614 %endif | |
615 %ifdef ARCH_X86_64 | |
616 mov r10, r0 | |
617 %endif | |
618 call h264_idct_add8_mmx_plane | |
619 %ifdef ARCH_X86_64 | |
620 add r10, gprsize | |
621 %else | |
622 add r0mp, gprsize | |
623 %endif | |
624 call h264_idct_add8_mmx_plane | |
625 RET | |
626 | |
627 h264_idct_add8_mmx2_plane | |
628 .nextblock | |
629 movzx r6, byte [scan8+r5] | |
630 movzx r6, byte [r4+r6] | |
631 test r6, r6 | |
632 jz .try_dc | |
633 %ifdef ARCH_X86_64 | |
634 mov r0d, dword [r1+r5*4] | |
635 add r0, [r10] | |
636 %else | |
637 mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
638 mov r0, [r0] | |
639 add r0, dword [r1+r5*4] | |
640 %endif | |
641 IDCT4_ADD r0, r2, r3 | |
642 inc r5 | |
643 add r2, 32 | |
644 test r5, 3 | |
645 jnz .nextblock | |
646 rep ret | |
647 .try_dc | |
648 movsx r6, word [r2] | |
649 test r6, r6 | |
650 jz .skipblock | |
651 DC_ADD_MMX2_INIT r2, r3, r6 | |
652 %ifdef ARCH_X86_64 | |
653 mov r0d, dword [r1+r5*4] | |
654 add r0, [r10] | |
655 %else | |
656 mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
657 mov r0, [r0] | |
658 add r0, dword [r1+r5*4] | |
659 %endif | |
660 DC_ADD_MMX2_OP movh, r0, r3, r6 | |
661 .skipblock | |
662 inc r5 | |
663 add r2, 32 | |
664 test r5, 3 | |
665 jnz .nextblock | |
666 rep ret | |
667 | |
668 ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, | |
669 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
670 cglobal h264_idct_add8_mmx2, 5, 7, 0 | |
671 mov r5, 16 | |
672 add r2, 512 | |
673 %ifdef ARCH_X86_64 | |
674 mov r10, r0 | |
675 %endif | |
676 %ifdef PIC | |
677 lea r11, [scan8_mem] | |
678 %endif | |
679 call h264_idct_add8_mmx2_plane | |
680 %ifdef ARCH_X86_64 | |
681 add r10, gprsize | |
682 %else | |
683 add r0mp, gprsize | |
684 %endif | |
685 call h264_idct_add8_mmx2_plane | |
686 RET | |
687 | |
688 INIT_MMX | |
689 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered | |
690 h264_idct_dc_add8_mmx2: | |
691 movd m0, [r2 ] ; 0 0 X D | |
692 punpcklwd m0, [r2+32] ; x X d D | |
693 paddsw m0, [pw_32] | |
694 psraw m0, 6 | |
695 punpcklwd m0, m0 ; d d D D | |
696 pxor m1, m1 ; 0 0 0 0 | |
697 psubw m1, m0 ; -d-d-D-D | |
698 packuswb m0, m1 ; -d-d-D-D d d D D | |
699 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D | |
700 punpcklwd m0, m0 ; d d d d D D D D | |
701 lea r6, [r3*3] | |
702 DC_ADD_MMX2_OP movq, r0, r3, r6 | |
703 ret | |
704 | |
705 ALIGN 16 | |
706 INIT_XMM | |
707 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride | |
708 x264_add8x4_idct_sse2: | |
709 movq m0, [r2+ 0] | |
710 movq m1, [r2+ 8] | |
711 movq m2, [r2+16] | |
712 movq m3, [r2+24] | |
713 movhps m0, [r2+32] | |
714 movhps m1, [r2+40] | |
715 movhps m2, [r2+48] | |
716 movhps m3, [r2+56] | |
717 IDCT4_1D 0,1,2,3,4,5 | |
718 TRANSPOSE2x4x4W 0,1,2,3,4 | |
719 paddw m0, [pw_32] | |
720 IDCT4_1D 0,1,2,3,4,5 | |
721 pxor m7, m7 | |
722 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 | |
723 lea r0, [r0+r3*2] | |
724 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 | |
725 ret | |
726 | |
727 %macro add16_sse2_cycle 2 | |
728 movzx r0, word [r4+%2] | |
729 test r0, r0 | |
730 jz .cycle%1end | |
731 mov r0d, dword [r1+%1*8] | |
732 %ifdef ARCH_X86_64 | |
733 add r0, r10 | |
734 %else | |
735 add r0, r0m | |
736 %endif | |
737 call x264_add8x4_idct_sse2 | |
738 .cycle%1end | |
739 %if %1 < 7 | |
740 add r2, 64 | |
741 %endif | |
742 %endmacro | |
743 | |
744 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, | |
745 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
746 cglobal h264_idct_add16_sse2, 5, 5, 8 | |
747 %ifdef ARCH_X86_64 | |
748 mov r10, r0 | |
749 %endif | |
750 ; unrolling of the loop leads to an average performance gain of | |
751 ; 20-25% | |
752 add16_sse2_cycle 0, 0xc | |
753 add16_sse2_cycle 1, 0x14 | |
754 add16_sse2_cycle 2, 0xe | |
755 add16_sse2_cycle 3, 0x16 | |
756 add16_sse2_cycle 4, 0x1c | |
757 add16_sse2_cycle 5, 0x24 | |
758 add16_sse2_cycle 6, 0x1e | |
759 add16_sse2_cycle 7, 0x26 | |
760 RET | |
761 | |
762 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, | |
763 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
764 cglobal h264_idct_add16intra_sse2, 5, 7, 8 | |
765 xor r5, r5 | |
766 %ifdef ARCH_X86_64 | |
767 mov r10, r0 | |
768 %endif | |
769 %ifdef PIC | |
770 lea r11, [scan8_mem] | |
771 %endif | |
772 .next2blocks | |
773 movzx r0, byte [scan8+r5] | |
774 movzx r0, word [r4+r0] | |
775 test r0, r0 | |
776 jz .try_dc | |
777 mov r0d, dword [r1+r5*4] | |
778 %ifdef ARCH_X86_64 | |
779 add r0, r10 | |
780 %else | |
781 add r0, r0m | |
782 %endif | |
783 call x264_add8x4_idct_sse2 | |
784 add r5, 2 | |
785 add r2, 64 | |
786 cmp r5, 16 | |
787 jl .next2blocks | |
788 REP_RET | |
789 .try_dc | |
790 movsx r0, word [r2 ] | |
791 or r0w, word [r2+32] | |
792 jz .skip2blocks | |
793 mov r0d, dword [r1+r5*4] | |
794 %ifdef ARCH_X86_64 | |
795 add r0, r10 | |
796 %else | |
797 add r0, r0m | |
798 %endif | |
799 call h264_idct_dc_add8_mmx2 | |
800 .skip2blocks | |
801 add r5, 2 | |
802 add r2, 64 | |
803 cmp r5, 16 | |
804 jl .next2blocks | |
805 REP_RET | |
806 | |
807 h264_idct_add8_sse2_plane: | |
808 .next2blocks | |
809 movzx r0, byte [scan8+r5] | |
810 movzx r0, word [r4+r0] | |
811 test r0, r0 | |
812 jz .try_dc | |
813 %ifdef ARCH_X86_64 | |
814 mov r0d, dword [r1+r5*4] | |
815 add r0, [r10] | |
816 %else | |
817 mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
818 mov r0, [r0] | |
819 add r0, dword [r1+r5*4] | |
820 %endif | |
821 call x264_add8x4_idct_sse2 | |
822 add r5, 2 | |
823 add r2, 64 | |
824 test r5, 3 | |
825 jnz .next2blocks | |
826 rep ret | |
827 .try_dc | |
828 movsx r0, word [r2 ] | |
829 or r0w, word [r2+32] | |
830 jz .skip2blocks | |
831 %ifdef ARCH_X86_64 | |
832 mov r0d, dword [r1+r5*4] | |
833 add r0, [r10] | |
834 %else | |
835 mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
836 mov r0, [r0] | |
837 add r0, dword [r1+r5*4] | |
838 %endif | |
839 call h264_idct_dc_add8_mmx2 | |
840 .skip2blocks | |
841 add r5, 2 | |
842 add r2, 64 | |
843 test r5, 3 | |
844 jnz .next2blocks | |
845 rep ret | |
846 | |
847 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, | |
848 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
849 cglobal h264_idct_add8_sse2, 5, 7, 8 | |
850 mov r5, 16 | |
851 add r2, 512 | |
852 %ifdef PIC | |
853 lea r11, [scan8_mem] | |
854 %endif | |
855 %ifdef ARCH_X86_64 | |
856 mov r10, r0 | |
857 %endif | |
858 call h264_idct_add8_sse2_plane | |
859 %ifdef ARCH_X86_64 | |
860 add r10, gprsize | |
861 %else | |
862 add r0mp, gprsize | |
863 %endif | |
864 call h264_idct_add8_sse2_plane | |
865 RET |