comparison x86/h264_idct.asm @ 12492:58a960d6e34c libavcodec

Rename h264_idct_sse2.asm to h264_idct.asm; move inline IDCT asm from h264dsp_mmx.c to h264_idct.asm (as yasm code). Because the loops are now coded in asm instead of C, this is (depending on the function) up to 50% faster for cases where gcc didn't do a great job at looping. Since h264_idct_add8() is now faster than the manual loop setup in h264.c, in-asm idct calling can now be enabled for chroma as well (see r16207). For MMX, this is 5% faster. For SSE2 (which isn't done for chroma if h264.c does the looping), this makes it up to 50% faster. Speed gain overall is ~0.5-1.0%.
author rbultje
date Tue, 14 Sep 2010 13:36:26 +0000
parents
children ef2f2db5b7be
comparison
equal deleted inserted replaced
12491:990f8a5fc8af 12492:58a960d6e34c
1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 iDCT
3 ;*****************************************************************************
4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5 ;* Copyright (C) 2003-2008 x264 project
6 ;*
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* Holger Lubitz <hal@duncan.ol.sub.de>
10 ;* Min Chen <chenm001.163.com>
11 ;*
12 ;* This file is part of FFmpeg.
13 ;*
14 ;* FFmpeg is free software; you can redistribute it and/or
15 ;* modify it under the terms of the GNU Lesser General Public
16 ;* License as published by the Free Software Foundation; either
17 ;* version 2.1 of the License, or (at your option) any later version.
18 ;*
19 ;* FFmpeg is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 ;* Lesser General Public License for more details.
23 ;*
24 ;* You should have received a copy of the GNU Lesser General Public
25 ;* License along with FFmpeg; if not, write to the Free Software
26 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 ;*****************************************************************************
28
29 %include "x86inc.asm"
30 %include "x86util.asm"
31
32 SECTION_RODATA
33
34 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35 scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
36 db 6+1*8, 7+1*8, 6+2*8, 7+2*8
37 db 4+3*8, 5+3*8, 4+4*8, 5+4*8
38 db 6+3*8, 7+3*8, 6+4*8, 7+4*8
39 db 1+1*8, 2+1*8
40 db 1+2*8, 2+2*8
41 db 1+4*8, 2+4*8
42 db 1+5*8, 2+5*8
43 %ifdef PIC
44 %define scan8 r11
45 %else
46 %define scan8 scan8_mem
47 %endif
48
49 cextern pw_32
50
51 SECTION .text
52
53 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
54 %macro IDCT4_ADD 3
55 ; Load dct coeffs
56 movq m0, [%2]
57 movq m1, [%2+8]
58 movq m2, [%2+16]
59 movq m3, [%2+24]
60
61 IDCT4_1D 0, 1, 2, 3, 4, 5
62 mova m6, [pw_32]
63 TRANSPOSE4x4W 0, 1, 2, 3, 4
64 paddw m0, m6
65 IDCT4_1D 0, 1, 2, 3, 4, 5
66 pxor m7, m7
67
68 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
69 lea %1, [%1+%3*2]
70 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
71 %endmacro
72
73 INIT_MMX
74 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
75 cglobal h264_idct_add_mmx, 3, 3, 0
76 IDCT4_ADD r0, r1, r2
77 RET
78
79 %macro IDCT8_1D 2
80 mova m4, m5
81 mova m0, m1
82 psraw m4, 1
83 psraw m1, 1
84 paddw m4, m5
85 paddw m1, m0
86 paddw m4, m7
87 paddw m1, m5
88 psubw m4, m0
89 paddw m1, m3
90
91 psubw m0, m3
92 psubw m5, m3
93 paddw m0, m7
94 psubw m5, m7
95 psraw m3, 1
96 psraw m7, 1
97 psubw m0, m3
98 psubw m5, m7
99
100 mova m3, m4
101 mova m7, m1
102 psraw m1, 2
103 psraw m3, 2
104 paddw m3, m0
105 psraw m0, 2
106 paddw m1, m5
107 psraw m5, 2
108 psubw m0, m4
109 psubw m7, m5
110
111 mova m4, m2
112 mova m5, m6
113 psraw m4, 1
114 psraw m6, 1
115 psubw m4, m5
116 paddw m6, m2
117
118 mova m2, %1
119 mova m5, %2
120 SUMSUB_BA m5, m2
121 SUMSUB_BA m6, m5
122 SUMSUB_BA m4, m2
123 SUMSUB_BA m7, m6
124 SUMSUB_BA m0, m4
125 SUMSUB_BA m3, m2
126 SUMSUB_BA m1, m5
127 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
128 %endmacro
129
130 %macro IDCT8_1D_FULL 1
131 mova m7, [%1+112]
132 mova m6, [%1+ 96]
133 mova m5, [%1+ 80]
134 mova m3, [%1+ 48]
135 mova m2, [%1+ 32]
136 mova m1, [%1+ 16]
137 IDCT8_1D [%1], [%1+ 64]
138 %endmacro
139
140 ; %1=int16_t *block, %2=int16_t *dstblock
141 %macro IDCT8_ADD_MMX_START 2
142 IDCT8_1D_FULL %1
143 mova [%1], m7
144 TRANSPOSE4x4W 0, 1, 2, 3, 7
145 mova m7, [%1]
146 mova [%2 ], m0
147 mova [%2+16], m1
148 mova [%2+32], m2
149 mova [%2+48], m3
150 TRANSPOSE4x4W 4, 5, 6, 7, 3
151 mova [%2+ 8], m4
152 mova [%2+24], m5
153 mova [%2+40], m6
154 mova [%2+56], m7
155 %endmacro
156
157 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
158 %macro IDCT8_ADD_MMX_END 3
159 IDCT8_1D_FULL %2
160 mova [%2 ], m5
161 mova [%2+16], m6
162 mova [%2+32], m7
163
164 pxor m7, m7
165 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
166 lea %1, [%1+%3*2]
167 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
168 mova m0, [%2 ]
169 mova m1, [%2+16]
170 mova m2, [%2+32]
171 lea %1, [%1+%3*2]
172 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
173 lea %1, [%1+%3*2]
174 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
175 %endmacro
176
177 INIT_MMX
178 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
179 cglobal h264_idct8_add_mmx, 3, 4, 0
180 %assign pad 128+4-(stack_offset&7)
181 SUB rsp, pad
182
183 add word [r1], 32
184 IDCT8_ADD_MMX_START r1 , rsp
185 IDCT8_ADD_MMX_START r1+8, rsp+64
186 lea r3, [r0+4]
187 IDCT8_ADD_MMX_END r0 , rsp, r2
188 IDCT8_ADD_MMX_END r3 , rsp+8, r2
189
190 ADD rsp, pad
191 RET
192
193 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
194 %macro IDCT8_ADD_SSE 4
195 IDCT8_1D_FULL %2
196 %ifdef ARCH_X86_64
197 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
198 %else
199 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
200 %endif
201 paddw m0, [pw_32]
202
203 %ifndef ARCH_X86_64
204 mova [%2 ], m0
205 mova [%2+16], m4
206 IDCT8_1D [%2], [%2+ 16]
207 mova [%2 ], m6
208 mova [%2+16], m7
209 %else
210 SWAP 0, 8
211 SWAP 4, 9
212 IDCT8_1D m8, m9
213 SWAP 6, 8
214 SWAP 7, 9
215 %endif
216
217 pxor m7, m7
218 lea %4, [%3*3]
219 STORE_DIFF m0, m6, m7, [%1 ]
220 STORE_DIFF m1, m6, m7, [%1+%3 ]
221 STORE_DIFF m2, m6, m7, [%1+%3*2]
222 STORE_DIFF m3, m6, m7, [%1+%4 ]
223 %ifndef ARCH_X86_64
224 mova m0, [%2 ]
225 mova m1, [%2+16]
226 %else
227 SWAP 0, 8
228 SWAP 1, 9
229 %endif
230 lea %1, [%1+%3*4]
231 STORE_DIFF m4, m6, m7, [%1 ]
232 STORE_DIFF m5, m6, m7, [%1+%3 ]
233 STORE_DIFF m0, m6, m7, [%1+%3*2]
234 STORE_DIFF m1, m6, m7, [%1+%4 ]
235 %endmacro
236
237 INIT_XMM
238 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
239 cglobal h264_idct8_add_sse2, 3, 4, 10
240 IDCT8_ADD_SSE r0, r1, r2, r3
241 RET
242
243 %macro DC_ADD_MMX2_INIT 2-3
244 %if %0 == 2
245 movsx %1, word [%1]
246 add %1, 32
247 sar %1, 6
248 movd m0, %1
249 lea %1, [%2*3]
250 %else
251 add %3, 32
252 sar %3, 6
253 movd m0, %3
254 lea %3, [%2*3]
255 %endif
256 pshufw m0, m0, 0
257 pxor m1, m1
258 psubw m1, m0
259 packuswb m0, m0
260 packuswb m1, m1
261 %endmacro
262
263 %macro DC_ADD_MMX2_OP 3-4
264 %1 m2, [%2 ]
265 %1 m3, [%2+%3 ]
266 %1 m4, [%2+%3*2]
267 %1 m5, [%2+%4 ]
268 paddusb m2, m0
269 paddusb m3, m0
270 paddusb m4, m0
271 paddusb m5, m0
272 psubusb m2, m1
273 psubusb m3, m1
274 psubusb m4, m1
275 psubusb m5, m1
276 %1 [%2 ], m2
277 %1 [%2+%3 ], m3
278 %1 [%2+%3*2], m4
279 %1 [%2+%4 ], m5
280 %endmacro
281
282 INIT_MMX
283 ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
284 cglobal h264_idct_dc_add_mmx2, 3, 3, 0
285 DC_ADD_MMX2_INIT r1, r2
286 DC_ADD_MMX2_OP movh, r0, r2, r1
287 RET
288
289 ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
290 cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
291 DC_ADD_MMX2_INIT r1, r2
292 DC_ADD_MMX2_OP mova, r0, r2, r1
293 lea r0, [r0+r2*4]
294 DC_ADD_MMX2_OP mova, r0, r2, r1
295 RET
296
297 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
298 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
299 cglobal h264_idct_add16_mmx, 5, 7, 0
300 xor r5, r5
301 %ifdef PIC
302 lea r11, [scan8_mem]
303 %endif
304 .nextblock
305 movzx r6, byte [scan8+r5]
306 movzx r6, byte [r4+r6]
307 test r6, r6
308 jz .skipblock
309 mov r6d, dword [r1+r5*4]
310 lea r6, [r0+r6]
311 IDCT4_ADD r6, r2, r3
312 .skipblock
313 inc r5
314 add r2, 32
315 cmp r5, 16
316 jl .nextblock
317 REP_RET
318
319 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
320 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
321 cglobal h264_idct8_add4_mmx, 5, 7, 0
322 %assign pad 128+4-(stack_offset&7)
323 SUB rsp, pad
324
325 xor r5, r5
326 %ifdef PIC
327 lea r11, [scan8_mem]
328 %endif
329 .nextblock
330 movzx r6, byte [scan8+r5]
331 movzx r6, byte [r4+r6]
332 test r6, r6
333 jz .skipblock
334 mov r6d, dword [r1+r5*4]
335 lea r6, [r0+r6]
336 add word [r2], 32
337 IDCT8_ADD_MMX_START r2 , rsp
338 IDCT8_ADD_MMX_START r2+8, rsp+64
339 IDCT8_ADD_MMX_END r6 , rsp, r3
340 mov r6d, dword [r1+r5*4]
341 lea r6, [r0+r6+4]
342 IDCT8_ADD_MMX_END r6 , rsp+8, r3
343 .skipblock
344 add r5, 4
345 add r2, 128
346 cmp r5, 16
347 jl .nextblock
348 ADD rsp, pad
349 RET
350
351 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
352 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
353 cglobal h264_idct_add16_mmx2, 5, 7, 0
354 xor r5, r5
355 %ifdef PIC
356 lea r11, [scan8_mem]
357 %endif
358 .nextblock
359 movzx r6, byte [scan8+r5]
360 movzx r6, byte [r4+r6]
361 test r6, r6
362 jz .skipblock
363 cmp r6, 1
364 jnz .no_dc
365 movsx r6, word [r2]
366 test r6, r6
367 jz .no_dc
368 DC_ADD_MMX2_INIT r2, r3, r6
369 %ifdef ARCH_X86_64
370 %define dst_reg r10
371 %define dst_regd r10d
372 %else
373 %define dst_reg r1
374 %define dst_regd r1d
375 %endif
376 mov dst_regd, dword [r1+r5*4]
377 lea dst_reg, [r0+dst_reg]
378 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
379 %ifndef ARCH_X86_64
380 mov r1, r1m
381 %endif
382 inc r5
383 add r2, 32
384 cmp r5, 16
385 jl .nextblock
386 REP_RET
387 .no_dc
388 mov r6d, dword [r1+r5*4]
389 lea r6, [r0+r6]
390 IDCT4_ADD r6, r2, r3
391 .skipblock
392 inc r5
393 add r2, 32
394 cmp r5, 16
395 jl .nextblock
396 REP_RET
397
398 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
399 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
400 cglobal h264_idct_add16intra_mmx, 5, 7, 0
401 xor r5, r5
402 %ifdef PIC
403 lea r11, [scan8_mem]
404 %endif
405 .nextblock
406 movzx r6, byte [scan8+r5]
407 movzx r6, byte [r4+r6]
408 or r6w, word [r2]
409 test r6, r6
410 jz .skipblock
411 mov r6d, dword [r1+r5*4]
412 lea r6, [r0+r6]
413 IDCT4_ADD r6, r2, r3
414 .skipblock
415 inc r5
416 add r2, 32
417 cmp r5, 16
418 jl .nextblock
419 REP_RET
420
421 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
422 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
423 cglobal h264_idct_add16intra_mmx2, 5, 7, 0
424 xor r5, r5
425 %ifdef PIC
426 lea r11, [scan8_mem]
427 %endif
428 .nextblock
429 movzx r6, byte [scan8+r5]
430 movzx r6, byte [r4+r6]
431 test r6, r6
432 jz .try_dc
433 mov r6d, dword [r1+r5*4]
434 lea r6, [r0+r6]
435 IDCT4_ADD r6, r2, r3
436 inc r5
437 add r2, 32
438 cmp r5, 16
439 jl .nextblock
440 REP_RET
441 .try_dc
442 movsx r6, word [r2]
443 test r6, r6
444 jz .skipblock
445 DC_ADD_MMX2_INIT r2, r3, r6
446 %ifdef ARCH_X86_64
447 %define dst_reg r10
448 %define dst_regd r10d
449 %else
450 %define dst_reg r1
451 %define dst_regd r1d
452 %endif
453 mov dst_regd, dword [r1+r5*4]
454 lea dst_reg, [r0+dst_reg]
455 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
456 %ifndef ARCH_X86_64
457 mov r1, r1m
458 %endif
459 .skipblock
460 inc r5
461 add r2, 32
462 cmp r5, 16
463 jl .nextblock
464 REP_RET
465
466 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
467 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
468 cglobal h264_idct8_add4_mmx2, 5, 7, 0
469 %assign pad 128+4-(stack_offset&7)
470 SUB rsp, pad
471
472 xor r5, r5
473 %ifdef PIC
474 lea r11, [scan8_mem]
475 %endif
476 .nextblock
477 movzx r6, byte [scan8+r5]
478 movzx r6, byte [r4+r6]
479 test r6, r6
480 jz .skipblock
481 cmp r6, 1
482 jnz .no_dc
483 movsx r6, word [r2]
484 test r6, r6
485 jz .no_dc
486 DC_ADD_MMX2_INIT r2, r3, r6
487 %ifdef ARCH_X86_64
488 %define dst_reg r10
489 %define dst_regd r10d
490 %else
491 %define dst_reg r1
492 %define dst_regd r1d
493 %endif
494 mov dst_regd, dword [r1+r5*4]
495 lea dst_reg, [r0+dst_reg]
496 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
497 lea dst_reg, [dst_reg+r3*4]
498 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
499 %ifndef ARCH_X86_64
500 mov r1, r1m
501 %endif
502 add r5, 4
503 add r2, 128
504 cmp r5, 16
505 jl .nextblock
506
507 ADD rsp, pad
508 RET
509 .no_dc
510 mov r6d, dword [r1+r5*4]
511 lea r6, [r0+r6]
512 add word [r2], 32
513 IDCT8_ADD_MMX_START r2 , rsp
514 IDCT8_ADD_MMX_START r2+8, rsp+64
515 IDCT8_ADD_MMX_END r6 , rsp, r3
516 mov r6d, dword [r1+r5*4]
517 lea r6, [r0+r6+4]
518 IDCT8_ADD_MMX_END r6 , rsp+8, r3
519 .skipblock
520 add r5, 4
521 add r2, 128
522 cmp r5, 16
523 jl .nextblock
524
525 ADD rsp, pad
526 RET
527
528 INIT_XMM
529 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
530 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
531 cglobal h264_idct8_add4_sse2, 5, 7, 10
532 xor r5, r5
533 %ifdef PIC
534 lea r11, [scan8_mem]
535 %endif
536 .nextblock
537 movzx r6, byte [scan8+r5]
538 movzx r6, byte [r4+r6]
539 test r6, r6
540 jz .skipblock
541 cmp r6, 1
542 jnz .no_dc
543 movsx r6, word [r2]
544 test r6, r6
545 jz .no_dc
546 INIT_MMX
547 DC_ADD_MMX2_INIT r2, r3, r6
548 %ifdef ARCH_X86_64
549 %define dst_reg r10
550 %define dst_regd r10d
551 %else
552 %define dst_reg r1
553 %define dst_regd r1d
554 %endif
555 mov dst_regd, dword [r1+r5*4]
556 lea dst_reg, [r0+dst_reg]
557 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
558 lea dst_reg, [dst_reg+r3*4]
559 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
560 %ifndef ARCH_X86_64
561 mov r1, r1m
562 %endif
563 add r5, 4
564 add r2, 128
565 cmp r5, 16
566 jl .nextblock
567 REP_RET
568 .no_dc
569 INIT_XMM
570 mov dst_regd, dword [r1+r5*4]
571 lea dst_reg, [r0+dst_reg]
572 IDCT8_ADD_SSE dst_reg, r2, r3, r6
573 %ifndef ARCH_X86_64
574 mov r1, r1m
575 %endif
576 .skipblock
577 add r5, 4
578 add r2, 128
579 cmp r5, 16
580 jl .nextblock
581 REP_RET
582
583 INIT_MMX
584 h264_idct_add8_mmx_plane:
585 .nextblock
586 movzx r6, byte [scan8+r5]
587 movzx r6, byte [r4+r6]
588 or r6w, word [r2]
589 test r6, r6
590 jz .skipblock
591 %ifdef ARCH_X86_64
592 mov r0d, dword [r1+r5*4]
593 add r0, [r10]
594 %else
595 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
596 mov r0, [r0]
597 add r0, dword [r1+r5*4]
598 %endif
599 IDCT4_ADD r0, r2, r3
600 .skipblock
601 inc r5
602 add r2, 32
603 test r5, 3
604 jnz .nextblock
605 rep ret
606
607 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
608 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
609 cglobal h264_idct_add8_mmx, 5, 7, 0
610 mov r5, 16
611 add r2, 512
612 %ifdef PIC
613 lea r11, [scan8_mem]
614 %endif
615 %ifdef ARCH_X86_64
616 mov r10, r0
617 %endif
618 call h264_idct_add8_mmx_plane
619 %ifdef ARCH_X86_64
620 add r10, gprsize
621 %else
622 add r0mp, gprsize
623 %endif
624 call h264_idct_add8_mmx_plane
625 RET
626
627 h264_idct_add8_mmx2_plane
628 .nextblock
629 movzx r6, byte [scan8+r5]
630 movzx r6, byte [r4+r6]
631 test r6, r6
632 jz .try_dc
633 %ifdef ARCH_X86_64
634 mov r0d, dword [r1+r5*4]
635 add r0, [r10]
636 %else
637 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
638 mov r0, [r0]
639 add r0, dword [r1+r5*4]
640 %endif
641 IDCT4_ADD r0, r2, r3
642 inc r5
643 add r2, 32
644 test r5, 3
645 jnz .nextblock
646 rep ret
647 .try_dc
648 movsx r6, word [r2]
649 test r6, r6
650 jz .skipblock
651 DC_ADD_MMX2_INIT r2, r3, r6
652 %ifdef ARCH_X86_64
653 mov r0d, dword [r1+r5*4]
654 add r0, [r10]
655 %else
656 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
657 mov r0, [r0]
658 add r0, dword [r1+r5*4]
659 %endif
660 DC_ADD_MMX2_OP movh, r0, r3, r6
661 .skipblock
662 inc r5
663 add r2, 32
664 test r5, 3
665 jnz .nextblock
666 rep ret
667
668 ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
669 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
670 cglobal h264_idct_add8_mmx2, 5, 7, 0
671 mov r5, 16
672 add r2, 512
673 %ifdef ARCH_X86_64
674 mov r10, r0
675 %endif
676 %ifdef PIC
677 lea r11, [scan8_mem]
678 %endif
679 call h264_idct_add8_mmx2_plane
680 %ifdef ARCH_X86_64
681 add r10, gprsize
682 %else
683 add r0mp, gprsize
684 %endif
685 call h264_idct_add8_mmx2_plane
686 RET
687
688 INIT_MMX
689 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
690 h264_idct_dc_add8_mmx2:
691 movd m0, [r2 ] ; 0 0 X D
692 punpcklwd m0, [r2+32] ; x X d D
693 paddsw m0, [pw_32]
694 psraw m0, 6
695 punpcklwd m0, m0 ; d d D D
696 pxor m1, m1 ; 0 0 0 0
697 psubw m1, m0 ; -d-d-D-D
698 packuswb m0, m1 ; -d-d-D-D d d D D
699 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
700 punpcklwd m0, m0 ; d d d d D D D D
701 lea r6, [r3*3]
702 DC_ADD_MMX2_OP movq, r0, r3, r6
703 ret
704
705 ALIGN 16
706 INIT_XMM
707 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
708 x264_add8x4_idct_sse2:
709 movq m0, [r2+ 0]
710 movq m1, [r2+ 8]
711 movq m2, [r2+16]
712 movq m3, [r2+24]
713 movhps m0, [r2+32]
714 movhps m1, [r2+40]
715 movhps m2, [r2+48]
716 movhps m3, [r2+56]
717 IDCT4_1D 0,1,2,3,4,5
718 TRANSPOSE2x4x4W 0,1,2,3,4
719 paddw m0, [pw_32]
720 IDCT4_1D 0,1,2,3,4,5
721 pxor m7, m7
722 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
723 lea r0, [r0+r3*2]
724 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
725 ret
726
727 %macro add16_sse2_cycle 2
728 movzx r0, word [r4+%2]
729 test r0, r0
730 jz .cycle%1end
731 mov r0d, dword [r1+%1*8]
732 %ifdef ARCH_X86_64
733 add r0, r10
734 %else
735 add r0, r0m
736 %endif
737 call x264_add8x4_idct_sse2
738 .cycle%1end
739 %if %1 < 7
740 add r2, 64
741 %endif
742 %endmacro
743
744 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
745 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
746 cglobal h264_idct_add16_sse2, 5, 5, 8
747 %ifdef ARCH_X86_64
748 mov r10, r0
749 %endif
750 ; unrolling of the loop leads to an average performance gain of
751 ; 20-25%
752 add16_sse2_cycle 0, 0xc
753 add16_sse2_cycle 1, 0x14
754 add16_sse2_cycle 2, 0xe
755 add16_sse2_cycle 3, 0x16
756 add16_sse2_cycle 4, 0x1c
757 add16_sse2_cycle 5, 0x24
758 add16_sse2_cycle 6, 0x1e
759 add16_sse2_cycle 7, 0x26
760 RET
761
762 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
763 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
764 cglobal h264_idct_add16intra_sse2, 5, 7, 8
765 xor r5, r5
766 %ifdef ARCH_X86_64
767 mov r10, r0
768 %endif
769 %ifdef PIC
770 lea r11, [scan8_mem]
771 %endif
772 .next2blocks
773 movzx r0, byte [scan8+r5]
774 movzx r0, word [r4+r0]
775 test r0, r0
776 jz .try_dc
777 mov r0d, dword [r1+r5*4]
778 %ifdef ARCH_X86_64
779 add r0, r10
780 %else
781 add r0, r0m
782 %endif
783 call x264_add8x4_idct_sse2
784 add r5, 2
785 add r2, 64
786 cmp r5, 16
787 jl .next2blocks
788 REP_RET
789 .try_dc
790 movsx r0, word [r2 ]
791 or r0w, word [r2+32]
792 jz .skip2blocks
793 mov r0d, dword [r1+r5*4]
794 %ifdef ARCH_X86_64
795 add r0, r10
796 %else
797 add r0, r0m
798 %endif
799 call h264_idct_dc_add8_mmx2
800 .skip2blocks
801 add r5, 2
802 add r2, 64
803 cmp r5, 16
804 jl .next2blocks
805 REP_RET
806
807 h264_idct_add8_sse2_plane:
808 .next2blocks
809 movzx r0, byte [scan8+r5]
810 movzx r0, word [r4+r0]
811 test r0, r0
812 jz .try_dc
813 %ifdef ARCH_X86_64
814 mov r0d, dword [r1+r5*4]
815 add r0, [r10]
816 %else
817 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
818 mov r0, [r0]
819 add r0, dword [r1+r5*4]
820 %endif
821 call x264_add8x4_idct_sse2
822 add r5, 2
823 add r2, 64
824 test r5, 3
825 jnz .next2blocks
826 rep ret
827 .try_dc
828 movsx r0, word [r2 ]
829 or r0w, word [r2+32]
830 jz .skip2blocks
831 %ifdef ARCH_X86_64
832 mov r0d, dword [r1+r5*4]
833 add r0, [r10]
834 %else
835 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
836 mov r0, [r0]
837 add r0, dword [r1+r5*4]
838 %endif
839 call h264_idct_dc_add8_mmx2
840 .skip2blocks
841 add r5, 2
842 add r2, 64
843 test r5, 3
844 jnz .next2blocks
845 rep ret
846
847 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
848 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
849 cglobal h264_idct_add8_sse2, 5, 7, 8
850 mov r5, 16
851 add r2, 512
852 %ifdef PIC
853 lea r11, [scan8_mem]
854 %endif
855 %ifdef ARCH_X86_64
856 mov r10, r0
857 %endif
858 call h264_idct_add8_sse2_plane
859 %ifdef ARCH_X86_64
860 add r10, gprsize
861 %else
862 add r0mp, gprsize
863 %endif
864 call h264_idct_add8_sse2_plane
865 RET