comparison x86/fft_mmx.asm @ 12399:020540442072 libavcodec

Convert ff_imdct_half_sse() to yasm. This is to avoid split asm sections that attempt to preserve some registers between sections.
author alexc
date Sun, 22 Aug 2010 14:39:58 +0000
parents 6f064ab48463
children f61e22f8cf28
comparison
equal deleted inserted replaced
12398:31736ceab3aa 12399:020540442072
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27 ; in blocks as conventient to the vector size. 27 ; in blocks as conventient to the vector size.
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
29 29
30 %include "x86inc.asm" 30 %include "x86inc.asm"
31
32 %ifdef ARCH_X86_64
33 %define pointer resq
34 %else
35 %define pointer resd
36 %endif
37
38 struc FFTContext
39 .nbits: resd 1
40 .reverse: resd 1
41 .revtab: pointer 1
42 .tmpbuf: pointer 1
43 .mdctsize: resd 1
44 .mdctbits: resd 1
45 .tcos: pointer 1
46 .tsin: pointer 1
47 endstruc
31 48
32 SECTION_RODATA 49 SECTION_RODATA
33 50
34 %define M_SQRT1_2 0.70710678118654752440 51 %define M_SQRT1_2 0.70710678118654752440
35 ps_root2: times 4 dd M_SQRT1_2 52 ps_root2: times 4 dd M_SQRT1_2
426 %define SECTION_REL - $$ 443 %define SECTION_REL - $$
427 %else 444 %else
428 %define SECTION_REL 445 %define SECTION_REL
429 %endif 446 %endif
430 447
448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
449 lea r2, [dispatch_tab%1]
450 mov r2, [r2 + (%2q-2)*gprsize]
451 %ifdef PIC
452 lea r3, [$$]
453 add r2, r3
454 %endif
455 call r2
456 %endmacro ; FFT_DISPATCH
457
431 %macro DECL_FFT 2-3 ; nbits, cpu, suffix 458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
432 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL 459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
433 %if %1==5 460 %if %1==5
434 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL 461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
435 %endif 462 %endif
462 section .text 489 section .text
463 490
464 ; On x86_32, this function does the register saving and restoring for all of fft. 491 ; On x86_32, this function does the register saving and restoring for all of fft.
465 ; The others pass args in registers and don't spill anything. 492 ; The others pass args in registers and don't spill anything.
466 cglobal fft_dispatch%3%2, 2,5,8, z, nbits 493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
467 lea r2, [dispatch_tab%3%2] 494 FFT_DISPATCH %3%2, nbits
468 mov r2, [r2 + (nbitsq-2)*gprsize]
469 %ifdef PIC
470 lea r3, [$$]
471 add r2, r3
472 %endif
473 call r2
474 RET 495 RET
475 %endmacro ; DECL_FFT 496 %endmacro ; DECL_FFT
476 497
477 DECL_FFT 5, _sse 498 DECL_FFT 5, _sse
478 DECL_FFT 5, _sse, _interleave 499 DECL_FFT 5, _sse, _interleave
479 DECL_FFT 4, _3dn 500 DECL_FFT 4, _3dn
480 DECL_FFT 4, _3dn, _interleave 501 DECL_FFT 4, _3dn, _interleave
481 DECL_FFT 4, _3dn2 502 DECL_FFT 4, _3dn2
482 DECL_FFT 4, _3dn2, _interleave 503 DECL_FFT 4, _3dn2, _interleave
483 504
505 INIT_XMM
506 %undef mulps
507 %undef addps
508 %undef subps
509 %undef unpcklps
510 %undef unpckhps
511
512 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
513 movaps xmm0, [%3+%2*4]
514 movaps xmm1, [%3+%1*4-0x10]
515 movaps xmm2, xmm0
516 shufps xmm0, xmm1, 0x88
517 shufps xmm1, xmm2, 0x77
518 movlps xmm4, [%4+%2*2]
519 movlps xmm5, [%5+%2*2+0x0]
520 movhps xmm4, [%4+%1*2-0x8]
521 movhps xmm5, [%5+%1*2-0x8]
522 movaps xmm2, xmm0
523 movaps xmm3, xmm1
524 mulps xmm0, xmm5
525 mulps xmm1, xmm4
526 mulps xmm2, xmm4
527 mulps xmm3, xmm5
528 subps xmm1, xmm0
529 addps xmm2, xmm3
530 movaps xmm0, xmm1
531 unpcklps xmm1, xmm2
532 unpckhps xmm0, xmm2
533 %endmacro
534
535 %macro PREROTATEW 3 ;addr1, addr2, xmm
536 movlps %1, %3
537 movhps %2, %3
538 %endmacro
539
540 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
541 movaps xmm6, [%4+%1*2]
542 movaps %2, [%4+%1*2+0x10]
543 movaps %3, xmm6
544 movaps xmm7, %2
545 mulps xmm6, [%5+%1*1]
546 mulps %2, [%6+%1*1]
547 mulps %3, [%6+%1*1]
548 mulps xmm7, [%5+%1*1]
549 subps %2, xmm6
550 addps %3, xmm7
551 %endmacro
552
553 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
554 .post:
555 CMUL %1, xmm0, xmm1, %3, %4, %5
556 CMUL %2, xmm4, xmm5, %3, %4, %5
557 shufps xmm1, xmm1, 0x1b
558 shufps xmm5, xmm5, 0x1b
559 movaps xmm6, xmm4
560 unpckhps xmm4, xmm1
561 unpcklps xmm6, xmm1
562 movaps xmm2, xmm0
563 unpcklps xmm0, xmm5
564 unpckhps xmm2, xmm5
565 movaps [%3+%2*2], xmm6
566 movaps [%3+%2*2+0x10], xmm4
567 movaps [%3+%1*2], xmm0
568 movaps [%3+%1*2+0x10], xmm2
569 sub %2, 0x10
570 add %1, 0x10
571 jl .post
572 %endmacro
573
574 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
575 %ifdef ARCH_X86_64
576 %define rrevtab r10
577 %define rtcos r11
578 %define rtsin r12
579 push r10
580 push r11
581 push r12
582 push r13
583 push r14
584 %else
585 %define rrevtab r6
586 %define rtsin r6
587 %define rtcos r5
588 %endif
589 mov r3d, [r0+FFTContext.mdctsize]
590 add r2, r3
591 shr r3, 1
592 mov rtcos, [r0+FFTContext.tcos]
593 mov rtsin, [r0+FFTContext.tsin]
594 add rtcos, r3
595 add rtsin, r3
596 %ifndef ARCH_X86_64
597 push rtcos
598 push rtsin
599 %endif
600 shr r3, 1
601 mov rrevtab, [r0+FFTContext.revtab]
602 add rrevtab, r3
603 %ifndef ARCH_X86_64
604 push rrevtab
605 %endif
606
607 sub r3, 4
608 %ifdef ARCH_X86_64
609 xor r4, r4
610 sub r4, r3
611 %endif
612 .pre:
613 %ifndef ARCH_X86_64
614 ;unspill
615 xor r4, r4
616 sub r4, r3
617 mov rtsin, [esp+4]
618 mov rtcos, [esp+8]
619 %endif
620
621 PREROTATER r4, r3, r2, rtcos, rtsin
622 %ifdef ARCH_X86_64
623 movzx r5, word [rrevtab+r4*1-4]
624 movzx r6, word [rrevtab+r4*1-2]
625 movzx r13, word [rrevtab+r3*1]
626 movzx r14, word [rrevtab+r3*1+2]
627 PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0
628 PREROTATEW [r1+r13*8], [r1+r14*8], xmm1
629 add r4, 4
630 %else
631 mov r6, [esp]
632 movzx r5, word [r6+r4*1-4]
633 movzx r4, word [r6+r4*1-2]
634 PREROTATEW [r1+r5*8], [r1+r4*8], xmm0
635 movzx r5, word [r6+r3*1]
636 movzx r4, word [r6+r3*1+2]
637 PREROTATEW [r1+r5*8], [r1+r4*8], xmm1
638 %endif
639 sub r3, 4
640 jns .pre
641
642 mov r5, r0
643 mov r6, r1
644 mov r0, r1
645 mov r1d, [r5+FFTContext.nbits]
646
647 FFT_DISPATCH _sse, r1
648
649 mov r0d, [r5+FFTContext.mdctsize]
650 add r6, r0
651 shr r0, 1
652 %ifndef ARCH_X86_64
653 %define rtcos r2
654 %define rtsin r3
655 mov rtcos, [esp+8]
656 mov rtsin, [esp+4]
657 %endif
658 neg r0
659 mov r1, -16
660 sub r1, r0
661 POSROTATESHUF r0, r1, r6, rtcos, rtsin
662 %ifdef ARCH_X86_64
663 pop r14
664 pop r13
665 pop r12
666 pop r11
667 pop r10
668 %else
669 add esp, 12
670 %endif
671 RET