Mercurial > libavcodec.hg
comparison x86/h264_idct.asm @ 12511:41ebcc0afb40 libavcodec
Unroll loop in h264_idct_add16intra_sse2(). Basically identical to r25171, this
inlines scan8[] and removes loop setup. 15% faster, 0.4% overall.
See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.
author | rbultje |
---|---|
date | Fri, 24 Sep 2010 14:07:23 +0000 |
parents | ef2f2db5b7be |
children | 0b6bd91bbe57 |
comparison
equal
deleted
inserted
replaced
12510:ef2f2db5b7be | 12511:41ebcc0afb40 |
---|---|
757 add16_sse2_cycle 5, 0x24 | 757 add16_sse2_cycle 5, 0x24 |
758 add16_sse2_cycle 6, 0x1e | 758 add16_sse2_cycle 6, 0x1e |
759 add16_sse2_cycle 7, 0x26 | 759 add16_sse2_cycle 7, 0x26 |
760 RET | 760 RET |
761 | 761 |
762 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, | 762 %macro add16intra_sse2_cycle 2 |
763 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
764 cglobal h264_idct_add16intra_sse2, 5, 7, 8 | |
765 xor r5, r5 | |
766 %ifdef ARCH_X86_64 | |
767 mov r10, r0 | |
768 %endif | |
769 %ifdef PIC | |
770 lea r11, [scan8_mem] | |
771 %endif | |
772 .next2blocks | |
773 movzx r0, byte [scan8+r5] | |
774 movzx r0, word [r4+r0] | |
775 test r0, r0 | |
776 jz .try_dc | |
777 mov r0d, dword [r1+r5*4] | |
778 %ifdef ARCH_X86_64 | |
779 add r0, r10 | |
780 %else | |
781 add r0, r0m | |
782 %endif | |
783 call x264_add8x4_idct_sse2 | |
784 add r5, 2 | |
785 add r2, 64 | |
786 cmp r5, 16 | |
787 jl .next2blocks | |
788 REP_RET | |
789 .try_dc | |
790 movsx r0, word [r2 ] | |
791 or r0w, word [r2+32] | |
792 jz .skip2blocks | |
793 mov r0d, dword [r1+r5*4] | |
794 %ifdef ARCH_X86_64 | |
795 add r0, r10 | |
796 %else | |
797 add r0, r0m | |
798 %endif | |
799 call h264_idct_dc_add8_mmx2 | |
800 .skip2blocks | |
801 add r5, 2 | |
802 add r2, 64 | |
803 cmp r5, 16 | |
804 jl .next2blocks | |
805 REP_RET | |
806 | |
807 %macro add8_sse2_cycle 2 | |
808 movzx r0, word [r4+%2] | 763 movzx r0, word [r4+%2] |
809 test r0, r0 | 764 test r0, r0 |
810 jz .try%1dc | 765 jz .try%1dc |
811 %ifdef ARCH_X86_64 | 766 mov r0d, dword [r1+%1*8] |
812 mov r0d, dword [r1+%1*8+64] | 767 %ifdef ARCH_X86_64 |
813 add r0, [r10] | 768 add r0, r10 |
814 %else | 769 %else |
815 mov r0, r0m | 770 add r0, r0m |
816 mov r0, [r0] | |
817 add r0, dword [r1+%1*8+64] | |
818 %endif | 771 %endif |
819 call x264_add8x4_idct_sse2 | 772 call x264_add8x4_idct_sse2 |
820 jmp .cycle%1end | 773 jmp .cycle%1end |
821 .try%1dc | 774 .try%1dc |
822 movsx r0, word [r2 ] | 775 movsx r0, word [r2 ] |
823 or r0w, word [r2+32] | 776 or r0w, word [r2+32] |
824 jz .cycle%1end | 777 jz .cycle%1end |
778 mov r0d, dword [r1+%1*8] | |
779 %ifdef ARCH_X86_64 | |
780 add r0, r10 | |
781 %else | |
782 add r0, r0m | |
783 %endif | |
784 call h264_idct_dc_add8_mmx2 | |
785 .cycle%1end | |
786 %if %1 < 7 | |
787 add r2, 64 | |
788 %endif | |
789 %endmacro | |
790 | |
791 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, | |
792 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |
793 cglobal h264_idct_add16intra_sse2, 5, 7, 8 | |
794 %ifdef ARCH_X86_64 | |
795 mov r10, r0 | |
796 %endif | |
797 add16intra_sse2_cycle 0, 0xc | |
798 add16intra_sse2_cycle 1, 0x14 | |
799 add16intra_sse2_cycle 2, 0xe | |
800 add16intra_sse2_cycle 3, 0x16 | |
801 add16intra_sse2_cycle 4, 0x1c | |
802 add16intra_sse2_cycle 5, 0x24 | |
803 add16intra_sse2_cycle 6, 0x1e | |
804 add16intra_sse2_cycle 7, 0x26 | |
805 RET | |
806 | |
807 %macro add8_sse2_cycle 2 | |
808 movzx r0, word [r4+%2] | |
809 test r0, r0 | |
810 jz .try%1dc | |
811 %ifdef ARCH_X86_64 | |
812 mov r0d, dword [r1+%1*8+64] | |
813 add r0, [r10] | |
814 %else | |
815 mov r0, r0m | |
816 mov r0, [r0] | |
817 add r0, dword [r1+%1*8+64] | |
818 %endif | |
819 call x264_add8x4_idct_sse2 | |
820 jmp .cycle%1end | |
821 .try%1dc | |
822 movsx r0, word [r2 ] | |
823 or r0w, word [r2+32] | |
824 jz .cycle%1end | |
825 %ifdef ARCH_X86_64 | 825 %ifdef ARCH_X86_64 |
826 mov r0d, dword [r1+%1*8+64] | 826 mov r0d, dword [r1+%1*8+64] |
827 add r0, [r10] | 827 add r0, [r10] |
828 %else | 828 %else |
829 mov r0, r0m | 829 mov r0, r0m |