comparison x86/h264_idct.asm @ 12511:41ebcc0afb40 libavcodec

Unroll loop in h264_idct_add16intra_sse2(). Basically identical to r25171, this inlines scan8[] and removes loop setup. 15% faster, 0.4% overall. See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.
author rbultje
date Fri, 24 Sep 2010 14:07:23 +0000
parents ef2f2db5b7be
children 0b6bd91bbe57
comparison
equal deleted inserted replaced
12510:ef2f2db5b7be 12511:41ebcc0afb40
757 add16_sse2_cycle 5, 0x24 757 add16_sse2_cycle 5, 0x24
758 add16_sse2_cycle 6, 0x1e 758 add16_sse2_cycle 6, 0x1e
759 add16_sse2_cycle 7, 0x26 759 add16_sse2_cycle 7, 0x26
760 RET 760 RET
761 761
762 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, 762 %macro add16intra_sse2_cycle 2
763 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
764 cglobal h264_idct_add16intra_sse2, 5, 7, 8
765 xor r5, r5
766 %ifdef ARCH_X86_64
767 mov r10, r0
768 %endif
769 %ifdef PIC
770 lea r11, [scan8_mem]
771 %endif
772 .next2blocks
773 movzx r0, byte [scan8+r5]
774 movzx r0, word [r4+r0]
775 test r0, r0
776 jz .try_dc
777 mov r0d, dword [r1+r5*4]
778 %ifdef ARCH_X86_64
779 add r0, r10
780 %else
781 add r0, r0m
782 %endif
783 call x264_add8x4_idct_sse2
784 add r5, 2
785 add r2, 64
786 cmp r5, 16
787 jl .next2blocks
788 REP_RET
789 .try_dc
790 movsx r0, word [r2 ]
791 or r0w, word [r2+32]
792 jz .skip2blocks
793 mov r0d, dword [r1+r5*4]
794 %ifdef ARCH_X86_64
795 add r0, r10
796 %else
797 add r0, r0m
798 %endif
799 call h264_idct_dc_add8_mmx2
800 .skip2blocks
801 add r5, 2
802 add r2, 64
803 cmp r5, 16
804 jl .next2blocks
805 REP_RET
806
807 %macro add8_sse2_cycle 2
808 movzx r0, word [r4+%2] 763 movzx r0, word [r4+%2]
809 test r0, r0 764 test r0, r0
810 jz .try%1dc 765 jz .try%1dc
811 %ifdef ARCH_X86_64 766 mov r0d, dword [r1+%1*8]
812 mov r0d, dword [r1+%1*8+64] 767 %ifdef ARCH_X86_64
813 add r0, [r10] 768 add r0, r10
814 %else 769 %else
815 mov r0, r0m 770 add r0, r0m
816 mov r0, [r0]
817 add r0, dword [r1+%1*8+64]
818 %endif 771 %endif
819 call x264_add8x4_idct_sse2 772 call x264_add8x4_idct_sse2
820 jmp .cycle%1end 773 jmp .cycle%1end
821 .try%1dc 774 .try%1dc
822 movsx r0, word [r2 ] 775 movsx r0, word [r2 ]
823 or r0w, word [r2+32] 776 or r0w, word [r2+32]
824 jz .cycle%1end 777 jz .cycle%1end
778 mov r0d, dword [r1+%1*8]
779 %ifdef ARCH_X86_64
780 add r0, r10
781 %else
782 add r0, r0m
783 %endif
784 call h264_idct_dc_add8_mmx2
785 .cycle%1end
786 %if %1 < 7
787 add r2, 64
788 %endif
789 %endmacro
790
791 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
792 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
793 cglobal h264_idct_add16intra_sse2, 5, 7, 8
794 %ifdef ARCH_X86_64
795 mov r10, r0
796 %endif
797 add16intra_sse2_cycle 0, 0xc
798 add16intra_sse2_cycle 1, 0x14
799 add16intra_sse2_cycle 2, 0xe
800 add16intra_sse2_cycle 3, 0x16
801 add16intra_sse2_cycle 4, 0x1c
802 add16intra_sse2_cycle 5, 0x24
803 add16intra_sse2_cycle 6, 0x1e
804 add16intra_sse2_cycle 7, 0x26
805 RET
806
807 %macro add8_sse2_cycle 2
808 movzx r0, word [r4+%2]
809 test r0, r0
810 jz .try%1dc
811 %ifdef ARCH_X86_64
812 mov r0d, dword [r1+%1*8+64]
813 add r0, [r10]
814 %else
815 mov r0, r0m
816 mov r0, [r0]
817 add r0, dword [r1+%1*8+64]
818 %endif
819 call x264_add8x4_idct_sse2
820 jmp .cycle%1end
821 .try%1dc
822 movsx r0, word [r2 ]
823 or r0w, word [r2+32]
824 jz .cycle%1end
825 %ifdef ARCH_X86_64 825 %ifdef ARCH_X86_64
826 mov r0d, dword [r1+%1*8+64] 826 mov r0d, dword [r1+%1*8+64]
827 add r0, [r10] 827 add r0, [r10]
828 %else 828 %else
829 mov r0, r0m 829 mov r0, r0m