comparison x86/vp8dsp.asm @ 12013:2ae70e2c31a4 libavcodec

MMX idct_add for VP8.
author rbultje
date Tue, 29 Jun 2010 14:43:11 +0000
parents d584c7373a64
children 6fe72dbf2c7b
comparison
equal deleted inserted replaced
12012:2d70a8b0ec8a 12013:2ae70e2c31a4
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
140 140
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
144
145 pw_20091: times 4 dw 20091
146 pw_17734: times 4 dw 17734
144 147
145 cextern pw_3 148 cextern pw_3
146 cextern pw_4 149 cextern pw_4
147 cextern pw_64 150 cextern pw_64
148 151
922 pextrd [r1], xmm2, 2 925 pextrd [r1], xmm2, 2
923 pextrd [r1+r2], xmm2, 3 926 pextrd [r1+r2], xmm2, 3
924 RET 927 RET
925 928
926 ;----------------------------------------------------------------------------- 929 ;-----------------------------------------------------------------------------
930 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
931 ;-----------------------------------------------------------------------------
932
933 ; calculate %1=%2+%1; %2=%2-%1, with %3=temp register
934 %macro SUMSUB 3
935 mova %3, %1
936 paddw %1, %2
937 psubw %2, %3
938 %endmacro
939
940 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
941 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
942 %macro VP8_MULTIPLY_SUMSUB 4
943 mova %3, %1
944 mova %4, %2
945 pmulhw %3, m6 ;20091(1)
946 pmulhw %4, m6 ;20091(2)
947 paddw %3, %1
948 paddw %4, %2
949 psllw %1, 1
950 psllw %2, 1
951 pmulhw %1, m7 ;35468(1)
952 pmulhw %2, m7 ;35468(2)
953 psubw %1, %4
954 paddw %2, %3
955 %endmacro
956
957 ; calculate x0=%1+%3; x1=%1-%3
958 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
959 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
960 ; %5/%6 are temporary registers
961 ; we assume m6/m7 have constant words 20091/17734 loaded in them
962 %macro VP8_IDCT_TRANSFORM4x4_1D 6
963 SUMSUB_BA m%3, m%1, m%5 ;t0, t1
964 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
965 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
966 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
967 SWAP %4, %1
968 SWAP %4, %3
969 %endmacro
970
971 ; transpose a 4x4 table
972 %macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3
973 mova m%5, m%1
974 punpcklwd m%1, m%2
975 punpckhwd m%5, m%2
976 mova m%2, m%3
977 punpcklwd m%3, m%4
978 punpckhwd m%2, m%4
979 mova m%4, m%1
980 punpckldq m%1, m%3 ;col0
981 punpckhdq m%4, m%3 ;col1
982 mova m%3, m%5
983 punpckldq m%5, m%2 ;col2
984 punpckhdq m%3, m%2 ;col3
985 SWAP %4, %2
986 SWAP %4, %5
987 SWAP %4, %3
988 %endmacro
989
990 INIT_MMX
991 cglobal vp8_idct_add_mmx, 3, 3
992 ; load block data
993 movq m0, [r1]
994 movq m1, [r1+8]
995 movq m2, [r1+16]
996 movq m3, [r1+24]
997 movq m6, [pw_20091]
998 movq m7, [pw_17734]
999
1000 ; actual IDCT
1001 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1002 TRANSPOSE4x4W 0, 1, 2, 3, 4
1003 paddw m0, [pw_4]
1004 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1005 TRANSPOSE4x4W 0, 1, 2, 3, 4
1006
1007 ; store
1008 pxor m4, m4
1009 lea r1, [r0+2*r2]
1010 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1011 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1012
1013 RET
1014
1015 ;-----------------------------------------------------------------------------
927 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) 1016 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
928 ;----------------------------------------------------------------------------- 1017 ;-----------------------------------------------------------------------------
929 1018
930 %macro SCATTER_WHT 1 1019 %macro SCATTER_WHT 1
931 pextrw r1d, m0, %1 1020 pextrw r1d, m0, %1