Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12013:2ae70e2c31a4 libavcodec
MMX idct_add for VP8.
author | rbultje |
---|---|
date | Tue, 29 Jun 2010 14:43:11 +0000 |
parents | d584c7373a64 |
children | 6fe72dbf2c7b |
comparison
equal
deleted
inserted
replaced
12012:2d70a8b0ec8a | 12013:2ae70e2c31a4 |
---|---|
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 | 139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 |
140 | 140 |
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | 142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
144 | |
145 pw_20091: times 4 dw 20091 | |
146 pw_17734: times 4 dw 17734 | |
144 | 147 |
145 cextern pw_3 | 148 cextern pw_3 |
146 cextern pw_4 | 149 cextern pw_4 |
147 cextern pw_64 | 150 cextern pw_64 |
148 | 151 |
922 pextrd [r1], xmm2, 2 | 925 pextrd [r1], xmm2, 2 |
923 pextrd [r1+r2], xmm2, 3 | 926 pextrd [r1+r2], xmm2, 3 |
924 RET | 927 RET |
925 | 928 |
926 ;----------------------------------------------------------------------------- | 929 ;----------------------------------------------------------------------------- |
930 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
931 ;----------------------------------------------------------------------------- | |
932 | |
933 ; calculate %1=%2+%1; %2=%2-%1, with %3=temp register | |
934 %macro SUMSUB 3 | |
935 mova %3, %1 | |
936 paddw %1, %2 | |
937 psubw %2, %3 | |
938 %endmacro | |
939 | |
940 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
941 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
942 %macro VP8_MULTIPLY_SUMSUB 4 | |
943 mova %3, %1 | |
944 mova %4, %2 | |
945 pmulhw %3, m6 ;20091(1) | |
946 pmulhw %4, m6 ;20091(2) | |
947 paddw %3, %1 | |
948 paddw %4, %2 | |
949 psllw %1, 1 | |
950 psllw %2, 1 | |
951 pmulhw %1, m7 ;35468(1) | |
952 pmulhw %2, m7 ;35468(2) | |
953 psubw %1, %4 | |
954 paddw %2, %3 | |
955 %endmacro | |
956 | |
957 ; calculate x0=%1+%3; x1=%1-%3 | |
958 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
959 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
960 ; %5/%6 are temporary registers | |
961 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
962 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
963 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
964 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
965 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
966 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
967 SWAP %4, %1 | |
968 SWAP %4, %3 | |
969 %endmacro | |
970 | |
971 ; transpose a 4x4 table | |
972 %macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3 | |
973 mova m%5, m%1 | |
974 punpcklwd m%1, m%2 | |
975 punpckhwd m%5, m%2 | |
976 mova m%2, m%3 | |
977 punpcklwd m%3, m%4 | |
978 punpckhwd m%2, m%4 | |
979 mova m%4, m%1 | |
980 punpckldq m%1, m%3 ;col0 | |
981 punpckhdq m%4, m%3 ;col1 | |
982 mova m%3, m%5 | |
983 punpckldq m%5, m%2 ;col2 | |
984 punpckhdq m%3, m%2 ;col3 | |
985 SWAP %4, %2 | |
986 SWAP %4, %5 | |
987 SWAP %4, %3 | |
988 %endmacro | |
989 | |
990 INIT_MMX | |
991 cglobal vp8_idct_add_mmx, 3, 3 | |
992 ; load block data | |
993 movq m0, [r1] | |
994 movq m1, [r1+8] | |
995 movq m2, [r1+16] | |
996 movq m3, [r1+24] | |
997 movq m6, [pw_20091] | |
998 movq m7, [pw_17734] | |
999 | |
1000 ; actual IDCT | |
1001 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1002 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1003 paddw m0, [pw_4] | |
1004 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1005 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1006 | |
1007 ; store | |
1008 pxor m4, m4 | |
1009 lea r1, [r0+2*r2] | |
1010 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1011 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1012 | |
1013 RET | |
1014 | |
1015 ;----------------------------------------------------------------------------- | |
927 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) | 1016 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
928 ;----------------------------------------------------------------------------- | 1017 ;----------------------------------------------------------------------------- |
929 | 1018 |
930 %macro SCATTER_WHT 1 | 1019 %macro SCATTER_WHT 1 |
931 pextrw r1d, m0, %1 | 1020 pextrw r1d, m0, %1 |