comparison x86/vp8dsp.asm @ 12241:c7f6ddcc5c01 libavcodec

VP8: optimize DC-only chroma case in the same way as luma. Add MMX idct_dc_add4uv function for this case. ~40% faster chroma idct.
author darkshikari
date Fri, 23 Jul 2010 06:02:52 +0000
parents 13b1ad24a4b1
children 48d6738904a9
comparison
equal deleted inserted replaced
12240:e6ade5e849c9 12241:c7f6ddcc5c01
974 pextrd [r1], m2, 2 974 pextrd [r1], m2, 2
975 pextrd [r1+r2], m2, 3 975 pextrd [r1+r2], m2, 3
976 RET 976 RET
977 977
978 ;----------------------------------------------------------------------------- 978 ;-----------------------------------------------------------------------------
979 ; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); 979 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
980 ;----------------------------------------------------------------------------- 980 ;-----------------------------------------------------------------------------
981 981
982 INIT_MMX 982 INIT_MMX
983 cglobal vp8_idct_dc_add4_mmx, 3, 3 983 cglobal vp8_idct_dc_add4y_mmx, 3, 3
984 ; load data 984 ; load data
985 movd m0, [r1+32*0] ; A 985 movd m0, [r1+32*0] ; A
986 movd m1, [r1+32*2] ; C 986 movd m1, [r1+32*2] ; C
987 punpcklwd m0, [r1+32*1] ; A B 987 punpcklwd m0, [r1+32*1] ; A B
988 punpcklwd m1, [r1+32*3] ; C D 988 punpcklwd m1, [r1+32*3] ; C D
1013 ADD_DC m0, m6, 0, mova 1013 ADD_DC m0, m6, 0, mova
1014 ADD_DC m1, m7, 8, mova 1014 ADD_DC m1, m7, 8, mova
1015 RET 1015 RET
1016 1016
1017 INIT_XMM 1017 INIT_XMM
1018 cglobal vp8_idct_dc_add4_sse2, 3, 3 1018 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
1019 ; load data 1019 ; load data
1020 movd m0, [r1+32*0] ; A 1020 movd m0, [r1+32*0] ; A
1021 movd m1, [r1+32*2] ; C 1021 movd m1, [r1+32*2] ; C
1022 punpcklwd m0, [r1+32*1] ; A B 1022 punpcklwd m0, [r1+32*1] ; A B
1023 punpcklwd m1, [r1+32*3] ; C D 1023 punpcklwd m1, [r1+32*3] ; C D
1040 punpcklbw m1, m1 1040 punpcklbw m1, m1
1041 1041
1042 ; add DC 1042 ; add DC
1043 lea r1, [r0+r2*2] 1043 lea r1, [r0+r2*2]
1044 ADD_DC m0, m1, 0, mova 1044 ADD_DC m0, m1, 0, mova
1045 RET
1046
1047 ;-----------------------------------------------------------------------------
1048 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
1049 ;-----------------------------------------------------------------------------
1050
1051 INIT_MMX
1052 cglobal vp8_idct_dc_add4uv_mmx, 3, 3
1053 ; load data
1054 movd m0, [r1+32*0] ; A
1055 movd m1, [r1+32*2] ; C
1056 punpcklwd m0, [r1+32*1] ; A B
1057 punpcklwd m1, [r1+32*3] ; C D
1058 punpckldq m0, m1 ; A B C D
1059 pxor m6, m6
1060
1061 ; calculate DC
1062 paddw m0, [pw_4]
1063 movd [r1+32*0], m6
1064 movd [r1+32*1], m6
1065 movd [r1+32*2], m6
1066 movd [r1+32*3], m6
1067 psraw m0, 3
1068 psubw m6, m0
1069 packuswb m0, m0
1070 packuswb m6, m6
1071 punpcklbw m0, m0 ; AABBCCDD
1072 punpcklbw m6, m6 ; AABBCCDD
1073 movq m1, m0
1074 movq m7, m6
1075 punpcklbw m0, m0 ; AAAABBBB
1076 punpckhbw m1, m1 ; CCCCDDDD
1077 punpcklbw m6, m6 ; AAAABBBB
1078 punpckhbw m7, m7 ; CCCCDDDD
1079
1080 ; add DC
1081 lea r1, [r0+r2*2]
1082 ADD_DC m0, m6, 0, mova
1083 lea r0, [r0+r2*4]
1084 lea r1, [r1+r2*4]
1085 ADD_DC m1, m7, 0, mova
1045 RET 1086 RET
1046 1087
1047 ;----------------------------------------------------------------------------- 1088 ;-----------------------------------------------------------------------------
1048 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); 1089 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
1049 ;----------------------------------------------------------------------------- 1090 ;-----------------------------------------------------------------------------