Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12241:c7f6ddcc5c01 libavcodec
VP8: optimize DC-only chroma case in the same way as luma.
Add MMX idct_dc_add4uv function for this case.
~40% faster chroma idct.
author | darkshikari |
---|---|
date | Fri, 23 Jul 2010 06:02:52 +0000 |
parents | 13b1ad24a4b1 |
children | 48d6738904a9 |
comparison
equal
deleted
inserted
replaced
12240:e6ade5e849c9 | 12241:c7f6ddcc5c01 |
---|---|
974 pextrd [r1], m2, 2 | 974 pextrd [r1], m2, 2 |
975 pextrd [r1+r2], m2, 3 | 975 pextrd [r1+r2], m2, 3 |
976 RET | 976 RET |
977 | 977 |
978 ;----------------------------------------------------------------------------- | 978 ;----------------------------------------------------------------------------- |
979 ; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); | 979 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
980 ;----------------------------------------------------------------------------- | 980 ;----------------------------------------------------------------------------- |
981 | 981 |
982 INIT_MMX | 982 INIT_MMX |
983 cglobal vp8_idct_dc_add4_mmx, 3, 3 | 983 cglobal vp8_idct_dc_add4y_mmx, 3, 3 |
984 ; load data | 984 ; load data |
985 movd m0, [r1+32*0] ; A | 985 movd m0, [r1+32*0] ; A |
986 movd m1, [r1+32*2] ; C | 986 movd m1, [r1+32*2] ; C |
987 punpcklwd m0, [r1+32*1] ; A B | 987 punpcklwd m0, [r1+32*1] ; A B |
988 punpcklwd m1, [r1+32*3] ; C D | 988 punpcklwd m1, [r1+32*3] ; C D |
1013 ADD_DC m0, m6, 0, mova | 1013 ADD_DC m0, m6, 0, mova |
1014 ADD_DC m1, m7, 8, mova | 1014 ADD_DC m1, m7, 8, mova |
1015 RET | 1015 RET |
1016 | 1016 |
1017 INIT_XMM | 1017 INIT_XMM |
1018 cglobal vp8_idct_dc_add4_sse2, 3, 3 | 1018 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |
1019 ; load data | 1019 ; load data |
1020 movd m0, [r1+32*0] ; A | 1020 movd m0, [r1+32*0] ; A |
1021 movd m1, [r1+32*2] ; C | 1021 movd m1, [r1+32*2] ; C |
1022 punpcklwd m0, [r1+32*1] ; A B | 1022 punpcklwd m0, [r1+32*1] ; A B |
1023 punpcklwd m1, [r1+32*3] ; C D | 1023 punpcklwd m1, [r1+32*3] ; C D |
1040 punpcklbw m1, m1 | 1040 punpcklbw m1, m1 |
1041 | 1041 |
1042 ; add DC | 1042 ; add DC |
1043 lea r1, [r0+r2*2] | 1043 lea r1, [r0+r2*2] |
1044 ADD_DC m0, m1, 0, mova | 1044 ADD_DC m0, m1, 0, mova |
1045 RET | |
1046 | |
1047 ;----------------------------------------------------------------------------- | |
1048 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); | |
1049 ;----------------------------------------------------------------------------- | |
1050 | |
1051 INIT_MMX | |
1052 cglobal vp8_idct_dc_add4uv_mmx, 3, 3 | |
1053 ; load data | |
1054 movd m0, [r1+32*0] ; A | |
1055 movd m1, [r1+32*2] ; C | |
1056 punpcklwd m0, [r1+32*1] ; A B | |
1057 punpcklwd m1, [r1+32*3] ; C D | |
1058 punpckldq m0, m1 ; A B C D | |
1059 pxor m6, m6 | |
1060 | |
1061 ; calculate DC | |
1062 paddw m0, [pw_4] | |
1063 movd [r1+32*0], m6 | |
1064 movd [r1+32*1], m6 | |
1065 movd [r1+32*2], m6 | |
1066 movd [r1+32*3], m6 | |
1067 psraw m0, 3 | |
1068 psubw m6, m0 | |
1069 packuswb m0, m0 | |
1070 packuswb m6, m6 | |
1071 punpcklbw m0, m0 ; AABBCCDD | |
1072 punpcklbw m6, m6 ; AABBCCDD | |
1073 movq m1, m0 | |
1074 movq m7, m6 | |
1075 punpcklbw m0, m0 ; AAAABBBB | |
1076 punpckhbw m1, m1 ; CCCCDDDD | |
1077 punpcklbw m6, m6 ; AAAABBBB | |
1078 punpckhbw m7, m7 ; CCCCDDDD | |
1079 | |
1080 ; add DC | |
1081 lea r1, [r0+r2*2] | |
1082 ADD_DC m0, m6, 0, mova | |
1083 lea r0, [r0+r2*4] | |
1084 lea r1, [r1+r2*4] | |
1085 ADD_DC m1, m7, 0, mova | |
1045 RET | 1086 RET |
1046 | 1087 |
1047 ;----------------------------------------------------------------------------- | 1088 ;----------------------------------------------------------------------------- |
1048 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | 1089 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
1049 ;----------------------------------------------------------------------------- | 1090 ;----------------------------------------------------------------------------- |