comparison x86/vp8dsp.asm @ 12209:9eef00a43280 libavcodec

Make mmx VP8 WHT faster Avoid pextrw, since it's slow on many older CPUs. Now it doesn't require mmxext either.
author darkshikari
date Wed, 21 Jul 2010 20:51:01 +0000
parents d38e8565ba05
children baf13deed97e
comparison
equal deleted inserted replaced
12208:5d73c4b4cd37 12209:9eef00a43280
1032 1032
1033 ;----------------------------------------------------------------------------- 1033 ;-----------------------------------------------------------------------------
1034 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) 1034 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1035 ;----------------------------------------------------------------------------- 1035 ;-----------------------------------------------------------------------------
1036 1036
1037 %macro SCATTER_WHT 1 1037 %macro SCATTER_WHT 3
1038 pextrw r1d, m0, %1 1038 movd r1d, m%1
1039 pextrw r2d, m1, %1 1039 movd r2d, m%2
1040 mov [r0+2*16*0], r1w 1040 mov [r0+2*16*(0+%3)], r1w
1041 mov [r0+2*16*1], r2w 1041 mov [r0+2*16*(1+%3)], r2w
1042 pextrw r1d, m2, %1 1042 shr r1d, 16
1043 pextrw r2d, m3, %1 1043 shr r2d, 16
1044 mov [r0+2*16*2], r1w 1044 psrlq m%1, 32
1045 mov [r0+2*16*3], r2w 1045 psrlq m%2, 32
1046 mov [r0+2*16*(4+%3)], r1w
1047 mov [r0+2*16*(5+%3)], r2w
1048 movd r1d, m%1
1049 movd r2d, m%2
1050 mov [r0+2*16*(8+%3)], r1w
1051 mov [r0+2*16*(9+%3)], r2w
1052 shr r1d, 16
1053 shr r2d, 16
1054 mov [r0+2*16*(12+%3)], r1w
1055 mov [r0+2*16*(13+%3)], r2w
1046 %endmacro 1056 %endmacro
1047 1057
1048 %macro HADAMARD4_1D 4 1058 %macro HADAMARD4_1D 4
1049 SUMSUB_BADC m%2, m%1, m%4, m%3 1059 SUMSUB_BADC m%2, m%1, m%4, m%3
1050 SUMSUB_BADC m%4, m%2, m%3, m%1 1060 SUMSUB_BADC m%4, m%2, m%3, m%1
1051 SWAP %1, %4, %3 1061 SWAP %1, %4, %3
1052 %endmacro 1062 %endmacro
1053 1063
1054 INIT_MMX 1064 INIT_MMX
1055 cglobal vp8_luma_dc_wht_mmxext, 2,3 1065 cglobal vp8_luma_dc_wht_mmx, 2,3
1056 movq m0, [r1] 1066 movq m0, [r1]
1057 movq m1, [r1+8] 1067 movq m1, [r1+8]
1058 movq m2, [r1+16] 1068 movq m2, [r1+16]
1059 movq m3, [r1+24] 1069 movq m3, [r1+24]
1060 HADAMARD4_1D 0, 1, 2, 3 1070 HADAMARD4_1D 0, 1, 2, 3
1063 HADAMARD4_1D 0, 1, 2, 3 1073 HADAMARD4_1D 0, 1, 2, 3
1064 psraw m0, 3 1074 psraw m0, 3
1065 psraw m1, 3 1075 psraw m1, 3
1066 psraw m2, 3 1076 psraw m2, 3
1067 psraw m3, 3 1077 psraw m3, 3
1068 SCATTER_WHT 0 1078 SCATTER_WHT 0, 1, 0
1069 add r0, 2*16*4 1079 SCATTER_WHT 2, 3, 2
1070 SCATTER_WHT 1
1071 add r0, 2*16*4
1072 SCATTER_WHT 2
1073 add r0, 2*16*4
1074 SCATTER_WHT 3
1075 RET 1080 RET
1076 1081
1077 ;----------------------------------------------------------------------------- 1082 ;-----------------------------------------------------------------------------
1078 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); 1083 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1079 ;----------------------------------------------------------------------------- 1084 ;-----------------------------------------------------------------------------