Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12209:9eef00a43280 libavcodec
Make mmx VP8 WHT faster
Avoid pextrw, since it's slow on many older CPUs.
Now it doesn't require mmxext either.
author | darkshikari |
---|---|
date | Wed, 21 Jul 2010 20:51:01 +0000 |
parents | d38e8565ba05 |
children | baf13deed97e |
comparison
equal
deleted
inserted
replaced
12208:5d73c4b4cd37 | 12209:9eef00a43280 |
---|---|
1032 | 1032 |
1033 ;----------------------------------------------------------------------------- | 1033 ;----------------------------------------------------------------------------- |
1034 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) | 1034 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
1035 ;----------------------------------------------------------------------------- | 1035 ;----------------------------------------------------------------------------- |
1036 | 1036 |
1037 %macro SCATTER_WHT 1 | 1037 %macro SCATTER_WHT 3 |
1038 pextrw r1d, m0, %1 | 1038 movd r1d, m%1 |
1039 pextrw r2d, m1, %1 | 1039 movd r2d, m%2 |
1040 mov [r0+2*16*0], r1w | 1040 mov [r0+2*16*(0+%3)], r1w |
1041 mov [r0+2*16*1], r2w | 1041 mov [r0+2*16*(1+%3)], r2w |
1042 pextrw r1d, m2, %1 | 1042 shr r1d, 16 |
1043 pextrw r2d, m3, %1 | 1043 shr r2d, 16 |
1044 mov [r0+2*16*2], r1w | 1044 psrlq m%1, 32 |
1045 mov [r0+2*16*3], r2w | 1045 psrlq m%2, 32 |
1046 mov [r0+2*16*(4+%3)], r1w | |
1047 mov [r0+2*16*(5+%3)], r2w | |
1048 movd r1d, m%1 | |
1049 movd r2d, m%2 | |
1050 mov [r0+2*16*(8+%3)], r1w | |
1051 mov [r0+2*16*(9+%3)], r2w | |
1052 shr r1d, 16 | |
1053 shr r2d, 16 | |
1054 mov [r0+2*16*(12+%3)], r1w | |
1055 mov [r0+2*16*(13+%3)], r2w | |
1046 %endmacro | 1056 %endmacro |
1047 | 1057 |
1048 %macro HADAMARD4_1D 4 | 1058 %macro HADAMARD4_1D 4 |
1049 SUMSUB_BADC m%2, m%1, m%4, m%3 | 1059 SUMSUB_BADC m%2, m%1, m%4, m%3 |
1050 SUMSUB_BADC m%4, m%2, m%3, m%1 | 1060 SUMSUB_BADC m%4, m%2, m%3, m%1 |
1051 SWAP %1, %4, %3 | 1061 SWAP %1, %4, %3 |
1052 %endmacro | 1062 %endmacro |
1053 | 1063 |
1054 INIT_MMX | 1064 INIT_MMX |
1055 cglobal vp8_luma_dc_wht_mmxext, 2,3 | 1065 cglobal vp8_luma_dc_wht_mmx, 2,3 |
1056 movq m0, [r1] | 1066 movq m0, [r1] |
1057 movq m1, [r1+8] | 1067 movq m1, [r1+8] |
1058 movq m2, [r1+16] | 1068 movq m2, [r1+16] |
1059 movq m3, [r1+24] | 1069 movq m3, [r1+24] |
1060 HADAMARD4_1D 0, 1, 2, 3 | 1070 HADAMARD4_1D 0, 1, 2, 3 |
1063 HADAMARD4_1D 0, 1, 2, 3 | 1073 HADAMARD4_1D 0, 1, 2, 3 |
1064 psraw m0, 3 | 1074 psraw m0, 3 |
1065 psraw m1, 3 | 1075 psraw m1, 3 |
1066 psraw m2, 3 | 1076 psraw m2, 3 |
1067 psraw m3, 3 | 1077 psraw m3, 3 |
1068 SCATTER_WHT 0 | 1078 SCATTER_WHT 0, 1, 0 |
1069 add r0, 2*16*4 | 1079 SCATTER_WHT 2, 3, 2 |
1070 SCATTER_WHT 1 | |
1071 add r0, 2*16*4 | |
1072 SCATTER_WHT 2 | |
1073 add r0, 2*16*4 | |
1074 SCATTER_WHT 3 | |
1075 RET | 1080 RET |
1076 | 1081 |
1077 ;----------------------------------------------------------------------------- | 1082 ;----------------------------------------------------------------------------- |
1078 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); | 1083 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
1079 ;----------------------------------------------------------------------------- | 1084 ;----------------------------------------------------------------------------- |