Mercurial > libavcodec.hg
comparison x86/dsputil_yasm.asm @ 10430:12c8175d6db5 libavcodec
simd add_hfyu_left_prediction
2.2x faster than C on conroe, 3.6x on penryn.
4-6% faster huffyuv decoding if using left or plane mode and yuv
author | lorenm |
---|---|
date | Sun, 18 Oct 2009 20:10:10 +0000 |
parents | 31138c296ac6 |
children | 546b7ebeaf07 |
comparison
equal
deleted
inserted
replaced
10429:289dd8daf4ee | 10430:12c8175d6db5 |
---|---|
18 ;* License along with FFmpeg; if not, write to the Free Software | 18 ;* License along with FFmpeg; if not, write to the Free Software |
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 ;****************************************************************************** | 20 ;****************************************************************************** |
21 | 21 |
22 %include "x86inc.asm" | 22 %include "x86inc.asm" |
23 | |
24 SECTION_RODATA | |
25 pb_f: times 16 db 15 | |
26 pb_zzzzzzzz77777777: times 8 db -1 | |
27 pb_7: times 8 db 7 | |
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 | |
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |
23 | 30 |
24 section .text align=16 | 31 section .text align=16 |
25 | 32 |
26 %macro PSWAPD_SSE 2 | 33 %macro PSWAPD_SSE 2 |
27 pshufw %1, %2, 0x4e | 34 pshufw %1, %2, 0x4e |
148 movzx r2d, byte [dstq-1] | 155 movzx r2d, byte [dstq-1] |
149 mov [leftq], r2d | 156 mov [leftq], r2d |
150 movzx r2d, byte [topq-1] | 157 movzx r2d, byte [topq-1] |
151 mov [left_topq], r2d | 158 mov [left_topq], r2d |
152 RET | 159 RET |
160 | |
161 | |
162 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned | |
163 add srcq, wq | |
164 add dstq, wq | |
165 neg wq | |
166 %%.loop: | |
167 mova m1, [srcq+wq] | |
168 mova m2, m1 | |
169 psllw m1, 8 | |
170 paddb m1, m2 | |
171 mova m2, m1 | |
172 pshufb m1, m3 | |
173 paddb m1, m2 | |
174 pshufb m0, m5 | |
175 mova m2, m1 | |
176 pshufb m1, m4 | |
177 paddb m1, m2 | |
178 %if mmsize == 16 | |
179 mova m2, m1 | |
180 pshufb m1, m6 | |
181 paddb m1, m2 | |
182 %endif | |
183 paddb m0, m1 | |
184 %if %1 | |
185 mova [dstq+wq], m0 | |
186 %else | |
187 movq [dstq+wq], m0 | |
188 movhps [dstq+wq+8], m0 | |
189 %endif | |
190 add wq, mmsize | |
191 jl %%.loop | |
192 mov eax, mmsize-1 | |
193 sub eax, wd | |
194 movd m1, eax | |
195 pshufb m0, m1 | |
196 movd eax, m0 | |
197 RET | |
198 %endmacro | |
199 | |
200 ; int ff_add_hfyu_left_prediction(uint8_t *dst, uint8_t *src, int w, int left) | |
201 INIT_MMX | |
202 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left | |
203 .skip_prologue: | |
204 mova m5, [pb_7 GLOBAL] | |
205 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |
206 mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |
207 movd m0, leftm | |
208 psllq m0, 56 | |
209 ADD_HFYU_LEFT_LOOP 1 | |
210 | |
211 INIT_XMM | |
212 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | |
213 mova m5, [pb_f GLOBAL] | |
214 mova m6, [pb_zzzzzzzz77777777 GLOBAL] | |
215 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |
216 mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |
217 movd m0, leftm | |
218 pslldq m0, 15 | |
219 test srcq, 15 | |
220 jnz ff_add_hfyu_left_prediction_ssse3 %+ .skip_prologue | |
221 test dstq, 15 | |
222 jnz .unaligned | |
223 ADD_HFYU_LEFT_LOOP 1 | |
224 .unaligned: | |
225 ADD_HFYU_LEFT_LOOP 0 | |
226 |