comparison x86/dsputil_yasm.asm @ 10430:12c8175d6db5 libavcodec

simd add_hfyu_left_prediction 2.2x faster than C on conroe, 3.6x on penryn. 4-6% faster huffyuv decoding if using left or plane mode and yuv
author lorenm
date Sun, 18 Oct 2009 20:10:10 +0000
parents 31138c296ac6
children 546b7ebeaf07
comparison
equal deleted inserted replaced
10429:289dd8daf4ee 10430:12c8175d6db5
18 ;* License along with FFmpeg; if not, write to the Free Software 18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;****************************************************************************** 20 ;******************************************************************************
21 21
22 %include "x86inc.asm" 22 %include "x86inc.asm"
23
24 SECTION_RODATA
25 pb_f: times 16 db 15
26 pb_zzzzzzzz77777777: times 8 db -1
27 pb_7: times 8 db 7
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
23 30
24 section .text align=16 31 section .text align=16
25 32
26 %macro PSWAPD_SSE 2 33 %macro PSWAPD_SSE 2
27 pshufw %1, %2, 0x4e 34 pshufw %1, %2, 0x4e
148 movzx r2d, byte [dstq-1] 155 movzx r2d, byte [dstq-1]
149 mov [leftq], r2d 156 mov [leftq], r2d
150 movzx r2d, byte [topq-1] 157 movzx r2d, byte [topq-1]
151 mov [left_topq], r2d 158 mov [left_topq], r2d
152 RET 159 RET
160
161
162 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
163 add srcq, wq
164 add dstq, wq
165 neg wq
166 %%.loop:
167 mova m1, [srcq+wq]
168 mova m2, m1
169 psllw m1, 8
170 paddb m1, m2
171 mova m2, m1
172 pshufb m1, m3
173 paddb m1, m2
174 pshufb m0, m5
175 mova m2, m1
176 pshufb m1, m4
177 paddb m1, m2
178 %if mmsize == 16
179 mova m2, m1
180 pshufb m1, m6
181 paddb m1, m2
182 %endif
183 paddb m0, m1
184 %if %1
185 mova [dstq+wq], m0
186 %else
187 movq [dstq+wq], m0
188 movhps [dstq+wq+8], m0
189 %endif
190 add wq, mmsize
191 jl %%.loop
192 mov eax, mmsize-1
193 sub eax, wd
194 movd m1, eax
195 pshufb m0, m1
196 movd eax, m0
197 RET
198 %endmacro
199
200 ; int ff_add_hfyu_left_prediction(uint8_t *dst, uint8_t *src, int w, int left)
201 INIT_MMX
202 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
203 .skip_prologue:
204 mova m5, [pb_7 GLOBAL]
205 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
206 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
207 movd m0, leftm
208 psllq m0, 56
209 ADD_HFYU_LEFT_LOOP 1
210
211 INIT_XMM
212 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
213 mova m5, [pb_f GLOBAL]
214 mova m6, [pb_zzzzzzzz77777777 GLOBAL]
215 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
216 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
217 movd m0, leftm
218 pslldq m0, 15
219 test srcq, 15
220 jnz ff_add_hfyu_left_prediction_ssse3 %+ .skip_prologue
221 test dstq, 15
222 jnz .unaligned
223 ADD_HFYU_LEFT_LOOP 1
224 .unaligned:
225 ADD_HFYU_LEFT_LOOP 0
226