comparison x86/dsputil_yasm.asm @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.
author lorenm
date Sat, 05 Dec 2009 15:09:10 +0000
parents 66242b8fbd32
children bb14c1586891
comparison
equal deleted inserted replaced
10643:7f6911429cdc 10644:5da7180afadf
98 %undef cvtps2pi 98 %undef cvtps2pi
99 99
100 100
101 101
102 %macro SCALARPRODUCT 1 102 %macro SCALARPRODUCT 1
103 ; void add_int16(int16_t * v1, int16_t * v2, int order) 103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
104 cglobal add_int16_%1, 3,3,2, v1, v2, order
105 shl orderq, 1
106 add v1q, orderq
107 add v2q, orderq
108 neg orderq
109 .loop:
110 movu m0, [v2q + orderq]
111 movu m1, [v2q + orderq + mmsize]
112 paddw m0, [v1q + orderq]
113 paddw m1, [v1q + orderq + mmsize]
114 mova [v1q + orderq], m0
115 mova [v1q + orderq + mmsize], m1
116 add orderq, mmsize*2
117 jl .loop
118 REP_RET
119
120 ; void sub_int16(int16_t * v1, int16_t * v2, int order)
121 cglobal sub_int16_%1, 3,3,4, v1, v2, order
122 shl orderq, 1
123 add v1q, orderq
124 add v2q, orderq
125 neg orderq
126 .loop:
127 movu m2, [v2q + orderq]
128 movu m3, [v2q + orderq + mmsize]
129 mova m0, [v1q + orderq]
130 mova m1, [v1q + orderq + mmsize]
131 psubw m0, m2
132 psubw m1, m3
133 mova [v1q + orderq], m0
134 mova [v1q + orderq + mmsize], m1
135 add orderq, mmsize*2
136 jl .loop
137 REP_RET
138
139 ; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
140 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift 104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
141 shl orderq, 1 105 shl orderq, 1
142 add v1q, orderq 106 add v1q, orderq
143 add v2q, orderq 107 add v2q, orderq
144 neg orderq 108 neg orderq
163 pshufw m0, m2, 0x4e 127 pshufw m0, m2, 0x4e
164 %endif 128 %endif
165 paddd m2, m0 129 paddd m2, m0
166 movd eax, m2 130 movd eax, m2
167 RET 131 RET
132
133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
134 cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
135 shl orderq, 1
136 movd m7, mulm
137 %if mmsize == 16
138 pshuflw m7, m7, 0
139 punpcklqdq m7, m7
140 %else
141 pshufw m7, m7, 0
142 %endif
143 pxor m6, m6
144 add v1q, orderq
145 add v2q, orderq
146 add v3q, orderq
147 neg orderq
148 .loop:
149 movu m0, [v2q + orderq]
150 movu m1, [v2q + orderq + mmsize]
151 mova m4, [v1q + orderq]
152 mova m5, [v1q + orderq + mmsize]
153 movu m2, [v3q + orderq]
154 movu m3, [v3q + orderq + mmsize]
155 pmaddwd m0, m4
156 pmaddwd m1, m5
157 pmullw m2, m7
158 pmullw m3, m7
159 paddd m6, m0
160 paddd m6, m1
161 paddw m2, m4
162 paddw m3, m5
163 mova [v1q + orderq], m2
164 mova [v1q + orderq + mmsize], m3
165 add orderq, mmsize*2
166 jl .loop
167 %if mmsize == 16
168 movhlps m0, m6
169 paddd m6, m0
170 pshuflw m0, m6, 0x4e
171 %else
172 pshufw m0, m6, 0x4e
173 %endif
174 paddd m6, m0
175 movd eax, m6
176 RET
168 %endmacro 177 %endmacro
169 178
170 INIT_MMX 179 INIT_MMX
171 SCALARPRODUCT mmx2 180 SCALARPRODUCT mmx2
172 INIT_XMM 181 INIT_XMM
173 SCALARPRODUCT sse2 182 SCALARPRODUCT sse2
183
184 %macro SCALARPRODUCT_LOOP 1
185 align 16
186 .loop%1:
187 sub orderq, mmsize*2
188 %if %1
189 mova m1, m4
190 mova m4, [v2q + orderq]
191 mova m0, [v2q + orderq + mmsize]
192 palignr m1, m0, %1
193 palignr m0, m4, %1
194 mova m3, m5
195 mova m5, [v3q + orderq]
196 mova m2, [v3q + orderq + mmsize]
197 palignr m3, m2, %1
198 palignr m2, m5, %1
199 %else
200 mova m0, [v2q + orderq]
201 mova m1, [v2q + orderq + mmsize]
202 mova m2, [v3q + orderq]
203 mova m3, [v3q + orderq + mmsize]
204 %endif
205 pmaddwd m0, [v1q + orderq]
206 pmaddwd m1, [v1q + orderq + mmsize]
207 pmullw m2, m7
208 pmullw m3, m7
209 paddw m2, [v1q + orderq]
210 paddw m3, [v1q + orderq + mmsize]
211 paddd m6, m0
212 paddd m6, m1
213 mova [v1q + orderq], m2
214 mova [v1q + orderq + mmsize], m3
215 jg .loop%1
216 %if %1
217 jmp .end
218 %endif
219 %endmacro
220
221 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
222 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul
223 shl orderq, 1
224 movd m7, mulm
225 pshuflw m7, m7, 0
226 punpcklqdq m7, m7
227 pxor m6, m6
228 mov r4d, v2d
229 and r4d, 15
230 and v2q, ~15
231 and v3q, ~15
232 mova m4, [v2q + orderq]
233 mova m5, [v3q + orderq]
234 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
235 cmp r4d, 0
236 je .loop0
237 cmp r4d, 2
238 je .loop2
239 cmp r4d, 4
240 je .loop4
241 cmp r4d, 6
242 je .loop6
243 cmp r4d, 8
244 je .loop8
245 cmp r4d, 10
246 je .loop10
247 cmp r4d, 12
248 je .loop12
249 SCALARPRODUCT_LOOP 14
250 SCALARPRODUCT_LOOP 12
251 SCALARPRODUCT_LOOP 10
252 SCALARPRODUCT_LOOP 8
253 SCALARPRODUCT_LOOP 6
254 SCALARPRODUCT_LOOP 4
255 SCALARPRODUCT_LOOP 2
256 SCALARPRODUCT_LOOP 0
257 .end:
258 movhlps m0, m6
259 paddd m6, m0
260 pshuflw m0, m6, 0x4e
261 paddd m6, m0
262 movd eax, m6
263 RET
174 264
175 265
176 266
177 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) 267 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
178 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top 268 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top