Mercurial > libavcodec.hg
comparison x86/dsputil_yasm.asm @ 10644:5da7180afadf libavcodec
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
author | lorenm |
---|---|
date | Sat, 05 Dec 2009 15:09:10 +0000 |
parents | 66242b8fbd32 |
children | bb14c1586891 |
comparison
equal
deleted
inserted
replaced
10643:7f6911429cdc | 10644:5da7180afadf |
---|---|
98 %undef cvtps2pi | 98 %undef cvtps2pi |
99 | 99 |
100 | 100 |
101 | 101 |
102 %macro SCALARPRODUCT 1 | 102 %macro SCALARPRODUCT 1 |
103 ; void add_int16(int16_t * v1, int16_t * v2, int order) | 103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
104 cglobal add_int16_%1, 3,3,2, v1, v2, order | |
105 shl orderq, 1 | |
106 add v1q, orderq | |
107 add v2q, orderq | |
108 neg orderq | |
109 .loop: | |
110 movu m0, [v2q + orderq] | |
111 movu m1, [v2q + orderq + mmsize] | |
112 paddw m0, [v1q + orderq] | |
113 paddw m1, [v1q + orderq + mmsize] | |
114 mova [v1q + orderq], m0 | |
115 mova [v1q + orderq + mmsize], m1 | |
116 add orderq, mmsize*2 | |
117 jl .loop | |
118 REP_RET | |
119 | |
120 ; void sub_int16(int16_t * v1, int16_t * v2, int order) | |
121 cglobal sub_int16_%1, 3,3,4, v1, v2, order | |
122 shl orderq, 1 | |
123 add v1q, orderq | |
124 add v2q, orderq | |
125 neg orderq | |
126 .loop: | |
127 movu m2, [v2q + orderq] | |
128 movu m3, [v2q + orderq + mmsize] | |
129 mova m0, [v1q + orderq] | |
130 mova m1, [v1q + orderq + mmsize] | |
131 psubw m0, m2 | |
132 psubw m1, m3 | |
133 mova [v1q + orderq], m0 | |
134 mova [v1q + orderq + mmsize], m1 | |
135 add orderq, mmsize*2 | |
136 jl .loop | |
137 REP_RET | |
138 | |
139 ; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) | |
140 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift | 104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
141 shl orderq, 1 | 105 shl orderq, 1 |
142 add v1q, orderq | 106 add v1q, orderq |
143 add v2q, orderq | 107 add v2q, orderq |
144 neg orderq | 108 neg orderq |
163 pshufw m0, m2, 0x4e | 127 pshufw m0, m2, 0x4e |
164 %endif | 128 %endif |
165 paddd m2, m0 | 129 paddd m2, m0 |
166 movd eax, m2 | 130 movd eax, m2 |
167 RET | 131 RET |
132 | |
133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
134 cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul | |
135 shl orderq, 1 | |
136 movd m7, mulm | |
137 %if mmsize == 16 | |
138 pshuflw m7, m7, 0 | |
139 punpcklqdq m7, m7 | |
140 %else | |
141 pshufw m7, m7, 0 | |
142 %endif | |
143 pxor m6, m6 | |
144 add v1q, orderq | |
145 add v2q, orderq | |
146 add v3q, orderq | |
147 neg orderq | |
148 .loop: | |
149 movu m0, [v2q + orderq] | |
150 movu m1, [v2q + orderq + mmsize] | |
151 mova m4, [v1q + orderq] | |
152 mova m5, [v1q + orderq + mmsize] | |
153 movu m2, [v3q + orderq] | |
154 movu m3, [v3q + orderq + mmsize] | |
155 pmaddwd m0, m4 | |
156 pmaddwd m1, m5 | |
157 pmullw m2, m7 | |
158 pmullw m3, m7 | |
159 paddd m6, m0 | |
160 paddd m6, m1 | |
161 paddw m2, m4 | |
162 paddw m3, m5 | |
163 mova [v1q + orderq], m2 | |
164 mova [v1q + orderq + mmsize], m3 | |
165 add orderq, mmsize*2 | |
166 jl .loop | |
167 %if mmsize == 16 | |
168 movhlps m0, m6 | |
169 paddd m6, m0 | |
170 pshuflw m0, m6, 0x4e | |
171 %else | |
172 pshufw m0, m6, 0x4e | |
173 %endif | |
174 paddd m6, m0 | |
175 movd eax, m6 | |
176 RET | |
168 %endmacro | 177 %endmacro |
169 | 178 |
170 INIT_MMX | 179 INIT_MMX |
171 SCALARPRODUCT mmx2 | 180 SCALARPRODUCT mmx2 |
172 INIT_XMM | 181 INIT_XMM |
173 SCALARPRODUCT sse2 | 182 SCALARPRODUCT sse2 |
183 | |
184 %macro SCALARPRODUCT_LOOP 1 | |
185 align 16 | |
186 .loop%1: | |
187 sub orderq, mmsize*2 | |
188 %if %1 | |
189 mova m1, m4 | |
190 mova m4, [v2q + orderq] | |
191 mova m0, [v2q + orderq + mmsize] | |
192 palignr m1, m0, %1 | |
193 palignr m0, m4, %1 | |
194 mova m3, m5 | |
195 mova m5, [v3q + orderq] | |
196 mova m2, [v3q + orderq + mmsize] | |
197 palignr m3, m2, %1 | |
198 palignr m2, m5, %1 | |
199 %else | |
200 mova m0, [v2q + orderq] | |
201 mova m1, [v2q + orderq + mmsize] | |
202 mova m2, [v3q + orderq] | |
203 mova m3, [v3q + orderq + mmsize] | |
204 %endif | |
205 pmaddwd m0, [v1q + orderq] | |
206 pmaddwd m1, [v1q + orderq + mmsize] | |
207 pmullw m2, m7 | |
208 pmullw m3, m7 | |
209 paddw m2, [v1q + orderq] | |
210 paddw m3, [v1q + orderq + mmsize] | |
211 paddd m6, m0 | |
212 paddd m6, m1 | |
213 mova [v1q + orderq], m2 | |
214 mova [v1q + orderq + mmsize], m3 | |
215 jg .loop%1 | |
216 %if %1 | |
217 jmp .end | |
218 %endif | |
219 %endmacro | |
220 | |
221 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
222 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul | |
223 shl orderq, 1 | |
224 movd m7, mulm | |
225 pshuflw m7, m7, 0 | |
226 punpcklqdq m7, m7 | |
227 pxor m6, m6 | |
228 mov r4d, v2d | |
229 and r4d, 15 | |
230 and v2q, ~15 | |
231 and v3q, ~15 | |
232 mova m4, [v2q + orderq] | |
233 mova m5, [v3q + orderq] | |
234 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |
235 cmp r4d, 0 | |
236 je .loop0 | |
237 cmp r4d, 2 | |
238 je .loop2 | |
239 cmp r4d, 4 | |
240 je .loop4 | |
241 cmp r4d, 6 | |
242 je .loop6 | |
243 cmp r4d, 8 | |
244 je .loop8 | |
245 cmp r4d, 10 | |
246 je .loop10 | |
247 cmp r4d, 12 | |
248 je .loop12 | |
249 SCALARPRODUCT_LOOP 14 | |
250 SCALARPRODUCT_LOOP 12 | |
251 SCALARPRODUCT_LOOP 10 | |
252 SCALARPRODUCT_LOOP 8 | |
253 SCALARPRODUCT_LOOP 6 | |
254 SCALARPRODUCT_LOOP 4 | |
255 SCALARPRODUCT_LOOP 2 | |
256 SCALARPRODUCT_LOOP 0 | |
257 .end: | |
258 movhlps m0, m6 | |
259 paddd m6, m0 | |
260 pshuflw m0, m6, 0x4e | |
261 paddd m6, m0 | |
262 movd eax, m6 | |
263 RET | |
174 | 264 |
175 | 265 |
176 | 266 |
177 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | 267 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
178 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top | 268 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |