Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12054:b8f80fe02861 libavcodec
SSSE3 versions of width4 VP8 6-tap MC functions
Also make some small changes to saturation order of 4-tap SSSE3 MC to fix a
non-bitexactness bug.
Patch mostly by Eli Friedman <eli.friedman AT gmail DOT com>.
author | darkshikari |
---|---|
date | Fri, 02 Jul 2010 05:27:41 +0000 |
parents | 1b11083f4bb4 |
children | 8527154f6e81 |
comparison
equal
deleted
inserted
replaced
12053:aa0b01031adf | 12054:b8f80fe02861 |
---|---|
42 times 4 dw -16, 3 | 42 times 4 dw -16, 3 |
43 times 4 dw 1, -8 | 43 times 4 dw 1, -8 |
44 times 4 dw 36, 108 | 44 times 4 dw 36, 108 |
45 times 4 dw -11, 2 | 45 times 4 dw -11, 2 |
46 | 46 |
47 fourtap_filter_hb_m: times 8 db -6, -1 | 47 fourtap_filter_hb_m: times 8 db -6, 123 |
48 times 8 db 123, 12 | 48 times 8 db 12, -1 |
49 times 8 db -9, -6 | 49 times 8 db -9, 93 |
50 times 8 db 93, 50 | 50 times 8 db 50, -6 |
51 times 8 db -6, -9 | 51 times 8 db -6, 50 |
52 times 8 db 50, 93 | 52 times 8 db 93, -9 |
53 times 8 db -1, -6 | 53 times 8 db -1, 12 |
54 times 8 db 12, 123 | 54 times 8 db 123, -6 |
55 | 55 |
56 sixtap_filter_hb_m: times 8 db 2, 1 | 56 sixtap_filter_hb_m: times 8 db 2, 1 |
57 times 8 db -11, 108 | 57 times 8 db -11, 108 |
58 times 8 db 36, -8 | 58 times 8 db 36, -8 |
59 times 8 db 3, 3 | 59 times 8 db 3, 3 |
134 %define bilinear_filter_vw bilinear_filter_vw_m | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | 135 %define bilinear_filter_vb bilinear_filter_vb_m |
136 %endif | 136 %endif |
137 | 137 |
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 | 139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
140 | 140 |
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | 142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
144 | 144 |
156 ; | 156 ; |
157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | 157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |
158 ; uint8_t *src, int srcstride, | 158 ; uint8_t *src, int srcstride, |
159 ; int height, int mx, int my); | 159 ; int height, int mx, int my); |
160 ;----------------------------------------------------------------------------- | 160 ;----------------------------------------------------------------------------- |
161 | |
162 %macro FILTER_SSSE3 3 | |
163 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 | |
164 lea r5d, [r5*3] | |
165 mova m3, [filter_h6_shuf2] | |
166 mova m4, [filter_h6_shuf3] | |
167 %ifdef PIC | |
168 lea r11, [sixtap_filter_hb_m] | |
169 %endif | |
170 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes | |
171 mova m6, [sixtap_filter_hb+r5*8-32] | |
172 mova m7, [sixtap_filter_hb+r5*8-16] | |
173 | |
174 .nextrow | |
175 movu m0, [r2-2] | |
176 mova m1, m0 | |
177 mova m2, m0 | |
178 %ifidn %1, 4 | |
179 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the | |
180 ; shuffle with a memory operand | |
181 punpcklbw m0, [r2+3] | |
182 %else | |
183 pshufb m0, [filter_h6_shuf1] | |
184 %endif | |
185 pshufb m1, m3 | |
186 pshufb m2, m4 | |
187 pmaddubsw m0, m5 | |
188 pmaddubsw m1, m6 | |
189 pmaddubsw m2, m7 | |
190 paddsw m0, m1 | |
191 paddsw m0, m2 | |
192 paddsw m0, [pw_64] | |
193 psraw m0, 7 | |
194 packuswb m0, m0 | |
195 movh [r0], m0 ; store | |
196 | |
197 ; go to next line | |
198 add r0, r1 | |
199 add r2, r3 | |
200 dec r4 ; next row | |
201 jg .nextrow | |
202 REP_RET | |
203 | |
204 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 | |
205 shl r5d, 4 | |
206 mova m2, [pw_64] | |
207 mova m3, [filter_h2_shuf] | |
208 mova m4, [filter_h4_shuf] | |
209 %ifdef PIC | |
210 lea r11, [fourtap_filter_hb_m] | |
211 %endif | |
212 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes | |
213 mova m6, [fourtap_filter_hb+r5] | |
214 | |
215 .nextrow | |
216 movu m0, [r2-1] | |
217 mova m1, m0 | |
218 pshufb m0, m3 | |
219 pshufb m1, m4 | |
220 pmaddubsw m0, m5 | |
221 pmaddubsw m1, m6 | |
222 paddsw m0, m2 | |
223 paddsw m0, m1 | |
224 psraw m0, 7 | |
225 packuswb m0, m0 | |
226 movh [r0], m0 ; store | |
227 | |
228 ; go to next line | |
229 add r0, r1 | |
230 add r2, r3 | |
231 dec r4 ; next row | |
232 jg .nextrow | |
233 REP_RET | |
234 | |
235 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 | |
236 shl r6d, 4 | |
237 %ifdef PIC | |
238 lea r11, [fourtap_filter_hb_m] | |
239 %endif | |
240 mova m5, [fourtap_filter_hb+r6-16] | |
241 mova m6, [fourtap_filter_hb+r6] | |
242 mova m7, [pw_64] | |
243 | |
244 ; read 3 lines | |
245 sub r2, r3 | |
246 movh m0, [r2] | |
247 movh m1, [r2+ r3] | |
248 movh m2, [r2+2*r3] | |
249 add r2, r3 | |
250 | |
251 .nextrow | |
252 movh m3, [r2+2*r3] ; read new row | |
253 mova m4, m0 | |
254 mova m0, m1 | |
255 punpcklbw m4, m1 | |
256 mova m1, m2 | |
257 punpcklbw m2, m3 | |
258 pmaddubsw m4, m5 | |
259 pmaddubsw m2, m6 | |
260 paddsw m4, m2 | |
261 mova m2, m3 | |
262 paddsw m4, m7 | |
263 psraw m4, 7 | |
264 packuswb m4, m4 | |
265 movh [r0], m4 | |
266 | |
267 ; go to next line | |
268 add r0, r1 | |
269 add r2, r3 | |
270 dec r4 ; next row | |
271 jg .nextrow | |
272 REP_RET | |
273 | |
274 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 | |
275 lea r6d, [r6*3] | |
276 %ifdef PIC | |
277 lea r11, [sixtap_filter_hb_m] | |
278 %endif | |
279 lea r6, [sixtap_filter_hb+r6*8] | |
280 | |
281 ; read 5 lines | |
282 sub r2, r3 | |
283 sub r2, r3 | |
284 movh m0, [r2] | |
285 movh m1, [r2+r3] | |
286 movh m2, [r2+r3*2] | |
287 lea r2, [r2+r3*2] | |
288 add r2, r3 | |
289 movh m3, [r2] | |
290 movh m4, [r2+r3] | |
291 | |
292 .nextrow | |
293 movh m5, [r2+2*r3] ; read new row | |
294 mova m6, m0 | |
295 punpcklbw m6, m5 | |
296 mova m0, m1 | |
297 punpcklbw m1, m2 | |
298 mova m7, m3 | |
299 punpcklbw m7, m4 | |
300 pmaddubsw m6, [r6-48] | |
301 pmaddubsw m1, [r6-32] | |
302 pmaddubsw m7, [r6-16] | |
303 paddsw m6, m1 | |
304 paddsw m6, m7 | |
305 mova m1, m2 | |
306 paddsw m6, [pw_64] | |
307 mova m2, m3 | |
308 psraw m6, 7 | |
309 mova m3, m4 | |
310 packuswb m6, m6 | |
311 mova m4, m5 | |
312 movh [r0], m6 | |
313 | |
314 ; go to next line | |
315 add r0, r1 | |
316 add r2, r3 | |
317 dec r4 ; next row | |
318 jg .nextrow | |
319 REP_RET | |
320 %endmacro | |
321 | |
322 INIT_MMX | |
323 FILTER_SSSE3 4, 0, 0 | |
324 INIT_XMM | |
325 FILTER_SSSE3 8, 8, 7 | |
161 | 326 |
162 ; 4x4 block, H-only 4-tap filter | 327 ; 4x4 block, H-only 4-tap filter |
163 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | 328 cglobal put_vp8_epel4_h4_mmxext, 6, 6 |
164 shl r5d, 4 | 329 shl r5d, 4 |
165 %ifdef PIC | 330 %ifdef PIC |
381 add r2, r3 | 546 add r2, r3 |
382 dec r4 ; next row | 547 dec r4 ; next row |
383 jg .nextrow | 548 jg .nextrow |
384 REP_RET | 549 REP_RET |
385 | 550 |
386 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 | |
387 shl r5d, 4 | |
388 mova m2, [pw_64] | |
389 mova m3, [filter_h4_shuf] | |
390 mova m4, [filter_h6_shuf2] | |
391 %ifdef PIC | |
392 lea r11, [fourtap_filter_hb_m] | |
393 %endif | |
394 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes | |
395 mova m6, [fourtap_filter_hb+r5] | |
396 | |
397 .nextrow | |
398 movu m0, [r2-1] | |
399 mova m1, m0 | |
400 pshufb m0, m3 | |
401 pshufb m1, m4 | |
402 pmaddubsw m0, m5 | |
403 pmaddubsw m1, m6 | |
404 paddsw m0, m2 | |
405 paddsw m0, m1 | |
406 psraw m0, 7 | |
407 packuswb m0, m0 | |
408 movh [r0], m0 ; store | |
409 | |
410 ; go to next line | |
411 add r0, r1 | |
412 add r2, r3 | |
413 dec r4 ; next row | |
414 jg .nextrow | |
415 REP_RET | |
416 | |
417 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 | |
418 lea r5d, [r5*3] | |
419 mova m3, [filter_h6_shuf1] | |
420 mova m4, [filter_h6_shuf2] | |
421 %ifdef PIC | |
422 lea r11, [sixtap_filter_hb_m] | |
423 %endif | |
424 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes | |
425 mova m6, [sixtap_filter_hb+r5*8-32] | |
426 mova m7, [sixtap_filter_hb+r5*8-16] | |
427 | |
428 .nextrow | |
429 movu m0, [r2-2] | |
430 mova m1, m0 | |
431 mova m2, m0 | |
432 pshufb m0, m3 | |
433 pshufb m1, m4 | |
434 pshufb m2, [filter_h6_shuf3] | |
435 pmaddubsw m0, m5 | |
436 pmaddubsw m1, m6 | |
437 pmaddubsw m2, m7 | |
438 paddsw m0, m1 | |
439 paddsw m0, m2 | |
440 paddsw m0, [pw_64] | |
441 psraw m0, 7 | |
442 packuswb m0, m0 | |
443 movh [r0], m0 ; store | |
444 | |
445 ; go to next line | |
446 add r0, r1 | |
447 add r2, r3 | |
448 dec r4 ; next row | |
449 jg .nextrow | |
450 REP_RET | |
451 | |
452 %macro FILTER_V 3 | 551 %macro FILTER_V 3 |
453 ; 4x4 block, V-only 4-tap filter | 552 ; 4x4 block, V-only 4-tap filter |
454 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | 553 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |
455 shl r6d, 5 | 554 shl r6d, 5 |
456 %ifdef PIC | 555 %ifdef PIC |
570 | 669 |
571 INIT_MMX | 670 INIT_MMX |
572 FILTER_V mmxext, 4, 0 | 671 FILTER_V mmxext, 4, 0 |
573 INIT_XMM | 672 INIT_XMM |
574 FILTER_V sse2, 8, 8 | 673 FILTER_V sse2, 8, 8 |
575 | |
576 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 | |
577 shl r6d, 4 | |
578 %ifdef PIC | |
579 lea r11, [fourtap_filter_hb_m] | |
580 %endif | |
581 mova m5, [fourtap_filter_hb+r6-16] | |
582 mova m6, [fourtap_filter_hb+r6] | |
583 mova m7, [pw_64] | |
584 | |
585 ; read 3 lines | |
586 sub r2, r3 | |
587 movh m0, [r2] | |
588 movh m1, [r2+ r3] | |
589 movh m2, [r2+2*r3] | |
590 add r2, r3 | |
591 | |
592 .nextrow | |
593 movh m3, [r2+2*r3] ; read new row | |
594 mova m4, m0 | |
595 mova m0, m1 | |
596 punpcklbw m4, m3 | |
597 punpcklbw m1, m2 | |
598 pmaddubsw m4, m5 | |
599 pmaddubsw m1, m6 | |
600 paddsw m4, m1 | |
601 mova m1, m2 | |
602 paddsw m4, m7 | |
603 mova m2, m3 | |
604 psraw m4, 7 | |
605 packuswb m4, m4 | |
606 movh [r0], m4 | |
607 | |
608 ; go to next line | |
609 add r0, r1 | |
610 add r2, r3 | |
611 dec r4 ; next row | |
612 jg .nextrow | |
613 REP_RET | |
614 | |
615 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 | |
616 lea r6d, [r6*3] | |
617 %ifdef PIC | |
618 lea r11, [sixtap_filter_hb_m] | |
619 %endif | |
620 lea r6, [sixtap_filter_hb+r6*8] | |
621 | |
622 ; read 5 lines | |
623 sub r2, r3 | |
624 sub r2, r3 | |
625 movh m0, [r2] | |
626 movh m1, [r2+r3] | |
627 movh m2, [r2+r3*2] | |
628 lea r2, [r2+r3*2] | |
629 add r2, r3 | |
630 movh m3, [r2] | |
631 movh m4, [r2+r3] | |
632 | |
633 .nextrow | |
634 movh m5, [r2+2*r3] ; read new row | |
635 mova m6, m0 | |
636 punpcklbw m6, m5 | |
637 mova m0, m1 | |
638 punpcklbw m1, m2 | |
639 mova m7, m3 | |
640 punpcklbw m7, m4 | |
641 pmaddubsw m6, [r6-48] | |
642 pmaddubsw m1, [r6-32] | |
643 pmaddubsw m7, [r6-16] | |
644 paddsw m6, m1 | |
645 paddsw m6, m7 | |
646 mova m1, m2 | |
647 paddsw m6, [pw_64] | |
648 mova m2, m3 | |
649 psraw m6, 7 | |
650 mova m3, m4 | |
651 packuswb m6, m6 | |
652 mova m4, m5 | |
653 movh [r0], m6 | |
654 | |
655 ; go to next line | |
656 add r0, r1 | |
657 add r2, r3 | |
658 dec r4 ; next row | |
659 jg .nextrow | |
660 REP_RET | |
661 | 674 |
662 %macro FILTER_BILINEAR 3 | 675 %macro FILTER_BILINEAR 3 |
663 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | 676 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |
664 mov r5d, 8*16 | 677 mov r5d, 8*16 |
665 shl r6d, 4 | 678 shl r6d, 4 |