comparison x86/vp8dsp.asm @ 12054:b8f80fe02861 libavcodec

SSSE3 versions of width4 VP8 6-tap MC functions Also make some small changes to saturation order of 4-tap SSSE3 MC to fix a non-bitexactness bug. Patch mostly by Eli Friedman <eli.friedman AT gmail DOT com>.
author darkshikari
date Fri, 02 Jul 2010 05:27:41 +0000
parents 1b11083f4bb4
children 8527154f6e81
comparison
equal deleted inserted replaced
12053:aa0b01031adf 12054:b8f80fe02861
42 times 4 dw -16, 3 42 times 4 dw -16, 3
43 times 4 dw 1, -8 43 times 4 dw 1, -8
44 times 4 dw 36, 108 44 times 4 dw 36, 108
45 times 4 dw -11, 2 45 times 4 dw -11, 2
46 46
47 fourtap_filter_hb_m: times 8 db -6, -1 47 fourtap_filter_hb_m: times 8 db -6, 123
48 times 8 db 123, 12 48 times 8 db 12, -1
49 times 8 db -9, -6 49 times 8 db -9, 93
50 times 8 db 93, 50 50 times 8 db 50, -6
51 times 8 db -6, -9 51 times 8 db -6, 50
52 times 8 db 50, 93 52 times 8 db 93, -9
53 times 8 db -1, -6 53 times 8 db -1, 12
54 times 8 db 12, 123 54 times 8 db 123, -6
55 55
56 sixtap_filter_hb_m: times 8 db 2, 1 56 sixtap_filter_hb_m: times 8 db 2, 1
57 times 8 db -11, 108 57 times 8 db -11, 108
58 times 8 db 36, -8 58 times 8 db 36, -8
59 times 8 db 3, 3 59 times 8 db 3, 3
134 %define bilinear_filter_vw bilinear_filter_vw_m 134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m 135 %define bilinear_filter_vb bilinear_filter_vb_m
136 %endif 136 %endif
137 137
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
140 140
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
144 144
156 ; 156 ;
157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, 157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
158 ; uint8_t *src, int srcstride, 158 ; uint8_t *src, int srcstride,
159 ; int height, int mx, int my); 159 ; int height, int mx, int my);
160 ;----------------------------------------------------------------------------- 160 ;-----------------------------------------------------------------------------
161
162 %macro FILTER_SSSE3 3
163 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
164 lea r5d, [r5*3]
165 mova m3, [filter_h6_shuf2]
166 mova m4, [filter_h6_shuf3]
167 %ifdef PIC
168 lea r11, [sixtap_filter_hb_m]
169 %endif
170 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
171 mova m6, [sixtap_filter_hb+r5*8-32]
172 mova m7, [sixtap_filter_hb+r5*8-16]
173
174 .nextrow
175 movu m0, [r2-2]
176 mova m1, m0
177 mova m2, m0
178 %ifidn %1, 4
179 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
180 ; shuffle with a memory operand
181 punpcklbw m0, [r2+3]
182 %else
183 pshufb m0, [filter_h6_shuf1]
184 %endif
185 pshufb m1, m3
186 pshufb m2, m4
187 pmaddubsw m0, m5
188 pmaddubsw m1, m6
189 pmaddubsw m2, m7
190 paddsw m0, m1
191 paddsw m0, m2
192 paddsw m0, [pw_64]
193 psraw m0, 7
194 packuswb m0, m0
195 movh [r0], m0 ; store
196
197 ; go to next line
198 add r0, r1
199 add r2, r3
200 dec r4 ; next row
201 jg .nextrow
202 REP_RET
203
204 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
205 shl r5d, 4
206 mova m2, [pw_64]
207 mova m3, [filter_h2_shuf]
208 mova m4, [filter_h4_shuf]
209 %ifdef PIC
210 lea r11, [fourtap_filter_hb_m]
211 %endif
212 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
213 mova m6, [fourtap_filter_hb+r5]
214
215 .nextrow
216 movu m0, [r2-1]
217 mova m1, m0
218 pshufb m0, m3
219 pshufb m1, m4
220 pmaddubsw m0, m5
221 pmaddubsw m1, m6
222 paddsw m0, m2
223 paddsw m0, m1
224 psraw m0, 7
225 packuswb m0, m0
226 movh [r0], m0 ; store
227
228 ; go to next line
229 add r0, r1
230 add r2, r3
231 dec r4 ; next row
232 jg .nextrow
233 REP_RET
234
235 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
236 shl r6d, 4
237 %ifdef PIC
238 lea r11, [fourtap_filter_hb_m]
239 %endif
240 mova m5, [fourtap_filter_hb+r6-16]
241 mova m6, [fourtap_filter_hb+r6]
242 mova m7, [pw_64]
243
244 ; read 3 lines
245 sub r2, r3
246 movh m0, [r2]
247 movh m1, [r2+ r3]
248 movh m2, [r2+2*r3]
249 add r2, r3
250
251 .nextrow
252 movh m3, [r2+2*r3] ; read new row
253 mova m4, m0
254 mova m0, m1
255 punpcklbw m4, m1
256 mova m1, m2
257 punpcklbw m2, m3
258 pmaddubsw m4, m5
259 pmaddubsw m2, m6
260 paddsw m4, m2
261 mova m2, m3
262 paddsw m4, m7
263 psraw m4, 7
264 packuswb m4, m4
265 movh [r0], m4
266
267 ; go to next line
268 add r0, r1
269 add r2, r3
270 dec r4 ; next row
271 jg .nextrow
272 REP_RET
273
274 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
275 lea r6d, [r6*3]
276 %ifdef PIC
277 lea r11, [sixtap_filter_hb_m]
278 %endif
279 lea r6, [sixtap_filter_hb+r6*8]
280
281 ; read 5 lines
282 sub r2, r3
283 sub r2, r3
284 movh m0, [r2]
285 movh m1, [r2+r3]
286 movh m2, [r2+r3*2]
287 lea r2, [r2+r3*2]
288 add r2, r3
289 movh m3, [r2]
290 movh m4, [r2+r3]
291
292 .nextrow
293 movh m5, [r2+2*r3] ; read new row
294 mova m6, m0
295 punpcklbw m6, m5
296 mova m0, m1
297 punpcklbw m1, m2
298 mova m7, m3
299 punpcklbw m7, m4
300 pmaddubsw m6, [r6-48]
301 pmaddubsw m1, [r6-32]
302 pmaddubsw m7, [r6-16]
303 paddsw m6, m1
304 paddsw m6, m7
305 mova m1, m2
306 paddsw m6, [pw_64]
307 mova m2, m3
308 psraw m6, 7
309 mova m3, m4
310 packuswb m6, m6
311 mova m4, m5
312 movh [r0], m6
313
314 ; go to next line
315 add r0, r1
316 add r2, r3
317 dec r4 ; next row
318 jg .nextrow
319 REP_RET
320 %endmacro
321
322 INIT_MMX
323 FILTER_SSSE3 4, 0, 0
324 INIT_XMM
325 FILTER_SSSE3 8, 8, 7
161 326
162 ; 4x4 block, H-only 4-tap filter 327 ; 4x4 block, H-only 4-tap filter
163 cglobal put_vp8_epel4_h4_mmxext, 6, 6 328 cglobal put_vp8_epel4_h4_mmxext, 6, 6
164 shl r5d, 4 329 shl r5d, 4
165 %ifdef PIC 330 %ifdef PIC
381 add r2, r3 546 add r2, r3
382 dec r4 ; next row 547 dec r4 ; next row
383 jg .nextrow 548 jg .nextrow
384 REP_RET 549 REP_RET
385 550
386 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
387 shl r5d, 4
388 mova m2, [pw_64]
389 mova m3, [filter_h4_shuf]
390 mova m4, [filter_h6_shuf2]
391 %ifdef PIC
392 lea r11, [fourtap_filter_hb_m]
393 %endif
394 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
395 mova m6, [fourtap_filter_hb+r5]
396
397 .nextrow
398 movu m0, [r2-1]
399 mova m1, m0
400 pshufb m0, m3
401 pshufb m1, m4
402 pmaddubsw m0, m5
403 pmaddubsw m1, m6
404 paddsw m0, m2
405 paddsw m0, m1
406 psraw m0, 7
407 packuswb m0, m0
408 movh [r0], m0 ; store
409
410 ; go to next line
411 add r0, r1
412 add r2, r3
413 dec r4 ; next row
414 jg .nextrow
415 REP_RET
416
417 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
418 lea r5d, [r5*3]
419 mova m3, [filter_h6_shuf1]
420 mova m4, [filter_h6_shuf2]
421 %ifdef PIC
422 lea r11, [sixtap_filter_hb_m]
423 %endif
424 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
425 mova m6, [sixtap_filter_hb+r5*8-32]
426 mova m7, [sixtap_filter_hb+r5*8-16]
427
428 .nextrow
429 movu m0, [r2-2]
430 mova m1, m0
431 mova m2, m0
432 pshufb m0, m3
433 pshufb m1, m4
434 pshufb m2, [filter_h6_shuf3]
435 pmaddubsw m0, m5
436 pmaddubsw m1, m6
437 pmaddubsw m2, m7
438 paddsw m0, m1
439 paddsw m0, m2
440 paddsw m0, [pw_64]
441 psraw m0, 7
442 packuswb m0, m0
443 movh [r0], m0 ; store
444
445 ; go to next line
446 add r0, r1
447 add r2, r3
448 dec r4 ; next row
449 jg .nextrow
450 REP_RET
451
452 %macro FILTER_V 3 551 %macro FILTER_V 3
453 ; 4x4 block, V-only 4-tap filter 552 ; 4x4 block, V-only 4-tap filter
454 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 553 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
455 shl r6d, 5 554 shl r6d, 5
456 %ifdef PIC 555 %ifdef PIC
570 669
571 INIT_MMX 670 INIT_MMX
572 FILTER_V mmxext, 4, 0 671 FILTER_V mmxext, 4, 0
573 INIT_XMM 672 INIT_XMM
574 FILTER_V sse2, 8, 8 673 FILTER_V sse2, 8, 8
575
576 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
577 shl r6d, 4
578 %ifdef PIC
579 lea r11, [fourtap_filter_hb_m]
580 %endif
581 mova m5, [fourtap_filter_hb+r6-16]
582 mova m6, [fourtap_filter_hb+r6]
583 mova m7, [pw_64]
584
585 ; read 3 lines
586 sub r2, r3
587 movh m0, [r2]
588 movh m1, [r2+ r3]
589 movh m2, [r2+2*r3]
590 add r2, r3
591
592 .nextrow
593 movh m3, [r2+2*r3] ; read new row
594 mova m4, m0
595 mova m0, m1
596 punpcklbw m4, m3
597 punpcklbw m1, m2
598 pmaddubsw m4, m5
599 pmaddubsw m1, m6
600 paddsw m4, m1
601 mova m1, m2
602 paddsw m4, m7
603 mova m2, m3
604 psraw m4, 7
605 packuswb m4, m4
606 movh [r0], m4
607
608 ; go to next line
609 add r0, r1
610 add r2, r3
611 dec r4 ; next row
612 jg .nextrow
613 REP_RET
614
615 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
616 lea r6d, [r6*3]
617 %ifdef PIC
618 lea r11, [sixtap_filter_hb_m]
619 %endif
620 lea r6, [sixtap_filter_hb+r6*8]
621
622 ; read 5 lines
623 sub r2, r3
624 sub r2, r3
625 movh m0, [r2]
626 movh m1, [r2+r3]
627 movh m2, [r2+r3*2]
628 lea r2, [r2+r3*2]
629 add r2, r3
630 movh m3, [r2]
631 movh m4, [r2+r3]
632
633 .nextrow
634 movh m5, [r2+2*r3] ; read new row
635 mova m6, m0
636 punpcklbw m6, m5
637 mova m0, m1
638 punpcklbw m1, m2
639 mova m7, m3
640 punpcklbw m7, m4
641 pmaddubsw m6, [r6-48]
642 pmaddubsw m1, [r6-32]
643 pmaddubsw m7, [r6-16]
644 paddsw m6, m1
645 paddsw m6, m7
646 mova m1, m2
647 paddsw m6, [pw_64]
648 mova m2, m3
649 psraw m6, 7
650 mova m3, m4
651 packuswb m6, m6
652 mova m4, m5
653 movh [r0], m6
654
655 ; go to next line
656 add r0, r1
657 add r2, r3
658 dec r4 ; next row
659 jg .nextrow
660 REP_RET
661 674
662 %macro FILTER_BILINEAR 3 675 %macro FILTER_BILINEAR 3
663 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 676 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
664 mov r5d, 8*16 677 mov r5d, 8*16
665 shl r6d, 4 678 shl r6d, 4