Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 11975:c3afb5be0d9b libavcodec
First shot at VP8 optimizations:
- MMXEXT, SSE2 and SSSE3 MC functions
- MMX and SSE4 IDCT dc_add functions
Patch by Jason Garrett-Glaser <darkshikari gmail com> and myself.
author | rbultje |
---|---|
date | Sun, 27 Jun 2010 02:01:45 +0000 |
parents | |
children | a6d24fc1deb7 |
comparison
equal
deleted
inserted
replaced
11974:356b20a6566d | 11975:c3afb5be0d9b |
---|---|
1 ;****************************************************************************** | |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
24 | |
25 SECTION_RODATA | |
26 | |
27 fourtap_filter_hw_m: times 4 dw -6, 123 | |
28 times 4 dw 12, -1 | |
29 times 4 dw -9, 93 | |
30 times 4 dw 50, -6 | |
31 times 4 dw -6, 50 | |
32 times 4 dw 93, -9 | |
33 times 4 dw -1, 12 | |
34 times 4 dw 123, -6 | |
35 | |
36 sixtap_filter_hw_m: times 4 dw 2, -11 | |
37 times 4 dw 108, 36 | |
38 times 4 dw -8, 1 | |
39 times 4 dw 3, -16 | |
40 times 4 dw 77, 77 | |
41 times 4 dw -16, 3 | |
42 times 4 dw 1, -8 | |
43 times 4 dw 36, 108 | |
44 times 4 dw -11, 2 | |
45 | |
46 fourtap_filter_hb_m: times 8 db -6, -1 | |
47 times 8 db 123, 12 | |
48 times 8 db -9, -6 | |
49 times 8 db 93, 50 | |
50 times 8 db -6, -9 | |
51 times 8 db 50, 93 | |
52 times 8 db -1, -6 | |
53 times 8 db 12, 123 | |
54 | |
55 sixtap_filter_hb_m: times 8 db 2, 1 | |
56 times 8 db -11, 108 | |
57 times 8 db 36, -8 | |
58 times 8 db 3, 3 | |
59 times 8 db -16, 77 | |
60 times 8 db 77, -16 | |
61 times 8 db 1, 2 | |
62 times 8 db -8, 36 | |
63 times 8 db 108, -11 | |
64 | |
65 fourtap_filter_v_m: times 8 dw -6 | |
66 times 8 dw 123 | |
67 times 8 dw 12 | |
68 times 8 dw -1 | |
69 times 8 dw -9 | |
70 times 8 dw 93 | |
71 times 8 dw 50 | |
72 times 8 dw -6 | |
73 times 8 dw -6 | |
74 times 8 dw 50 | |
75 times 8 dw 93 | |
76 times 8 dw -9 | |
77 times 8 dw -1 | |
78 times 8 dw 12 | |
79 times 8 dw 123 | |
80 times 8 dw -6 | |
81 | |
82 sixtap_filter_v_m: times 8 dw 2 | |
83 times 8 dw -11 | |
84 times 8 dw 108 | |
85 times 8 dw 36 | |
86 times 8 dw -8 | |
87 times 8 dw 1 | |
88 times 8 dw 3 | |
89 times 8 dw -16 | |
90 times 8 dw 77 | |
91 times 8 dw 77 | |
92 times 8 dw -16 | |
93 times 8 dw 3 | |
94 times 8 dw 1 | |
95 times 8 dw -8 | |
96 times 8 dw 36 | |
97 times 8 dw 108 | |
98 times 8 dw -11 | |
99 times 8 dw 2 | |
100 | |
101 %ifdef PIC | |
102 %define fourtap_filter_hw r11 | |
103 %define sixtap_filter_hw r11 | |
104 %define fourtap_filter_hb r11 | |
105 %define sixtap_filter_hb r11 | |
106 %define fourtap_filter_v r11 | |
107 %define sixtap_filter_v r11 | |
108 %else | |
109 %define fourtap_filter_hw fourtap_filter_hw_m | |
110 %define sixtap_filter_hw sixtap_filter_hw_m | |
111 %define fourtap_filter_hb fourtap_filter_hb_m | |
112 %define sixtap_filter_hb sixtap_filter_hb_m | |
113 %define fourtap_filter_v fourtap_filter_v_m | |
114 %define sixtap_filter_v sixtap_filter_v_m | |
115 %endif | |
116 | |
117 filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 | |
118 filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
119 | |
120 filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 | |
121 filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
122 filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
123 | |
124 cextern pw_4 | |
125 cextern pw_64 | |
126 | |
127 SECTION .text | |
128 | |
129 ;----------------------------------------------------------------------------- | |
130 ; subpel MC functions: | |
131 ; | |
132 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
133 ; uint8_t *src, int srcstride, | |
134 ; int height, int mx, int my); | |
135 ;----------------------------------------------------------------------------- | |
136 | |
137 ; 4x4 block, H-only 4-tap filter | |
138 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
139 shl r5d, 4 | |
140 %ifdef PIC | |
141 lea r11, [fourtap_filter_hw_m] | |
142 %endif | |
143 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
144 movq mm5, [fourtap_filter_hw+r5] | |
145 movq mm7, [pw_64] | |
146 pxor mm6, mm6 | |
147 | |
148 .nextrow | |
149 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
150 | |
151 ; first set of 2 pixels | |
152 movq mm2, mm1 ; byte ABCD.. | |
153 punpcklbw mm1, mm6 ; byte->word ABCD | |
154 pshufw mm0, mm2, 9 ; byte CDEF.. | |
155 punpcklbw mm0, mm6 ; byte->word CDEF | |
156 pshufw mm3, mm1, 0x94 ; word ABBC | |
157 pshufw mm1, mm0, 0x94 ; word CDDE | |
158 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
159 movq mm0, mm1 ; backup for second set of pixels | |
160 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
161 paddd mm3, mm1 ; finish 1st 2px | |
162 | |
163 ; second set of 2 pixels, use backup of above | |
164 punpckhbw mm2, mm6 ; byte->word EFGH | |
165 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
166 pshufw mm1, mm2, 0x94 ; word EFFG | |
167 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
168 paddd mm0, mm1 ; finish 2nd 2px | |
169 | |
170 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
171 packssdw mm3, mm0 ; merge dword->word (4px) | |
172 paddsw mm3, mm7 ; rounding | |
173 psraw mm3, 7 | |
174 packuswb mm3, mm6 ; clip and word->bytes | |
175 movd [r0], mm3 ; store | |
176 | |
177 ; go to next line | |
178 add r0, r1 | |
179 add r2, r3 | |
180 dec r4 ; next row | |
181 jg .nextrow | |
182 REP_RET | |
183 | |
184 ; 4x4 block, H-only 6-tap filter | |
185 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
186 lea r5d, [r5*3] | |
187 %ifdef PIC | |
188 lea r11, [sixtap_filter_hw_m] | |
189 %endif | |
190 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
191 movq mm5, [sixtap_filter_hw+r5*8-32] | |
192 movq mm6, [sixtap_filter_hw+r5*8-16] | |
193 movq mm7, [pw_64] | |
194 pxor mm3, mm3 | |
195 | |
196 .nextrow | |
197 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
198 | |
199 ; first set of 2 pixels | |
200 movq mm2, mm1 ; byte ABCD.. | |
201 punpcklbw mm1, mm3 ; byte->word ABCD | |
202 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
203 punpckhbw mm2, mm3 ; byte->word EFGH | |
204 punpcklbw mm0, mm3 ; byte->word CDEF | |
205 pshufw mm1, mm1, 0x94 ; word ABBC | |
206 pshufw mm2, mm2, 0x94 ; word EFFG | |
207 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
208 pshufw mm3, mm0, 0x94 ; word CDDE | |
209 movq mm0, mm3 ; backup for second set of pixels | |
210 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
211 paddd mm1, mm3 ; add to 1st 2px cache | |
212 movq mm3, mm2 ; backup for second set of pixels | |
213 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
214 paddd mm1, mm2 ; finish 1st 2px | |
215 | |
216 ; second set of 2 pixels, use backup of above | |
217 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
218 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
219 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
220 paddd mm0, mm3 ; add to 2nd 2px cache | |
221 pxor mm3, mm3 | |
222 punpcklbw mm2, mm3 ; byte->word FGHI | |
223 pshufw mm2, mm2, 0xE9 ; word GHHI | |
224 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
225 paddd mm0, mm2 ; finish 2nd 2px | |
226 | |
227 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
228 packssdw mm1, mm0 ; merge dword->word (4px) | |
229 paddsw mm1, mm7 ; rounding | |
230 psraw mm1, 7 | |
231 packuswb mm1, mm3 ; clip and word->bytes | |
232 movd [r0], mm1 ; store | |
233 | |
234 ; go to next line | |
235 add r0, r1 | |
236 add r2, r3 | |
237 dec r4 ; next row | |
238 jg .nextrow | |
239 REP_RET | |
240 | |
241 ; 4x4 block, H-only 4-tap filter | |
242 INIT_XMM | |
243 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | |
244 shl r5d, 4 | |
245 %ifdef PIC | |
246 lea r11, [fourtap_filter_hw_m] | |
247 %endif | |
248 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
249 mova m6, [fourtap_filter_hw+r5] | |
250 pxor m7, m7 | |
251 | |
252 .nextrow | |
253 movh m0, [r2-1] | |
254 punpcklbw m0, m7 ; ABCDEFGH | |
255 mova m1, m0 | |
256 mova m2, m0 | |
257 mova m3, m0 | |
258 psrldq m1, 2 ; BCDEFGH | |
259 psrldq m2, 4 ; CDEFGH | |
260 psrldq m3, 6 ; DEFGH | |
261 punpcklwd m0, m1 ; ABBCCDDE | |
262 punpcklwd m2, m3 ; CDDEEFFG | |
263 pmaddwd m0, m5 | |
264 pmaddwd m2, m6 | |
265 paddd m0, m2 | |
266 | |
267 movh m1, [r2+3] | |
268 punpcklbw m1, m7 ; ABCDEFGH | |
269 mova m2, m1 | |
270 mova m3, m1 | |
271 mova m4, m1 | |
272 psrldq m2, 2 ; BCDEFGH | |
273 psrldq m3, 4 ; CDEFGH | |
274 psrldq m4, 6 ; DEFGH | |
275 punpcklwd m1, m2 ; ABBCCDDE | |
276 punpcklwd m3, m4 ; CDDEEFFG | |
277 pmaddwd m1, m5 | |
278 pmaddwd m3, m6 | |
279 paddd m1, m3 | |
280 | |
281 packssdw m0, m1 | |
282 paddsw m0, [pw_64] | |
283 psraw m0, 7 | |
284 packuswb m0, m7 | |
285 movh [r0], m0 ; store | |
286 | |
287 ; go to next line | |
288 add r0, r1 | |
289 add r2, r3 | |
290 dec r4 ; next row | |
291 jg .nextrow | |
292 REP_RET | |
293 | |
294 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | |
295 lea r5d, [r5*3] | |
296 %ifdef PIC | |
297 lea r11, [sixtap_filter_hw_m] | |
298 %endif | |
299 lea r5, [sixtap_filter_hw+r5*8] | |
300 pxor m7, m7 | |
301 | |
302 .nextrow | |
303 movu m0, [r2-2] | |
304 mova m6, m0 | |
305 mova m4, m0 | |
306 punpcklbw m0, m7 ; ABCDEFGHI | |
307 mova m1, m0 | |
308 mova m2, m0 | |
309 mova m3, m0 | |
310 psrldq m1, 2 ; BCDEFGH | |
311 psrldq m2, 4 ; CDEFGH | |
312 psrldq m3, 6 ; DEFGH | |
313 psrldq m4, 4 | |
314 punpcklbw m4, m7 ; EFGH | |
315 mova m5, m4 | |
316 psrldq m5, 2 ; FGH | |
317 punpcklwd m0, m1 ; ABBCCDDE | |
318 punpcklwd m2, m3 ; CDDEEFFG | |
319 punpcklwd m4, m5 ; EFFGGHHI | |
320 pmaddwd m0, [r5-48] | |
321 pmaddwd m2, [r5-32] | |
322 pmaddwd m4, [r5-16] | |
323 paddd m0, m2 | |
324 paddd m0, m4 | |
325 | |
326 psrldq m6, 4 | |
327 mova m4, m6 | |
328 punpcklbw m6, m7 ; ABCDEFGHI | |
329 mova m1, m6 | |
330 mova m2, m6 | |
331 mova m3, m6 | |
332 psrldq m1, 2 ; BCDEFGH | |
333 psrldq m2, 4 ; CDEFGH | |
334 psrldq m3, 6 ; DEFGH | |
335 psrldq m4, 4 | |
336 punpcklbw m4, m7 ; EFGH | |
337 mova m5, m4 | |
338 psrldq m5, 2 ; FGH | |
339 punpcklwd m6, m1 ; ABBCCDDE | |
340 punpcklwd m2, m3 ; CDDEEFFG | |
341 punpcklwd m4, m5 ; EFFGGHHI | |
342 pmaddwd m6, [r5-48] | |
343 pmaddwd m2, [r5-32] | |
344 pmaddwd m4, [r5-16] | |
345 paddd m6, m2 | |
346 paddd m6, m4 | |
347 | |
348 packssdw m0, m6 | |
349 paddsw m0, [pw_64] | |
350 psraw m0, 7 | |
351 packuswb m0, m7 | |
352 movh [r0], m0 ; store | |
353 | |
354 ; go to next line | |
355 add r0, r1 | |
356 add r2, r3 | |
357 dec r4 ; next row | |
358 jg .nextrow | |
359 REP_RET | |
360 | |
361 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 | |
362 shl r5d, 4 | |
363 mova m2, [pw_64] | |
364 mova m3, [filter_v4_shuf1] | |
365 mova m4, [filter_v4_shuf2] | |
366 %ifdef PIC | |
367 lea r11, [fourtap_filter_hb_m] | |
368 %endif | |
369 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes | |
370 mova m6, [fourtap_filter_hb+r5] | |
371 | |
372 .nextrow | |
373 movu m0, [r2-1] | |
374 mova m1, m0 | |
375 pshufb m0, m3 | |
376 pshufb m1, m4 | |
377 pmaddubsw m0, m5 | |
378 pmaddubsw m1, m6 | |
379 paddsw m0, m2 | |
380 paddsw m0, m1 | |
381 psraw m0, 7 | |
382 packuswb m0, m0 | |
383 movh [r0], m0 ; store | |
384 | |
385 ; go to next line | |
386 add r0, r1 | |
387 add r2, r3 | |
388 dec r4 ; next row | |
389 jg .nextrow | |
390 REP_RET | |
391 | |
392 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 | |
393 lea r5d, [r5*3] | |
394 mova m3, [filter_v6_shuf1] | |
395 mova m4, [filter_v6_shuf2] | |
396 %ifdef PIC | |
397 lea r11, [sixtap_filter_hb_m] | |
398 %endif | |
399 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes | |
400 mova m6, [sixtap_filter_hb+r5*8-32] | |
401 mova m7, [sixtap_filter_hb+r5*8-16] | |
402 | |
403 .nextrow | |
404 movu m0, [r2-2] | |
405 mova m1, m0 | |
406 mova m2, m0 | |
407 pshufb m0, m3 | |
408 pshufb m1, m4 | |
409 pshufb m2, [filter_v6_shuf3] | |
410 pmaddubsw m0, m5 | |
411 pmaddubsw m1, m6 | |
412 pmaddubsw m2, m7 | |
413 paddsw m0, m1 | |
414 paddsw m0, m2 | |
415 paddsw m0, [pw_64] | |
416 psraw m0, 7 | |
417 packuswb m0, m0 | |
418 movh [r0], m0 ; store | |
419 | |
420 ; go to next line | |
421 add r0, r1 | |
422 add r2, r3 | |
423 dec r4 ; next row | |
424 jg .nextrow | |
425 REP_RET | |
426 | |
427 %macro FILTER_V 3 | |
428 ; 4x4 block, V-only 4-tap filter | |
429 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
430 shl r6d, 5 | |
431 %ifdef PIC | |
432 lea r11, [fourtap_filter_v_m] | |
433 %endif | |
434 lea r6, [fourtap_filter_v+r6-32] | |
435 mova m6, [pw_64] | |
436 pxor m7, m7 | |
437 mova m5, [r6+48] | |
438 | |
439 ; read 3 lines | |
440 sub r2, r3 | |
441 movh m0, [r2] | |
442 movh m1, [r2+ r3] | |
443 movh m2, [r2+2*r3] | |
444 add r2, r3 | |
445 punpcklbw m0, m7 | |
446 punpcklbw m1, m7 | |
447 punpcklbw m2, m7 | |
448 | |
449 .nextrow | |
450 ; first calculate negative taps (to prevent losing positive overflows) | |
451 movh m4, [r2+2*r3] ; read new row | |
452 punpcklbw m4, m7 | |
453 mova m3, m4 | |
454 pmullw m0, [r6+0] | |
455 pmullw m4, m5 | |
456 paddsw m4, m0 | |
457 | |
458 ; then calculate positive taps | |
459 mova m0, m1 | |
460 pmullw m1, [r6+16] | |
461 paddsw m4, m1 | |
462 mova m1, m2 | |
463 pmullw m2, [r6+32] | |
464 paddsw m4, m2 | |
465 mova m2, m3 | |
466 | |
467 ; round/clip/store | |
468 paddsw m4, m6 | |
469 psraw m4, 7 | |
470 packuswb m4, m7 | |
471 movh [r0], m4 | |
472 | |
473 ; go to next line | |
474 add r0, r1 | |
475 add r2, r3 | |
476 dec r4 ; next row | |
477 jg .nextrow | |
478 REP_RET | |
479 | |
480 | |
481 ; 4x4 block, V-only 6-tap filter | |
482 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
483 shl r6d, 4 | |
484 lea r6, [r6*3] | |
485 %ifdef PIC | |
486 lea r11, [sixtap_filter_v_m] | |
487 %endif | |
488 lea r6, [sixtap_filter_v+r6-96] | |
489 pxor m7, m7 | |
490 | |
491 ; read 5 lines | |
492 sub r2, r3 | |
493 sub r2, r3 | |
494 movh m0, [r2] | |
495 movh m1, [r2+r3] | |
496 movh m2, [r2+r3*2] | |
497 lea r2, [r2+r3*2] | |
498 add r2, r3 | |
499 movh m3, [r2] | |
500 movh m4, [r2+r3] | |
501 punpcklbw m0, m7 | |
502 punpcklbw m1, m7 | |
503 punpcklbw m2, m7 | |
504 punpcklbw m3, m7 | |
505 punpcklbw m4, m7 | |
506 | |
507 .nextrow | |
508 ; first calculate negative taps (to prevent losing positive overflows) | |
509 mova m5, m1 | |
510 pmullw m5, [r6+16] | |
511 mova m6, m4 | |
512 pmullw m6, [r6+64] | |
513 paddsw m6, m5 | |
514 | |
515 ; then calculate positive taps | |
516 movh m5, [r2+2*r3] ; read new row | |
517 punpcklbw m5, m7 | |
518 pmullw m0, [r6+0] | |
519 paddsw m6, m0 | |
520 mova m0, m1 | |
521 mova m1, m2 | |
522 pmullw m2, [r6+32] | |
523 paddsw m6, m2 | |
524 mova m2, m3 | |
525 pmullw m3, [r6+48] | |
526 paddsw m6, m3 | |
527 mova m3, m4 | |
528 mova m4, m5 | |
529 pmullw m5, [r6+80] | |
530 paddsw m6, m5 | |
531 | |
532 ; round/clip/store | |
533 paddsw m6, [pw_64] | |
534 psraw m6, 7 | |
535 packuswb m6, m7 | |
536 movh [r0], m6 | |
537 | |
538 ; go to next line | |
539 add r0, r1 | |
540 add r2, r3 | |
541 dec r4 ; next row | |
542 jg .nextrow | |
543 REP_RET | |
544 %endmacro | |
545 | |
546 INIT_MMX | |
547 FILTER_V mmxext, 4, 0 | |
548 INIT_XMM | |
549 FILTER_V sse2, 8, 8 | |
550 | |
551 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 | |
552 shl r6d, 4 | |
553 %ifdef PIC | |
554 lea r11, [fourtap_filter_hb_m] | |
555 %endif | |
556 mova m5, [fourtap_filter_hb+r6-16] | |
557 mova m6, [fourtap_filter_hb+r6] | |
558 mova m7, [pw_64] | |
559 | |
560 ; read 3 lines | |
561 sub r2, r3 | |
562 movh m0, [r2] | |
563 movh m1, [r2+ r3] | |
564 movh m2, [r2+2*r3] | |
565 add r2, r3 | |
566 | |
567 .nextrow | |
568 movh m3, [r2+2*r3] ; read new row | |
569 mova m4, m0 | |
570 mova m0, m1 | |
571 punpcklbw m4, m3 | |
572 punpcklbw m1, m2 | |
573 pmaddubsw m4, m5 | |
574 pmaddubsw m1, m6 | |
575 paddsw m4, m1 | |
576 mova m1, m2 | |
577 paddsw m4, m7 | |
578 mova m2, m3 | |
579 psraw m4, 7 | |
580 packuswb m4, m4 | |
581 movh [r0], m4 | |
582 | |
583 ; go to next line | |
584 add r0, r1 | |
585 add r2, r3 | |
586 dec r4 ; next row | |
587 jg .nextrow | |
588 REP_RET | |
589 | |
590 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 | |
591 lea r6d, [r6*3] | |
592 %ifdef PIC | |
593 lea r11, [sixtap_filter_hb_m] | |
594 %endif | |
595 lea r6, [sixtap_filter_hb+r6*8] | |
596 | |
597 ; read 5 lines | |
598 sub r2, r3 | |
599 sub r2, r3 | |
600 movh m0, [r2] | |
601 movh m1, [r2+r3] | |
602 movh m2, [r2+r3*2] | |
603 lea r2, [r2+r3*2] | |
604 add r2, r3 | |
605 movh m3, [r2] | |
606 movh m4, [r2+r3] | |
607 | |
608 .nextrow | |
609 movh m5, [r2+2*r3] ; read new row | |
610 mova m6, m0 | |
611 punpcklbw m6, m5 | |
612 mova m0, m1 | |
613 punpcklbw m1, m2 | |
614 mova m7, m3 | |
615 punpcklbw m7, m4 | |
616 pmaddubsw m6, [r6-48] | |
617 pmaddubsw m1, [r6-32] | |
618 pmaddubsw m7, [r6-16] | |
619 paddsw m6, m1 | |
620 paddsw m6, m7 | |
621 mova m1, m2 | |
622 paddsw m6, [pw_64] | |
623 mova m2, m3 | |
624 psraw m6, 7 | |
625 mova m3, m4 | |
626 packuswb m6, m6 | |
627 mova m4, m5 | |
628 movh [r0], m6 | |
629 | |
630 ; go to next line | |
631 add r0, r1 | |
632 add r2, r3 | |
633 dec r4 ; next row | |
634 jg .nextrow | |
635 REP_RET | |
636 | |
637 ;----------------------------------------------------------------------------- | |
638 ; IDCT functions: | |
639 ; | |
640 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
641 ;----------------------------------------------------------------------------- | |
642 | |
643 cglobal vp8_idct_dc_add_mmx, 3, 3 | |
644 ; load data | |
645 movd mm0, [r1] | |
646 | |
647 ; calculate DC | |
648 paddw mm0, [pw_4] | |
649 pxor mm1, mm1 | |
650 psraw mm0, 3 | |
651 psubw mm1, mm0 | |
652 packuswb mm0, mm0 | |
653 packuswb mm1, mm1 | |
654 punpcklbw mm0, mm0 | |
655 punpcklbw mm1, mm1 | |
656 punpcklwd mm0, mm0 | |
657 punpcklwd mm1, mm1 | |
658 | |
659 ; add DC | |
660 lea r1, [r0+r2*2] | |
661 movd mm2, [r0] | |
662 movd mm3, [r0+r2] | |
663 movd mm4, [r1] | |
664 movd mm5, [r1+r2] | |
665 paddusb mm2, mm0 | |
666 paddusb mm3, mm0 | |
667 paddusb mm4, mm0 | |
668 paddusb mm5, mm0 | |
669 psubusb mm2, mm1 | |
670 psubusb mm3, mm1 | |
671 psubusb mm4, mm1 | |
672 psubusb mm5, mm1 | |
673 movd [r0], mm2 | |
674 movd [r0+r2], mm3 | |
675 movd [r1], mm4 | |
676 movd [r1+r2], mm5 | |
677 RET | |
678 | |
679 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | |
680 ; load data | |
681 movd xmm0, [r1] | |
682 lea r1, [r0+r2*2] | |
683 pxor xmm1, xmm1 | |
684 movq xmm2, [pw_4] | |
685 | |
686 ; calculate DC | |
687 paddw xmm0, xmm2 | |
688 movd xmm2, [r0] | |
689 movd xmm3, [r0+r2] | |
690 movd xmm4, [r1] | |
691 movd xmm5, [r1+r2] | |
692 psraw xmm0, 3 | |
693 pshuflw xmm0, xmm0, 0 | |
694 punpcklqdq xmm0, xmm0 | |
695 punpckldq xmm2, xmm3 | |
696 punpckldq xmm4, xmm5 | |
697 punpcklbw xmm2, xmm1 | |
698 punpcklbw xmm4, xmm1 | |
699 paddw xmm2, xmm0 | |
700 paddw xmm4, xmm0 | |
701 packuswb xmm2, xmm4 | |
702 movd [r0], xmm2 | |
703 pextrd [r0+r2], xmm2, 1 | |
704 pextrd [r1], xmm2, 2 | |
705 pextrd [r1+r2], xmm2, 3 | |
706 RET |