Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12013:2ae70e2c31a4 libavcodec
MMX idct_add for VP8.
author | rbultje |
---|---|
date | Tue, 29 Jun 2010 14:43:11 +0000 |
parents | d584c7373a64 |
children | 6fe72dbf2c7b |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
47 fourtap_filter_hb_m: times 8 db -6, -1 | |
48 times 8 db 123, 12 | |
49 times 8 db -9, -6 | |
50 times 8 db 93, 50 | |
51 times 8 db -6, -9 | |
52 times 8 db 50, 93 | |
53 times 8 db -1, -6 | |
54 times 8 db 12, 123 | |
55 | |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 | |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
148 cextern pw_3 |
11975 | 149 cextern pw_4 |
150 cextern pw_64 | |
151 | |
152 SECTION .text | |
153 | |
154 ;----------------------------------------------------------------------------- | |
155 ; subpel MC functions: | |
156 ; | |
157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
158 ; uint8_t *src, int srcstride, | |
159 ; int height, int mx, int my); | |
160 ;----------------------------------------------------------------------------- | |
161 | |
162 ; 4x4 block, H-only 4-tap filter | |
163 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
164 shl r5d, 4 | |
165 %ifdef PIC | |
166 lea r11, [fourtap_filter_hw_m] | |
167 %endif | |
168 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
169 movq mm5, [fourtap_filter_hw+r5] | |
170 movq mm7, [pw_64] | |
171 pxor mm6, mm6 | |
172 | |
173 .nextrow | |
174 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
175 | |
176 ; first set of 2 pixels | |
177 movq mm2, mm1 ; byte ABCD.. | |
178 punpcklbw mm1, mm6 ; byte->word ABCD | |
179 pshufw mm0, mm2, 9 ; byte CDEF.. | |
180 punpcklbw mm0, mm6 ; byte->word CDEF | |
181 pshufw mm3, mm1, 0x94 ; word ABBC | |
182 pshufw mm1, mm0, 0x94 ; word CDDE | |
183 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
184 movq mm0, mm1 ; backup for second set of pixels | |
185 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
186 paddd mm3, mm1 ; finish 1st 2px | |
187 | |
188 ; second set of 2 pixels, use backup of above | |
189 punpckhbw mm2, mm6 ; byte->word EFGH | |
190 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
191 pshufw mm1, mm2, 0x94 ; word EFFG | |
192 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
193 paddd mm0, mm1 ; finish 2nd 2px | |
194 | |
195 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
196 packssdw mm3, mm0 ; merge dword->word (4px) | |
197 paddsw mm3, mm7 ; rounding | |
198 psraw mm3, 7 | |
199 packuswb mm3, mm6 ; clip and word->bytes | |
200 movd [r0], mm3 ; store | |
201 | |
202 ; go to next line | |
203 add r0, r1 | |
204 add r2, r3 | |
205 dec r4 ; next row | |
206 jg .nextrow | |
207 REP_RET | |
208 | |
209 ; 4x4 block, H-only 6-tap filter | |
210 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
211 lea r5d, [r5*3] | |
212 %ifdef PIC | |
213 lea r11, [sixtap_filter_hw_m] | |
214 %endif | |
215 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
216 movq mm5, [sixtap_filter_hw+r5*8-32] | |
217 movq mm6, [sixtap_filter_hw+r5*8-16] | |
218 movq mm7, [pw_64] | |
219 pxor mm3, mm3 | |
220 | |
221 .nextrow | |
222 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
223 | |
224 ; first set of 2 pixels | |
225 movq mm2, mm1 ; byte ABCD.. | |
226 punpcklbw mm1, mm3 ; byte->word ABCD | |
227 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
228 punpckhbw mm2, mm3 ; byte->word EFGH | |
229 punpcklbw mm0, mm3 ; byte->word CDEF | |
230 pshufw mm1, mm1, 0x94 ; word ABBC | |
231 pshufw mm2, mm2, 0x94 ; word EFFG | |
232 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
233 pshufw mm3, mm0, 0x94 ; word CDDE | |
234 movq mm0, mm3 ; backup for second set of pixels | |
235 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
236 paddd mm1, mm3 ; add to 1st 2px cache | |
237 movq mm3, mm2 ; backup for second set of pixels | |
238 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
239 paddd mm1, mm2 ; finish 1st 2px | |
240 | |
241 ; second set of 2 pixels, use backup of above | |
242 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
243 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
244 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
245 paddd mm0, mm3 ; add to 2nd 2px cache | |
246 pxor mm3, mm3 | |
247 punpcklbw mm2, mm3 ; byte->word FGHI | |
248 pshufw mm2, mm2, 0xE9 ; word GHHI | |
249 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
250 paddd mm0, mm2 ; finish 2nd 2px | |
251 | |
252 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
253 packssdw mm1, mm0 ; merge dword->word (4px) | |
254 paddsw mm1, mm7 ; rounding | |
255 psraw mm1, 7 | |
256 packuswb mm1, mm3 ; clip and word->bytes | |
257 movd [r0], mm1 ; store | |
258 | |
259 ; go to next line | |
260 add r0, r1 | |
261 add r2, r3 | |
262 dec r4 ; next row | |
263 jg .nextrow | |
264 REP_RET | |
265 | |
266 ; 4x4 block, H-only 4-tap filter | |
267 INIT_XMM | |
268 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | |
269 shl r5d, 4 | |
270 %ifdef PIC | |
271 lea r11, [fourtap_filter_hw_m] | |
272 %endif | |
273 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
274 mova m6, [fourtap_filter_hw+r5] | |
275 pxor m7, m7 | |
276 | |
277 .nextrow | |
278 movh m0, [r2-1] | |
279 punpcklbw m0, m7 ; ABCDEFGH | |
280 mova m1, m0 | |
281 mova m2, m0 | |
282 mova m3, m0 | |
283 psrldq m1, 2 ; BCDEFGH | |
284 psrldq m2, 4 ; CDEFGH | |
285 psrldq m3, 6 ; DEFGH | |
286 punpcklwd m0, m1 ; ABBCCDDE | |
287 punpcklwd m2, m3 ; CDDEEFFG | |
288 pmaddwd m0, m5 | |
289 pmaddwd m2, m6 | |
290 paddd m0, m2 | |
291 | |
292 movh m1, [r2+3] | |
293 punpcklbw m1, m7 ; ABCDEFGH | |
294 mova m2, m1 | |
295 mova m3, m1 | |
296 mova m4, m1 | |
297 psrldq m2, 2 ; BCDEFGH | |
298 psrldq m3, 4 ; CDEFGH | |
299 psrldq m4, 6 ; DEFGH | |
300 punpcklwd m1, m2 ; ABBCCDDE | |
301 punpcklwd m3, m4 ; CDDEEFFG | |
302 pmaddwd m1, m5 | |
303 pmaddwd m3, m6 | |
304 paddd m1, m3 | |
305 | |
306 packssdw m0, m1 | |
307 paddsw m0, [pw_64] | |
308 psraw m0, 7 | |
309 packuswb m0, m7 | |
310 movh [r0], m0 ; store | |
311 | |
312 ; go to next line | |
313 add r0, r1 | |
314 add r2, r3 | |
315 dec r4 ; next row | |
316 jg .nextrow | |
317 REP_RET | |
318 | |
319 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | |
320 lea r5d, [r5*3] | |
321 %ifdef PIC | |
322 lea r11, [sixtap_filter_hw_m] | |
323 %endif | |
324 lea r5, [sixtap_filter_hw+r5*8] | |
325 pxor m7, m7 | |
326 | |
327 .nextrow | |
328 movu m0, [r2-2] | |
329 mova m6, m0 | |
330 mova m4, m0 | |
331 punpcklbw m0, m7 ; ABCDEFGHI | |
332 mova m1, m0 | |
333 mova m2, m0 | |
334 mova m3, m0 | |
335 psrldq m1, 2 ; BCDEFGH | |
336 psrldq m2, 4 ; CDEFGH | |
337 psrldq m3, 6 ; DEFGH | |
338 psrldq m4, 4 | |
339 punpcklbw m4, m7 ; EFGH | |
340 mova m5, m4 | |
341 psrldq m5, 2 ; FGH | |
342 punpcklwd m0, m1 ; ABBCCDDE | |
343 punpcklwd m2, m3 ; CDDEEFFG | |
344 punpcklwd m4, m5 ; EFFGGHHI | |
345 pmaddwd m0, [r5-48] | |
346 pmaddwd m2, [r5-32] | |
347 pmaddwd m4, [r5-16] | |
348 paddd m0, m2 | |
349 paddd m0, m4 | |
350 | |
351 psrldq m6, 4 | |
352 mova m4, m6 | |
353 punpcklbw m6, m7 ; ABCDEFGHI | |
354 mova m1, m6 | |
355 mova m2, m6 | |
356 mova m3, m6 | |
357 psrldq m1, 2 ; BCDEFGH | |
358 psrldq m2, 4 ; CDEFGH | |
359 psrldq m3, 6 ; DEFGH | |
360 psrldq m4, 4 | |
361 punpcklbw m4, m7 ; EFGH | |
362 mova m5, m4 | |
363 psrldq m5, 2 ; FGH | |
364 punpcklwd m6, m1 ; ABBCCDDE | |
365 punpcklwd m2, m3 ; CDDEEFFG | |
366 punpcklwd m4, m5 ; EFFGGHHI | |
367 pmaddwd m6, [r5-48] | |
368 pmaddwd m2, [r5-32] | |
369 pmaddwd m4, [r5-16] | |
370 paddd m6, m2 | |
371 paddd m6, m4 | |
372 | |
373 packssdw m0, m6 | |
374 paddsw m0, [pw_64] | |
375 psraw m0, 7 | |
376 packuswb m0, m7 | |
377 movh [r0], m0 ; store | |
378 | |
379 ; go to next line | |
380 add r0, r1 | |
381 add r2, r3 | |
382 dec r4 ; next row | |
383 jg .nextrow | |
384 REP_RET | |
385 | |
386 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 | |
387 shl r5d, 4 | |
388 mova m2, [pw_64] | |
11991 | 389 mova m3, [filter_h4_shuf] |
390 mova m4, [filter_h6_shuf2] | |
11975 | 391 %ifdef PIC |
392 lea r11, [fourtap_filter_hb_m] | |
393 %endif | |
394 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes | |
395 mova m6, [fourtap_filter_hb+r5] | |
396 | |
397 .nextrow | |
398 movu m0, [r2-1] | |
399 mova m1, m0 | |
400 pshufb m0, m3 | |
401 pshufb m1, m4 | |
402 pmaddubsw m0, m5 | |
403 pmaddubsw m1, m6 | |
404 paddsw m0, m2 | |
405 paddsw m0, m1 | |
406 psraw m0, 7 | |
407 packuswb m0, m0 | |
408 movh [r0], m0 ; store | |
409 | |
410 ; go to next line | |
411 add r0, r1 | |
412 add r2, r3 | |
413 dec r4 ; next row | |
414 jg .nextrow | |
415 REP_RET | |
416 | |
417 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 | |
418 lea r5d, [r5*3] | |
11991 | 419 mova m3, [filter_h6_shuf1] |
420 mova m4, [filter_h6_shuf2] | |
11975 | 421 %ifdef PIC |
422 lea r11, [sixtap_filter_hb_m] | |
423 %endif | |
424 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes | |
425 mova m6, [sixtap_filter_hb+r5*8-32] | |
426 mova m7, [sixtap_filter_hb+r5*8-16] | |
427 | |
428 .nextrow | |
429 movu m0, [r2-2] | |
430 mova m1, m0 | |
431 mova m2, m0 | |
432 pshufb m0, m3 | |
433 pshufb m1, m4 | |
11991 | 434 pshufb m2, [filter_h6_shuf3] |
11975 | 435 pmaddubsw m0, m5 |
436 pmaddubsw m1, m6 | |
437 pmaddubsw m2, m7 | |
438 paddsw m0, m1 | |
439 paddsw m0, m2 | |
440 paddsw m0, [pw_64] | |
441 psraw m0, 7 | |
442 packuswb m0, m0 | |
443 movh [r0], m0 ; store | |
444 | |
445 ; go to next line | |
446 add r0, r1 | |
447 add r2, r3 | |
448 dec r4 ; next row | |
449 jg .nextrow | |
450 REP_RET | |
451 | |
452 %macro FILTER_V 3 | |
453 ; 4x4 block, V-only 4-tap filter | |
454 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
455 shl r6d, 5 | |
456 %ifdef PIC | |
457 lea r11, [fourtap_filter_v_m] | |
458 %endif | |
459 lea r6, [fourtap_filter_v+r6-32] | |
460 mova m6, [pw_64] | |
461 pxor m7, m7 | |
462 mova m5, [r6+48] | |
463 | |
464 ; read 3 lines | |
465 sub r2, r3 | |
466 movh m0, [r2] | |
467 movh m1, [r2+ r3] | |
468 movh m2, [r2+2*r3] | |
469 add r2, r3 | |
470 punpcklbw m0, m7 | |
471 punpcklbw m1, m7 | |
472 punpcklbw m2, m7 | |
473 | |
474 .nextrow | |
475 ; first calculate negative taps (to prevent losing positive overflows) | |
476 movh m4, [r2+2*r3] ; read new row | |
477 punpcklbw m4, m7 | |
478 mova m3, m4 | |
479 pmullw m0, [r6+0] | |
480 pmullw m4, m5 | |
481 paddsw m4, m0 | |
482 | |
483 ; then calculate positive taps | |
484 mova m0, m1 | |
485 pmullw m1, [r6+16] | |
486 paddsw m4, m1 | |
487 mova m1, m2 | |
488 pmullw m2, [r6+32] | |
489 paddsw m4, m2 | |
490 mova m2, m3 | |
491 | |
492 ; round/clip/store | |
493 paddsw m4, m6 | |
494 psraw m4, 7 | |
495 packuswb m4, m7 | |
496 movh [r0], m4 | |
497 | |
498 ; go to next line | |
499 add r0, r1 | |
500 add r2, r3 | |
501 dec r4 ; next row | |
502 jg .nextrow | |
503 REP_RET | |
504 | |
505 | |
506 ; 4x4 block, V-only 6-tap filter | |
507 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
508 shl r6d, 4 | |
509 lea r6, [r6*3] | |
510 %ifdef PIC | |
511 lea r11, [sixtap_filter_v_m] | |
512 %endif | |
513 lea r6, [sixtap_filter_v+r6-96] | |
514 pxor m7, m7 | |
515 | |
516 ; read 5 lines | |
517 sub r2, r3 | |
518 sub r2, r3 | |
519 movh m0, [r2] | |
520 movh m1, [r2+r3] | |
521 movh m2, [r2+r3*2] | |
522 lea r2, [r2+r3*2] | |
523 add r2, r3 | |
524 movh m3, [r2] | |
525 movh m4, [r2+r3] | |
526 punpcklbw m0, m7 | |
527 punpcklbw m1, m7 | |
528 punpcklbw m2, m7 | |
529 punpcklbw m3, m7 | |
530 punpcklbw m4, m7 | |
531 | |
532 .nextrow | |
533 ; first calculate negative taps (to prevent losing positive overflows) | |
534 mova m5, m1 | |
535 pmullw m5, [r6+16] | |
536 mova m6, m4 | |
537 pmullw m6, [r6+64] | |
538 paddsw m6, m5 | |
539 | |
540 ; then calculate positive taps | |
541 movh m5, [r2+2*r3] ; read new row | |
542 punpcklbw m5, m7 | |
543 pmullw m0, [r6+0] | |
544 paddsw m6, m0 | |
545 mova m0, m1 | |
546 mova m1, m2 | |
547 pmullw m2, [r6+32] | |
548 paddsw m6, m2 | |
549 mova m2, m3 | |
550 pmullw m3, [r6+48] | |
551 paddsw m6, m3 | |
552 mova m3, m4 | |
553 mova m4, m5 | |
554 pmullw m5, [r6+80] | |
555 paddsw m6, m5 | |
556 | |
557 ; round/clip/store | |
558 paddsw m6, [pw_64] | |
559 psraw m6, 7 | |
560 packuswb m6, m7 | |
561 movh [r0], m6 | |
562 | |
563 ; go to next line | |
564 add r0, r1 | |
565 add r2, r3 | |
566 dec r4 ; next row | |
567 jg .nextrow | |
568 REP_RET | |
569 %endmacro | |
570 | |
571 INIT_MMX | |
572 FILTER_V mmxext, 4, 0 | |
573 INIT_XMM | |
574 FILTER_V sse2, 8, 8 | |
575 | |
576 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 | |
577 shl r6d, 4 | |
578 %ifdef PIC | |
579 lea r11, [fourtap_filter_hb_m] | |
580 %endif | |
581 mova m5, [fourtap_filter_hb+r6-16] | |
582 mova m6, [fourtap_filter_hb+r6] | |
583 mova m7, [pw_64] | |
584 | |
585 ; read 3 lines | |
586 sub r2, r3 | |
587 movh m0, [r2] | |
588 movh m1, [r2+ r3] | |
589 movh m2, [r2+2*r3] | |
590 add r2, r3 | |
591 | |
592 .nextrow | |
593 movh m3, [r2+2*r3] ; read new row | |
594 mova m4, m0 | |
595 mova m0, m1 | |
596 punpcklbw m4, m3 | |
597 punpcklbw m1, m2 | |
598 pmaddubsw m4, m5 | |
599 pmaddubsw m1, m6 | |
600 paddsw m4, m1 | |
601 mova m1, m2 | |
602 paddsw m4, m7 | |
603 mova m2, m3 | |
604 psraw m4, 7 | |
605 packuswb m4, m4 | |
606 movh [r0], m4 | |
607 | |
608 ; go to next line | |
609 add r0, r1 | |
610 add r2, r3 | |
611 dec r4 ; next row | |
612 jg .nextrow | |
613 REP_RET | |
614 | |
615 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 | |
616 lea r6d, [r6*3] | |
617 %ifdef PIC | |
618 lea r11, [sixtap_filter_hb_m] | |
619 %endif | |
620 lea r6, [sixtap_filter_hb+r6*8] | |
621 | |
622 ; read 5 lines | |
623 sub r2, r3 | |
624 sub r2, r3 | |
625 movh m0, [r2] | |
626 movh m1, [r2+r3] | |
627 movh m2, [r2+r3*2] | |
628 lea r2, [r2+r3*2] | |
629 add r2, r3 | |
630 movh m3, [r2] | |
631 movh m4, [r2+r3] | |
632 | |
633 .nextrow | |
634 movh m5, [r2+2*r3] ; read new row | |
635 mova m6, m0 | |
636 punpcklbw m6, m5 | |
637 mova m0, m1 | |
638 punpcklbw m1, m2 | |
639 mova m7, m3 | |
640 punpcklbw m7, m4 | |
641 pmaddubsw m6, [r6-48] | |
642 pmaddubsw m1, [r6-32] | |
643 pmaddubsw m7, [r6-16] | |
644 paddsw m6, m1 | |
645 paddsw m6, m7 | |
646 mova m1, m2 | |
647 paddsw m6, [pw_64] | |
648 mova m2, m3 | |
649 psraw m6, 7 | |
650 mova m3, m4 | |
651 packuswb m6, m6 | |
652 mova m4, m5 | |
653 movh [r0], m6 | |
654 | |
655 ; go to next line | |
656 add r0, r1 | |
657 add r2, r3 | |
658 dec r4 ; next row | |
659 jg .nextrow | |
660 REP_RET | |
661 | |
11991 | 662 %macro FILTER_BILINEAR 3 |
663 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
664 mov r5d, 8*16 | |
665 shl r6d, 4 | |
666 sub r5d, r6d | |
667 %ifdef PIC | |
668 lea r11, [bilinear_filter_vw_m] | |
669 %endif | |
670 pxor m6, m6 | |
12000 | 671 mova m4, [bilinear_filter_vw+r5-16] |
672 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 673 .nextrow |
674 movh m0, [r2+r3*0] | |
675 movh m1, [r2+r3*1] | |
676 movh m3, [r2+r3*2] | |
677 punpcklbw m0, m6 | |
678 punpcklbw m1, m6 | |
679 punpcklbw m3, m6 | |
680 mova m2, m1 | |
681 pmullw m0, m4 | |
682 pmullw m1, m5 | |
683 pmullw m2, m4 | |
684 pmullw m3, m5 | |
685 paddsw m0, m1 | |
686 paddsw m2, m3 | |
687 psraw m0, 2 | |
688 psraw m2, 2 | |
689 pavgw m0, m6 | |
690 pavgw m2, m6 | |
691 %ifidn %1, mmxext | |
692 packuswb m0, m0 | |
693 packuswb m2, m2 | |
694 movh [r0+r1*0], m0 | |
695 movh [r0+r1*1], m2 | |
696 %else | |
697 packuswb m0, m2 | |
698 movh [r0+r1*0], m0 | |
699 movhps [r0+r1*1], m0 | |
700 %endif | |
701 | |
702 lea r0, [r0+r1*2] | |
703 lea r2, [r2+r3*2] | |
704 sub r4, 2 | |
705 jg .nextrow | |
706 REP_RET | |
707 | |
708 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
709 mov r6d, 8*16 | |
710 shl r5d, 4 | |
711 sub r6d, r5d | |
712 %ifdef PIC | |
713 lea r11, [bilinear_filter_vw_m] | |
714 %endif | |
715 pxor m6, m6 | |
12000 | 716 mova m4, [bilinear_filter_vw+r6-16] |
717 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 718 .nextrow |
719 movh m0, [r2+r3*0+0] | |
720 movh m1, [r2+r3*0+1] | |
721 movh m2, [r2+r3*1+0] | |
722 movh m3, [r2+r3*1+1] | |
723 punpcklbw m0, m6 | |
724 punpcklbw m1, m6 | |
725 punpcklbw m2, m6 | |
726 punpcklbw m3, m6 | |
727 pmullw m0, m4 | |
728 pmullw m1, m5 | |
729 pmullw m2, m4 | |
730 pmullw m3, m5 | |
731 paddsw m0, m1 | |
732 paddsw m2, m3 | |
733 psraw m0, 2 | |
734 psraw m2, 2 | |
735 pavgw m0, m6 | |
736 pavgw m2, m6 | |
737 %ifidn %1, mmxext | |
738 packuswb m0, m0 | |
739 packuswb m2, m2 | |
740 movh [r0+r1*0], m0 | |
741 movh [r0+r1*1], m2 | |
742 %else | |
743 packuswb m0, m2 | |
744 movh [r0+r1*0], m0 | |
745 movhps [r0+r1*1], m0 | |
746 %endif | |
747 | |
748 lea r0, [r0+r1*2] | |
749 lea r2, [r2+r3*2] | |
750 sub r4, 2 | |
751 jg .nextrow | |
752 REP_RET | |
753 %endmacro | |
754 | |
755 INIT_MMX | |
756 FILTER_BILINEAR mmxext, 4, 0 | |
757 INIT_XMM | |
758 FILTER_BILINEAR sse2, 8, 7 | |
759 | |
760 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 | |
761 shl r6d, 4 | |
762 %ifdef PIC | |
763 lea r11, [bilinear_filter_vb_m] | |
764 %endif | |
765 pxor m4, m4 | |
12000 | 766 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 767 .nextrow |
768 movh m0, [r2+r3*0] | |
769 movh m1, [r2+r3*1] | |
770 movh m2, [r2+r3*2] | |
771 punpcklbw m0, m1 | |
772 punpcklbw m1, m2 | |
773 pmaddubsw m0, m3 | |
774 pmaddubsw m1, m3 | |
775 psraw m0, 2 | |
776 psraw m1, 2 | |
777 pavgw m0, m4 | |
778 pavgw m1, m4 | |
779 packuswb m0, m1 | |
780 movh [r0+r1*0], m0 | |
781 movhps [r0+r1*1], m0 | |
782 | |
783 lea r0, [r0+r1*2] | |
784 lea r2, [r2+r3*2] | |
785 sub r4, 2 | |
786 jg .nextrow | |
787 REP_RET | |
788 | |
789 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 | |
790 shl r5d, 4 | |
791 %ifdef PIC | |
792 lea r11, [bilinear_filter_vb_m] | |
793 %endif | |
794 pxor m4, m4 | |
795 mova m2, [filter_h2_shuf] | |
12000 | 796 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 797 .nextrow |
798 movu m0, [r2+r3*0] | |
799 movu m1, [r2+r3*1] | |
800 pshufb m0, m2 | |
801 pshufb m1, m2 | |
802 pmaddubsw m0, m3 | |
803 pmaddubsw m1, m3 | |
804 psraw m0, 2 | |
805 psraw m1, 2 | |
806 pavgw m0, m4 | |
807 pavgw m1, m4 | |
808 packuswb m0, m1 | |
809 movh [r0+r1*0], m0 | |
810 movhps [r0+r1*1], m0 | |
811 | |
812 lea r0, [r0+r1*2] | |
813 lea r2, [r2+r3*2] | |
814 sub r4, 2 | |
815 jg .nextrow | |
816 REP_RET | |
817 | |
11992 | 818 cglobal put_vp8_pixels8_mmx, 5,5 |
819 .nextrow: | |
820 movq mm0, [r2+r3*0] | |
821 movq mm1, [r2+r3*1] | |
822 lea r2, [r2+r3*2] | |
823 movq [r0+r1*0], mm0 | |
824 movq [r0+r1*1], mm1 | |
825 lea r0, [r0+r1*2] | |
826 sub r4d, 2 | |
827 jg .nextrow | |
828 REP_RET | |
829 | |
830 cglobal put_vp8_pixels16_mmx, 5,5 | |
831 .nextrow: | |
832 movq mm0, [r2+r3*0+0] | |
833 movq mm1, [r2+r3*0+8] | |
834 movq mm2, [r2+r3*1+0] | |
835 movq mm3, [r2+r3*1+8] | |
836 lea r2, [r2+r3*2] | |
837 movq [r0+r1*0+0], mm0 | |
838 movq [r0+r1*0+8], mm1 | |
839 movq [r0+r1*1+0], mm2 | |
840 movq [r0+r1*1+8], mm3 | |
841 lea r0, [r0+r1*2] | |
842 sub r4d, 2 | |
843 jg .nextrow | |
844 REP_RET | |
845 | |
846 cglobal put_vp8_pixels16_sse, 5,5,2 | |
847 .nextrow: | |
848 movups xmm0, [r2+r3*0] | |
849 movups xmm1, [r2+r3*1] | |
850 lea r2, [r2+r3*2] | |
851 movaps [r0+r1*0], xmm0 | |
852 movaps [r0+r1*1], xmm1 | |
853 lea r0, [r0+r1*2] | |
854 sub r4d, 2 | |
855 jg .nextrow | |
856 REP_RET | |
857 | |
11975 | 858 ;----------------------------------------------------------------------------- |
859 ; IDCT functions: | |
860 ; | |
861 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
862 ;----------------------------------------------------------------------------- | |
863 | |
864 cglobal vp8_idct_dc_add_mmx, 3, 3 | |
865 ; load data | |
866 movd mm0, [r1] | |
867 | |
868 ; calculate DC | |
869 paddw mm0, [pw_4] | |
870 pxor mm1, mm1 | |
871 psraw mm0, 3 | |
872 psubw mm1, mm0 | |
873 packuswb mm0, mm0 | |
874 packuswb mm1, mm1 | |
875 punpcklbw mm0, mm0 | |
876 punpcklbw mm1, mm1 | |
877 punpcklwd mm0, mm0 | |
878 punpcklwd mm1, mm1 | |
879 | |
880 ; add DC | |
881 lea r1, [r0+r2*2] | |
882 movd mm2, [r0] | |
883 movd mm3, [r0+r2] | |
884 movd mm4, [r1] | |
885 movd mm5, [r1+r2] | |
886 paddusb mm2, mm0 | |
887 paddusb mm3, mm0 | |
888 paddusb mm4, mm0 | |
889 paddusb mm5, mm0 | |
890 psubusb mm2, mm1 | |
891 psubusb mm3, mm1 | |
892 psubusb mm4, mm1 | |
893 psubusb mm5, mm1 | |
894 movd [r0], mm2 | |
895 movd [r0+r2], mm3 | |
896 movd [r1], mm4 | |
897 movd [r1+r2], mm5 | |
898 RET | |
899 | |
900 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | |
901 ; load data | |
902 movd xmm0, [r1] | |
903 lea r1, [r0+r2*2] | |
904 pxor xmm1, xmm1 | |
905 movq xmm2, [pw_4] | |
906 | |
907 ; calculate DC | |
908 paddw xmm0, xmm2 | |
909 movd xmm2, [r0] | |
910 movd xmm3, [r0+r2] | |
911 movd xmm4, [r1] | |
912 movd xmm5, [r1+r2] | |
913 psraw xmm0, 3 | |
914 pshuflw xmm0, xmm0, 0 | |
915 punpcklqdq xmm0, xmm0 | |
916 punpckldq xmm2, xmm3 | |
917 punpckldq xmm4, xmm5 | |
918 punpcklbw xmm2, xmm1 | |
919 punpcklbw xmm4, xmm1 | |
920 paddw xmm2, xmm0 | |
921 paddw xmm4, xmm0 | |
922 packuswb xmm2, xmm4 | |
923 movd [r0], xmm2 | |
924 pextrd [r0+r2], xmm2, 1 | |
925 pextrd [r1], xmm2, 2 | |
926 pextrd [r1+r2], xmm2, 3 | |
927 RET | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
928 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
929 ;----------------------------------------------------------------------------- |
12013 | 930 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
931 ;----------------------------------------------------------------------------- | |
932 | |
933 ; calculate %1=%2+%1; %2=%2-%1, with %3=temp register | |
934 %macro SUMSUB 3 | |
935 mova %3, %1 | |
936 paddw %1, %2 | |
937 psubw %2, %3 | |
938 %endmacro | |
939 | |
940 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
941 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
942 %macro VP8_MULTIPLY_SUMSUB 4 | |
943 mova %3, %1 | |
944 mova %4, %2 | |
945 pmulhw %3, m6 ;20091(1) | |
946 pmulhw %4, m6 ;20091(2) | |
947 paddw %3, %1 | |
948 paddw %4, %2 | |
949 psllw %1, 1 | |
950 psllw %2, 1 | |
951 pmulhw %1, m7 ;35468(1) | |
952 pmulhw %2, m7 ;35468(2) | |
953 psubw %1, %4 | |
954 paddw %2, %3 | |
955 %endmacro | |
956 | |
957 ; calculate x0=%1+%3; x1=%1-%3 | |
958 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
959 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
960 ; %5/%6 are temporary registers | |
961 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
962 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
963 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
964 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
965 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
966 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
967 SWAP %4, %1 | |
968 SWAP %4, %3 | |
969 %endmacro | |
970 | |
971 ; transpose a 4x4 table | |
972 %macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3 | |
973 mova m%5, m%1 | |
974 punpcklwd m%1, m%2 | |
975 punpckhwd m%5, m%2 | |
976 mova m%2, m%3 | |
977 punpcklwd m%3, m%4 | |
978 punpckhwd m%2, m%4 | |
979 mova m%4, m%1 | |
980 punpckldq m%1, m%3 ;col0 | |
981 punpckhdq m%4, m%3 ;col1 | |
982 mova m%3, m%5 | |
983 punpckldq m%5, m%2 ;col2 | |
984 punpckhdq m%3, m%2 ;col3 | |
985 SWAP %4, %2 | |
986 SWAP %4, %5 | |
987 SWAP %4, %3 | |
988 %endmacro | |
989 | |
990 INIT_MMX | |
991 cglobal vp8_idct_add_mmx, 3, 3 | |
992 ; load block data | |
993 movq m0, [r1] | |
994 movq m1, [r1+8] | |
995 movq m2, [r1+16] | |
996 movq m3, [r1+24] | |
997 movq m6, [pw_20091] | |
998 movq m7, [pw_17734] | |
999 | |
1000 ; actual IDCT | |
1001 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1002 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1003 paddw m0, [pw_4] | |
1004 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1005 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1006 | |
1007 ; store | |
1008 pxor m4, m4 | |
1009 lea r1, [r0+2*r2] | |
1010 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1011 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1012 | |
1013 RET | |
1014 | |
1015 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1016 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1017 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1018 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1019 %macro SCATTER_WHT 1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1020 pextrw r1d, m0, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1021 pextrw r2d, m1, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1022 mov [r0+2*16*0], r1w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1023 mov [r0+2*16*1], r2w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1024 pextrw r1d, m2, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1025 pextrw r2d, m3, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1026 mov [r0+2*16*2], r1w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1027 mov [r0+2*16*3], r2w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1028 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1029 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1030 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1031 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1032 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1033 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1034 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1035 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1036 INIT_MMX |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1037 cglobal vp8_luma_dc_wht_mmxext, 2,3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1038 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1039 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1040 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1041 movq m3, [r1+24] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1042 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1043 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1044 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1045 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1046 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1047 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1048 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1049 psraw m3, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1050 SCATTER_WHT 0 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1051 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1052 SCATTER_WHT 1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1053 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1054 SCATTER_WHT 2 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1055 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1056 SCATTER_WHT 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1057 RET |