Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12043:f9a0bd0888a4 libavcodec
mpegaudio: call ff_mpegaudiodec_init_mmx() only from float decoder
The mmx code is floating-point only, and this function does not know
from which decoder it is called. Without this change, the integer
decoder only "works" because the size of the context struct is smaller
in this case, and the mmx init function writes the function pointer
outside the allocated context.
author | mru |
---|---|
date | Thu, 01 Jul 2010 23:21:17 +0000 |
parents | 1b11083f4bb4 |
children | b8f80fe02861 |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
47 fourtap_filter_hb_m: times 8 db -6, -1 | |
48 times 8 db 123, 12 | |
49 times 8 db -9, -6 | |
50 times 8 db 93, 50 | |
51 times 8 db -6, -9 | |
52 times 8 db 50, 93 | |
53 times 8 db -1, -6 | |
54 times 8 db 12, 123 | |
55 | |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 | |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
148 cextern pw_3 |
11975 | 149 cextern pw_4 |
150 cextern pw_64 | |
151 | |
152 SECTION .text | |
153 | |
154 ;----------------------------------------------------------------------------- | |
155 ; subpel MC functions: | |
156 ; | |
157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
158 ; uint8_t *src, int srcstride, | |
159 ; int height, int mx, int my); | |
160 ;----------------------------------------------------------------------------- | |
161 | |
162 ; 4x4 block, H-only 4-tap filter | |
163 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
164 shl r5d, 4 | |
165 %ifdef PIC | |
166 lea r11, [fourtap_filter_hw_m] | |
167 %endif | |
168 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
169 movq mm5, [fourtap_filter_hw+r5] | |
170 movq mm7, [pw_64] | |
171 pxor mm6, mm6 | |
172 | |
173 .nextrow | |
174 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
175 | |
176 ; first set of 2 pixels | |
177 movq mm2, mm1 ; byte ABCD.. | |
178 punpcklbw mm1, mm6 ; byte->word ABCD | |
179 pshufw mm0, mm2, 9 ; byte CDEF.. | |
180 punpcklbw mm0, mm6 ; byte->word CDEF | |
181 pshufw mm3, mm1, 0x94 ; word ABBC | |
182 pshufw mm1, mm0, 0x94 ; word CDDE | |
183 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
184 movq mm0, mm1 ; backup for second set of pixels | |
185 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
186 paddd mm3, mm1 ; finish 1st 2px | |
187 | |
188 ; second set of 2 pixels, use backup of above | |
189 punpckhbw mm2, mm6 ; byte->word EFGH | |
190 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
191 pshufw mm1, mm2, 0x94 ; word EFFG | |
192 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
193 paddd mm0, mm1 ; finish 2nd 2px | |
194 | |
195 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
196 packssdw mm3, mm0 ; merge dword->word (4px) | |
197 paddsw mm3, mm7 ; rounding | |
198 psraw mm3, 7 | |
199 packuswb mm3, mm6 ; clip and word->bytes | |
200 movd [r0], mm3 ; store | |
201 | |
202 ; go to next line | |
203 add r0, r1 | |
204 add r2, r3 | |
205 dec r4 ; next row | |
206 jg .nextrow | |
207 REP_RET | |
208 | |
209 ; 4x4 block, H-only 6-tap filter | |
210 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
211 lea r5d, [r5*3] | |
212 %ifdef PIC | |
213 lea r11, [sixtap_filter_hw_m] | |
214 %endif | |
215 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
216 movq mm5, [sixtap_filter_hw+r5*8-32] | |
217 movq mm6, [sixtap_filter_hw+r5*8-16] | |
218 movq mm7, [pw_64] | |
219 pxor mm3, mm3 | |
220 | |
221 .nextrow | |
222 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
223 | |
224 ; first set of 2 pixels | |
225 movq mm2, mm1 ; byte ABCD.. | |
226 punpcklbw mm1, mm3 ; byte->word ABCD | |
227 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
228 punpckhbw mm2, mm3 ; byte->word EFGH | |
229 punpcklbw mm0, mm3 ; byte->word CDEF | |
230 pshufw mm1, mm1, 0x94 ; word ABBC | |
231 pshufw mm2, mm2, 0x94 ; word EFFG | |
232 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
233 pshufw mm3, mm0, 0x94 ; word CDDE | |
234 movq mm0, mm3 ; backup for second set of pixels | |
235 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
236 paddd mm1, mm3 ; add to 1st 2px cache | |
237 movq mm3, mm2 ; backup for second set of pixels | |
238 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
239 paddd mm1, mm2 ; finish 1st 2px | |
240 | |
241 ; second set of 2 pixels, use backup of above | |
242 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
243 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
244 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
245 paddd mm0, mm3 ; add to 2nd 2px cache | |
246 pxor mm3, mm3 | |
247 punpcklbw mm2, mm3 ; byte->word FGHI | |
248 pshufw mm2, mm2, 0xE9 ; word GHHI | |
249 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
250 paddd mm0, mm2 ; finish 2nd 2px | |
251 | |
252 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
253 packssdw mm1, mm0 ; merge dword->word (4px) | |
254 paddsw mm1, mm7 ; rounding | |
255 psraw mm1, 7 | |
256 packuswb mm1, mm3 ; clip and word->bytes | |
257 movd [r0], mm1 ; store | |
258 | |
259 ; go to next line | |
260 add r0, r1 | |
261 add r2, r3 | |
262 dec r4 ; next row | |
263 jg .nextrow | |
264 REP_RET | |
265 | |
266 ; 4x4 block, H-only 4-tap filter | |
267 INIT_XMM | |
268 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | |
269 shl r5d, 4 | |
270 %ifdef PIC | |
271 lea r11, [fourtap_filter_hw_m] | |
272 %endif | |
273 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
274 mova m6, [fourtap_filter_hw+r5] | |
275 pxor m7, m7 | |
276 | |
277 .nextrow | |
278 movh m0, [r2-1] | |
279 punpcklbw m0, m7 ; ABCDEFGH | |
280 mova m1, m0 | |
281 mova m2, m0 | |
282 mova m3, m0 | |
283 psrldq m1, 2 ; BCDEFGH | |
284 psrldq m2, 4 ; CDEFGH | |
285 psrldq m3, 6 ; DEFGH | |
286 punpcklwd m0, m1 ; ABBCCDDE | |
287 punpcklwd m2, m3 ; CDDEEFFG | |
288 pmaddwd m0, m5 | |
289 pmaddwd m2, m6 | |
290 paddd m0, m2 | |
291 | |
292 movh m1, [r2+3] | |
293 punpcklbw m1, m7 ; ABCDEFGH | |
294 mova m2, m1 | |
295 mova m3, m1 | |
296 mova m4, m1 | |
297 psrldq m2, 2 ; BCDEFGH | |
298 psrldq m3, 4 ; CDEFGH | |
299 psrldq m4, 6 ; DEFGH | |
300 punpcklwd m1, m2 ; ABBCCDDE | |
301 punpcklwd m3, m4 ; CDDEEFFG | |
302 pmaddwd m1, m5 | |
303 pmaddwd m3, m6 | |
304 paddd m1, m3 | |
305 | |
306 packssdw m0, m1 | |
307 paddsw m0, [pw_64] | |
308 psraw m0, 7 | |
309 packuswb m0, m7 | |
310 movh [r0], m0 ; store | |
311 | |
312 ; go to next line | |
313 add r0, r1 | |
314 add r2, r3 | |
315 dec r4 ; next row | |
316 jg .nextrow | |
317 REP_RET | |
318 | |
319 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | |
320 lea r5d, [r5*3] | |
321 %ifdef PIC | |
322 lea r11, [sixtap_filter_hw_m] | |
323 %endif | |
324 lea r5, [sixtap_filter_hw+r5*8] | |
325 pxor m7, m7 | |
326 | |
327 .nextrow | |
328 movu m0, [r2-2] | |
329 mova m6, m0 | |
330 mova m4, m0 | |
331 punpcklbw m0, m7 ; ABCDEFGHI | |
332 mova m1, m0 | |
333 mova m2, m0 | |
334 mova m3, m0 | |
335 psrldq m1, 2 ; BCDEFGH | |
336 psrldq m2, 4 ; CDEFGH | |
337 psrldq m3, 6 ; DEFGH | |
338 psrldq m4, 4 | |
339 punpcklbw m4, m7 ; EFGH | |
340 mova m5, m4 | |
341 psrldq m5, 2 ; FGH | |
342 punpcklwd m0, m1 ; ABBCCDDE | |
343 punpcklwd m2, m3 ; CDDEEFFG | |
344 punpcklwd m4, m5 ; EFFGGHHI | |
345 pmaddwd m0, [r5-48] | |
346 pmaddwd m2, [r5-32] | |
347 pmaddwd m4, [r5-16] | |
348 paddd m0, m2 | |
349 paddd m0, m4 | |
350 | |
351 psrldq m6, 4 | |
352 mova m4, m6 | |
353 punpcklbw m6, m7 ; ABCDEFGHI | |
354 mova m1, m6 | |
355 mova m2, m6 | |
356 mova m3, m6 | |
357 psrldq m1, 2 ; BCDEFGH | |
358 psrldq m2, 4 ; CDEFGH | |
359 psrldq m3, 6 ; DEFGH | |
360 psrldq m4, 4 | |
361 punpcklbw m4, m7 ; EFGH | |
362 mova m5, m4 | |
363 psrldq m5, 2 ; FGH | |
364 punpcklwd m6, m1 ; ABBCCDDE | |
365 punpcklwd m2, m3 ; CDDEEFFG | |
366 punpcklwd m4, m5 ; EFFGGHHI | |
367 pmaddwd m6, [r5-48] | |
368 pmaddwd m2, [r5-32] | |
369 pmaddwd m4, [r5-16] | |
370 paddd m6, m2 | |
371 paddd m6, m4 | |
372 | |
373 packssdw m0, m6 | |
374 paddsw m0, [pw_64] | |
375 psraw m0, 7 | |
376 packuswb m0, m7 | |
377 movh [r0], m0 ; store | |
378 | |
379 ; go to next line | |
380 add r0, r1 | |
381 add r2, r3 | |
382 dec r4 ; next row | |
383 jg .nextrow | |
384 REP_RET | |
385 | |
386 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 | |
387 shl r5d, 4 | |
388 mova m2, [pw_64] | |
11991 | 389 mova m3, [filter_h4_shuf] |
390 mova m4, [filter_h6_shuf2] | |
11975 | 391 %ifdef PIC |
392 lea r11, [fourtap_filter_hb_m] | |
393 %endif | |
394 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes | |
395 mova m6, [fourtap_filter_hb+r5] | |
396 | |
397 .nextrow | |
398 movu m0, [r2-1] | |
399 mova m1, m0 | |
400 pshufb m0, m3 | |
401 pshufb m1, m4 | |
402 pmaddubsw m0, m5 | |
403 pmaddubsw m1, m6 | |
404 paddsw m0, m2 | |
405 paddsw m0, m1 | |
406 psraw m0, 7 | |
407 packuswb m0, m0 | |
408 movh [r0], m0 ; store | |
409 | |
410 ; go to next line | |
411 add r0, r1 | |
412 add r2, r3 | |
413 dec r4 ; next row | |
414 jg .nextrow | |
415 REP_RET | |
416 | |
417 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 | |
418 lea r5d, [r5*3] | |
11991 | 419 mova m3, [filter_h6_shuf1] |
420 mova m4, [filter_h6_shuf2] | |
11975 | 421 %ifdef PIC |
422 lea r11, [sixtap_filter_hb_m] | |
423 %endif | |
424 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes | |
425 mova m6, [sixtap_filter_hb+r5*8-32] | |
426 mova m7, [sixtap_filter_hb+r5*8-16] | |
427 | |
428 .nextrow | |
429 movu m0, [r2-2] | |
430 mova m1, m0 | |
431 mova m2, m0 | |
432 pshufb m0, m3 | |
433 pshufb m1, m4 | |
11991 | 434 pshufb m2, [filter_h6_shuf3] |
11975 | 435 pmaddubsw m0, m5 |
436 pmaddubsw m1, m6 | |
437 pmaddubsw m2, m7 | |
438 paddsw m0, m1 | |
439 paddsw m0, m2 | |
440 paddsw m0, [pw_64] | |
441 psraw m0, 7 | |
442 packuswb m0, m0 | |
443 movh [r0], m0 ; store | |
444 | |
445 ; go to next line | |
446 add r0, r1 | |
447 add r2, r3 | |
448 dec r4 ; next row | |
449 jg .nextrow | |
450 REP_RET | |
451 | |
452 %macro FILTER_V 3 | |
453 ; 4x4 block, V-only 4-tap filter | |
454 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
455 shl r6d, 5 | |
456 %ifdef PIC | |
457 lea r11, [fourtap_filter_v_m] | |
458 %endif | |
459 lea r6, [fourtap_filter_v+r6-32] | |
460 mova m6, [pw_64] | |
461 pxor m7, m7 | |
462 mova m5, [r6+48] | |
463 | |
464 ; read 3 lines | |
465 sub r2, r3 | |
466 movh m0, [r2] | |
467 movh m1, [r2+ r3] | |
468 movh m2, [r2+2*r3] | |
469 add r2, r3 | |
470 punpcklbw m0, m7 | |
471 punpcklbw m1, m7 | |
472 punpcklbw m2, m7 | |
473 | |
474 .nextrow | |
475 ; first calculate negative taps (to prevent losing positive overflows) | |
476 movh m4, [r2+2*r3] ; read new row | |
477 punpcklbw m4, m7 | |
478 mova m3, m4 | |
479 pmullw m0, [r6+0] | |
480 pmullw m4, m5 | |
481 paddsw m4, m0 | |
482 | |
483 ; then calculate positive taps | |
484 mova m0, m1 | |
485 pmullw m1, [r6+16] | |
486 paddsw m4, m1 | |
487 mova m1, m2 | |
488 pmullw m2, [r6+32] | |
489 paddsw m4, m2 | |
490 mova m2, m3 | |
491 | |
492 ; round/clip/store | |
493 paddsw m4, m6 | |
494 psraw m4, 7 | |
495 packuswb m4, m7 | |
496 movh [r0], m4 | |
497 | |
498 ; go to next line | |
499 add r0, r1 | |
500 add r2, r3 | |
501 dec r4 ; next row | |
502 jg .nextrow | |
503 REP_RET | |
504 | |
505 | |
506 ; 4x4 block, V-only 6-tap filter | |
507 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
508 shl r6d, 4 | |
509 lea r6, [r6*3] | |
510 %ifdef PIC | |
511 lea r11, [sixtap_filter_v_m] | |
512 %endif | |
513 lea r6, [sixtap_filter_v+r6-96] | |
514 pxor m7, m7 | |
515 | |
516 ; read 5 lines | |
517 sub r2, r3 | |
518 sub r2, r3 | |
519 movh m0, [r2] | |
520 movh m1, [r2+r3] | |
521 movh m2, [r2+r3*2] | |
522 lea r2, [r2+r3*2] | |
523 add r2, r3 | |
524 movh m3, [r2] | |
525 movh m4, [r2+r3] | |
526 punpcklbw m0, m7 | |
527 punpcklbw m1, m7 | |
528 punpcklbw m2, m7 | |
529 punpcklbw m3, m7 | |
530 punpcklbw m4, m7 | |
531 | |
532 .nextrow | |
533 ; first calculate negative taps (to prevent losing positive overflows) | |
534 mova m5, m1 | |
535 pmullw m5, [r6+16] | |
536 mova m6, m4 | |
537 pmullw m6, [r6+64] | |
538 paddsw m6, m5 | |
539 | |
540 ; then calculate positive taps | |
541 movh m5, [r2+2*r3] ; read new row | |
542 punpcklbw m5, m7 | |
543 pmullw m0, [r6+0] | |
544 paddsw m6, m0 | |
545 mova m0, m1 | |
546 mova m1, m2 | |
547 pmullw m2, [r6+32] | |
548 paddsw m6, m2 | |
549 mova m2, m3 | |
550 pmullw m3, [r6+48] | |
551 paddsw m6, m3 | |
552 mova m3, m4 | |
553 mova m4, m5 | |
554 pmullw m5, [r6+80] | |
555 paddsw m6, m5 | |
556 | |
557 ; round/clip/store | |
558 paddsw m6, [pw_64] | |
559 psraw m6, 7 | |
560 packuswb m6, m7 | |
561 movh [r0], m6 | |
562 | |
563 ; go to next line | |
564 add r0, r1 | |
565 add r2, r3 | |
566 dec r4 ; next row | |
567 jg .nextrow | |
568 REP_RET | |
569 %endmacro | |
570 | |
571 INIT_MMX | |
572 FILTER_V mmxext, 4, 0 | |
573 INIT_XMM | |
574 FILTER_V sse2, 8, 8 | |
575 | |
576 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 | |
577 shl r6d, 4 | |
578 %ifdef PIC | |
579 lea r11, [fourtap_filter_hb_m] | |
580 %endif | |
581 mova m5, [fourtap_filter_hb+r6-16] | |
582 mova m6, [fourtap_filter_hb+r6] | |
583 mova m7, [pw_64] | |
584 | |
585 ; read 3 lines | |
586 sub r2, r3 | |
587 movh m0, [r2] | |
588 movh m1, [r2+ r3] | |
589 movh m2, [r2+2*r3] | |
590 add r2, r3 | |
591 | |
592 .nextrow | |
593 movh m3, [r2+2*r3] ; read new row | |
594 mova m4, m0 | |
595 mova m0, m1 | |
596 punpcklbw m4, m3 | |
597 punpcklbw m1, m2 | |
598 pmaddubsw m4, m5 | |
599 pmaddubsw m1, m6 | |
600 paddsw m4, m1 | |
601 mova m1, m2 | |
602 paddsw m4, m7 | |
603 mova m2, m3 | |
604 psraw m4, 7 | |
605 packuswb m4, m4 | |
606 movh [r0], m4 | |
607 | |
608 ; go to next line | |
609 add r0, r1 | |
610 add r2, r3 | |
611 dec r4 ; next row | |
612 jg .nextrow | |
613 REP_RET | |
614 | |
615 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 | |
616 lea r6d, [r6*3] | |
617 %ifdef PIC | |
618 lea r11, [sixtap_filter_hb_m] | |
619 %endif | |
620 lea r6, [sixtap_filter_hb+r6*8] | |
621 | |
622 ; read 5 lines | |
623 sub r2, r3 | |
624 sub r2, r3 | |
625 movh m0, [r2] | |
626 movh m1, [r2+r3] | |
627 movh m2, [r2+r3*2] | |
628 lea r2, [r2+r3*2] | |
629 add r2, r3 | |
630 movh m3, [r2] | |
631 movh m4, [r2+r3] | |
632 | |
633 .nextrow | |
634 movh m5, [r2+2*r3] ; read new row | |
635 mova m6, m0 | |
636 punpcklbw m6, m5 | |
637 mova m0, m1 | |
638 punpcklbw m1, m2 | |
639 mova m7, m3 | |
640 punpcklbw m7, m4 | |
641 pmaddubsw m6, [r6-48] | |
642 pmaddubsw m1, [r6-32] | |
643 pmaddubsw m7, [r6-16] | |
644 paddsw m6, m1 | |
645 paddsw m6, m7 | |
646 mova m1, m2 | |
647 paddsw m6, [pw_64] | |
648 mova m2, m3 | |
649 psraw m6, 7 | |
650 mova m3, m4 | |
651 packuswb m6, m6 | |
652 mova m4, m5 | |
653 movh [r0], m6 | |
654 | |
655 ; go to next line | |
656 add r0, r1 | |
657 add r2, r3 | |
658 dec r4 ; next row | |
659 jg .nextrow | |
660 REP_RET | |
661 | |
11991 | 662 %macro FILTER_BILINEAR 3 |
663 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
664 mov r5d, 8*16 | |
665 shl r6d, 4 | |
666 sub r5d, r6d | |
667 %ifdef PIC | |
668 lea r11, [bilinear_filter_vw_m] | |
669 %endif | |
670 pxor m6, m6 | |
12000 | 671 mova m4, [bilinear_filter_vw+r5-16] |
672 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 673 .nextrow |
674 movh m0, [r2+r3*0] | |
675 movh m1, [r2+r3*1] | |
676 movh m3, [r2+r3*2] | |
677 punpcklbw m0, m6 | |
678 punpcklbw m1, m6 | |
679 punpcklbw m3, m6 | |
680 mova m2, m1 | |
681 pmullw m0, m4 | |
682 pmullw m1, m5 | |
683 pmullw m2, m4 | |
684 pmullw m3, m5 | |
685 paddsw m0, m1 | |
686 paddsw m2, m3 | |
687 psraw m0, 2 | |
688 psraw m2, 2 | |
689 pavgw m0, m6 | |
690 pavgw m2, m6 | |
691 %ifidn %1, mmxext | |
692 packuswb m0, m0 | |
693 packuswb m2, m2 | |
694 movh [r0+r1*0], m0 | |
695 movh [r0+r1*1], m2 | |
696 %else | |
697 packuswb m0, m2 | |
698 movh [r0+r1*0], m0 | |
699 movhps [r0+r1*1], m0 | |
700 %endif | |
701 | |
702 lea r0, [r0+r1*2] | |
703 lea r2, [r2+r3*2] | |
704 sub r4, 2 | |
705 jg .nextrow | |
706 REP_RET | |
707 | |
708 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
709 mov r6d, 8*16 | |
710 shl r5d, 4 | |
711 sub r6d, r5d | |
712 %ifdef PIC | |
713 lea r11, [bilinear_filter_vw_m] | |
714 %endif | |
715 pxor m6, m6 | |
12000 | 716 mova m4, [bilinear_filter_vw+r6-16] |
717 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 718 .nextrow |
719 movh m0, [r2+r3*0+0] | |
720 movh m1, [r2+r3*0+1] | |
721 movh m2, [r2+r3*1+0] | |
722 movh m3, [r2+r3*1+1] | |
723 punpcklbw m0, m6 | |
724 punpcklbw m1, m6 | |
725 punpcklbw m2, m6 | |
726 punpcklbw m3, m6 | |
727 pmullw m0, m4 | |
728 pmullw m1, m5 | |
729 pmullw m2, m4 | |
730 pmullw m3, m5 | |
731 paddsw m0, m1 | |
732 paddsw m2, m3 | |
733 psraw m0, 2 | |
734 psraw m2, 2 | |
735 pavgw m0, m6 | |
736 pavgw m2, m6 | |
737 %ifidn %1, mmxext | |
738 packuswb m0, m0 | |
739 packuswb m2, m2 | |
740 movh [r0+r1*0], m0 | |
741 movh [r0+r1*1], m2 | |
742 %else | |
743 packuswb m0, m2 | |
744 movh [r0+r1*0], m0 | |
745 movhps [r0+r1*1], m0 | |
746 %endif | |
747 | |
748 lea r0, [r0+r1*2] | |
749 lea r2, [r2+r3*2] | |
750 sub r4, 2 | |
751 jg .nextrow | |
752 REP_RET | |
753 %endmacro | |
754 | |
755 INIT_MMX | |
756 FILTER_BILINEAR mmxext, 4, 0 | |
757 INIT_XMM | |
758 FILTER_BILINEAR sse2, 8, 7 | |
759 | |
760 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 | |
761 shl r6d, 4 | |
762 %ifdef PIC | |
763 lea r11, [bilinear_filter_vb_m] | |
764 %endif | |
765 pxor m4, m4 | |
12000 | 766 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 767 .nextrow |
768 movh m0, [r2+r3*0] | |
769 movh m1, [r2+r3*1] | |
770 movh m2, [r2+r3*2] | |
771 punpcklbw m0, m1 | |
772 punpcklbw m1, m2 | |
773 pmaddubsw m0, m3 | |
774 pmaddubsw m1, m3 | |
775 psraw m0, 2 | |
776 psraw m1, 2 | |
777 pavgw m0, m4 | |
778 pavgw m1, m4 | |
779 packuswb m0, m1 | |
780 movh [r0+r1*0], m0 | |
781 movhps [r0+r1*1], m0 | |
782 | |
783 lea r0, [r0+r1*2] | |
784 lea r2, [r2+r3*2] | |
785 sub r4, 2 | |
786 jg .nextrow | |
787 REP_RET | |
788 | |
789 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 | |
790 shl r5d, 4 | |
791 %ifdef PIC | |
792 lea r11, [bilinear_filter_vb_m] | |
793 %endif | |
794 pxor m4, m4 | |
795 mova m2, [filter_h2_shuf] | |
12000 | 796 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 797 .nextrow |
798 movu m0, [r2+r3*0] | |
799 movu m1, [r2+r3*1] | |
800 pshufb m0, m2 | |
801 pshufb m1, m2 | |
802 pmaddubsw m0, m3 | |
803 pmaddubsw m1, m3 | |
804 psraw m0, 2 | |
805 psraw m1, 2 | |
806 pavgw m0, m4 | |
807 pavgw m1, m4 | |
808 packuswb m0, m1 | |
809 movh [r0+r1*0], m0 | |
810 movhps [r0+r1*1], m0 | |
811 | |
812 lea r0, [r0+r1*2] | |
813 lea r2, [r2+r3*2] | |
814 sub r4, 2 | |
815 jg .nextrow | |
816 REP_RET | |
817 | |
11992 | 818 cglobal put_vp8_pixels8_mmx, 5,5 |
819 .nextrow: | |
820 movq mm0, [r2+r3*0] | |
821 movq mm1, [r2+r3*1] | |
822 lea r2, [r2+r3*2] | |
823 movq [r0+r1*0], mm0 | |
824 movq [r0+r1*1], mm1 | |
825 lea r0, [r0+r1*2] | |
826 sub r4d, 2 | |
827 jg .nextrow | |
828 REP_RET | |
829 | |
830 cglobal put_vp8_pixels16_mmx, 5,5 | |
831 .nextrow: | |
832 movq mm0, [r2+r3*0+0] | |
833 movq mm1, [r2+r3*0+8] | |
834 movq mm2, [r2+r3*1+0] | |
835 movq mm3, [r2+r3*1+8] | |
836 lea r2, [r2+r3*2] | |
837 movq [r0+r1*0+0], mm0 | |
838 movq [r0+r1*0+8], mm1 | |
839 movq [r0+r1*1+0], mm2 | |
840 movq [r0+r1*1+8], mm3 | |
841 lea r0, [r0+r1*2] | |
842 sub r4d, 2 | |
843 jg .nextrow | |
844 REP_RET | |
845 | |
846 cglobal put_vp8_pixels16_sse, 5,5,2 | |
847 .nextrow: | |
848 movups xmm0, [r2+r3*0] | |
849 movups xmm1, [r2+r3*1] | |
850 lea r2, [r2+r3*2] | |
851 movaps [r0+r1*0], xmm0 | |
852 movaps [r0+r1*1], xmm1 | |
853 lea r0, [r0+r1*2] | |
854 sub r4d, 2 | |
855 jg .nextrow | |
856 REP_RET | |
857 | |
11975 | 858 ;----------------------------------------------------------------------------- |
859 ; IDCT functions: | |
860 ; | |
861 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
862 ;----------------------------------------------------------------------------- | |
863 | |
864 cglobal vp8_idct_dc_add_mmx, 3, 3 | |
865 ; load data | |
866 movd mm0, [r1] | |
867 | |
868 ; calculate DC | |
869 paddw mm0, [pw_4] | |
870 pxor mm1, mm1 | |
871 psraw mm0, 3 | |
872 psubw mm1, mm0 | |
873 packuswb mm0, mm0 | |
874 packuswb mm1, mm1 | |
875 punpcklbw mm0, mm0 | |
876 punpcklbw mm1, mm1 | |
877 punpcklwd mm0, mm0 | |
878 punpcklwd mm1, mm1 | |
879 | |
880 ; add DC | |
881 lea r1, [r0+r2*2] | |
882 movd mm2, [r0] | |
883 movd mm3, [r0+r2] | |
884 movd mm4, [r1] | |
885 movd mm5, [r1+r2] | |
886 paddusb mm2, mm0 | |
887 paddusb mm3, mm0 | |
888 paddusb mm4, mm0 | |
889 paddusb mm5, mm0 | |
890 psubusb mm2, mm1 | |
891 psubusb mm3, mm1 | |
892 psubusb mm4, mm1 | |
893 psubusb mm5, mm1 | |
894 movd [r0], mm2 | |
895 movd [r0+r2], mm3 | |
896 movd [r1], mm4 | |
897 movd [r1+r2], mm5 | |
898 RET | |
899 | |
900 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | |
901 ; load data | |
902 movd xmm0, [r1] | |
903 lea r1, [r0+r2*2] | |
904 pxor xmm1, xmm1 | |
905 movq xmm2, [pw_4] | |
906 | |
907 ; calculate DC | |
908 paddw xmm0, xmm2 | |
909 movd xmm2, [r0] | |
910 movd xmm3, [r0+r2] | |
911 movd xmm4, [r1] | |
912 movd xmm5, [r1+r2] | |
913 psraw xmm0, 3 | |
914 pshuflw xmm0, xmm0, 0 | |
915 punpcklqdq xmm0, xmm0 | |
916 punpckldq xmm2, xmm3 | |
917 punpckldq xmm4, xmm5 | |
918 punpcklbw xmm2, xmm1 | |
919 punpcklbw xmm4, xmm1 | |
920 paddw xmm2, xmm0 | |
921 paddw xmm4, xmm0 | |
922 packuswb xmm2, xmm4 | |
923 movd [r0], xmm2 | |
924 pextrd [r0+r2], xmm2, 1 | |
925 pextrd [r1], xmm2, 2 | |
926 pextrd [r1+r2], xmm2, 3 | |
927 RET | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
928 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
929 ;----------------------------------------------------------------------------- |
12013 | 930 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
931 ;----------------------------------------------------------------------------- | |
932 | |
933 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
934 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
935 %macro VP8_MULTIPLY_SUMSUB 4 | |
936 mova %3, %1 | |
937 mova %4, %2 | |
938 pmulhw %3, m6 ;20091(1) | |
939 pmulhw %4, m6 ;20091(2) | |
940 paddw %3, %1 | |
941 paddw %4, %2 | |
12018 | 942 paddw %1, %1 |
943 paddw %2, %2 | |
12013 | 944 pmulhw %1, m7 ;35468(1) |
945 pmulhw %2, m7 ;35468(2) | |
946 psubw %1, %4 | |
947 paddw %2, %3 | |
948 %endmacro | |
949 | |
950 ; calculate x0=%1+%3; x1=%1-%3 | |
951 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
952 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
953 ; %5/%6 are temporary registers | |
954 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
955 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
956 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
957 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
958 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
959 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
960 SWAP %4, %1 | |
961 SWAP %4, %3 | |
962 %endmacro | |
963 | |
964 INIT_MMX | |
965 cglobal vp8_idct_add_mmx, 3, 3 | |
966 ; load block data | |
967 movq m0, [r1] | |
968 movq m1, [r1+8] | |
969 movq m2, [r1+16] | |
970 movq m3, [r1+24] | |
971 movq m6, [pw_20091] | |
972 movq m7, [pw_17734] | |
973 | |
974 ; actual IDCT | |
975 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
976 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
977 paddw m0, [pw_4] | |
978 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
979 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
980 | |
981 ; store | |
982 pxor m4, m4 | |
983 lea r1, [r0+2*r2] | |
984 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
985 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
986 | |
987 RET | |
988 | |
989 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
990 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
991 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
992 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
993 %macro SCATTER_WHT 1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
994 pextrw r1d, m0, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
995 pextrw r2d, m1, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
996 mov [r0+2*16*0], r1w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
997 mov [r0+2*16*1], r2w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
998 pextrw r1d, m2, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
999 pextrw r2d, m3, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1000 mov [r0+2*16*2], r1w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1001 mov [r0+2*16*3], r2w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1002 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1003 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1004 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1005 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1006 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1007 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1008 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1009 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1010 INIT_MMX |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1011 cglobal vp8_luma_dc_wht_mmxext, 2,3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1012 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1013 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1014 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1015 movq m3, [r1+24] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1016 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1017 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1018 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1019 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1020 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1021 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1022 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1023 psraw m3, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1024 SCATTER_WHT 0 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1025 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1026 SCATTER_WHT 1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1027 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1028 SCATTER_WHT 2 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1029 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1030 SCATTER_WHT 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1031 RET |