comparison x86/vp8dsp.asm @ 11975:c3afb5be0d9b libavcodec

First shot at VP8 optimizations: - MMXEXT, SSE2 and SSSE3 MC functions - MMX and SSE4 IDCT dc_add functions Patch by Jason Garrett-Glaser <darkshikari gmail com> and myself.
author rbultje
date Sun, 27 Jun 2010 02:01:45 +0000
parents
children a6d24fc1deb7
comparison
equal deleted inserted replaced
11974:356b20a6566d 11975:c3afb5be0d9b
1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
22
23 %include "x86inc.asm"
24
25 SECTION_RODATA
26
27 fourtap_filter_hw_m: times 4 dw -6, 123
28 times 4 dw 12, -1
29 times 4 dw -9, 93
30 times 4 dw 50, -6
31 times 4 dw -6, 50
32 times 4 dw 93, -9
33 times 4 dw -1, 12
34 times 4 dw 123, -6
35
36 sixtap_filter_hw_m: times 4 dw 2, -11
37 times 4 dw 108, 36
38 times 4 dw -8, 1
39 times 4 dw 3, -16
40 times 4 dw 77, 77
41 times 4 dw -16, 3
42 times 4 dw 1, -8
43 times 4 dw 36, 108
44 times 4 dw -11, 2
45
46 fourtap_filter_hb_m: times 8 db -6, -1
47 times 8 db 123, 12
48 times 8 db -9, -6
49 times 8 db 93, 50
50 times 8 db -6, -9
51 times 8 db 50, 93
52 times 8 db -1, -6
53 times 8 db 12, 123
54
55 sixtap_filter_hb_m: times 8 db 2, 1
56 times 8 db -11, 108
57 times 8 db 36, -8
58 times 8 db 3, 3
59 times 8 db -16, 77
60 times 8 db 77, -16
61 times 8 db 1, 2
62 times 8 db -8, 36
63 times 8 db 108, -11
64
65 fourtap_filter_v_m: times 8 dw -6
66 times 8 dw 123
67 times 8 dw 12
68 times 8 dw -1
69 times 8 dw -9
70 times 8 dw 93
71 times 8 dw 50
72 times 8 dw -6
73 times 8 dw -6
74 times 8 dw 50
75 times 8 dw 93
76 times 8 dw -9
77 times 8 dw -1
78 times 8 dw 12
79 times 8 dw 123
80 times 8 dw -6
81
82 sixtap_filter_v_m: times 8 dw 2
83 times 8 dw -11
84 times 8 dw 108
85 times 8 dw 36
86 times 8 dw -8
87 times 8 dw 1
88 times 8 dw 3
89 times 8 dw -16
90 times 8 dw 77
91 times 8 dw 77
92 times 8 dw -16
93 times 8 dw 3
94 times 8 dw 1
95 times 8 dw -8
96 times 8 dw 36
97 times 8 dw 108
98 times 8 dw -11
99 times 8 dw 2
100
101 %ifdef PIC
102 %define fourtap_filter_hw r11
103 %define sixtap_filter_hw r11
104 %define fourtap_filter_hb r11
105 %define sixtap_filter_hb r11
106 %define fourtap_filter_v r11
107 %define sixtap_filter_v r11
108 %else
109 %define fourtap_filter_hw fourtap_filter_hw_m
110 %define sixtap_filter_hw sixtap_filter_hw_m
111 %define fourtap_filter_hb fourtap_filter_hb_m
112 %define sixtap_filter_hb sixtap_filter_hb_m
113 %define fourtap_filter_v fourtap_filter_v_m
114 %define sixtap_filter_v sixtap_filter_v_m
115 %endif
116
117 filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
118 filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
119
120 filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
121 filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
122 filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
123
124 cextern pw_4
125 cextern pw_64
126
127 SECTION .text
128
129 ;-----------------------------------------------------------------------------
130 ; subpel MC functions:
131 ;
132 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
133 ; uint8_t *src, int srcstride,
134 ; int height, int mx, int my);
135 ;-----------------------------------------------------------------------------
136
137 ; 4x4 block, H-only 4-tap filter
138 cglobal put_vp8_epel4_h4_mmxext, 6, 6
139 shl r5d, 4
140 %ifdef PIC
141 lea r11, [fourtap_filter_hw_m]
142 %endif
143 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
144 movq mm5, [fourtap_filter_hw+r5]
145 movq mm7, [pw_64]
146 pxor mm6, mm6
147
148 .nextrow
149 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
150
151 ; first set of 2 pixels
152 movq mm2, mm1 ; byte ABCD..
153 punpcklbw mm1, mm6 ; byte->word ABCD
154 pshufw mm0, mm2, 9 ; byte CDEF..
155 punpcklbw mm0, mm6 ; byte->word CDEF
156 pshufw mm3, mm1, 0x94 ; word ABBC
157 pshufw mm1, mm0, 0x94 ; word CDDE
158 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
159 movq mm0, mm1 ; backup for second set of pixels
160 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
161 paddd mm3, mm1 ; finish 1st 2px
162
163 ; second set of 2 pixels, use backup of above
164 punpckhbw mm2, mm6 ; byte->word EFGH
165 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
166 pshufw mm1, mm2, 0x94 ; word EFFG
167 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
168 paddd mm0, mm1 ; finish 2nd 2px
169
170 ; merge two sets of 2 pixels into one set of 4, round/clip/store
171 packssdw mm3, mm0 ; merge dword->word (4px)
172 paddsw mm3, mm7 ; rounding
173 psraw mm3, 7
174 packuswb mm3, mm6 ; clip and word->bytes
175 movd [r0], mm3 ; store
176
177 ; go to next line
178 add r0, r1
179 add r2, r3
180 dec r4 ; next row
181 jg .nextrow
182 REP_RET
183
184 ; 4x4 block, H-only 6-tap filter
185 cglobal put_vp8_epel4_h6_mmxext, 6, 6
186 lea r5d, [r5*3]
187 %ifdef PIC
188 lea r11, [sixtap_filter_hw_m]
189 %endif
190 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
191 movq mm5, [sixtap_filter_hw+r5*8-32]
192 movq mm6, [sixtap_filter_hw+r5*8-16]
193 movq mm7, [pw_64]
194 pxor mm3, mm3
195
196 .nextrow
197 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
198
199 ; first set of 2 pixels
200 movq mm2, mm1 ; byte ABCD..
201 punpcklbw mm1, mm3 ; byte->word ABCD
202 pshufw mm0, mm2, 0x9 ; byte CDEF..
203 punpckhbw mm2, mm3 ; byte->word EFGH
204 punpcklbw mm0, mm3 ; byte->word CDEF
205 pshufw mm1, mm1, 0x94 ; word ABBC
206 pshufw mm2, mm2, 0x94 ; word EFFG
207 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
208 pshufw mm3, mm0, 0x94 ; word CDDE
209 movq mm0, mm3 ; backup for second set of pixels
210 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
211 paddd mm1, mm3 ; add to 1st 2px cache
212 movq mm3, mm2 ; backup for second set of pixels
213 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
214 paddd mm1, mm2 ; finish 1st 2px
215
216 ; second set of 2 pixels, use backup of above
217 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
218 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
219 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
220 paddd mm0, mm3 ; add to 2nd 2px cache
221 pxor mm3, mm3
222 punpcklbw mm2, mm3 ; byte->word FGHI
223 pshufw mm2, mm2, 0xE9 ; word GHHI
224 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
225 paddd mm0, mm2 ; finish 2nd 2px
226
227 ; merge two sets of 2 pixels into one set of 4, round/clip/store
228 packssdw mm1, mm0 ; merge dword->word (4px)
229 paddsw mm1, mm7 ; rounding
230 psraw mm1, 7
231 packuswb mm1, mm3 ; clip and word->bytes
232 movd [r0], mm1 ; store
233
234 ; go to next line
235 add r0, r1
236 add r2, r3
237 dec r4 ; next row
238 jg .nextrow
239 REP_RET
240
241 ; 4x4 block, H-only 4-tap filter
242 INIT_XMM
243 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
244 shl r5d, 4
245 %ifdef PIC
246 lea r11, [fourtap_filter_hw_m]
247 %endif
248 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
249 mova m6, [fourtap_filter_hw+r5]
250 pxor m7, m7
251
252 .nextrow
253 movh m0, [r2-1]
254 punpcklbw m0, m7 ; ABCDEFGH
255 mova m1, m0
256 mova m2, m0
257 mova m3, m0
258 psrldq m1, 2 ; BCDEFGH
259 psrldq m2, 4 ; CDEFGH
260 psrldq m3, 6 ; DEFGH
261 punpcklwd m0, m1 ; ABBCCDDE
262 punpcklwd m2, m3 ; CDDEEFFG
263 pmaddwd m0, m5
264 pmaddwd m2, m6
265 paddd m0, m2
266
267 movh m1, [r2+3]
268 punpcklbw m1, m7 ; ABCDEFGH
269 mova m2, m1
270 mova m3, m1
271 mova m4, m1
272 psrldq m2, 2 ; BCDEFGH
273 psrldq m3, 4 ; CDEFGH
274 psrldq m4, 6 ; DEFGH
275 punpcklwd m1, m2 ; ABBCCDDE
276 punpcklwd m3, m4 ; CDDEEFFG
277 pmaddwd m1, m5
278 pmaddwd m3, m6
279 paddd m1, m3
280
281 packssdw m0, m1
282 paddsw m0, [pw_64]
283 psraw m0, 7
284 packuswb m0, m7
285 movh [r0], m0 ; store
286
287 ; go to next line
288 add r0, r1
289 add r2, r3
290 dec r4 ; next row
291 jg .nextrow
292 REP_RET
293
294 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
295 lea r5d, [r5*3]
296 %ifdef PIC
297 lea r11, [sixtap_filter_hw_m]
298 %endif
299 lea r5, [sixtap_filter_hw+r5*8]
300 pxor m7, m7
301
302 .nextrow
303 movu m0, [r2-2]
304 mova m6, m0
305 mova m4, m0
306 punpcklbw m0, m7 ; ABCDEFGHI
307 mova m1, m0
308 mova m2, m0
309 mova m3, m0
310 psrldq m1, 2 ; BCDEFGH
311 psrldq m2, 4 ; CDEFGH
312 psrldq m3, 6 ; DEFGH
313 psrldq m4, 4
314 punpcklbw m4, m7 ; EFGH
315 mova m5, m4
316 psrldq m5, 2 ; FGH
317 punpcklwd m0, m1 ; ABBCCDDE
318 punpcklwd m2, m3 ; CDDEEFFG
319 punpcklwd m4, m5 ; EFFGGHHI
320 pmaddwd m0, [r5-48]
321 pmaddwd m2, [r5-32]
322 pmaddwd m4, [r5-16]
323 paddd m0, m2
324 paddd m0, m4
325
326 psrldq m6, 4
327 mova m4, m6
328 punpcklbw m6, m7 ; ABCDEFGHI
329 mova m1, m6
330 mova m2, m6
331 mova m3, m6
332 psrldq m1, 2 ; BCDEFGH
333 psrldq m2, 4 ; CDEFGH
334 psrldq m3, 6 ; DEFGH
335 psrldq m4, 4
336 punpcklbw m4, m7 ; EFGH
337 mova m5, m4
338 psrldq m5, 2 ; FGH
339 punpcklwd m6, m1 ; ABBCCDDE
340 punpcklwd m2, m3 ; CDDEEFFG
341 punpcklwd m4, m5 ; EFFGGHHI
342 pmaddwd m6, [r5-48]
343 pmaddwd m2, [r5-32]
344 pmaddwd m4, [r5-16]
345 paddd m6, m2
346 paddd m6, m4
347
348 packssdw m0, m6
349 paddsw m0, [pw_64]
350 psraw m0, 7
351 packuswb m0, m7
352 movh [r0], m0 ; store
353
354 ; go to next line
355 add r0, r1
356 add r2, r3
357 dec r4 ; next row
358 jg .nextrow
359 REP_RET
360
361 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
362 shl r5d, 4
363 mova m2, [pw_64]
364 mova m3, [filter_v4_shuf1]
365 mova m4, [filter_v4_shuf2]
366 %ifdef PIC
367 lea r11, [fourtap_filter_hb_m]
368 %endif
369 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
370 mova m6, [fourtap_filter_hb+r5]
371
372 .nextrow
373 movu m0, [r2-1]
374 mova m1, m0
375 pshufb m0, m3
376 pshufb m1, m4
377 pmaddubsw m0, m5
378 pmaddubsw m1, m6
379 paddsw m0, m2
380 paddsw m0, m1
381 psraw m0, 7
382 packuswb m0, m0
383 movh [r0], m0 ; store
384
385 ; go to next line
386 add r0, r1
387 add r2, r3
388 dec r4 ; next row
389 jg .nextrow
390 REP_RET
391
392 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
393 lea r5d, [r5*3]
394 mova m3, [filter_v6_shuf1]
395 mova m4, [filter_v6_shuf2]
396 %ifdef PIC
397 lea r11, [sixtap_filter_hb_m]
398 %endif
399 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
400 mova m6, [sixtap_filter_hb+r5*8-32]
401 mova m7, [sixtap_filter_hb+r5*8-16]
402
403 .nextrow
404 movu m0, [r2-2]
405 mova m1, m0
406 mova m2, m0
407 pshufb m0, m3
408 pshufb m1, m4
409 pshufb m2, [filter_v6_shuf3]
410 pmaddubsw m0, m5
411 pmaddubsw m1, m6
412 pmaddubsw m2, m7
413 paddsw m0, m1
414 paddsw m0, m2
415 paddsw m0, [pw_64]
416 psraw m0, 7
417 packuswb m0, m0
418 movh [r0], m0 ; store
419
420 ; go to next line
421 add r0, r1
422 add r2, r3
423 dec r4 ; next row
424 jg .nextrow
425 REP_RET
426
427 %macro FILTER_V 3
428 ; 4x4 block, V-only 4-tap filter
429 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
430 shl r6d, 5
431 %ifdef PIC
432 lea r11, [fourtap_filter_v_m]
433 %endif
434 lea r6, [fourtap_filter_v+r6-32]
435 mova m6, [pw_64]
436 pxor m7, m7
437 mova m5, [r6+48]
438
439 ; read 3 lines
440 sub r2, r3
441 movh m0, [r2]
442 movh m1, [r2+ r3]
443 movh m2, [r2+2*r3]
444 add r2, r3
445 punpcklbw m0, m7
446 punpcklbw m1, m7
447 punpcklbw m2, m7
448
449 .nextrow
450 ; first calculate negative taps (to prevent losing positive overflows)
451 movh m4, [r2+2*r3] ; read new row
452 punpcklbw m4, m7
453 mova m3, m4
454 pmullw m0, [r6+0]
455 pmullw m4, m5
456 paddsw m4, m0
457
458 ; then calculate positive taps
459 mova m0, m1
460 pmullw m1, [r6+16]
461 paddsw m4, m1
462 mova m1, m2
463 pmullw m2, [r6+32]
464 paddsw m4, m2
465 mova m2, m3
466
467 ; round/clip/store
468 paddsw m4, m6
469 psraw m4, 7
470 packuswb m4, m7
471 movh [r0], m4
472
473 ; go to next line
474 add r0, r1
475 add r2, r3
476 dec r4 ; next row
477 jg .nextrow
478 REP_RET
479
480
481 ; 4x4 block, V-only 6-tap filter
482 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
483 shl r6d, 4
484 lea r6, [r6*3]
485 %ifdef PIC
486 lea r11, [sixtap_filter_v_m]
487 %endif
488 lea r6, [sixtap_filter_v+r6-96]
489 pxor m7, m7
490
491 ; read 5 lines
492 sub r2, r3
493 sub r2, r3
494 movh m0, [r2]
495 movh m1, [r2+r3]
496 movh m2, [r2+r3*2]
497 lea r2, [r2+r3*2]
498 add r2, r3
499 movh m3, [r2]
500 movh m4, [r2+r3]
501 punpcklbw m0, m7
502 punpcklbw m1, m7
503 punpcklbw m2, m7
504 punpcklbw m3, m7
505 punpcklbw m4, m7
506
507 .nextrow
508 ; first calculate negative taps (to prevent losing positive overflows)
509 mova m5, m1
510 pmullw m5, [r6+16]
511 mova m6, m4
512 pmullw m6, [r6+64]
513 paddsw m6, m5
514
515 ; then calculate positive taps
516 movh m5, [r2+2*r3] ; read new row
517 punpcklbw m5, m7
518 pmullw m0, [r6+0]
519 paddsw m6, m0
520 mova m0, m1
521 mova m1, m2
522 pmullw m2, [r6+32]
523 paddsw m6, m2
524 mova m2, m3
525 pmullw m3, [r6+48]
526 paddsw m6, m3
527 mova m3, m4
528 mova m4, m5
529 pmullw m5, [r6+80]
530 paddsw m6, m5
531
532 ; round/clip/store
533 paddsw m6, [pw_64]
534 psraw m6, 7
535 packuswb m6, m7
536 movh [r0], m6
537
538 ; go to next line
539 add r0, r1
540 add r2, r3
541 dec r4 ; next row
542 jg .nextrow
543 REP_RET
544 %endmacro
545
546 INIT_MMX
547 FILTER_V mmxext, 4, 0
548 INIT_XMM
549 FILTER_V sse2, 8, 8
550
551 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
552 shl r6d, 4
553 %ifdef PIC
554 lea r11, [fourtap_filter_hb_m]
555 %endif
556 mova m5, [fourtap_filter_hb+r6-16]
557 mova m6, [fourtap_filter_hb+r6]
558 mova m7, [pw_64]
559
560 ; read 3 lines
561 sub r2, r3
562 movh m0, [r2]
563 movh m1, [r2+ r3]
564 movh m2, [r2+2*r3]
565 add r2, r3
566
567 .nextrow
568 movh m3, [r2+2*r3] ; read new row
569 mova m4, m0
570 mova m0, m1
571 punpcklbw m4, m3
572 punpcklbw m1, m2
573 pmaddubsw m4, m5
574 pmaddubsw m1, m6
575 paddsw m4, m1
576 mova m1, m2
577 paddsw m4, m7
578 mova m2, m3
579 psraw m4, 7
580 packuswb m4, m4
581 movh [r0], m4
582
583 ; go to next line
584 add r0, r1
585 add r2, r3
586 dec r4 ; next row
587 jg .nextrow
588 REP_RET
589
590 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
591 lea r6d, [r6*3]
592 %ifdef PIC
593 lea r11, [sixtap_filter_hb_m]
594 %endif
595 lea r6, [sixtap_filter_hb+r6*8]
596
597 ; read 5 lines
598 sub r2, r3
599 sub r2, r3
600 movh m0, [r2]
601 movh m1, [r2+r3]
602 movh m2, [r2+r3*2]
603 lea r2, [r2+r3*2]
604 add r2, r3
605 movh m3, [r2]
606 movh m4, [r2+r3]
607
608 .nextrow
609 movh m5, [r2+2*r3] ; read new row
610 mova m6, m0
611 punpcklbw m6, m5
612 mova m0, m1
613 punpcklbw m1, m2
614 mova m7, m3
615 punpcklbw m7, m4
616 pmaddubsw m6, [r6-48]
617 pmaddubsw m1, [r6-32]
618 pmaddubsw m7, [r6-16]
619 paddsw m6, m1
620 paddsw m6, m7
621 mova m1, m2
622 paddsw m6, [pw_64]
623 mova m2, m3
624 psraw m6, 7
625 mova m3, m4
626 packuswb m6, m6
627 mova m4, m5
628 movh [r0], m6
629
630 ; go to next line
631 add r0, r1
632 add r2, r3
633 dec r4 ; next row
634 jg .nextrow
635 REP_RET
636
637 ;-----------------------------------------------------------------------------
638 ; IDCT functions:
639 ;
640 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
641 ;-----------------------------------------------------------------------------
642
643 cglobal vp8_idct_dc_add_mmx, 3, 3
644 ; load data
645 movd mm0, [r1]
646
647 ; calculate DC
648 paddw mm0, [pw_4]
649 pxor mm1, mm1
650 psraw mm0, 3
651 psubw mm1, mm0
652 packuswb mm0, mm0
653 packuswb mm1, mm1
654 punpcklbw mm0, mm0
655 punpcklbw mm1, mm1
656 punpcklwd mm0, mm0
657 punpcklwd mm1, mm1
658
659 ; add DC
660 lea r1, [r0+r2*2]
661 movd mm2, [r0]
662 movd mm3, [r0+r2]
663 movd mm4, [r1]
664 movd mm5, [r1+r2]
665 paddusb mm2, mm0
666 paddusb mm3, mm0
667 paddusb mm4, mm0
668 paddusb mm5, mm0
669 psubusb mm2, mm1
670 psubusb mm3, mm1
671 psubusb mm4, mm1
672 psubusb mm5, mm1
673 movd [r0], mm2
674 movd [r0+r2], mm3
675 movd [r1], mm4
676 movd [r1+r2], mm5
677 RET
678
679 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
680 ; load data
681 movd xmm0, [r1]
682 lea r1, [r0+r2*2]
683 pxor xmm1, xmm1
684 movq xmm2, [pw_4]
685
686 ; calculate DC
687 paddw xmm0, xmm2
688 movd xmm2, [r0]
689 movd xmm3, [r0+r2]
690 movd xmm4, [r1]
691 movd xmm5, [r1+r2]
692 psraw xmm0, 3
693 pshuflw xmm0, xmm0, 0
694 punpcklqdq xmm0, xmm0
695 punpckldq xmm2, xmm3
696 punpckldq xmm4, xmm5
697 punpcklbw xmm2, xmm1
698 punpcklbw xmm4, xmm1
699 paddw xmm2, xmm0
700 paddw xmm4, xmm0
701 packuswb xmm2, xmm4
702 movd [r0], xmm2
703 pextrd [r0+r2], xmm2, 1
704 pextrd [r1], xmm2, 2
705 pextrd [r1+r2], xmm2, 3
706 RET