11975
|
1 ;******************************************************************************
|
|
2 ;* VP8 MMXEXT optimizations
|
|
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
|
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
|
5 ;*
|
|
6 ;* This file is part of FFmpeg.
|
|
7 ;*
|
|
8 ;* FFmpeg is free software; you can redistribute it and/or
|
|
9 ;* modify it under the terms of the GNU Lesser General Public
|
|
10 ;* License as published by the Free Software Foundation; either
|
|
11 ;* version 2.1 of the License, or (at your option) any later version.
|
|
12 ;*
|
|
13 ;* FFmpeg is distributed in the hope that it will be useful,
|
|
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
16 ;* Lesser General Public License for more details.
|
|
17 ;*
|
|
18 ;* You should have received a copy of the GNU Lesser General Public
|
|
19 ;* License along with FFmpeg; if not, write to the Free Software
|
|
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
21 ;******************************************************************************
|
|
22
|
|
23 %include "x86inc.asm"
|
|
24
|
|
25 SECTION_RODATA
|
|
26
|
|
27 fourtap_filter_hw_m: times 4 dw -6, 123
|
|
28 times 4 dw 12, -1
|
|
29 times 4 dw -9, 93
|
|
30 times 4 dw 50, -6
|
|
31 times 4 dw -6, 50
|
|
32 times 4 dw 93, -9
|
|
33 times 4 dw -1, 12
|
|
34 times 4 dw 123, -6
|
|
35
|
|
36 sixtap_filter_hw_m: times 4 dw 2, -11
|
|
37 times 4 dw 108, 36
|
|
38 times 4 dw -8, 1
|
|
39 times 4 dw 3, -16
|
|
40 times 4 dw 77, 77
|
|
41 times 4 dw -16, 3
|
|
42 times 4 dw 1, -8
|
|
43 times 4 dw 36, 108
|
|
44 times 4 dw -11, 2
|
|
45
|
|
46 fourtap_filter_hb_m: times 8 db -6, -1
|
|
47 times 8 db 123, 12
|
|
48 times 8 db -9, -6
|
|
49 times 8 db 93, 50
|
|
50 times 8 db -6, -9
|
|
51 times 8 db 50, 93
|
|
52 times 8 db -1, -6
|
|
53 times 8 db 12, 123
|
|
54
|
|
55 sixtap_filter_hb_m: times 8 db 2, 1
|
|
56 times 8 db -11, 108
|
|
57 times 8 db 36, -8
|
|
58 times 8 db 3, 3
|
|
59 times 8 db -16, 77
|
|
60 times 8 db 77, -16
|
|
61 times 8 db 1, 2
|
|
62 times 8 db -8, 36
|
|
63 times 8 db 108, -11
|
|
64
|
|
65 fourtap_filter_v_m: times 8 dw -6
|
|
66 times 8 dw 123
|
|
67 times 8 dw 12
|
|
68 times 8 dw -1
|
|
69 times 8 dw -9
|
|
70 times 8 dw 93
|
|
71 times 8 dw 50
|
|
72 times 8 dw -6
|
|
73 times 8 dw -6
|
|
74 times 8 dw 50
|
|
75 times 8 dw 93
|
|
76 times 8 dw -9
|
|
77 times 8 dw -1
|
|
78 times 8 dw 12
|
|
79 times 8 dw 123
|
|
80 times 8 dw -6
|
|
81
|
|
82 sixtap_filter_v_m: times 8 dw 2
|
|
83 times 8 dw -11
|
|
84 times 8 dw 108
|
|
85 times 8 dw 36
|
|
86 times 8 dw -8
|
|
87 times 8 dw 1
|
|
88 times 8 dw 3
|
|
89 times 8 dw -16
|
|
90 times 8 dw 77
|
|
91 times 8 dw 77
|
|
92 times 8 dw -16
|
|
93 times 8 dw 3
|
|
94 times 8 dw 1
|
|
95 times 8 dw -8
|
|
96 times 8 dw 36
|
|
97 times 8 dw 108
|
|
98 times 8 dw -11
|
|
99 times 8 dw 2
|
|
100
|
|
101 %ifdef PIC
|
|
102 %define fourtap_filter_hw r11
|
|
103 %define sixtap_filter_hw r11
|
|
104 %define fourtap_filter_hb r11
|
|
105 %define sixtap_filter_hb r11
|
|
106 %define fourtap_filter_v r11
|
|
107 %define sixtap_filter_v r11
|
|
108 %else
|
|
109 %define fourtap_filter_hw fourtap_filter_hw_m
|
|
110 %define sixtap_filter_hw sixtap_filter_hw_m
|
|
111 %define fourtap_filter_hb fourtap_filter_hb_m
|
|
112 %define sixtap_filter_hb sixtap_filter_hb_m
|
|
113 %define fourtap_filter_v fourtap_filter_v_m
|
|
114 %define sixtap_filter_v sixtap_filter_v_m
|
|
115 %endif
|
|
116
|
|
117 filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
|
|
118 filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
|
119
|
|
120 filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
|
121 filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
|
122 filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
|
123
|
|
124 cextern pw_4
|
|
125 cextern pw_64
|
|
126
|
|
127 SECTION .text
|
|
128
|
|
129 ;-----------------------------------------------------------------------------
|
|
130 ; subpel MC functions:
|
|
131 ;
|
|
132 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
|
|
133 ; uint8_t *src, int srcstride,
|
|
134 ; int height, int mx, int my);
|
|
135 ;-----------------------------------------------------------------------------
|
|
136
|
|
137 ; 4x4 block, H-only 4-tap filter
|
|
138 cglobal put_vp8_epel4_h4_mmxext, 6, 6
|
|
139 shl r5d, 4
|
|
140 %ifdef PIC
|
|
141 lea r11, [fourtap_filter_hw_m]
|
|
142 %endif
|
|
143 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
|
|
144 movq mm5, [fourtap_filter_hw+r5]
|
|
145 movq mm7, [pw_64]
|
|
146 pxor mm6, mm6
|
|
147
|
|
148 .nextrow
|
|
149 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
|
|
150
|
|
151 ; first set of 2 pixels
|
|
152 movq mm2, mm1 ; byte ABCD..
|
|
153 punpcklbw mm1, mm6 ; byte->word ABCD
|
|
154 pshufw mm0, mm2, 9 ; byte CDEF..
|
|
155 punpcklbw mm0, mm6 ; byte->word CDEF
|
|
156 pshufw mm3, mm1, 0x94 ; word ABBC
|
|
157 pshufw mm1, mm0, 0x94 ; word CDDE
|
|
158 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
|
|
159 movq mm0, mm1 ; backup for second set of pixels
|
|
160 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
|
161 paddd mm3, mm1 ; finish 1st 2px
|
|
162
|
|
163 ; second set of 2 pixels, use backup of above
|
|
164 punpckhbw mm2, mm6 ; byte->word EFGH
|
|
165 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
|
|
166 pshufw mm1, mm2, 0x94 ; word EFFG
|
|
167 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
|
168 paddd mm0, mm1 ; finish 2nd 2px
|
|
169
|
|
170 ; merge two sets of 2 pixels into one set of 4, round/clip/store
|
|
171 packssdw mm3, mm0 ; merge dword->word (4px)
|
|
172 paddsw mm3, mm7 ; rounding
|
|
173 psraw mm3, 7
|
|
174 packuswb mm3, mm6 ; clip and word->bytes
|
|
175 movd [r0], mm3 ; store
|
|
176
|
|
177 ; go to next line
|
|
178 add r0, r1
|
|
179 add r2, r3
|
|
180 dec r4 ; next row
|
|
181 jg .nextrow
|
|
182 REP_RET
|
|
183
|
|
184 ; 4x4 block, H-only 6-tap filter
|
|
185 cglobal put_vp8_epel4_h6_mmxext, 6, 6
|
|
186 lea r5d, [r5*3]
|
|
187 %ifdef PIC
|
|
188 lea r11, [sixtap_filter_hw_m]
|
|
189 %endif
|
|
190 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
|
|
191 movq mm5, [sixtap_filter_hw+r5*8-32]
|
|
192 movq mm6, [sixtap_filter_hw+r5*8-16]
|
|
193 movq mm7, [pw_64]
|
|
194 pxor mm3, mm3
|
|
195
|
|
196 .nextrow
|
|
197 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
|
|
198
|
|
199 ; first set of 2 pixels
|
|
200 movq mm2, mm1 ; byte ABCD..
|
|
201 punpcklbw mm1, mm3 ; byte->word ABCD
|
|
202 pshufw mm0, mm2, 0x9 ; byte CDEF..
|
|
203 punpckhbw mm2, mm3 ; byte->word EFGH
|
|
204 punpcklbw mm0, mm3 ; byte->word CDEF
|
|
205 pshufw mm1, mm1, 0x94 ; word ABBC
|
|
206 pshufw mm2, mm2, 0x94 ; word EFFG
|
|
207 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
|
|
208 pshufw mm3, mm0, 0x94 ; word CDDE
|
|
209 movq mm0, mm3 ; backup for second set of pixels
|
|
210 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
|
|
211 paddd mm1, mm3 ; add to 1st 2px cache
|
|
212 movq mm3, mm2 ; backup for second set of pixels
|
|
213 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
|
214 paddd mm1, mm2 ; finish 1st 2px
|
|
215
|
|
216 ; second set of 2 pixels, use backup of above
|
|
217 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
|
|
218 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
|
|
219 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
|
|
220 paddd mm0, mm3 ; add to 2nd 2px cache
|
|
221 pxor mm3, mm3
|
|
222 punpcklbw mm2, mm3 ; byte->word FGHI
|
|
223 pshufw mm2, mm2, 0xE9 ; word GHHI
|
|
224 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
|
225 paddd mm0, mm2 ; finish 2nd 2px
|
|
226
|
|
227 ; merge two sets of 2 pixels into one set of 4, round/clip/store
|
|
228 packssdw mm1, mm0 ; merge dword->word (4px)
|
|
229 paddsw mm1, mm7 ; rounding
|
|
230 psraw mm1, 7
|
|
231 packuswb mm1, mm3 ; clip and word->bytes
|
|
232 movd [r0], mm1 ; store
|
|
233
|
|
234 ; go to next line
|
|
235 add r0, r1
|
|
236 add r2, r3
|
|
237 dec r4 ; next row
|
|
238 jg .nextrow
|
|
239 REP_RET
|
|
240
|
|
241 ; 4x4 block, H-only 4-tap filter
|
|
242 INIT_XMM
|
|
243 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
|
|
244 shl r5d, 4
|
|
245 %ifdef PIC
|
|
246 lea r11, [fourtap_filter_hw_m]
|
|
247 %endif
|
|
248 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
|
|
249 mova m6, [fourtap_filter_hw+r5]
|
|
250 pxor m7, m7
|
|
251
|
|
252 .nextrow
|
|
253 movh m0, [r2-1]
|
|
254 punpcklbw m0, m7 ; ABCDEFGH
|
|
255 mova m1, m0
|
|
256 mova m2, m0
|
|
257 mova m3, m0
|
|
258 psrldq m1, 2 ; BCDEFGH
|
|
259 psrldq m2, 4 ; CDEFGH
|
|
260 psrldq m3, 6 ; DEFGH
|
|
261 punpcklwd m0, m1 ; ABBCCDDE
|
|
262 punpcklwd m2, m3 ; CDDEEFFG
|
|
263 pmaddwd m0, m5
|
|
264 pmaddwd m2, m6
|
|
265 paddd m0, m2
|
|
266
|
|
267 movh m1, [r2+3]
|
|
268 punpcklbw m1, m7 ; ABCDEFGH
|
|
269 mova m2, m1
|
|
270 mova m3, m1
|
|
271 mova m4, m1
|
|
272 psrldq m2, 2 ; BCDEFGH
|
|
273 psrldq m3, 4 ; CDEFGH
|
|
274 psrldq m4, 6 ; DEFGH
|
|
275 punpcklwd m1, m2 ; ABBCCDDE
|
|
276 punpcklwd m3, m4 ; CDDEEFFG
|
|
277 pmaddwd m1, m5
|
|
278 pmaddwd m3, m6
|
|
279 paddd m1, m3
|
|
280
|
|
281 packssdw m0, m1
|
|
282 paddsw m0, [pw_64]
|
|
283 psraw m0, 7
|
|
284 packuswb m0, m7
|
|
285 movh [r0], m0 ; store
|
|
286
|
|
287 ; go to next line
|
|
288 add r0, r1
|
|
289 add r2, r3
|
|
290 dec r4 ; next row
|
|
291 jg .nextrow
|
|
292 REP_RET
|
|
293
|
|
294 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
|
|
295 lea r5d, [r5*3]
|
|
296 %ifdef PIC
|
|
297 lea r11, [sixtap_filter_hw_m]
|
|
298 %endif
|
|
299 lea r5, [sixtap_filter_hw+r5*8]
|
|
300 pxor m7, m7
|
|
301
|
|
302 .nextrow
|
|
303 movu m0, [r2-2]
|
|
304 mova m6, m0
|
|
305 mova m4, m0
|
|
306 punpcklbw m0, m7 ; ABCDEFGHI
|
|
307 mova m1, m0
|
|
308 mova m2, m0
|
|
309 mova m3, m0
|
|
310 psrldq m1, 2 ; BCDEFGH
|
|
311 psrldq m2, 4 ; CDEFGH
|
|
312 psrldq m3, 6 ; DEFGH
|
|
313 psrldq m4, 4
|
|
314 punpcklbw m4, m7 ; EFGH
|
|
315 mova m5, m4
|
|
316 psrldq m5, 2 ; FGH
|
|
317 punpcklwd m0, m1 ; ABBCCDDE
|
|
318 punpcklwd m2, m3 ; CDDEEFFG
|
|
319 punpcklwd m4, m5 ; EFFGGHHI
|
|
320 pmaddwd m0, [r5-48]
|
|
321 pmaddwd m2, [r5-32]
|
|
322 pmaddwd m4, [r5-16]
|
|
323 paddd m0, m2
|
|
324 paddd m0, m4
|
|
325
|
|
326 psrldq m6, 4
|
|
327 mova m4, m6
|
|
328 punpcklbw m6, m7 ; ABCDEFGHI
|
|
329 mova m1, m6
|
|
330 mova m2, m6
|
|
331 mova m3, m6
|
|
332 psrldq m1, 2 ; BCDEFGH
|
|
333 psrldq m2, 4 ; CDEFGH
|
|
334 psrldq m3, 6 ; DEFGH
|
|
335 psrldq m4, 4
|
|
336 punpcklbw m4, m7 ; EFGH
|
|
337 mova m5, m4
|
|
338 psrldq m5, 2 ; FGH
|
|
339 punpcklwd m6, m1 ; ABBCCDDE
|
|
340 punpcklwd m2, m3 ; CDDEEFFG
|
|
341 punpcklwd m4, m5 ; EFFGGHHI
|
|
342 pmaddwd m6, [r5-48]
|
|
343 pmaddwd m2, [r5-32]
|
|
344 pmaddwd m4, [r5-16]
|
|
345 paddd m6, m2
|
|
346 paddd m6, m4
|
|
347
|
|
348 packssdw m0, m6
|
|
349 paddsw m0, [pw_64]
|
|
350 psraw m0, 7
|
|
351 packuswb m0, m7
|
|
352 movh [r0], m0 ; store
|
|
353
|
|
354 ; go to next line
|
|
355 add r0, r1
|
|
356 add r2, r3
|
|
357 dec r4 ; next row
|
|
358 jg .nextrow
|
|
359 REP_RET
|
|
360
|
|
361 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
|
|
362 shl r5d, 4
|
|
363 mova m2, [pw_64]
|
|
364 mova m3, [filter_v4_shuf1]
|
|
365 mova m4, [filter_v4_shuf2]
|
|
366 %ifdef PIC
|
|
367 lea r11, [fourtap_filter_hb_m]
|
|
368 %endif
|
|
369 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
|
|
370 mova m6, [fourtap_filter_hb+r5]
|
|
371
|
|
372 .nextrow
|
|
373 movu m0, [r2-1]
|
|
374 mova m1, m0
|
|
375 pshufb m0, m3
|
|
376 pshufb m1, m4
|
|
377 pmaddubsw m0, m5
|
|
378 pmaddubsw m1, m6
|
|
379 paddsw m0, m2
|
|
380 paddsw m0, m1
|
|
381 psraw m0, 7
|
|
382 packuswb m0, m0
|
|
383 movh [r0], m0 ; store
|
|
384
|
|
385 ; go to next line
|
|
386 add r0, r1
|
|
387 add r2, r3
|
|
388 dec r4 ; next row
|
|
389 jg .nextrow
|
|
390 REP_RET
|
|
391
|
|
392 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
|
|
393 lea r5d, [r5*3]
|
|
394 mova m3, [filter_v6_shuf1]
|
|
395 mova m4, [filter_v6_shuf2]
|
|
396 %ifdef PIC
|
|
397 lea r11, [sixtap_filter_hb_m]
|
|
398 %endif
|
|
399 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
|
|
400 mova m6, [sixtap_filter_hb+r5*8-32]
|
|
401 mova m7, [sixtap_filter_hb+r5*8-16]
|
|
402
|
|
403 .nextrow
|
|
404 movu m0, [r2-2]
|
|
405 mova m1, m0
|
|
406 mova m2, m0
|
|
407 pshufb m0, m3
|
|
408 pshufb m1, m4
|
|
409 pshufb m2, [filter_v6_shuf3]
|
|
410 pmaddubsw m0, m5
|
|
411 pmaddubsw m1, m6
|
|
412 pmaddubsw m2, m7
|
|
413 paddsw m0, m1
|
|
414 paddsw m0, m2
|
|
415 paddsw m0, [pw_64]
|
|
416 psraw m0, 7
|
|
417 packuswb m0, m0
|
|
418 movh [r0], m0 ; store
|
|
419
|
|
420 ; go to next line
|
|
421 add r0, r1
|
|
422 add r2, r3
|
|
423 dec r4 ; next row
|
|
424 jg .nextrow
|
|
425 REP_RET
|
|
426
|
|
427 %macro FILTER_V 3
|
|
428 ; 4x4 block, V-only 4-tap filter
|
|
429 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
|
|
430 shl r6d, 5
|
|
431 %ifdef PIC
|
|
432 lea r11, [fourtap_filter_v_m]
|
|
433 %endif
|
|
434 lea r6, [fourtap_filter_v+r6-32]
|
|
435 mova m6, [pw_64]
|
|
436 pxor m7, m7
|
|
437 mova m5, [r6+48]
|
|
438
|
|
439 ; read 3 lines
|
|
440 sub r2, r3
|
|
441 movh m0, [r2]
|
|
442 movh m1, [r2+ r3]
|
|
443 movh m2, [r2+2*r3]
|
|
444 add r2, r3
|
|
445 punpcklbw m0, m7
|
|
446 punpcklbw m1, m7
|
|
447 punpcklbw m2, m7
|
|
448
|
|
449 .nextrow
|
|
450 ; first calculate negative taps (to prevent losing positive overflows)
|
|
451 movh m4, [r2+2*r3] ; read new row
|
|
452 punpcklbw m4, m7
|
|
453 mova m3, m4
|
|
454 pmullw m0, [r6+0]
|
|
455 pmullw m4, m5
|
|
456 paddsw m4, m0
|
|
457
|
|
458 ; then calculate positive taps
|
|
459 mova m0, m1
|
|
460 pmullw m1, [r6+16]
|
|
461 paddsw m4, m1
|
|
462 mova m1, m2
|
|
463 pmullw m2, [r6+32]
|
|
464 paddsw m4, m2
|
|
465 mova m2, m3
|
|
466
|
|
467 ; round/clip/store
|
|
468 paddsw m4, m6
|
|
469 psraw m4, 7
|
|
470 packuswb m4, m7
|
|
471 movh [r0], m4
|
|
472
|
|
473 ; go to next line
|
|
474 add r0, r1
|
|
475 add r2, r3
|
|
476 dec r4 ; next row
|
|
477 jg .nextrow
|
|
478 REP_RET
|
|
479
|
|
480
|
|
481 ; 4x4 block, V-only 6-tap filter
|
|
482 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
|
|
483 shl r6d, 4
|
|
484 lea r6, [r6*3]
|
|
485 %ifdef PIC
|
|
486 lea r11, [sixtap_filter_v_m]
|
|
487 %endif
|
|
488 lea r6, [sixtap_filter_v+r6-96]
|
|
489 pxor m7, m7
|
|
490
|
|
491 ; read 5 lines
|
|
492 sub r2, r3
|
|
493 sub r2, r3
|
|
494 movh m0, [r2]
|
|
495 movh m1, [r2+r3]
|
|
496 movh m2, [r2+r3*2]
|
|
497 lea r2, [r2+r3*2]
|
|
498 add r2, r3
|
|
499 movh m3, [r2]
|
|
500 movh m4, [r2+r3]
|
|
501 punpcklbw m0, m7
|
|
502 punpcklbw m1, m7
|
|
503 punpcklbw m2, m7
|
|
504 punpcklbw m3, m7
|
|
505 punpcklbw m4, m7
|
|
506
|
|
507 .nextrow
|
|
508 ; first calculate negative taps (to prevent losing positive overflows)
|
|
509 mova m5, m1
|
|
510 pmullw m5, [r6+16]
|
|
511 mova m6, m4
|
|
512 pmullw m6, [r6+64]
|
|
513 paddsw m6, m5
|
|
514
|
|
515 ; then calculate positive taps
|
|
516 movh m5, [r2+2*r3] ; read new row
|
|
517 punpcklbw m5, m7
|
|
518 pmullw m0, [r6+0]
|
|
519 paddsw m6, m0
|
|
520 mova m0, m1
|
|
521 mova m1, m2
|
|
522 pmullw m2, [r6+32]
|
|
523 paddsw m6, m2
|
|
524 mova m2, m3
|
|
525 pmullw m3, [r6+48]
|
|
526 paddsw m6, m3
|
|
527 mova m3, m4
|
|
528 mova m4, m5
|
|
529 pmullw m5, [r6+80]
|
|
530 paddsw m6, m5
|
|
531
|
|
532 ; round/clip/store
|
|
533 paddsw m6, [pw_64]
|
|
534 psraw m6, 7
|
|
535 packuswb m6, m7
|
|
536 movh [r0], m6
|
|
537
|
|
538 ; go to next line
|
|
539 add r0, r1
|
|
540 add r2, r3
|
|
541 dec r4 ; next row
|
|
542 jg .nextrow
|
|
543 REP_RET
|
|
544 %endmacro
|
|
545
|
|
546 INIT_MMX
|
|
547 FILTER_V mmxext, 4, 0
|
|
548 INIT_XMM
|
|
549 FILTER_V sse2, 8, 8
|
|
550
|
|
551 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
|
|
552 shl r6d, 4
|
|
553 %ifdef PIC
|
|
554 lea r11, [fourtap_filter_hb_m]
|
|
555 %endif
|
|
556 mova m5, [fourtap_filter_hb+r6-16]
|
|
557 mova m6, [fourtap_filter_hb+r6]
|
|
558 mova m7, [pw_64]
|
|
559
|
|
560 ; read 3 lines
|
|
561 sub r2, r3
|
|
562 movh m0, [r2]
|
|
563 movh m1, [r2+ r3]
|
|
564 movh m2, [r2+2*r3]
|
|
565 add r2, r3
|
|
566
|
|
567 .nextrow
|
|
568 movh m3, [r2+2*r3] ; read new row
|
|
569 mova m4, m0
|
|
570 mova m0, m1
|
|
571 punpcklbw m4, m3
|
|
572 punpcklbw m1, m2
|
|
573 pmaddubsw m4, m5
|
|
574 pmaddubsw m1, m6
|
|
575 paddsw m4, m1
|
|
576 mova m1, m2
|
|
577 paddsw m4, m7
|
|
578 mova m2, m3
|
|
579 psraw m4, 7
|
|
580 packuswb m4, m4
|
|
581 movh [r0], m4
|
|
582
|
|
583 ; go to next line
|
|
584 add r0, r1
|
|
585 add r2, r3
|
|
586 dec r4 ; next row
|
|
587 jg .nextrow
|
|
588 REP_RET
|
|
589
|
|
590 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
|
|
591 lea r6d, [r6*3]
|
|
592 %ifdef PIC
|
|
593 lea r11, [sixtap_filter_hb_m]
|
|
594 %endif
|
|
595 lea r6, [sixtap_filter_hb+r6*8]
|
|
596
|
|
597 ; read 5 lines
|
|
598 sub r2, r3
|
|
599 sub r2, r3
|
|
600 movh m0, [r2]
|
|
601 movh m1, [r2+r3]
|
|
602 movh m2, [r2+r3*2]
|
|
603 lea r2, [r2+r3*2]
|
|
604 add r2, r3
|
|
605 movh m3, [r2]
|
|
606 movh m4, [r2+r3]
|
|
607
|
|
608 .nextrow
|
|
609 movh m5, [r2+2*r3] ; read new row
|
|
610 mova m6, m0
|
|
611 punpcklbw m6, m5
|
|
612 mova m0, m1
|
|
613 punpcklbw m1, m2
|
|
614 mova m7, m3
|
|
615 punpcklbw m7, m4
|
|
616 pmaddubsw m6, [r6-48]
|
|
617 pmaddubsw m1, [r6-32]
|
|
618 pmaddubsw m7, [r6-16]
|
|
619 paddsw m6, m1
|
|
620 paddsw m6, m7
|
|
621 mova m1, m2
|
|
622 paddsw m6, [pw_64]
|
|
623 mova m2, m3
|
|
624 psraw m6, 7
|
|
625 mova m3, m4
|
|
626 packuswb m6, m6
|
|
627 mova m4, m5
|
|
628 movh [r0], m6
|
|
629
|
|
630 ; go to next line
|
|
631 add r0, r1
|
|
632 add r2, r3
|
|
633 dec r4 ; next row
|
|
634 jg .nextrow
|
|
635 REP_RET
|
|
636
|
|
637 ;-----------------------------------------------------------------------------
|
|
638 ; IDCT functions:
|
|
639 ;
|
|
640 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
|
|
641 ;-----------------------------------------------------------------------------
|
|
642
|
|
643 cglobal vp8_idct_dc_add_mmx, 3, 3
|
|
644 ; load data
|
|
645 movd mm0, [r1]
|
|
646
|
|
647 ; calculate DC
|
|
648 paddw mm0, [pw_4]
|
|
649 pxor mm1, mm1
|
|
650 psraw mm0, 3
|
|
651 psubw mm1, mm0
|
|
652 packuswb mm0, mm0
|
|
653 packuswb mm1, mm1
|
|
654 punpcklbw mm0, mm0
|
|
655 punpcklbw mm1, mm1
|
|
656 punpcklwd mm0, mm0
|
|
657 punpcklwd mm1, mm1
|
|
658
|
|
659 ; add DC
|
|
660 lea r1, [r0+r2*2]
|
|
661 movd mm2, [r0]
|
|
662 movd mm3, [r0+r2]
|
|
663 movd mm4, [r1]
|
|
664 movd mm5, [r1+r2]
|
|
665 paddusb mm2, mm0
|
|
666 paddusb mm3, mm0
|
|
667 paddusb mm4, mm0
|
|
668 paddusb mm5, mm0
|
|
669 psubusb mm2, mm1
|
|
670 psubusb mm3, mm1
|
|
671 psubusb mm4, mm1
|
|
672 psubusb mm5, mm1
|
|
673 movd [r0], mm2
|
|
674 movd [r0+r2], mm3
|
|
675 movd [r1], mm4
|
|
676 movd [r1+r2], mm5
|
|
677 RET
|
|
678
|
|
679 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
|
680 ; load data
|
|
681 movd xmm0, [r1]
|
|
682 lea r1, [r0+r2*2]
|
|
683 pxor xmm1, xmm1
|
|
684 movq xmm2, [pw_4]
|
|
685
|
|
686 ; calculate DC
|
|
687 paddw xmm0, xmm2
|
|
688 movd xmm2, [r0]
|
|
689 movd xmm3, [r0+r2]
|
|
690 movd xmm4, [r1]
|
|
691 movd xmm5, [r1+r2]
|
|
692 psraw xmm0, 3
|
|
693 pshuflw xmm0, xmm0, 0
|
|
694 punpcklqdq xmm0, xmm0
|
|
695 punpckldq xmm2, xmm3
|
|
696 punpckldq xmm4, xmm5
|
|
697 punpcklbw xmm2, xmm1
|
|
698 punpcklbw xmm4, xmm1
|
|
699 paddw xmm2, xmm0
|
|
700 paddw xmm4, xmm0
|
|
701 packuswb xmm2, xmm4
|
|
702 movd [r0], xmm2
|
|
703 pextrd [r0+r2], xmm2, 1
|
|
704 pextrd [r1], xmm2, 2
|
|
705 pextrd [r1+r2], xmm2, 3
|
|
706 RET
|