11975
|
1 ;******************************************************************************
|
|
2 ;* VP8 MMXEXT optimizations
|
|
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
|
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
|
5 ;*
|
|
6 ;* This file is part of FFmpeg.
|
|
7 ;*
|
|
8 ;* FFmpeg is free software; you can redistribute it and/or
|
|
9 ;* modify it under the terms of the GNU Lesser General Public
|
|
10 ;* License as published by the Free Software Foundation; either
|
|
11 ;* version 2.1 of the License, or (at your option) any later version.
|
|
12 ;*
|
|
13 ;* FFmpeg is distributed in the hope that it will be useful,
|
|
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
16 ;* Lesser General Public License for more details.
|
|
17 ;*
|
|
18 ;* You should have received a copy of the GNU Lesser General Public
|
|
19 ;* License along with FFmpeg; if not, write to the Free Software
|
|
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
21 ;******************************************************************************
|
|
22
|
|
23 %include "x86inc.asm"
|
|
24
|
|
25 SECTION_RODATA
|
|
26
|
|
27 fourtap_filter_hw_m: times 4 dw -6, 123
|
|
28 times 4 dw 12, -1
|
|
29 times 4 dw -9, 93
|
|
30 times 4 dw 50, -6
|
|
31 times 4 dw -6, 50
|
|
32 times 4 dw 93, -9
|
|
33 times 4 dw -1, 12
|
|
34 times 4 dw 123, -6
|
|
35
|
|
36 sixtap_filter_hw_m: times 4 dw 2, -11
|
|
37 times 4 dw 108, 36
|
|
38 times 4 dw -8, 1
|
|
39 times 4 dw 3, -16
|
|
40 times 4 dw 77, 77
|
|
41 times 4 dw -16, 3
|
|
42 times 4 dw 1, -8
|
|
43 times 4 dw 36, 108
|
|
44 times 4 dw -11, 2
|
|
45
|
|
46 fourtap_filter_hb_m: times 8 db -6, -1
|
|
47 times 8 db 123, 12
|
|
48 times 8 db -9, -6
|
|
49 times 8 db 93, 50
|
|
50 times 8 db -6, -9
|
|
51 times 8 db 50, 93
|
|
52 times 8 db -1, -6
|
|
53 times 8 db 12, 123
|
|
54
|
|
55 sixtap_filter_hb_m: times 8 db 2, 1
|
|
56 times 8 db -11, 108
|
|
57 times 8 db 36, -8
|
|
58 times 8 db 3, 3
|
|
59 times 8 db -16, 77
|
|
60 times 8 db 77, -16
|
|
61 times 8 db 1, 2
|
|
62 times 8 db -8, 36
|
|
63 times 8 db 108, -11
|
|
64
|
|
65 fourtap_filter_v_m: times 8 dw -6
|
|
66 times 8 dw 123
|
|
67 times 8 dw 12
|
|
68 times 8 dw -1
|
|
69 times 8 dw -9
|
|
70 times 8 dw 93
|
|
71 times 8 dw 50
|
|
72 times 8 dw -6
|
|
73 times 8 dw -6
|
|
74 times 8 dw 50
|
|
75 times 8 dw 93
|
|
76 times 8 dw -9
|
|
77 times 8 dw -1
|
|
78 times 8 dw 12
|
|
79 times 8 dw 123
|
|
80 times 8 dw -6
|
|
81
|
|
82 sixtap_filter_v_m: times 8 dw 2
|
|
83 times 8 dw -11
|
|
84 times 8 dw 108
|
|
85 times 8 dw 36
|
|
86 times 8 dw -8
|
|
87 times 8 dw 1
|
|
88 times 8 dw 3
|
|
89 times 8 dw -16
|
|
90 times 8 dw 77
|
|
91 times 8 dw 77
|
|
92 times 8 dw -16
|
|
93 times 8 dw 3
|
|
94 times 8 dw 1
|
|
95 times 8 dw -8
|
|
96 times 8 dw 36
|
|
97 times 8 dw 108
|
|
98 times 8 dw -11
|
|
99 times 8 dw 2
|
|
100
|
11991
|
101 bilinear_filter_vw_m: times 8 dw 1
|
|
102 times 8 dw 2
|
|
103 times 8 dw 3
|
|
104 times 8 dw 4
|
|
105 times 8 dw 5
|
|
106 times 8 dw 6
|
|
107 times 8 dw 7
|
|
108
|
|
109 bilinear_filter_vb_m: times 8 db 7, 1
|
|
110 times 8 db 6, 2
|
|
111 times 8 db 5, 3
|
|
112 times 8 db 4, 4
|
|
113 times 8 db 3, 5
|
|
114 times 8 db 2, 6
|
|
115 times 8 db 1, 7
|
|
116
|
11975
|
117 %ifdef PIC
|
11991
|
118 %define fourtap_filter_hw r11
|
|
119 %define sixtap_filter_hw r11
|
|
120 %define fourtap_filter_hb r11
|
|
121 %define sixtap_filter_hb r11
|
|
122 %define fourtap_filter_v r11
|
|
123 %define sixtap_filter_v r11
|
|
124 %define bilinear_filter_vw r11
|
|
125 %define bilinear_filter_vb r11
|
11975
|
126 %else
|
|
127 %define fourtap_filter_hw fourtap_filter_hw_m
|
|
128 %define sixtap_filter_hw sixtap_filter_hw_m
|
|
129 %define fourtap_filter_hb fourtap_filter_hb_m
|
|
130 %define sixtap_filter_hb sixtap_filter_hb_m
|
|
131 %define fourtap_filter_v fourtap_filter_v_m
|
|
132 %define sixtap_filter_v sixtap_filter_v_m
|
11991
|
133 %define bilinear_filter_vw bilinear_filter_vw_m
|
|
134 %define bilinear_filter_vb bilinear_filter_vb_m
|
11975
|
135 %endif
|
|
136
|
11991
|
137 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
|
138 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
|
11975
|
139
|
11991
|
140 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
|
141 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
|
142 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
11975
|
143
|
|
144 cextern pw_4
|
|
145 cextern pw_64
|
|
146
|
|
147 SECTION .text
|
|
148
|
|
149 ;-----------------------------------------------------------------------------
|
|
150 ; subpel MC functions:
|
|
151 ;
|
|
152 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
|
|
153 ; uint8_t *src, int srcstride,
|
|
154 ; int height, int mx, int my);
|
|
155 ;-----------------------------------------------------------------------------
|
|
156
|
|
157 ; 4x4 block, H-only 4-tap filter
|
|
158 cglobal put_vp8_epel4_h4_mmxext, 6, 6
|
|
159 shl r5d, 4
|
|
160 %ifdef PIC
|
|
161 lea r11, [fourtap_filter_hw_m]
|
|
162 %endif
|
|
163 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
|
|
164 movq mm5, [fourtap_filter_hw+r5]
|
|
165 movq mm7, [pw_64]
|
|
166 pxor mm6, mm6
|
|
167
|
|
168 .nextrow
|
|
169 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
|
|
170
|
|
171 ; first set of 2 pixels
|
|
172 movq mm2, mm1 ; byte ABCD..
|
|
173 punpcklbw mm1, mm6 ; byte->word ABCD
|
|
174 pshufw mm0, mm2, 9 ; byte CDEF..
|
|
175 punpcklbw mm0, mm6 ; byte->word CDEF
|
|
176 pshufw mm3, mm1, 0x94 ; word ABBC
|
|
177 pshufw mm1, mm0, 0x94 ; word CDDE
|
|
178 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
|
|
179 movq mm0, mm1 ; backup for second set of pixels
|
|
180 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
|
181 paddd mm3, mm1 ; finish 1st 2px
|
|
182
|
|
183 ; second set of 2 pixels, use backup of above
|
|
184 punpckhbw mm2, mm6 ; byte->word EFGH
|
|
185 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
|
|
186 pshufw mm1, mm2, 0x94 ; word EFFG
|
|
187 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
|
188 paddd mm0, mm1 ; finish 2nd 2px
|
|
189
|
|
190 ; merge two sets of 2 pixels into one set of 4, round/clip/store
|
|
191 packssdw mm3, mm0 ; merge dword->word (4px)
|
|
192 paddsw mm3, mm7 ; rounding
|
|
193 psraw mm3, 7
|
|
194 packuswb mm3, mm6 ; clip and word->bytes
|
|
195 movd [r0], mm3 ; store
|
|
196
|
|
197 ; go to next line
|
|
198 add r0, r1
|
|
199 add r2, r3
|
|
200 dec r4 ; next row
|
|
201 jg .nextrow
|
|
202 REP_RET
|
|
203
|
|
204 ; 4x4 block, H-only 6-tap filter
|
|
205 cglobal put_vp8_epel4_h6_mmxext, 6, 6
|
|
206 lea r5d, [r5*3]
|
|
207 %ifdef PIC
|
|
208 lea r11, [sixtap_filter_hw_m]
|
|
209 %endif
|
|
210 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
|
|
211 movq mm5, [sixtap_filter_hw+r5*8-32]
|
|
212 movq mm6, [sixtap_filter_hw+r5*8-16]
|
|
213 movq mm7, [pw_64]
|
|
214 pxor mm3, mm3
|
|
215
|
|
216 .nextrow
|
|
217 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
|
|
218
|
|
219 ; first set of 2 pixels
|
|
220 movq mm2, mm1 ; byte ABCD..
|
|
221 punpcklbw mm1, mm3 ; byte->word ABCD
|
|
222 pshufw mm0, mm2, 0x9 ; byte CDEF..
|
|
223 punpckhbw mm2, mm3 ; byte->word EFGH
|
|
224 punpcklbw mm0, mm3 ; byte->word CDEF
|
|
225 pshufw mm1, mm1, 0x94 ; word ABBC
|
|
226 pshufw mm2, mm2, 0x94 ; word EFFG
|
|
227 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
|
|
228 pshufw mm3, mm0, 0x94 ; word CDDE
|
|
229 movq mm0, mm3 ; backup for second set of pixels
|
|
230 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
|
|
231 paddd mm1, mm3 ; add to 1st 2px cache
|
|
232 movq mm3, mm2 ; backup for second set of pixels
|
|
233 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
|
234 paddd mm1, mm2 ; finish 1st 2px
|
|
235
|
|
236 ; second set of 2 pixels, use backup of above
|
|
237 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
|
|
238 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
|
|
239 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
|
|
240 paddd mm0, mm3 ; add to 2nd 2px cache
|
|
241 pxor mm3, mm3
|
|
242 punpcklbw mm2, mm3 ; byte->word FGHI
|
|
243 pshufw mm2, mm2, 0xE9 ; word GHHI
|
|
244 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
|
245 paddd mm0, mm2 ; finish 2nd 2px
|
|
246
|
|
247 ; merge two sets of 2 pixels into one set of 4, round/clip/store
|
|
248 packssdw mm1, mm0 ; merge dword->word (4px)
|
|
249 paddsw mm1, mm7 ; rounding
|
|
250 psraw mm1, 7
|
|
251 packuswb mm1, mm3 ; clip and word->bytes
|
|
252 movd [r0], mm1 ; store
|
|
253
|
|
254 ; go to next line
|
|
255 add r0, r1
|
|
256 add r2, r3
|
|
257 dec r4 ; next row
|
|
258 jg .nextrow
|
|
259 REP_RET
|
|
260
|
|
261 ; 4x4 block, H-only 4-tap filter
|
|
262 INIT_XMM
|
|
263 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
|
|
264 shl r5d, 4
|
|
265 %ifdef PIC
|
|
266 lea r11, [fourtap_filter_hw_m]
|
|
267 %endif
|
|
268 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
|
|
269 mova m6, [fourtap_filter_hw+r5]
|
|
270 pxor m7, m7
|
|
271
|
|
272 .nextrow
|
|
273 movh m0, [r2-1]
|
|
274 punpcklbw m0, m7 ; ABCDEFGH
|
|
275 mova m1, m0
|
|
276 mova m2, m0
|
|
277 mova m3, m0
|
|
278 psrldq m1, 2 ; BCDEFGH
|
|
279 psrldq m2, 4 ; CDEFGH
|
|
280 psrldq m3, 6 ; DEFGH
|
|
281 punpcklwd m0, m1 ; ABBCCDDE
|
|
282 punpcklwd m2, m3 ; CDDEEFFG
|
|
283 pmaddwd m0, m5
|
|
284 pmaddwd m2, m6
|
|
285 paddd m0, m2
|
|
286
|
|
287 movh m1, [r2+3]
|
|
288 punpcklbw m1, m7 ; ABCDEFGH
|
|
289 mova m2, m1
|
|
290 mova m3, m1
|
|
291 mova m4, m1
|
|
292 psrldq m2, 2 ; BCDEFGH
|
|
293 psrldq m3, 4 ; CDEFGH
|
|
294 psrldq m4, 6 ; DEFGH
|
|
295 punpcklwd m1, m2 ; ABBCCDDE
|
|
296 punpcklwd m3, m4 ; CDDEEFFG
|
|
297 pmaddwd m1, m5
|
|
298 pmaddwd m3, m6
|
|
299 paddd m1, m3
|
|
300
|
|
301 packssdw m0, m1
|
|
302 paddsw m0, [pw_64]
|
|
303 psraw m0, 7
|
|
304 packuswb m0, m7
|
|
305 movh [r0], m0 ; store
|
|
306
|
|
307 ; go to next line
|
|
308 add r0, r1
|
|
309 add r2, r3
|
|
310 dec r4 ; next row
|
|
311 jg .nextrow
|
|
312 REP_RET
|
|
313
|
|
314 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
|
|
315 lea r5d, [r5*3]
|
|
316 %ifdef PIC
|
|
317 lea r11, [sixtap_filter_hw_m]
|
|
318 %endif
|
|
319 lea r5, [sixtap_filter_hw+r5*8]
|
|
320 pxor m7, m7
|
|
321
|
|
322 .nextrow
|
|
323 movu m0, [r2-2]
|
|
324 mova m6, m0
|
|
325 mova m4, m0
|
|
326 punpcklbw m0, m7 ; ABCDEFGHI
|
|
327 mova m1, m0
|
|
328 mova m2, m0
|
|
329 mova m3, m0
|
|
330 psrldq m1, 2 ; BCDEFGH
|
|
331 psrldq m2, 4 ; CDEFGH
|
|
332 psrldq m3, 6 ; DEFGH
|
|
333 psrldq m4, 4
|
|
334 punpcklbw m4, m7 ; EFGH
|
|
335 mova m5, m4
|
|
336 psrldq m5, 2 ; FGH
|
|
337 punpcklwd m0, m1 ; ABBCCDDE
|
|
338 punpcklwd m2, m3 ; CDDEEFFG
|
|
339 punpcklwd m4, m5 ; EFFGGHHI
|
|
340 pmaddwd m0, [r5-48]
|
|
341 pmaddwd m2, [r5-32]
|
|
342 pmaddwd m4, [r5-16]
|
|
343 paddd m0, m2
|
|
344 paddd m0, m4
|
|
345
|
|
346 psrldq m6, 4
|
|
347 mova m4, m6
|
|
348 punpcklbw m6, m7 ; ABCDEFGHI
|
|
349 mova m1, m6
|
|
350 mova m2, m6
|
|
351 mova m3, m6
|
|
352 psrldq m1, 2 ; BCDEFGH
|
|
353 psrldq m2, 4 ; CDEFGH
|
|
354 psrldq m3, 6 ; DEFGH
|
|
355 psrldq m4, 4
|
|
356 punpcklbw m4, m7 ; EFGH
|
|
357 mova m5, m4
|
|
358 psrldq m5, 2 ; FGH
|
|
359 punpcklwd m6, m1 ; ABBCCDDE
|
|
360 punpcklwd m2, m3 ; CDDEEFFG
|
|
361 punpcklwd m4, m5 ; EFFGGHHI
|
|
362 pmaddwd m6, [r5-48]
|
|
363 pmaddwd m2, [r5-32]
|
|
364 pmaddwd m4, [r5-16]
|
|
365 paddd m6, m2
|
|
366 paddd m6, m4
|
|
367
|
|
368 packssdw m0, m6
|
|
369 paddsw m0, [pw_64]
|
|
370 psraw m0, 7
|
|
371 packuswb m0, m7
|
|
372 movh [r0], m0 ; store
|
|
373
|
|
374 ; go to next line
|
|
375 add r0, r1
|
|
376 add r2, r3
|
|
377 dec r4 ; next row
|
|
378 jg .nextrow
|
|
379 REP_RET
|
|
380
|
|
381 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
|
|
382 shl r5d, 4
|
|
383 mova m2, [pw_64]
|
11991
|
384 mova m3, [filter_h4_shuf]
|
|
385 mova m4, [filter_h6_shuf2]
|
11975
|
386 %ifdef PIC
|
|
387 lea r11, [fourtap_filter_hb_m]
|
|
388 %endif
|
|
389 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
|
|
390 mova m6, [fourtap_filter_hb+r5]
|
|
391
|
|
392 .nextrow
|
|
393 movu m0, [r2-1]
|
|
394 mova m1, m0
|
|
395 pshufb m0, m3
|
|
396 pshufb m1, m4
|
|
397 pmaddubsw m0, m5
|
|
398 pmaddubsw m1, m6
|
|
399 paddsw m0, m2
|
|
400 paddsw m0, m1
|
|
401 psraw m0, 7
|
|
402 packuswb m0, m0
|
|
403 movh [r0], m0 ; store
|
|
404
|
|
405 ; go to next line
|
|
406 add r0, r1
|
|
407 add r2, r3
|
|
408 dec r4 ; next row
|
|
409 jg .nextrow
|
|
410 REP_RET
|
|
411
|
|
412 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
|
|
413 lea r5d, [r5*3]
|
11991
|
414 mova m3, [filter_h6_shuf1]
|
|
415 mova m4, [filter_h6_shuf2]
|
11975
|
416 %ifdef PIC
|
|
417 lea r11, [sixtap_filter_hb_m]
|
|
418 %endif
|
|
419 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
|
|
420 mova m6, [sixtap_filter_hb+r5*8-32]
|
|
421 mova m7, [sixtap_filter_hb+r5*8-16]
|
|
422
|
|
423 .nextrow
|
|
424 movu m0, [r2-2]
|
|
425 mova m1, m0
|
|
426 mova m2, m0
|
|
427 pshufb m0, m3
|
|
428 pshufb m1, m4
|
11991
|
429 pshufb m2, [filter_h6_shuf3]
|
11975
|
430 pmaddubsw m0, m5
|
|
431 pmaddubsw m1, m6
|
|
432 pmaddubsw m2, m7
|
|
433 paddsw m0, m1
|
|
434 paddsw m0, m2
|
|
435 paddsw m0, [pw_64]
|
|
436 psraw m0, 7
|
|
437 packuswb m0, m0
|
|
438 movh [r0], m0 ; store
|
|
439
|
|
440 ; go to next line
|
|
441 add r0, r1
|
|
442 add r2, r3
|
|
443 dec r4 ; next row
|
|
444 jg .nextrow
|
|
445 REP_RET
|
|
446
|
|
447 %macro FILTER_V 3
|
|
448 ; 4x4 block, V-only 4-tap filter
|
|
449 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
|
|
450 shl r6d, 5
|
|
451 %ifdef PIC
|
|
452 lea r11, [fourtap_filter_v_m]
|
|
453 %endif
|
|
454 lea r6, [fourtap_filter_v+r6-32]
|
|
455 mova m6, [pw_64]
|
|
456 pxor m7, m7
|
|
457 mova m5, [r6+48]
|
|
458
|
|
459 ; read 3 lines
|
|
460 sub r2, r3
|
|
461 movh m0, [r2]
|
|
462 movh m1, [r2+ r3]
|
|
463 movh m2, [r2+2*r3]
|
|
464 add r2, r3
|
|
465 punpcklbw m0, m7
|
|
466 punpcklbw m1, m7
|
|
467 punpcklbw m2, m7
|
|
468
|
|
469 .nextrow
|
|
470 ; first calculate negative taps (to prevent losing positive overflows)
|
|
471 movh m4, [r2+2*r3] ; read new row
|
|
472 punpcklbw m4, m7
|
|
473 mova m3, m4
|
|
474 pmullw m0, [r6+0]
|
|
475 pmullw m4, m5
|
|
476 paddsw m4, m0
|
|
477
|
|
478 ; then calculate positive taps
|
|
479 mova m0, m1
|
|
480 pmullw m1, [r6+16]
|
|
481 paddsw m4, m1
|
|
482 mova m1, m2
|
|
483 pmullw m2, [r6+32]
|
|
484 paddsw m4, m2
|
|
485 mova m2, m3
|
|
486
|
|
487 ; round/clip/store
|
|
488 paddsw m4, m6
|
|
489 psraw m4, 7
|
|
490 packuswb m4, m7
|
|
491 movh [r0], m4
|
|
492
|
|
493 ; go to next line
|
|
494 add r0, r1
|
|
495 add r2, r3
|
|
496 dec r4 ; next row
|
|
497 jg .nextrow
|
|
498 REP_RET
|
|
499
|
|
500
|
|
501 ; 4x4 block, V-only 6-tap filter
|
|
502 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
|
|
503 shl r6d, 4
|
|
504 lea r6, [r6*3]
|
|
505 %ifdef PIC
|
|
506 lea r11, [sixtap_filter_v_m]
|
|
507 %endif
|
|
508 lea r6, [sixtap_filter_v+r6-96]
|
|
509 pxor m7, m7
|
|
510
|
|
511 ; read 5 lines
|
|
512 sub r2, r3
|
|
513 sub r2, r3
|
|
514 movh m0, [r2]
|
|
515 movh m1, [r2+r3]
|
|
516 movh m2, [r2+r3*2]
|
|
517 lea r2, [r2+r3*2]
|
|
518 add r2, r3
|
|
519 movh m3, [r2]
|
|
520 movh m4, [r2+r3]
|
|
521 punpcklbw m0, m7
|
|
522 punpcklbw m1, m7
|
|
523 punpcklbw m2, m7
|
|
524 punpcklbw m3, m7
|
|
525 punpcklbw m4, m7
|
|
526
|
|
527 .nextrow
|
|
528 ; first calculate negative taps (to prevent losing positive overflows)
|
|
529 mova m5, m1
|
|
530 pmullw m5, [r6+16]
|
|
531 mova m6, m4
|
|
532 pmullw m6, [r6+64]
|
|
533 paddsw m6, m5
|
|
534
|
|
535 ; then calculate positive taps
|
|
536 movh m5, [r2+2*r3] ; read new row
|
|
537 punpcklbw m5, m7
|
|
538 pmullw m0, [r6+0]
|
|
539 paddsw m6, m0
|
|
540 mova m0, m1
|
|
541 mova m1, m2
|
|
542 pmullw m2, [r6+32]
|
|
543 paddsw m6, m2
|
|
544 mova m2, m3
|
|
545 pmullw m3, [r6+48]
|
|
546 paddsw m6, m3
|
|
547 mova m3, m4
|
|
548 mova m4, m5
|
|
549 pmullw m5, [r6+80]
|
|
550 paddsw m6, m5
|
|
551
|
|
552 ; round/clip/store
|
|
553 paddsw m6, [pw_64]
|
|
554 psraw m6, 7
|
|
555 packuswb m6, m7
|
|
556 movh [r0], m6
|
|
557
|
|
558 ; go to next line
|
|
559 add r0, r1
|
|
560 add r2, r3
|
|
561 dec r4 ; next row
|
|
562 jg .nextrow
|
|
563 REP_RET
|
|
564 %endmacro
|
|
565
|
|
566 INIT_MMX
|
|
567 FILTER_V mmxext, 4, 0
|
|
568 INIT_XMM
|
|
569 FILTER_V sse2, 8, 8
|
|
570
|
|
571 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
|
|
572 shl r6d, 4
|
|
573 %ifdef PIC
|
|
574 lea r11, [fourtap_filter_hb_m]
|
|
575 %endif
|
|
576 mova m5, [fourtap_filter_hb+r6-16]
|
|
577 mova m6, [fourtap_filter_hb+r6]
|
|
578 mova m7, [pw_64]
|
|
579
|
|
580 ; read 3 lines
|
|
581 sub r2, r3
|
|
582 movh m0, [r2]
|
|
583 movh m1, [r2+ r3]
|
|
584 movh m2, [r2+2*r3]
|
|
585 add r2, r3
|
|
586
|
|
587 .nextrow
|
|
588 movh m3, [r2+2*r3] ; read new row
|
|
589 mova m4, m0
|
|
590 mova m0, m1
|
|
591 punpcklbw m4, m3
|
|
592 punpcklbw m1, m2
|
|
593 pmaddubsw m4, m5
|
|
594 pmaddubsw m1, m6
|
|
595 paddsw m4, m1
|
|
596 mova m1, m2
|
|
597 paddsw m4, m7
|
|
598 mova m2, m3
|
|
599 psraw m4, 7
|
|
600 packuswb m4, m4
|
|
601 movh [r0], m4
|
|
602
|
|
603 ; go to next line
|
|
604 add r0, r1
|
|
605 add r2, r3
|
|
606 dec r4 ; next row
|
|
607 jg .nextrow
|
|
608 REP_RET
|
|
609
|
|
610 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
|
|
611 lea r6d, [r6*3]
|
|
612 %ifdef PIC
|
|
613 lea r11, [sixtap_filter_hb_m]
|
|
614 %endif
|
|
615 lea r6, [sixtap_filter_hb+r6*8]
|
|
616
|
|
617 ; read 5 lines
|
|
618 sub r2, r3
|
|
619 sub r2, r3
|
|
620 movh m0, [r2]
|
|
621 movh m1, [r2+r3]
|
|
622 movh m2, [r2+r3*2]
|
|
623 lea r2, [r2+r3*2]
|
|
624 add r2, r3
|
|
625 movh m3, [r2]
|
|
626 movh m4, [r2+r3]
|
|
627
|
|
628 .nextrow
|
|
629 movh m5, [r2+2*r3] ; read new row
|
|
630 mova m6, m0
|
|
631 punpcklbw m6, m5
|
|
632 mova m0, m1
|
|
633 punpcklbw m1, m2
|
|
634 mova m7, m3
|
|
635 punpcklbw m7, m4
|
|
636 pmaddubsw m6, [r6-48]
|
|
637 pmaddubsw m1, [r6-32]
|
|
638 pmaddubsw m7, [r6-16]
|
|
639 paddsw m6, m1
|
|
640 paddsw m6, m7
|
|
641 mova m1, m2
|
|
642 paddsw m6, [pw_64]
|
|
643 mova m2, m3
|
|
644 psraw m6, 7
|
|
645 mova m3, m4
|
|
646 packuswb m6, m6
|
|
647 mova m4, m5
|
|
648 movh [r0], m6
|
|
649
|
|
650 ; go to next line
|
|
651 add r0, r1
|
|
652 add r2, r3
|
|
653 dec r4 ; next row
|
|
654 jg .nextrow
|
|
655 REP_RET
|
|
656
|
11991
|
657 %macro FILTER_BILINEAR 3
|
|
658 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
|
|
659 mov r5d, 8*16
|
|
660 shl r6d, 4
|
|
661 sub r5d, r6d
|
|
662 %ifdef PIC
|
|
663 lea r11, [bilinear_filter_vw_m]
|
|
664 %endif
|
|
665 pxor m6, m6
|
|
666 mova m4, [bilinear_filter_vw+r5d-16]
|
|
667 mova m5, [bilinear_filter_vw+r6d-16]
|
|
668 .nextrow
|
|
669 movh m0, [r2+r3*0]
|
|
670 movh m1, [r2+r3*1]
|
|
671 movh m3, [r2+r3*2]
|
|
672 punpcklbw m0, m6
|
|
673 punpcklbw m1, m6
|
|
674 punpcklbw m3, m6
|
|
675 mova m2, m1
|
|
676 pmullw m0, m4
|
|
677 pmullw m1, m5
|
|
678 pmullw m2, m4
|
|
679 pmullw m3, m5
|
|
680 paddsw m0, m1
|
|
681 paddsw m2, m3
|
|
682 psraw m0, 2
|
|
683 psraw m2, 2
|
|
684 pavgw m0, m6
|
|
685 pavgw m2, m6
|
|
686 %ifidn %1, mmxext
|
|
687 packuswb m0, m0
|
|
688 packuswb m2, m2
|
|
689 movh [r0+r1*0], m0
|
|
690 movh [r0+r1*1], m2
|
|
691 %else
|
|
692 packuswb m0, m2
|
|
693 movh [r0+r1*0], m0
|
|
694 movhps [r0+r1*1], m0
|
|
695 %endif
|
|
696
|
|
697 lea r0, [r0+r1*2]
|
|
698 lea r2, [r2+r3*2]
|
|
699 sub r4, 2
|
|
700 jg .nextrow
|
|
701 REP_RET
|
|
702
|
|
703 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
|
|
704 mov r6d, 8*16
|
|
705 shl r5d, 4
|
|
706 sub r6d, r5d
|
|
707 %ifdef PIC
|
|
708 lea r11, [bilinear_filter_vw_m]
|
|
709 %endif
|
|
710 pxor m6, m6
|
|
711 mova m4, [bilinear_filter_vw+r6d-16]
|
|
712 mova m5, [bilinear_filter_vw+r5d-16]
|
|
713 .nextrow
|
|
714 movh m0, [r2+r3*0+0]
|
|
715 movh m1, [r2+r3*0+1]
|
|
716 movh m2, [r2+r3*1+0]
|
|
717 movh m3, [r2+r3*1+1]
|
|
718 punpcklbw m0, m6
|
|
719 punpcklbw m1, m6
|
|
720 punpcklbw m2, m6
|
|
721 punpcklbw m3, m6
|
|
722 pmullw m0, m4
|
|
723 pmullw m1, m5
|
|
724 pmullw m2, m4
|
|
725 pmullw m3, m5
|
|
726 paddsw m0, m1
|
|
727 paddsw m2, m3
|
|
728 psraw m0, 2
|
|
729 psraw m2, 2
|
|
730 pavgw m0, m6
|
|
731 pavgw m2, m6
|
|
732 %ifidn %1, mmxext
|
|
733 packuswb m0, m0
|
|
734 packuswb m2, m2
|
|
735 movh [r0+r1*0], m0
|
|
736 movh [r0+r1*1], m2
|
|
737 %else
|
|
738 packuswb m0, m2
|
|
739 movh [r0+r1*0], m0
|
|
740 movhps [r0+r1*1], m0
|
|
741 %endif
|
|
742
|
|
743 lea r0, [r0+r1*2]
|
|
744 lea r2, [r2+r3*2]
|
|
745 sub r4, 2
|
|
746 jg .nextrow
|
|
747 REP_RET
|
|
748 %endmacro
|
|
749
|
|
750 INIT_MMX
|
|
751 FILTER_BILINEAR mmxext, 4, 0
|
|
752 INIT_XMM
|
|
753 FILTER_BILINEAR sse2, 8, 7
|
|
754
|
|
755 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
|
|
756 shl r6d, 4
|
|
757 %ifdef PIC
|
|
758 lea r11, [bilinear_filter_vb_m]
|
|
759 %endif
|
|
760 pxor m4, m4
|
|
761 mova m3, [bilinear_filter_vb+r6d-16]
|
|
762 .nextrow
|
|
763 movh m0, [r2+r3*0]
|
|
764 movh m1, [r2+r3*1]
|
|
765 movh m2, [r2+r3*2]
|
|
766 punpcklbw m0, m1
|
|
767 punpcklbw m1, m2
|
|
768 pmaddubsw m0, m3
|
|
769 pmaddubsw m1, m3
|
|
770 psraw m0, 2
|
|
771 psraw m1, 2
|
|
772 pavgw m0, m4
|
|
773 pavgw m1, m4
|
|
774 packuswb m0, m1
|
|
775 movh [r0+r1*0], m0
|
|
776 movhps [r0+r1*1], m0
|
|
777
|
|
778 lea r0, [r0+r1*2]
|
|
779 lea r2, [r2+r3*2]
|
|
780 sub r4, 2
|
|
781 jg .nextrow
|
|
782 REP_RET
|
|
783
|
|
784 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
|
|
785 shl r5d, 4
|
|
786 %ifdef PIC
|
|
787 lea r11, [bilinear_filter_vb_m]
|
|
788 %endif
|
|
789 pxor m4, m4
|
|
790 mova m2, [filter_h2_shuf]
|
|
791 mova m3, [bilinear_filter_vb+r5d-16]
|
|
792 .nextrow
|
|
793 movu m0, [r2+r3*0]
|
|
794 movu m1, [r2+r3*1]
|
|
795 pshufb m0, m2
|
|
796 pshufb m1, m2
|
|
797 pmaddubsw m0, m3
|
|
798 pmaddubsw m1, m3
|
|
799 psraw m0, 2
|
|
800 psraw m1, 2
|
|
801 pavgw m0, m4
|
|
802 pavgw m1, m4
|
|
803 packuswb m0, m1
|
|
804 movh [r0+r1*0], m0
|
|
805 movhps [r0+r1*1], m0
|
|
806
|
|
807 lea r0, [r0+r1*2]
|
|
808 lea r2, [r2+r3*2]
|
|
809 sub r4, 2
|
|
810 jg .nextrow
|
|
811 REP_RET
|
|
812
|
11975
|
813 ;-----------------------------------------------------------------------------
|
|
814 ; IDCT functions:
|
|
815 ;
|
|
816 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
|
|
817 ;-----------------------------------------------------------------------------
|
|
818
|
|
819 cglobal vp8_idct_dc_add_mmx, 3, 3
|
|
820 ; load data
|
|
821 movd mm0, [r1]
|
|
822
|
|
823 ; calculate DC
|
|
824 paddw mm0, [pw_4]
|
|
825 pxor mm1, mm1
|
|
826 psraw mm0, 3
|
|
827 psubw mm1, mm0
|
|
828 packuswb mm0, mm0
|
|
829 packuswb mm1, mm1
|
|
830 punpcklbw mm0, mm0
|
|
831 punpcklbw mm1, mm1
|
|
832 punpcklwd mm0, mm0
|
|
833 punpcklwd mm1, mm1
|
|
834
|
|
835 ; add DC
|
|
836 lea r1, [r0+r2*2]
|
|
837 movd mm2, [r0]
|
|
838 movd mm3, [r0+r2]
|
|
839 movd mm4, [r1]
|
|
840 movd mm5, [r1+r2]
|
|
841 paddusb mm2, mm0
|
|
842 paddusb mm3, mm0
|
|
843 paddusb mm4, mm0
|
|
844 paddusb mm5, mm0
|
|
845 psubusb mm2, mm1
|
|
846 psubusb mm3, mm1
|
|
847 psubusb mm4, mm1
|
|
848 psubusb mm5, mm1
|
|
849 movd [r0], mm2
|
|
850 movd [r0+r2], mm3
|
|
851 movd [r1], mm4
|
|
852 movd [r1+r2], mm5
|
|
853 RET
|
|
854
|
|
855 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
|
856 ; load data
|
|
857 movd xmm0, [r1]
|
|
858 lea r1, [r0+r2*2]
|
|
859 pxor xmm1, xmm1
|
|
860 movq xmm2, [pw_4]
|
|
861
|
|
862 ; calculate DC
|
|
863 paddw xmm0, xmm2
|
|
864 movd xmm2, [r0]
|
|
865 movd xmm3, [r0+r2]
|
|
866 movd xmm4, [r1]
|
|
867 movd xmm5, [r1+r2]
|
|
868 psraw xmm0, 3
|
|
869 pshuflw xmm0, xmm0, 0
|
|
870 punpcklqdq xmm0, xmm0
|
|
871 punpckldq xmm2, xmm3
|
|
872 punpckldq xmm4, xmm5
|
|
873 punpcklbw xmm2, xmm1
|
|
874 punpcklbw xmm4, xmm1
|
|
875 paddw xmm2, xmm0
|
|
876 paddw xmm4, xmm0
|
|
877 packuswb xmm2, xmm4
|
|
878 movd [r0], xmm2
|
|
879 pextrd [r0+r2], xmm2, 1
|
|
880 pextrd [r1], xmm2, 2
|
|
881 pextrd [r1+r2], xmm2, 3
|
|
882 RET
|