comparison x86/h264_intrapred.asm @ 11951:afee30fe8c26 libavcodec

16x16 and 8x8c x86 SIMD intra pred functions for VP8 and H.264
author darkshikari
date Fri, 25 Jun 2010 18:25:49 +0000
parents
children 953a0949c789
comparison
equal deleted inserted replaced
11950:56aba5a9761c 11951:afee30fe8c26
1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;*
5 ;* This file is part of FFmpeg.
6 ;*
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 %include "x86inc.asm"
23
24 SECTION_RODATA
25
26 tm_shuf: times 8 db 0x03, 0x80
27
28 SECTION .text
29
30 cextern pb_3
31
32 ;-----------------------------------------------------------------------------
33 ; void pred16x16_vertical(uint8_t *src, int stride)
34 ;-----------------------------------------------------------------------------
35
36 cglobal pred16x16_vertical_mmx, 2,3
37 sub r0, r1
38 mov r2, 8
39 movq mm0, [r0+0]
40 movq mm1, [r0+8]
41 .loop:
42 movq [r0+r1*1+0], mm0
43 movq [r0+r1*1+8], mm1
44 movq [r0+r1*2+0], mm0
45 movq [r0+r1*2+8], mm1
46 lea r0, [r0+r1*2]
47 dec r2
48 jg .loop
49 REP_RET
50
51 cglobal pred16x16_vertical_sse, 2,3
52 sub r0, r1
53 mov r2, 4
54 movaps xmm0, [r0]
55 .loop:
56 movaps [r0+r1*1], xmm0
57 movaps [r0+r1*2], xmm0
58 lea r0, [r0+r1*2]
59 movaps [r0+r1*1], xmm0
60 movaps [r0+r1*2], xmm0
61 lea r0, [r0+r1*2]
62 dec r2
63 jg .loop
64 REP_RET
65
66 ;-----------------------------------------------------------------------------
67 ; void pred16x16_horizontal(uint8_t *src, int stride)
68 ;-----------------------------------------------------------------------------
69
70 %macro PRED16x16_H 1
71 cglobal pred16x16_horizontal_%1, 2,3
72 mov r2, 8
73 %ifidn %1, ssse3
74 mova m2, [pb_3]
75 %endif
76 .loop:
77 movd m0, [r0+r1*0-4]
78 movd m1, [r0+r1*1-4]
79
80 %ifidn %1, ssse3
81 pshufb m0, m2
82 pshufb m1, m2
83 %else
84 punpcklbw m0, m0
85 punpcklbw m1, m1
86 %ifidn %1, mmxext
87 pshufw m0, m0, 0xff
88 pshufw m1, m1, 0xff
89 %else
90 punpckhwd m0, m0
91 punpckhwd m1, m1
92 punpckhdq m0, m0
93 punpckhdq m1, m1
94 %endif
95 mova [r0+r1*0+8], m0
96 mova [r0+r1*1+8], m1
97 %endif
98
99 mova [r0+r1*0], m0
100 mova [r0+r1*1], m1
101 lea r0, [r0+r1*2]
102 dec r2
103 jg .loop
104 REP_RET
105 %endmacro
106
107 INIT_MMX
108 PRED16x16_H mmx
109 PRED16x16_H mmxext
110 INIT_XMM
111 PRED16x16_H ssse3
112
113 ;-----------------------------------------------------------------------------
114 ; void pred16x16_dc(uint8_t *src, int stride)
115 ;-----------------------------------------------------------------------------
116
117 %macro PRED16x16_DC 2
118 cglobal pred16x16_dc_%1, 2,7
119 mov r4, r0
120 sub r0, r1
121 pxor mm0, mm0
122 pxor mm1, mm1
123 psadbw mm0, [r0+0]
124 psadbw mm1, [r0+8]
125 dec r0
126 movzx r5d, byte [r0+r1*1]
127 paddw mm0, mm1
128 movd r6d, mm0
129 lea r0, [r0+r1*2]
130 %rep 7
131 movzx r2d, byte [r0+r1*0]
132 movzx r3d, byte [r0+r1*1]
133 add r5d, r2d
134 add r6d, r3d
135 lea r0, [r0+r1*2]
136 %endrep
137 movzx r2d, byte [r0+r1*0]
138 add r5d, r6d
139 lea r2d, [r2+r5+16]
140 shr r2d, 5
141 %ifidn %1, mmx
142 movd m0, r2d
143 punpcklbw m0, m0
144 punpcklwd m0, m0
145 punpckldq m0, m0
146 %elifidn %1, mmxext
147 movd m0, r2d
148 punpcklbw m0, m0
149 pshufw m0, m0, 0
150 %elifidn %1, sse
151 imul r2d, 0x01010101
152 movd m0, r2d
153 shufps m0, m0, 0
154 %elifidn %1, sse2
155 movd m0, r2d
156 punpcklbw m0, m0
157 pshuflw m0, m0, 0
158 punpcklqdq m0, m0
159 %elifidn %1, ssse3
160 pxor m1, m1
161 movd m0, r2d
162 pshufb m0, m1
163 %endif
164
165 %if mmsize==8
166 mov r3d, 8
167 .loop:
168 %2 [r4+r1*0+0], m0
169 %2 [r4+r1*0+8], m0
170 %2 [r4+r1*1+0], m0
171 %2 [r4+r1*1+8], m0
172 %else
173 mov r3d, 4
174 .loop:
175 %2 [r4+r1*0], m0
176 %2 [r4+r1*1], m0
177 lea r4, [r4+r1*2]
178 %2 [r4+r1*0], m0
179 %2 [r4+r1*1], m0
180 %endif
181 lea r4, [r4+r1*2]
182 dec r3d
183 jg .loop
184 REP_RET
185 %endmacro
186
187 INIT_MMX
188 PRED16x16_DC mmx, movq
189 PRED16x16_DC mmxext, movq
190 INIT_XMM
191 PRED16x16_DC sse, movaps
192 PRED16x16_DC sse2, movdqa
193 PRED16x16_DC ssse3, movdqa
194
195 ;-----------------------------------------------------------------------------
196 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
197 ;-----------------------------------------------------------------------------
198
199 %macro PRED16x16_TM_MMX 1
200 cglobal pred16x16_tm_vp8_%1, 2,5
201 sub r0, r1
202 pxor mm7, mm7
203 movq mm0, [r0+0]
204 movq mm2, [r0+8]
205 movq mm1, mm0
206 movq mm3, mm2
207 punpcklbw mm0, mm7
208 punpckhbw mm1, mm7
209 punpcklbw mm2, mm7
210 punpckhbw mm3, mm7
211 movzx r3d, byte [r0-1]
212 mov r4d, 16
213 .loop:
214 movzx r2d, byte [r0+r1-1]
215 sub r2d, r3d
216 movd mm4, r2d
217 %ifidn %1, mmx
218 punpcklwd mm4, mm4
219 punpckldq mm4, mm4
220 %else
221 pshufw mm4, mm4, 0
222 %endif
223 movq mm5, mm4
224 movq mm6, mm4
225 movq mm7, mm4
226 paddw mm4, mm0
227 paddw mm5, mm1
228 paddw mm6, mm2
229 paddw mm7, mm3
230 packuswb mm4, mm5
231 packuswb mm6, mm7
232 movq [r0+r1+0], mm4
233 movq [r0+r1+8], mm6
234 add r0, r1
235 dec r4d
236 jg .loop
237 REP_RET
238 %endmacro
239
240 PRED16x16_TM_MMX mmx
241 PRED16x16_TM_MMX mmxext
242
243 cglobal pred16x16_tm_vp8_sse2, 2,6,6
244 sub r0, r1
245 pxor xmm2, xmm2
246 movdqa xmm0, [r0]
247 movdqa xmm1, xmm0
248 punpcklbw xmm0, xmm2
249 punpckhbw xmm1, xmm2
250 movzx r4d, byte [r0-1]
251 mov r5d, 8
252 .loop:
253 movzx r2d, byte [r0+r1*1-1]
254 movzx r3d, byte [r0+r1*2-1]
255 sub r2d, r4d
256 sub r3d, r4d
257 movd xmm2, r2d
258 movd xmm4, r3d
259 pshuflw xmm2, xmm2, 0
260 pshuflw xmm4, xmm4, 0
261 punpcklqdq xmm2, xmm2
262 punpcklqdq xmm4, xmm4
263 movdqa xmm3, xmm2
264 movdqa xmm5, xmm4
265 paddw xmm2, xmm0
266 paddw xmm3, xmm1
267 paddw xmm4, xmm0
268 paddw xmm5, xmm1
269 packuswb xmm2, xmm3
270 packuswb xmm4, xmm5
271 movdqa [r0+r1*1], xmm2
272 movdqa [r0+r1*2], xmm4
273 lea r0, [r0+r1*2]
274 dec r5d
275 jg .loop
276 REP_RET
277
278 ;-----------------------------------------------------------------------------
279 ; void pred8x8_vertical(uint8_t *src, int stride)
280 ;-----------------------------------------------------------------------------
281
282 cglobal pred8x8_vertical_mmx, 2,2
283 sub r0, r1
284 movq mm0, [r0]
285 %rep 3
286 movq [r0+r1*1], mm0
287 movq [r0+r1*2], mm0
288 lea r0, [r0+r1*2]
289 %endrep
290 movq [r0+r1*1], mm0
291 movq [r0+r1*2], mm0
292 RET
293
294 ;-----------------------------------------------------------------------------
295 ; void pred8x8_horizontal(uint8_t *src, int stride)
296 ;-----------------------------------------------------------------------------
297
298 %macro PRED8x8_H 1
299 cglobal pred8x8_horizontal_%1, 2,3
300 mov r2, 4
301 %ifidn %1, ssse3
302 mova m2, [pb_3]
303 %endif
304 .loop:
305 movd m0, [r0+r1*0-4]
306 movd m1, [r0+r1*1-4]
307 %ifidn %1, ssse3
308 pshufb m0, m2
309 pshufb m1, m2
310 %else
311 punpcklbw m0, m0
312 punpcklbw m1, m1
313 %ifidn %1, mmxext
314 pshufw m0, m0, 0xff
315 pshufw m1, m1, 0xff
316 %else
317 punpckhwd m0, m0
318 punpckhwd m1, m1
319 punpckhdq m0, m0
320 punpckhdq m1, m1
321 %endif
322 %endif
323 mova [r0+r1*0], m0
324 mova [r0+r1*1], m1
325 lea r0, [r0+r1*2]
326 dec r2
327 jg .loop
328 REP_RET
329 %endmacro
330
331 INIT_MMX
332 PRED8x8_H mmx
333 PRED8x8_H mmxext
334 PRED8x8_H ssse3
335
336 ;-----------------------------------------------------------------------------
337 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
338 ;-----------------------------------------------------------------------------
339
340 %macro PRED8x8_DC 1
341 cglobal pred8x8_dc_rv40_%1, 2,7
342 mov r4, r0
343 sub r0, r1
344 pxor mm0, mm0
345 psadbw mm0, [r0]
346 dec r0
347 movzx r5d, byte [r0+r1*1]
348 movd r6d, mm0
349 lea r0, [r0+r1*2]
350 %rep 3
351 movzx r2d, byte [r0+r1*0]
352 movzx r3d, byte [r0+r1*1]
353 add r5d, r2d
354 add r6d, r3d
355 lea r0, [r0+r1*2]
356 %endrep
357 movzx r2d, byte [r0+r1*0]
358 add r5d, r6d
359 lea r2d, [r2+r5+8]
360 shr r2d, 4
361 %ifidn %1, mmx
362 movd mm0, r2d
363 punpcklbw mm0, mm0
364 punpcklwd mm0, mm0
365 punpckldq mm0, mm0
366 %else
367 movd mm0, r2d
368 punpcklbw mm0, mm0
369 pshufw mm0, mm0, 0
370 %endif
371 mov r3d, 4
372 .loop:
373 movq [r4+r1*0], mm0
374 movq [r4+r1*1], mm0
375 lea r4, [r4+r1*2]
376 dec r3d
377 jg .loop
378 REP_RET
379 %endmacro
380
381
382 PRED8x8_DC mmx
383 PRED8x8_DC mmxext
384
385 ;-----------------------------------------------------------------------------
386 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
387 ;-----------------------------------------------------------------------------
388
389 %macro PRED8x8_TM_MMX 1
390 cglobal pred8x8_tm_vp8_%1, 2,6
391 sub r0, r1
392 pxor mm7, mm7
393 movq mm0, [r0]
394 movq mm1, mm0
395 punpcklbw mm0, mm7
396 punpckhbw mm1, mm7
397 movzx r4d, byte [r0-1]
398 mov r5d, 4
399 .loop:
400 movzx r2d, byte [r0+r1*1-1]
401 movzx r3d, byte [r0+r1*2-1]
402 sub r2d, r4d
403 sub r3d, r4d
404 movd mm2, r2d
405 movd mm4, r3d
406 %ifidn %1, mmx
407 punpcklwd mm2, mm2
408 punpcklwd mm4, mm4
409 punpckldq mm2, mm2
410 punpckldq mm4, mm4
411 %else
412 pshufw mm2, mm2, 0
413 pshufw mm4, mm4, 0
414 %endif
415 movq mm3, mm2
416 movq mm5, mm4
417 paddw mm2, mm0
418 paddw mm3, mm1
419 paddw mm4, mm0
420 paddw mm5, mm1
421 packuswb mm2, mm3
422 packuswb mm4, mm5
423 movq [r0+r1*1], mm2
424 movq [r0+r1*2], mm4
425 lea r0, [r0+r1*2]
426 dec r5d
427 jg .loop
428 REP_RET
429 %endmacro
430
431 PRED8x8_TM_MMX mmx
432 PRED8x8_TM_MMX mmxext
433
434 cglobal pred8x8_tm_vp8_sse2, 2,6,4
435 sub r0, r1
436 pxor xmm1, xmm1
437 movq xmm0, [r0]
438 punpcklbw xmm0, xmm1
439 movzx r4d, byte [r0-1]
440 mov r5d, 4
441 .loop:
442 movzx r2d, byte [r0+r1*1-1]
443 movzx r3d, byte [r0+r1*2-1]
444 sub r2d, r4d
445 sub r3d, r4d
446 movd xmm2, r2d
447 movd xmm3, r3d
448 pshuflw xmm2, xmm2, 0
449 pshuflw xmm3, xmm3, 0
450 punpcklqdq xmm2, xmm2
451 punpcklqdq xmm3, xmm3
452 paddw xmm2, xmm0
453 paddw xmm3, xmm0
454 packuswb xmm2, xmm3
455 movq [r0+r1*1], xmm2
456 movhps [r0+r1*2], xmm2
457 lea r0, [r0+r1*2]
458 dec r5d
459 jg .loop
460 REP_RET
461
462 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
463 sub r0, r1
464 movdqa xmm4, [tm_shuf]
465 pxor xmm1, xmm1
466 movq xmm0, [r0]
467 punpcklbw xmm0, xmm1
468 movd xmm5, [r0-4]
469 pshufb xmm5, xmm4
470 mov r2d, 4
471 .loop:
472 movd xmm2, [r0+r1*1-4]
473 movd xmm3, [r0+r1*2-4]
474 pshufb xmm2, xmm4
475 pshufb xmm3, xmm4
476 psubw xmm2, xmm5
477 psubw xmm3, xmm5
478 paddw xmm2, xmm0
479 paddw xmm3, xmm0
480 packuswb xmm2, xmm3
481 movq [r0+r1*1], xmm2
482 movhps [r0+r1*2], xmm2
483 lea r0, [r0+r1*2]
484 dec r2d
485 jg .loop
486 REP_RET