comparison x86/x86util.asm @ 12005:88563eada57f libavcodec

Make x86util.asm LGPL so we can use it in LGPL asm Strip out most x264-specific stuff (not used anywhere in ffmpeg).
author darkshikari
date Tue, 29 Jun 2010 00:40:12 +0000
parents 980030a3e315
children 2ae70e2c31a4
comparison
equal deleted inserted replaced
12004:2002ea7c06f6 12005:88563eada57f
1 ;***************************************************************************** 1 ;*****************************************************************************
2 ;* x86util.asm 2 ;* x86util.asm
3 ;***************************************************************************** 3 ;*****************************************************************************
4 ;* Copyright (C) 2008 x264 project 4 ;* Copyright (C) 2008-2010 x264 project
5 ;* 5 ;*
6 ;* Authors: Holger Lubitz <holger@lubitz.org> 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Loren Merritt <lorenm@u.washington.edu> 7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* 8 ;*
9 ;* This program is free software; you can redistribute it and/or modify 9 ;* This file is part of FFmpeg.
10 ;* it under the terms of the GNU General Public License as published by 10 ;*
11 ;* the Free Software Foundation; either version 2 of the License, or 11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* (at your option) any later version. 12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* 13 ;* License as published by the Free Software Foundation; either
14 ;* This program is distributed in the hope that it will be useful, 14 ;* version 2.1 of the License, or (at your option) any later version.
15 ;*
16 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* GNU General Public License for more details. 19 ;* Lesser General Public License for more details.
18 ;* 20 ;*
19 ;* You should have received a copy of the GNU General Public License 21 ;* You should have received a copy of the GNU Lesser General Public
20 ;* along with this program; if not, write to the Free Software 22 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;***************************************************************************** 24 ;******************************************************************************
23
24 %assign FENC_STRIDE 16
25 %assign FDEC_STRIDE 32
26 25
27 %macro SBUTTERFLY 4 26 %macro SBUTTERFLY 4
28 mova m%4, m%2 27 mova m%4, m%2
29 punpckl%1 m%2, m%3 28 punpckl%1 m%2, m%3
30 punpckh%1 m%4, m%3 29 punpckh%1 m%4, m%3
242 psubw %2, %1 241 psubw %2, %1
243 psubw %4, %3 242 psubw %4, %3
244 %endif 243 %endif
245 %endmacro 244 %endmacro
246 245
247 %macro HADAMARD4_V 4+
248 SUMSUB_BADC %1, %2, %3, %4
249 SUMSUB_BADC %1, %3, %2, %4
250 %endmacro
251
252 %macro HADAMARD8_V 8+
253 SUMSUB_BADC %1, %2, %3, %4
254 SUMSUB_BADC %5, %6, %7, %8
255 SUMSUB_BADC %1, %3, %2, %4
256 SUMSUB_BADC %5, %7, %6, %8
257 SUMSUB_BADC %1, %5, %2, %6
258 SUMSUB_BADC %3, %7, %4, %8
259 %endmacro
260
261 %macro TRANS_SSE2 5-6
262 ; TRANSPOSE2x2
263 ; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
264 ; %2: ord/unord (for compat with sse4, unused)
265 ; %3/%4: source regs
266 ; %5/%6: tmp regs
267 %ifidn %1, d
268 %define mask [mask_10]
269 %define shift 16
270 %elifidn %1, q
271 %define mask [mask_1100]
272 %define shift 32
273 %endif
274 %if %0==6 ; less dependency if we have two tmp
275 mova m%5, mask ; ff00
276 mova m%6, m%4 ; x5x4
277 psll%1 m%4, shift ; x4..
278 pand m%6, m%5 ; x5..
279 pandn m%5, m%3 ; ..x0
280 psrl%1 m%3, shift ; ..x1
281 por m%4, m%5 ; x4x0
282 por m%3, m%6 ; x5x1
283 %else ; more dependency, one insn less. sometimes faster, sometimes not
284 mova m%5, m%4 ; x5x4
285 psll%1 m%4, shift ; x4..
286 pxor m%4, m%3 ; (x4^x1)x0
287 pand m%4, mask ; (x4^x1)..
288 pxor m%3, m%4 ; x4x0
289 psrl%1 m%4, shift ; ..(x1^x4)
290 pxor m%5, m%4 ; x5x1
291 SWAP %4, %3, %5
292 %endif
293 %endmacro
294
295 %macro TRANS_SSE4 5-6 ; see above
296 %ifidn %1, d
297 mova m%5, m%3
298 %ifidn %2, ord
299 psrl%1 m%3, 16
300 %endif
301 pblendw m%3, m%4, 10101010b
302 psll%1 m%4, 16
303 %ifidn %2, ord
304 pblendw m%4, m%5, 01010101b
305 %else
306 psrl%1 m%5, 16
307 por m%4, m%5
308 %endif
309 %elifidn %1, q
310 mova m%5, m%3
311 shufps m%3, m%4, 10001000b
312 shufps m%5, m%4, 11011101b
313 SWAP %4, %5
314 %endif
315 %endmacro
316
317 %macro HADAMARD 5-6
318 ; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
319 ; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
320 ; %3/%4: regs
321 ; %5(%6): tmpregs
322 %if %1!=0 ; have to reorder stuff for horizontal op
323 %ifidn %2, sumsub
324 %define ORDER ord
325 ; sumsub needs order because a-b != b-a unless a=b
326 %else
327 %define ORDER unord
328 ; if we just max, order doesn't matter (allows pblendw+or in sse4)
329 %endif
330 %if %1==1
331 TRANS d, ORDER, %3, %4, %5, %6
332 %elif %1==2
333 %if mmsize==8
334 SBUTTERFLY dq, %3, %4, %5
335 %else
336 TRANS q, ORDER, %3, %4, %5, %6
337 %endif
338 %elif %1==4
339 SBUTTERFLY qdq, %3, %4, %5
340 %endif
341 %endif
342 %ifidn %2, sumsub
343 SUMSUB_BA m%3, m%4, m%5
344 %else
345 %ifidn %2, amax
346 %if %0==6
347 ABS2 m%3, m%4, m%5, m%6
348 %else
349 ABS1 m%3, m%5
350 ABS1 m%4, m%5
351 %endif
352 %endif
353 pmaxsw m%3, m%4
354 %endif
355 %endmacro
356
357
358 %macro HADAMARD2_2D 6-7 sumsub
359 HADAMARD 0, sumsub, %1, %2, %5
360 HADAMARD 0, sumsub, %3, %4, %5
361 SBUTTERFLY %6, %1, %2, %5
362 %ifnum %7
363 HADAMARD 0, amax, %1, %2, %5, %7
364 %else
365 HADAMARD 0, %7, %1, %2, %5
366 %endif
367 SBUTTERFLY %6, %3, %4, %5
368 %ifnum %7
369 HADAMARD 0, amax, %3, %4, %5, %7
370 %else
371 HADAMARD 0, %7, %3, %4, %5
372 %endif
373 %endmacro
374
375 %macro HADAMARD4_2D 5-6 sumsub
376 HADAMARD2_2D %1, %2, %3, %4, %5, wd
377 HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
378 SWAP %2, %3
379 %endmacro
380
381 %macro HADAMARD4_2D_SSE 5-6 sumsub
382 HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
383 HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
384 SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0
385 SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2
386 HADAMARD2_2D %1, %3, %2, %4, %5, dq
387 SBUTTERFLY qdq, %1, %2, %5
388 HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1
389 SBUTTERFLY qdq, %3, %4, %5
390 HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3
391 %endmacro
392
393 %macro HADAMARD8_2D 9-10 sumsub
394 HADAMARD2_2D %1, %2, %3, %4, %9, wd
395 HADAMARD2_2D %5, %6, %7, %8, %9, wd
396 HADAMARD2_2D %1, %3, %2, %4, %9, dq
397 HADAMARD2_2D %5, %7, %6, %8, %9, dq
398 HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
399 HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
400 %ifnidn %10, amax
401 SWAP %2, %5
402 SWAP %4, %7
403 %endif
404 %endmacro
405
406 %macro SUMSUB2_AB 3 246 %macro SUMSUB2_AB 3
407 mova %3, %1 247 mova %3, %1
408 paddw %1, %1 248 paddw %1, %1
409 paddw %1, %2 249 paddw %1, %2
410 psubw %3, %2 250 psubw %3, %2
481 punpcklbw %2, %3 321 punpcklbw %2, %3
482 psubw %1, %2 322 psubw %1, %2
483 %endif 323 %endif
484 %endmacro 324 %endmacro
485 325
486 %macro LOAD_DIFF8x4_SSE2 8
487 LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
488 LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
489 LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
490 LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
491 %endmacro
492
493 %macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
494 movh m%2, [%8+%1*FDEC_STRIDE]
495 movh m%1, [%7+%1*FENC_STRIDE]
496 punpcklbw m%1, m%2
497 movh m%3, [%8+%2*FDEC_STRIDE]
498 movh m%2, [%7+%2*FENC_STRIDE]
499 punpcklbw m%2, m%3
500 movh m%4, [%8+%3*FDEC_STRIDE]
501 movh m%3, [%7+%3*FENC_STRIDE]
502 punpcklbw m%3, m%4
503 movh m%5, [%8+%4*FDEC_STRIDE]
504 movh m%4, [%7+%4*FENC_STRIDE]
505 punpcklbw m%4, m%5
506 pmaddubsw m%1, m%6
507 pmaddubsw m%2, m%6
508 pmaddubsw m%3, m%6
509 pmaddubsw m%4, m%6
510 %endmacro
511
512 %macro STORE_DCT 6 326 %macro STORE_DCT 6
513 movq [%5+%6+ 0], m%1 327 movq [%5+%6+ 0], m%1
514 movq [%5+%6+ 8], m%2 328 movq [%5+%6+ 8], m%2
515 movq [%5+%6+16], m%3 329 movq [%5+%6+16], m%3
516 movq [%5+%6+24], m%4 330 movq [%5+%6+24], m%4
517 movhps [%5+%6+32], m%1 331 movhps [%5+%6+32], m%1
518 movhps [%5+%6+40], m%2 332 movhps [%5+%6+40], m%2
519 movhps [%5+%6+48], m%3 333 movhps [%5+%6+48], m%3
520 movhps [%5+%6+56], m%4 334 movhps [%5+%6+56], m%4
521 %endmacro
522
523 %macro STORE_IDCT 4
524 movhps [r0-4*FDEC_STRIDE], %1
525 movh [r0-3*FDEC_STRIDE], %1
526 movhps [r0-2*FDEC_STRIDE], %2
527 movh [r0-1*FDEC_STRIDE], %2
528 movhps [r0+0*FDEC_STRIDE], %3
529 movh [r0+1*FDEC_STRIDE], %3
530 movhps [r0+2*FDEC_STRIDE], %4
531 movh [r0+3*FDEC_STRIDE], %4
532 %endmacro 335 %endmacro
533 336
534 %macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? 337 %macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
535 LOAD_DIFF m%1, m%5, m%7, [%8], [%9] 338 LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
536 LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] 339 LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]