Mercurial > libavcodec.hg
comparison ppc/h264_altivec.c @ 5094:ce57e3f2b2a7 libavcodec
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
author | lu_zero |
---|---|
date | Sun, 03 Jun 2007 13:27:44 +0000 |
parents | 41cabe79ba25 |
children | ad0c45e0008c |
comparison
equal
deleted
inserted
replaced
5093:f0ffa056b77c | 5094:ce57e3f2b2a7 |
---|---|
401 | 401 |
402 | 402 |
403 /**************************************************************************** | 403 /**************************************************************************** |
404 * IDCT transform: | 404 * IDCT transform: |
405 ****************************************************************************/ | 405 ****************************************************************************/ |
406 | |
407 #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ | |
408 /* 1st stage */ \ | |
409 vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ | |
410 vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ | |
411 vz2 = vec_sra(vb1,vec_splat_u16(1)); \ | |
412 vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ | |
413 vz3 = vec_sra(vb3,vec_splat_u16(1)); \ | |
414 vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ | |
415 /* 2nd stage: output */ \ | |
416 va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ | |
417 va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ | |
418 va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ | |
419 va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ | |
420 | |
421 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ | |
422 b0 = vec_mergeh( a0, a0 ); \ | |
423 b1 = vec_mergeh( a1, a0 ); \ | |
424 b2 = vec_mergeh( a2, a0 ); \ | |
425 b3 = vec_mergeh( a3, a0 ); \ | |
426 a0 = vec_mergeh( b0, b2 ); \ | |
427 a1 = vec_mergel( b0, b2 ); \ | |
428 a2 = vec_mergeh( b1, b3 ); \ | |
429 a3 = vec_mergel( b1, b3 ); \ | |
430 b0 = vec_mergeh( a0, a2 ); \ | |
431 b1 = vec_mergel( a0, a2 ); \ | |
432 b2 = vec_mergeh( a1, a3 ); \ | |
433 b3 = vec_mergel( a1, a3 ) | |
434 | |
435 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ | |
436 vdst_orig = vec_ld(0, dst); \ | |
437 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ | |
438 vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \ | |
439 va = vec_add(va, vdst_ss); \ | |
440 va_u8 = vec_packsu(va, zero_s16v); \ | |
441 va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ | |
442 vec_ste(va_u32, element, (uint32_t*)dst); | |
443 | |
444 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) | |
445 { | |
446 vec_s16_t va0, va1, va2, va3; | |
447 vec_s16_t vz0, vz1, vz2, vz3; | |
448 vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3; | |
449 vec_u8_t va_u8; | |
450 vec_u32_t va_u32; | |
451 vec_s16_t vdst_ss; | |
452 const vec_u16_t v6us = vec_splat_u16(6); | |
453 vec_u8_t vdst, vdst_orig; | |
454 vec_u8_t vdst_mask = vec_lvsl(0, dst); | |
455 int element = ((unsigned long)dst & 0xf) >> 2; | |
456 LOAD_ZERO; | |
457 | |
458 block[0] += 32; /* add 32 as a DC-level for rounding */ | |
459 | |
460 vtmp0 = vec_ld(0,block); | |
461 vtmp1 = vec_sld(vtmp0, vtmp0, 8); | |
462 vtmp2 = vec_ld(16,block); | |
463 vtmp3 = vec_sld(vtmp2, vtmp2, 8); | |
464 | |
465 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); | |
466 VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); | |
467 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); | |
468 | |
469 va0 = vec_sra(va0,v6us); | |
470 va1 = vec_sra(va1,v6us); | |
471 va2 = vec_sra(va2,v6us); | |
472 va3 = vec_sra(va3,v6us); | |
473 | |
474 VEC_LOAD_U8_ADD_S16_STORE_U8(va0); | |
475 dst += stride; | |
476 VEC_LOAD_U8_ADD_S16_STORE_U8(va1); | |
477 dst += stride; | |
478 VEC_LOAD_U8_ADD_S16_STORE_U8(va2); | |
479 dst += stride; | |
480 VEC_LOAD_U8_ADD_S16_STORE_U8(va3); | |
481 } | |
406 | 482 |
407 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ | 483 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ |
408 /* a0 = SRC(0) + SRC(4); */ \ | 484 /* a0 = SRC(0) + SRC(4); */ \ |
409 vec_s16_t a0v = vec_add(s0, s4); \ | 485 vec_s16_t a0v = vec_add(s0, s4); \ |
410 /* a2 = SRC(0) - SRC(4); */ \ | 486 /* a2 = SRC(0) - SRC(4); */ \ |
529 #ifdef HAVE_ALTIVEC | 605 #ifdef HAVE_ALTIVEC |
530 if (has_altivec()) { | 606 if (has_altivec()) { |
531 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; | 607 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; |
532 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; | 608 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; |
533 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; | 609 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; |
610 c->h264_idct_add = ff_h264_idct_add_altivec; | |
534 c->h264_idct8_add = ff_h264_idct8_add_altivec; | 611 c->h264_idct8_add = ff_h264_idct8_add_altivec; |
535 | 612 |
536 #define dspfunc(PFX, IDX, NUM) \ | 613 #define dspfunc(PFX, IDX, NUM) \ |
537 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ | 614 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ |
538 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ | 615 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ |