comparison ppc/h264_altivec.c @ 5530:cd266411b11a libavcodec

use shorter types vec_"type" instead of the too long vector "type" part 1 of h264 luma interpolation 8x8 for altivec contributed by Mauricio Alvarez % lokifo A gmail P com % Original thread: Date: Jun 26, 2007 8:15 PM Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec
author gpoirier
date Sun, 12 Aug 2007 13:50:06 +0000
parents b59514a8d239
children f644e7c90380
comparison
equal deleted inserted replaced
5529:af68496af656 5530:cd266411b11a
184 {((8 - x) * (8 - y)), 184 {((8 - x) * (8 - y)),
185 ((x) * (8 - y)), 185 ((x) * (8 - y)),
186 ((8 - x) * (y)), 186 ((8 - x) * (y)),
187 ((x) * (y))}; 187 ((x) * (y))};
188 register int i; 188 register int i;
189 vector unsigned char fperm; 189 vec_u8_t fperm;
190 const vector signed int vABCD = vec_ld(0, ABCD); 190 const vec_s32_t vABCD = vec_ld(0, ABCD);
191 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); 191 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
192 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); 192 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
193 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); 193 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
194 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); 194 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
195 const vector signed int vzero = vec_splat_s32(0); 195 LOAD_ZERO;
196 const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 196 const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197 const vector unsigned short v6us = vec_splat_u16(6); 197 const vec_u16_t v6us = vec_splat_u16(6);
198 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 198 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 199 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
200 200
201 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; 201 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202 vector unsigned char vsrc0uc, vsrc1uc; 202 vec_u8_t vsrc0uc, vsrc1uc;
203 vector signed short vsrc0ssH, vsrc1ssH; 203 vec_s16_t vsrc0ssH, vsrc1ssH;
204 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; 204 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
205 vector signed short vsrc2ssH, vsrc3ssH, psum; 205 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
206 vector unsigned char vdst, ppsum, fsum; 206 vec_u8_t vdst, ppsum, fsum;
207 207
208 if (((unsigned long)dst) % 16 == 0) { 208 if (((unsigned long)dst) % 16 == 0) {
209 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 209 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
210 0x14, 0x15, 0x16, 0x17, 210 0x14, 0x15, 0x16, 0x17,
211 0x08, 0x09, 0x0A, 0x0B, 211 0x08, 0x09, 0x0A, 0x0B,
212 0x0C, 0x0D, 0x0E, 0x0F); 212 0x0C, 0x0D, 0x0E, 0x0F);
213 } else { 213 } else {
214 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 214 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
215 0x04, 0x05, 0x06, 0x07, 215 0x04, 0x05, 0x06, 0x07,
216 0x18, 0x19, 0x1A, 0x1B, 216 0x18, 0x19, 0x1A, 0x1B,
217 0x1C, 0x1D, 0x1E, 0x1F); 217 0x1C, 0x1D, 0x1E, 0x1F);
218 } 218 }
219 219
220 vsrcAuc = vec_ld(0, src); 220 vsrcAuc = vec_ld(0, src);
221 221
222 if (loadSecond) 222 if (loadSecond)
228 if (reallyBadAlign) 228 if (reallyBadAlign)
229 vsrc1uc = vsrcBuc; 229 vsrc1uc = vsrcBuc;
230 else 230 else
231 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 231 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
232 232
233 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 233 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
234 (vector unsigned char)vsrc0uc); 234 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
235 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
236 (vector unsigned char)vsrc1uc);
237 235
238 if (!loadSecond) {// -> !reallyBadAlign 236 if (!loadSecond) {// -> !reallyBadAlign
239 for (i = 0 ; i < h ; i++) { 237 for (i = 0 ; i < h ; i++) {
240 238
241 239
242 vsrcCuc = vec_ld(stride + 0, src); 240 vsrcCuc = vec_ld(stride + 0, src);
243 241
244 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 242 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
245 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 243 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
246 244
247 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 245 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
248 (vector unsigned char)vsrc2uc); 246 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
249 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
250 (vector unsigned char)vsrc3uc);
251 247
252 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 248 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
253 psum = vec_mladd(vB, vsrc1ssH, psum); 249 psum = vec_mladd(vB, vsrc1ssH, psum);
254 psum = vec_mladd(vC, vsrc2ssH, psum); 250 psum = vec_mladd(vC, vsrc2ssH, psum);
255 psum = vec_mladd(vD, vsrc3ssH, psum); 251 psum = vec_mladd(vD, vsrc3ssH, psum);
256 psum = vec_add(v28ss, psum); 252 psum = vec_add(v28ss, psum);
257 psum = vec_sra(psum, v6us); 253 psum = vec_sra(psum, v6us);
258 254
259 vdst = vec_ld(0, dst); 255 vdst = vec_ld(0, dst);
260 ppsum = (vector unsigned char)vec_packsu(psum, psum); 256 ppsum = (vec_u8_t)vec_packsu(psum, psum);
261 fsum = vec_perm(vdst, ppsum, fperm); 257 fsum = vec_perm(vdst, ppsum, fperm);
262 258
263 vec_st(fsum, 0, dst); 259 vec_st(fsum, 0, dst);
264 260
265 vsrc0ssH = vsrc2ssH; 261 vsrc0ssH = vsrc2ssH;
267 263
268 dst += stride; 264 dst += stride;
269 src += stride; 265 src += stride;
270 } 266 }
271 } else { 267 } else {
272 vector unsigned char vsrcDuc; 268 vec_u8_t vsrcDuc;
273 for (i = 0 ; i < h ; i++) { 269 for (i = 0 ; i < h ; i++) {
274 vsrcCuc = vec_ld(stride + 0, src); 270 vsrcCuc = vec_ld(stride + 0, src);
275 vsrcDuc = vec_ld(stride + 16, src); 271 vsrcDuc = vec_ld(stride + 16, src);
276 272
277 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 273 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
278 if (reallyBadAlign) 274 if (reallyBadAlign)
279 vsrc3uc = vsrcDuc; 275 vsrc3uc = vsrcDuc;
280 else 276 else
281 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 277 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282 278
283 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 279 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
284 (vector unsigned char)vsrc2uc); 280 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
285 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
286 (vector unsigned char)vsrc3uc);
287 281
288 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 282 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
289 psum = vec_mladd(vB, vsrc1ssH, psum); 283 psum = vec_mladd(vB, vsrc1ssH, psum);
290 psum = vec_mladd(vC, vsrc2ssH, psum); 284 psum = vec_mladd(vC, vsrc2ssH, psum);
291 psum = vec_mladd(vD, vsrc3ssH, psum); 285 psum = vec_mladd(vD, vsrc3ssH, psum);
292 psum = vec_add(v28ss, psum); 286 psum = vec_add(v28ss, psum);
293 psum = vec_sr(psum, v6us); 287 psum = vec_sr(psum, v6us);
294 288
295 vdst = vec_ld(0, dst); 289 vdst = vec_ld(0, dst);
296 ppsum = (vector unsigned char)vec_pack(psum, psum); 290 ppsum = (vec_u8_t)vec_pack(psum, psum);
297 fsum = vec_perm(vdst, ppsum, fperm); 291 fsum = vec_perm(vdst, ppsum, fperm);
298 292
299 vec_st(fsum, 0, dst); 293 vec_st(fsum, 0, dst);
300 294
301 vsrc0ssH = vsrc2ssH; 295 vsrc0ssH = vsrc2ssH;
310 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, 304 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
311 const uint8_t * src2, int dst_stride, 305 const uint8_t * src2, int dst_stride,
312 int src_stride1, int h) 306 int src_stride1, int h)
313 { 307 {
314 int i; 308 int i;
315 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; 309 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
316 310
317 mask_ = vec_lvsl(0, src2); 311 mask_ = vec_lvsl(0, src2);
318 312
319 for (i = 0; i < h; i++) { 313 for (i = 0; i < h; i++) {
320 314
352 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, 346 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
353 const uint8_t * src2, int dst_stride, 347 const uint8_t * src2, int dst_stride,
354 int src_stride1, int h) 348 int src_stride1, int h)
355 { 349 {
356 int i; 350 int i;
357 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; 351 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
358 352
359 mask_ = vec_lvsl(0, src2); 353 mask_ = vec_lvsl(0, src2);
360 354
361 for (i = 0; i < h; i++) { 355 for (i = 0; i < h; i++) {
362 356
565 559
566 const vec_u16_t onev = vec_splat_u16(1); 560 const vec_u16_t onev = vec_splat_u16(1);
567 const vec_u16_t twov = vec_splat_u16(2); 561 const vec_u16_t twov = vec_splat_u16(2);
568 const vec_u16_t sixv = vec_splat_u16(6); 562 const vec_u16_t sixv = vec_splat_u16(6);
569 563
570 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0, 564 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
571 -1,-1,-1,-1,-1,-1,-1,-1);
572 LOAD_ZERO; 565 LOAD_ZERO;
573 566
574 dct[0] += 32; // rounding for the >>6 at the end 567 dct[0] += 32; // rounding for the >>6 at the end
575 568
576 s0 = vec_ld(0x00, (int16_t*)dct); 569 s0 = vec_ld(0x00, (int16_t*)dct);
599 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); 592 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
600 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); 593 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
601 } 594 }
602 595
603 #define transpose4x16(r0, r1, r2, r3) { \ 596 #define transpose4x16(r0, r1, r2, r3) { \
604 register vector unsigned char r4; \ 597 register vec_u8_t r4; \
605 register vector unsigned char r5; \ 598 register vec_u8_t r5; \
606 register vector unsigned char r6; \ 599 register vec_u8_t r6; \
607 register vector unsigned char r7; \ 600 register vec_u8_t r7; \
608 \ 601 \
609 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ 602 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
610 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ 603 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
611 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ 604 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
612 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ 605 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
616 r2 = vec_mergeh(r5, r7); /*all set 2*/ \ 609 r2 = vec_mergeh(r5, r7); /*all set 2*/ \
617 r3 = vec_mergel(r5, r7); /*all set 3*/ \ 610 r3 = vec_mergel(r5, r7); /*all set 3*/ \
618 } 611 }
619 612
620 static inline void write16x4(uint8_t *dst, int dst_stride, 613 static inline void write16x4(uint8_t *dst, int dst_stride,
621 register vector unsigned char r0, register vector unsigned char r1, 614 register vec_u8_t r0, register vec_u8_t r1,
622 register vector unsigned char r2, register vector unsigned char r3) { 615 register vec_u8_t r2, register vec_u8_t r3) {
623 DECLARE_ALIGNED_16(unsigned char, result[64]); 616 DECLARE_ALIGNED_16(unsigned char, result[64]);
624 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; 617 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625 int int_dst_stride = dst_stride/4; 618 int int_dst_stride = dst_stride/4;
626 619
627 vec_st(r0, 0, result); 620 vec_st(r0, 0, result);
649 642
650 /** \brief performs a 6x16 transpose of data in src, and stores it to dst 643 /** \brief performs a 6x16 transpose of data in src, and stores it to dst
651 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing 644 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652 out of unaligned_load() */ 645 out of unaligned_load() */
653 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ 646 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654 register vector unsigned char r0 = unaligned_load(0, src);\ 647 register vec_u8_t r0 = unaligned_load(0, src); \
655 register vector unsigned char r1 = unaligned_load( src_stride, src);\ 648 register vec_u8_t r1 = unaligned_load( src_stride, src); \
656 register vector unsigned char r2 = unaligned_load(2* src_stride, src);\ 649 register vec_u8_t r2 = unaligned_load(2* src_stride, src); \
657 register vector unsigned char r3 = unaligned_load(3* src_stride, src);\ 650 register vec_u8_t r3 = unaligned_load(3* src_stride, src); \
658 register vector unsigned char r4 = unaligned_load(4* src_stride, src);\ 651 register vec_u8_t r4 = unaligned_load(4* src_stride, src); \
659 register vector unsigned char r5 = unaligned_load(5* src_stride, src);\ 652 register vec_u8_t r5 = unaligned_load(5* src_stride, src); \
660 register vector unsigned char r6 = unaligned_load(6* src_stride, src);\ 653 register vec_u8_t r6 = unaligned_load(6* src_stride, src); \
661 register vector unsigned char r7 = unaligned_load(7* src_stride, src);\ 654 register vec_u8_t r7 = unaligned_load(7* src_stride, src); \
662 register vector unsigned char r14 = unaligned_load(14*src_stride, src);\ 655 register vec_u8_t r14 = unaligned_load(14*src_stride, src); \
663 register vector unsigned char r15 = unaligned_load(15*src_stride, src);\ 656 register vec_u8_t r15 = unaligned_load(15*src_stride, src); \
664 \ 657 \
665 r8 = unaligned_load( 8*src_stride, src); \ 658 r8 = unaligned_load( 8*src_stride, src); \
666 r9 = unaligned_load( 9*src_stride, src); \ 659 r9 = unaligned_load( 9*src_stride, src); \
667 r10 = unaligned_load(10*src_stride, src); \ 660 r10 = unaligned_load(10*src_stride, src); \
668 r11 = unaligned_load(11*src_stride, src); \ 661 r11 = unaligned_load(11*src_stride, src); \
708 /* Don't need to compute 14 and 15*/ \ 701 /* Don't need to compute 14 and 15*/ \
709 \ 702 \
710 } 703 }
711 704
712 // out: o = |x-y| < a 705 // out: o = |x-y| < a
713 static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x, 706 static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
714 register vector unsigned char y, 707 register vec_u8_t y,
715 register vector unsigned char a) { 708 register vec_u8_t a) {
716 709
717 register vector unsigned char diff = vec_subs(x, y); 710 register vec_u8_t diff = vec_subs(x, y);
718 register vector unsigned char diffneg = vec_subs(y, x); 711 register vec_u8_t diffneg = vec_subs(y, x);
719 register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ 712 register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
720 o = (vector unsigned char)vec_cmplt(o, a); 713 o = (vec_u8_t)vec_cmplt(o, a);
721 return o; 714 return o;
722 } 715 }
723 716
724 static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0, 717 static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
725 register vector unsigned char p1, 718 register vec_u8_t p1,
726 register vector unsigned char q0, 719 register vec_u8_t q0,
727 register vector unsigned char q1, 720 register vec_u8_t q1,
728 register vector unsigned char alpha, 721 register vec_u8_t alpha,
729 register vector unsigned char beta) { 722 register vec_u8_t beta) {
730 723
731 register vector unsigned char mask; 724 register vec_u8_t mask;
732 register vector unsigned char tempmask; 725 register vec_u8_t tempmask;
733 726
734 mask = diff_lt_altivec(p0, q0, alpha); 727 mask = diff_lt_altivec(p0, q0, alpha);
735 tempmask = diff_lt_altivec(p1, p0, beta); 728 tempmask = diff_lt_altivec(p1, p0, beta);
736 mask = vec_and(mask, tempmask); 729 mask = vec_and(mask, tempmask);
737 tempmask = diff_lt_altivec(q1, q0, beta); 730 tempmask = diff_lt_altivec(q1, q0, beta);
739 732
740 return mask; 733 return mask;
741 } 734 }
742 735
743 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) 736 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744 static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, 737 static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
745 register vector unsigned char p1, 738 register vec_u8_t p1,
746 register vector unsigned char p2, 739 register vec_u8_t p2,
747 register vector unsigned char q0, 740 register vec_u8_t q0,
748 register vector unsigned char tc0) { 741 register vec_u8_t tc0) {
749 742
750 register vector unsigned char average = vec_avg(p0, q0); 743 register vec_u8_t average = vec_avg(p0, q0);
751 register vector unsigned char temp; 744 register vec_u8_t temp;
752 register vector unsigned char uncliped; 745 register vec_u8_t uncliped;
753 register vector unsigned char ones; 746 register vec_u8_t ones;
754 register vector unsigned char max; 747 register vec_u8_t max;
755 register vector unsigned char min; 748 register vec_u8_t min;
756 register vector unsigned char newp1; 749 register vec_u8_t newp1;
757 750
758 temp = vec_xor(average, p2); 751 temp = vec_xor(average, p2);
759 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ 752 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
760 ones = vec_splat_u8(1); 753 ones = vec_splat_u8(1);
761 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ 754 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
767 return newp1; 760 return newp1;
768 } 761 }
769 762
770 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ 763 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
771 \ 764 \
772 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ 765 const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
773 \ 766 \
774 register vector unsigned char pq0bit = vec_xor(p0,q0); \ 767 register vec_u8_t pq0bit = vec_xor(p0,q0); \
775 register vector unsigned char q1minus; \ 768 register vec_u8_t q1minus; \
776 register vector unsigned char p0minus; \ 769 register vec_u8_t p0minus; \
777 register vector unsigned char stage1; \ 770 register vec_u8_t stage1; \
778 register vector unsigned char stage2; \ 771 register vec_u8_t stage2; \
779 register vector unsigned char vec160; \ 772 register vec_u8_t vec160; \
780 register vector unsigned char delta; \ 773 register vec_u8_t delta; \
781 register vector unsigned char deltaneg; \ 774 register vec_u8_t deltaneg; \
782 \ 775 \
783 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ 776 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
784 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ 777 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
785 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ 778 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
786 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ 779 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
799 q0 = vec_adds(q0, deltaneg); \ 792 q0 = vec_adds(q0, deltaneg); \
800 } 793 }
801 794
802 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ 795 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
803 DECLARE_ALIGNED_16(unsigned char, temp[16]); \ 796 DECLARE_ALIGNED_16(unsigned char, temp[16]); \
804 register vector unsigned char alphavec; \ 797 register vec_u8_t alphavec; \
805 register vector unsigned char betavec; \ 798 register vec_u8_t betavec; \
806 register vector unsigned char mask; \ 799 register vec_u8_t mask; \
807 register vector unsigned char p1mask; \ 800 register vec_u8_t p1mask; \
808 register vector unsigned char q1mask; \ 801 register vec_u8_t q1mask; \
809 register vector signed char tc0vec; \ 802 register vector signed char tc0vec; \
810 register vector unsigned char finaltc0; \ 803 register vec_u8_t finaltc0; \
811 register vector unsigned char tc0masked; \ 804 register vec_u8_t tc0masked; \
812 register vector unsigned char newp1; \ 805 register vec_u8_t newp1; \
813 register vector unsigned char newq1; \ 806 register vec_u8_t newq1; \
814 \ 807 \
815 temp[0] = alpha; \ 808 temp[0] = alpha; \
816 temp[1] = beta; \ 809 temp[1] = beta; \
817 alphavec = vec_ld(0, temp); \ 810 alphavec = vec_ld(0, temp); \
818 betavec = vec_splat(alphavec, 0x1); \ 811 betavec = vec_splat(alphavec, 0x1); \
822 *((int *)temp) = *((int *)tc0); \ 815 *((int *)temp) = *((int *)tc0); \
823 tc0vec = vec_ld(0, (signed char*)temp); \ 816 tc0vec = vec_ld(0, (signed char*)temp); \
824 tc0vec = vec_mergeh(tc0vec, tc0vec); \ 817 tc0vec = vec_mergeh(tc0vec, tc0vec); \
825 tc0vec = vec_mergeh(tc0vec, tc0vec); \ 818 tc0vec = vec_mergeh(tc0vec, tc0vec); \
826 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ 819 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
827 finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \ 820 finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \
828 \ 821 \
829 p1mask = diff_lt_altivec(p2, p0, betavec); \ 822 p1mask = diff_lt_altivec(p2, p0, betavec); \
830 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ 823 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
831 tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec); \ 824 tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \
832 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ 825 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
833 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ 826 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
834 /*end if*/ \ 827 /*end if*/ \
835 \ 828 \
836 q1mask = diff_lt_altivec(q2, q0, betavec); \ 829 q1mask = diff_lt_altivec(q2, q0, betavec); \
837 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ 830 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
838 tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec); \ 831 tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \
839 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ 832 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
840 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ 833 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
841 /*end if*/ \ 834 /*end if*/ \
842 \ 835 \
843 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ 836 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
846 } 839 }
847 840
848 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { 841 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
849 842
850 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { 843 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
851 register vector unsigned char p2 = vec_ld(-3*stride, pix); 844 register vec_u8_t p2 = vec_ld(-3*stride, pix);
852 register vector unsigned char p1 = vec_ld(-2*stride, pix); 845 register vec_u8_t p1 = vec_ld(-2*stride, pix);
853 register vector unsigned char p0 = vec_ld(-1*stride, pix); 846 register vec_u8_t p0 = vec_ld(-1*stride, pix);
854 register vector unsigned char q0 = vec_ld(0, pix); 847 register vec_u8_t q0 = vec_ld(0, pix);
855 register vector unsigned char q1 = vec_ld(stride, pix); 848 register vec_u8_t q1 = vec_ld(stride, pix);
856 register vector unsigned char q2 = vec_ld(2*stride, pix); 849 register vec_u8_t q2 = vec_ld(2*stride, pix);
857 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); 850 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
858 vec_st(p1, -2*stride, pix); 851 vec_st(p1, -2*stride, pix);
859 vec_st(p0, -1*stride, pix); 852 vec_st(p0, -1*stride, pix);
860 vec_st(q0, 0, pix); 853 vec_st(q0, 0, pix);
861 vec_st(q1, stride, pix); 854 vec_st(q1, stride, pix);
862 } 855 }
863 } 856 }
864 857
865 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { 858 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
866 859
867 register vector unsigned char line0, line1, line2, line3, line4, line5; 860 register vec_u8_t line0, line1, line2, line3, line4, line5;
868 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) 861 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
869 return; 862 return;
870 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); 863 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
871 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); 864 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
872 transpose4x16(line1, line2, line3, line4); 865 transpose4x16(line1, line2, line3, line4);