Mercurial > libavcodec.hg
comparison ppc/h264_altivec.c @ 5530:cd266411b11a libavcodec
use shorter types vec_"type" instead of the too long vector "type"
part 1 of h264 luma interpolation 8x8 for altivec contributed by
Mauricio Alvarez % lokifo A gmail P com %
Original thread:
Date: Jun 26, 2007 8:15 PM
Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec
author | gpoirier |
---|---|
date | Sun, 12 Aug 2007 13:50:06 +0000 |
parents | b59514a8d239 |
children | f644e7c90380 |
comparison
equal
deleted
inserted
replaced
5529:af68496af656 | 5530:cd266411b11a |
---|---|
184 {((8 - x) * (8 - y)), | 184 {((8 - x) * (8 - y)), |
185 ((x) * (8 - y)), | 185 ((x) * (8 - y)), |
186 ((8 - x) * (y)), | 186 ((8 - x) * (y)), |
187 ((x) * (y))}; | 187 ((x) * (y))}; |
188 register int i; | 188 register int i; |
189 vector unsigned char fperm; | 189 vec_u8_t fperm; |
190 const vector signed int vABCD = vec_ld(0, ABCD); | 190 const vec_s32_t vABCD = vec_ld(0, ABCD); |
191 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); | 191 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); |
192 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); | 192 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); |
193 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); | 193 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); |
194 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); | 194 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); |
195 const vector signed int vzero = vec_splat_s32(0); | 195 LOAD_ZERO; |
196 const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); | 196 const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); |
197 const vector unsigned short v6us = vec_splat_u16(6); | 197 const vec_u16_t v6us = vec_splat_u16(6); |
198 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; | 198 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
199 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; | 199 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
200 | 200 |
201 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; | 201 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; |
202 vector unsigned char vsrc0uc, vsrc1uc; | 202 vec_u8_t vsrc0uc, vsrc1uc; |
203 vector signed short vsrc0ssH, vsrc1ssH; | 203 vec_s16_t vsrc0ssH, vsrc1ssH; |
204 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; | 204 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; |
205 vector signed short vsrc2ssH, vsrc3ssH, psum; | 205 vec_s16_t vsrc2ssH, vsrc3ssH, psum; |
206 vector unsigned char vdst, ppsum, fsum; | 206 vec_u8_t vdst, ppsum, fsum; |
207 | 207 |
208 if (((unsigned long)dst) % 16 == 0) { | 208 if (((unsigned long)dst) % 16 == 0) { |
209 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, | 209 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, |
210 0x14, 0x15, 0x16, 0x17, | 210 0x14, 0x15, 0x16, 0x17, |
211 0x08, 0x09, 0x0A, 0x0B, | 211 0x08, 0x09, 0x0A, 0x0B, |
212 0x0C, 0x0D, 0x0E, 0x0F); | 212 0x0C, 0x0D, 0x0E, 0x0F); |
213 } else { | 213 } else { |
214 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, | 214 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, |
215 0x04, 0x05, 0x06, 0x07, | 215 0x04, 0x05, 0x06, 0x07, |
216 0x18, 0x19, 0x1A, 0x1B, | 216 0x18, 0x19, 0x1A, 0x1B, |
217 0x1C, 0x1D, 0x1E, 0x1F); | 217 0x1C, 0x1D, 0x1E, 0x1F); |
218 } | 218 } |
219 | 219 |
220 vsrcAuc = vec_ld(0, src); | 220 vsrcAuc = vec_ld(0, src); |
221 | 221 |
222 if (loadSecond) | 222 if (loadSecond) |
228 if (reallyBadAlign) | 228 if (reallyBadAlign) |
229 vsrc1uc = vsrcBuc; | 229 vsrc1uc = vsrcBuc; |
230 else | 230 else |
231 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); | 231 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
232 | 232 |
233 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | 233 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc); |
234 (vector unsigned char)vsrc0uc); | 234 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc); |
235 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | |
236 (vector unsigned char)vsrc1uc); | |
237 | 235 |
238 if (!loadSecond) {// -> !reallyBadAlign | 236 if (!loadSecond) {// -> !reallyBadAlign |
239 for (i = 0 ; i < h ; i++) { | 237 for (i = 0 ; i < h ; i++) { |
240 | 238 |
241 | 239 |
242 vsrcCuc = vec_ld(stride + 0, src); | 240 vsrcCuc = vec_ld(stride + 0, src); |
243 | 241 |
244 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | 242 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
245 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | 243 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
246 | 244 |
247 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | 245 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); |
248 (vector unsigned char)vsrc2uc); | 246 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); |
249 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | |
250 (vector unsigned char)vsrc3uc); | |
251 | 247 |
252 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 248 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
253 psum = vec_mladd(vB, vsrc1ssH, psum); | 249 psum = vec_mladd(vB, vsrc1ssH, psum); |
254 psum = vec_mladd(vC, vsrc2ssH, psum); | 250 psum = vec_mladd(vC, vsrc2ssH, psum); |
255 psum = vec_mladd(vD, vsrc3ssH, psum); | 251 psum = vec_mladd(vD, vsrc3ssH, psum); |
256 psum = vec_add(v28ss, psum); | 252 psum = vec_add(v28ss, psum); |
257 psum = vec_sra(psum, v6us); | 253 psum = vec_sra(psum, v6us); |
258 | 254 |
259 vdst = vec_ld(0, dst); | 255 vdst = vec_ld(0, dst); |
260 ppsum = (vector unsigned char)vec_packsu(psum, psum); | 256 ppsum = (vec_u8_t)vec_packsu(psum, psum); |
261 fsum = vec_perm(vdst, ppsum, fperm); | 257 fsum = vec_perm(vdst, ppsum, fperm); |
262 | 258 |
263 vec_st(fsum, 0, dst); | 259 vec_st(fsum, 0, dst); |
264 | 260 |
265 vsrc0ssH = vsrc2ssH; | 261 vsrc0ssH = vsrc2ssH; |
267 | 263 |
268 dst += stride; | 264 dst += stride; |
269 src += stride; | 265 src += stride; |
270 } | 266 } |
271 } else { | 267 } else { |
272 vector unsigned char vsrcDuc; | 268 vec_u8_t vsrcDuc; |
273 for (i = 0 ; i < h ; i++) { | 269 for (i = 0 ; i < h ; i++) { |
274 vsrcCuc = vec_ld(stride + 0, src); | 270 vsrcCuc = vec_ld(stride + 0, src); |
275 vsrcDuc = vec_ld(stride + 16, src); | 271 vsrcDuc = vec_ld(stride + 16, src); |
276 | 272 |
277 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | 273 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
278 if (reallyBadAlign) | 274 if (reallyBadAlign) |
279 vsrc3uc = vsrcDuc; | 275 vsrc3uc = vsrcDuc; |
280 else | 276 else |
281 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | 277 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
282 | 278 |
283 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | 279 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); |
284 (vector unsigned char)vsrc2uc); | 280 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); |
285 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | |
286 (vector unsigned char)vsrc3uc); | |
287 | 281 |
288 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 282 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
289 psum = vec_mladd(vB, vsrc1ssH, psum); | 283 psum = vec_mladd(vB, vsrc1ssH, psum); |
290 psum = vec_mladd(vC, vsrc2ssH, psum); | 284 psum = vec_mladd(vC, vsrc2ssH, psum); |
291 psum = vec_mladd(vD, vsrc3ssH, psum); | 285 psum = vec_mladd(vD, vsrc3ssH, psum); |
292 psum = vec_add(v28ss, psum); | 286 psum = vec_add(v28ss, psum); |
293 psum = vec_sr(psum, v6us); | 287 psum = vec_sr(psum, v6us); |
294 | 288 |
295 vdst = vec_ld(0, dst); | 289 vdst = vec_ld(0, dst); |
296 ppsum = (vector unsigned char)vec_pack(psum, psum); | 290 ppsum = (vec_u8_t)vec_pack(psum, psum); |
297 fsum = vec_perm(vdst, ppsum, fperm); | 291 fsum = vec_perm(vdst, ppsum, fperm); |
298 | 292 |
299 vec_st(fsum, 0, dst); | 293 vec_st(fsum, 0, dst); |
300 | 294 |
301 vsrc0ssH = vsrc2ssH; | 295 vsrc0ssH = vsrc2ssH; |
310 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, | 304 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, |
311 const uint8_t * src2, int dst_stride, | 305 const uint8_t * src2, int dst_stride, |
312 int src_stride1, int h) | 306 int src_stride1, int h) |
313 { | 307 { |
314 int i; | 308 int i; |
315 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; | 309 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; |
316 | 310 |
317 mask_ = vec_lvsl(0, src2); | 311 mask_ = vec_lvsl(0, src2); |
318 | 312 |
319 for (i = 0; i < h; i++) { | 313 for (i = 0; i < h; i++) { |
320 | 314 |
352 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, | 346 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, |
353 const uint8_t * src2, int dst_stride, | 347 const uint8_t * src2, int dst_stride, |
354 int src_stride1, int h) | 348 int src_stride1, int h) |
355 { | 349 { |
356 int i; | 350 int i; |
357 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; | 351 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; |
358 | 352 |
359 mask_ = vec_lvsl(0, src2); | 353 mask_ = vec_lvsl(0, src2); |
360 | 354 |
361 for (i = 0; i < h; i++) { | 355 for (i = 0; i < h; i++) { |
362 | 356 |
565 | 559 |
566 const vec_u16_t onev = vec_splat_u16(1); | 560 const vec_u16_t onev = vec_splat_u16(1); |
567 const vec_u16_t twov = vec_splat_u16(2); | 561 const vec_u16_t twov = vec_splat_u16(2); |
568 const vec_u16_t sixv = vec_splat_u16(6); | 562 const vec_u16_t sixv = vec_splat_u16(6); |
569 | 563 |
570 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0, | 564 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1); |
571 -1,-1,-1,-1,-1,-1,-1,-1); | |
572 LOAD_ZERO; | 565 LOAD_ZERO; |
573 | 566 |
574 dct[0] += 32; // rounding for the >>6 at the end | 567 dct[0] += 32; // rounding for the >>6 at the end |
575 | 568 |
576 s0 = vec_ld(0x00, (int16_t*)dct); | 569 s0 = vec_ld(0x00, (int16_t*)dct); |
599 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); | 592 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); |
600 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); | 593 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); |
601 } | 594 } |
602 | 595 |
603 #define transpose4x16(r0, r1, r2, r3) { \ | 596 #define transpose4x16(r0, r1, r2, r3) { \ |
604 register vector unsigned char r4; \ | 597 register vec_u8_t r4; \ |
605 register vector unsigned char r5; \ | 598 register vec_u8_t r5; \ |
606 register vector unsigned char r6; \ | 599 register vec_u8_t r6; \ |
607 register vector unsigned char r7; \ | 600 register vec_u8_t r7; \ |
608 \ | 601 \ |
609 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ | 602 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ |
610 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ | 603 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ |
611 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ | 604 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ |
612 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ | 605 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ |
616 r2 = vec_mergeh(r5, r7); /*all set 2*/ \ | 609 r2 = vec_mergeh(r5, r7); /*all set 2*/ \ |
617 r3 = vec_mergel(r5, r7); /*all set 3*/ \ | 610 r3 = vec_mergel(r5, r7); /*all set 3*/ \ |
618 } | 611 } |
619 | 612 |
620 static inline void write16x4(uint8_t *dst, int dst_stride, | 613 static inline void write16x4(uint8_t *dst, int dst_stride, |
621 register vector unsigned char r0, register vector unsigned char r1, | 614 register vec_u8_t r0, register vec_u8_t r1, |
622 register vector unsigned char r2, register vector unsigned char r3) { | 615 register vec_u8_t r2, register vec_u8_t r3) { |
623 DECLARE_ALIGNED_16(unsigned char, result[64]); | 616 DECLARE_ALIGNED_16(unsigned char, result[64]); |
624 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; | 617 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; |
625 int int_dst_stride = dst_stride/4; | 618 int int_dst_stride = dst_stride/4; |
626 | 619 |
627 vec_st(r0, 0, result); | 620 vec_st(r0, 0, result); |
649 | 642 |
650 /** \brief performs a 6x16 transpose of data in src, and stores it to dst | 643 /** \brief performs a 6x16 transpose of data in src, and stores it to dst |
651 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing | 644 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing |
652 out of unaligned_load() */ | 645 out of unaligned_load() */ |
653 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ | 646 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ |
654 register vector unsigned char r0 = unaligned_load(0, src);\ | 647 register vec_u8_t r0 = unaligned_load(0, src); \ |
655 register vector unsigned char r1 = unaligned_load( src_stride, src);\ | 648 register vec_u8_t r1 = unaligned_load( src_stride, src); \ |
656 register vector unsigned char r2 = unaligned_load(2* src_stride, src);\ | 649 register vec_u8_t r2 = unaligned_load(2* src_stride, src); \ |
657 register vector unsigned char r3 = unaligned_load(3* src_stride, src);\ | 650 register vec_u8_t r3 = unaligned_load(3* src_stride, src); \ |
658 register vector unsigned char r4 = unaligned_load(4* src_stride, src);\ | 651 register vec_u8_t r4 = unaligned_load(4* src_stride, src); \ |
659 register vector unsigned char r5 = unaligned_load(5* src_stride, src);\ | 652 register vec_u8_t r5 = unaligned_load(5* src_stride, src); \ |
660 register vector unsigned char r6 = unaligned_load(6* src_stride, src);\ | 653 register vec_u8_t r6 = unaligned_load(6* src_stride, src); \ |
661 register vector unsigned char r7 = unaligned_load(7* src_stride, src);\ | 654 register vec_u8_t r7 = unaligned_load(7* src_stride, src); \ |
662 register vector unsigned char r14 = unaligned_load(14*src_stride, src);\ | 655 register vec_u8_t r14 = unaligned_load(14*src_stride, src); \ |
663 register vector unsigned char r15 = unaligned_load(15*src_stride, src);\ | 656 register vec_u8_t r15 = unaligned_load(15*src_stride, src); \ |
664 \ | 657 \ |
665 r8 = unaligned_load( 8*src_stride, src); \ | 658 r8 = unaligned_load( 8*src_stride, src); \ |
666 r9 = unaligned_load( 9*src_stride, src); \ | 659 r9 = unaligned_load( 9*src_stride, src); \ |
667 r10 = unaligned_load(10*src_stride, src); \ | 660 r10 = unaligned_load(10*src_stride, src); \ |
668 r11 = unaligned_load(11*src_stride, src); \ | 661 r11 = unaligned_load(11*src_stride, src); \ |
708 /* Don't need to compute 14 and 15*/ \ | 701 /* Don't need to compute 14 and 15*/ \ |
709 \ | 702 \ |
710 } | 703 } |
711 | 704 |
712 // out: o = |x-y| < a | 705 // out: o = |x-y| < a |
713 static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x, | 706 static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x, |
714 register vector unsigned char y, | 707 register vec_u8_t y, |
715 register vector unsigned char a) { | 708 register vec_u8_t a) { |
716 | 709 |
717 register vector unsigned char diff = vec_subs(x, y); | 710 register vec_u8_t diff = vec_subs(x, y); |
718 register vector unsigned char diffneg = vec_subs(y, x); | 711 register vec_u8_t diffneg = vec_subs(y, x); |
719 register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ | 712 register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ |
720 o = (vector unsigned char)vec_cmplt(o, a); | 713 o = (vec_u8_t)vec_cmplt(o, a); |
721 return o; | 714 return o; |
722 } | 715 } |
723 | 716 |
724 static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0, | 717 static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0, |
725 register vector unsigned char p1, | 718 register vec_u8_t p1, |
726 register vector unsigned char q0, | 719 register vec_u8_t q0, |
727 register vector unsigned char q1, | 720 register vec_u8_t q1, |
728 register vector unsigned char alpha, | 721 register vec_u8_t alpha, |
729 register vector unsigned char beta) { | 722 register vec_u8_t beta) { |
730 | 723 |
731 register vector unsigned char mask; | 724 register vec_u8_t mask; |
732 register vector unsigned char tempmask; | 725 register vec_u8_t tempmask; |
733 | 726 |
734 mask = diff_lt_altivec(p0, q0, alpha); | 727 mask = diff_lt_altivec(p0, q0, alpha); |
735 tempmask = diff_lt_altivec(p1, p0, beta); | 728 tempmask = diff_lt_altivec(p1, p0, beta); |
736 mask = vec_and(mask, tempmask); | 729 mask = vec_and(mask, tempmask); |
737 tempmask = diff_lt_altivec(q1, q0, beta); | 730 tempmask = diff_lt_altivec(q1, q0, beta); |
739 | 732 |
740 return mask; | 733 return mask; |
741 } | 734 } |
742 | 735 |
743 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) | 736 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) |
744 static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, | 737 static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, |
745 register vector unsigned char p1, | 738 register vec_u8_t p1, |
746 register vector unsigned char p2, | 739 register vec_u8_t p2, |
747 register vector unsigned char q0, | 740 register vec_u8_t q0, |
748 register vector unsigned char tc0) { | 741 register vec_u8_t tc0) { |
749 | 742 |
750 register vector unsigned char average = vec_avg(p0, q0); | 743 register vec_u8_t average = vec_avg(p0, q0); |
751 register vector unsigned char temp; | 744 register vec_u8_t temp; |
752 register vector unsigned char uncliped; | 745 register vec_u8_t uncliped; |
753 register vector unsigned char ones; | 746 register vec_u8_t ones; |
754 register vector unsigned char max; | 747 register vec_u8_t max; |
755 register vector unsigned char min; | 748 register vec_u8_t min; |
756 register vector unsigned char newp1; | 749 register vec_u8_t newp1; |
757 | 750 |
758 temp = vec_xor(average, p2); | 751 temp = vec_xor(average, p2); |
759 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ | 752 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ |
760 ones = vec_splat_u8(1); | 753 ones = vec_splat_u8(1); |
761 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ | 754 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ |
767 return newp1; | 760 return newp1; |
768 } | 761 } |
769 | 762 |
770 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ | 763 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ |
771 \ | 764 \ |
772 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ | 765 const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ |
773 \ | 766 \ |
774 register vector unsigned char pq0bit = vec_xor(p0,q0); \ | 767 register vec_u8_t pq0bit = vec_xor(p0,q0); \ |
775 register vector unsigned char q1minus; \ | 768 register vec_u8_t q1minus; \ |
776 register vector unsigned char p0minus; \ | 769 register vec_u8_t p0minus; \ |
777 register vector unsigned char stage1; \ | 770 register vec_u8_t stage1; \ |
778 register vector unsigned char stage2; \ | 771 register vec_u8_t stage2; \ |
779 register vector unsigned char vec160; \ | 772 register vec_u8_t vec160; \ |
780 register vector unsigned char delta; \ | 773 register vec_u8_t delta; \ |
781 register vector unsigned char deltaneg; \ | 774 register vec_u8_t deltaneg; \ |
782 \ | 775 \ |
783 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ | 776 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ |
784 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ | 777 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ |
785 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ | 778 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ |
786 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ | 779 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ |
799 q0 = vec_adds(q0, deltaneg); \ | 792 q0 = vec_adds(q0, deltaneg); \ |
800 } | 793 } |
801 | 794 |
802 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ | 795 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ |
803 DECLARE_ALIGNED_16(unsigned char, temp[16]); \ | 796 DECLARE_ALIGNED_16(unsigned char, temp[16]); \ |
804 register vector unsigned char alphavec; \ | 797 register vec_u8_t alphavec; \ |
805 register vector unsigned char betavec; \ | 798 register vec_u8_t betavec; \ |
806 register vector unsigned char mask; \ | 799 register vec_u8_t mask; \ |
807 register vector unsigned char p1mask; \ | 800 register vec_u8_t p1mask; \ |
808 register vector unsigned char q1mask; \ | 801 register vec_u8_t q1mask; \ |
809 register vector signed char tc0vec; \ | 802 register vector signed char tc0vec; \ |
810 register vector unsigned char finaltc0; \ | 803 register vec_u8_t finaltc0; \ |
811 register vector unsigned char tc0masked; \ | 804 register vec_u8_t tc0masked; \ |
812 register vector unsigned char newp1; \ | 805 register vec_u8_t newp1; \ |
813 register vector unsigned char newq1; \ | 806 register vec_u8_t newq1; \ |
814 \ | 807 \ |
815 temp[0] = alpha; \ | 808 temp[0] = alpha; \ |
816 temp[1] = beta; \ | 809 temp[1] = beta; \ |
817 alphavec = vec_ld(0, temp); \ | 810 alphavec = vec_ld(0, temp); \ |
818 betavec = vec_splat(alphavec, 0x1); \ | 811 betavec = vec_splat(alphavec, 0x1); \ |
822 *((int *)temp) = *((int *)tc0); \ | 815 *((int *)temp) = *((int *)tc0); \ |
823 tc0vec = vec_ld(0, (signed char*)temp); \ | 816 tc0vec = vec_ld(0, (signed char*)temp); \ |
824 tc0vec = vec_mergeh(tc0vec, tc0vec); \ | 817 tc0vec = vec_mergeh(tc0vec, tc0vec); \ |
825 tc0vec = vec_mergeh(tc0vec, tc0vec); \ | 818 tc0vec = vec_mergeh(tc0vec, tc0vec); \ |
826 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ | 819 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ |
827 finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \ | 820 finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \ |
828 \ | 821 \ |
829 p1mask = diff_lt_altivec(p2, p0, betavec); \ | 822 p1mask = diff_lt_altivec(p2, p0, betavec); \ |
830 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ | 823 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ |
831 tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec); \ | 824 tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \ |
832 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ | 825 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ |
833 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ | 826 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ |
834 /*end if*/ \ | 827 /*end if*/ \ |
835 \ | 828 \ |
836 q1mask = diff_lt_altivec(q2, q0, betavec); \ | 829 q1mask = diff_lt_altivec(q2, q0, betavec); \ |
837 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ | 830 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ |
838 tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec); \ | 831 tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \ |
839 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ | 832 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ |
840 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ | 833 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ |
841 /*end if*/ \ | 834 /*end if*/ \ |
842 \ | 835 \ |
843 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ | 836 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ |
846 } | 839 } |
847 | 840 |
848 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { | 841 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { |
849 | 842 |
850 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { | 843 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { |
851 register vector unsigned char p2 = vec_ld(-3*stride, pix); | 844 register vec_u8_t p2 = vec_ld(-3*stride, pix); |
852 register vector unsigned char p1 = vec_ld(-2*stride, pix); | 845 register vec_u8_t p1 = vec_ld(-2*stride, pix); |
853 register vector unsigned char p0 = vec_ld(-1*stride, pix); | 846 register vec_u8_t p0 = vec_ld(-1*stride, pix); |
854 register vector unsigned char q0 = vec_ld(0, pix); | 847 register vec_u8_t q0 = vec_ld(0, pix); |
855 register vector unsigned char q1 = vec_ld(stride, pix); | 848 register vec_u8_t q1 = vec_ld(stride, pix); |
856 register vector unsigned char q2 = vec_ld(2*stride, pix); | 849 register vec_u8_t q2 = vec_ld(2*stride, pix); |
857 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); | 850 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); |
858 vec_st(p1, -2*stride, pix); | 851 vec_st(p1, -2*stride, pix); |
859 vec_st(p0, -1*stride, pix); | 852 vec_st(p0, -1*stride, pix); |
860 vec_st(q0, 0, pix); | 853 vec_st(q0, 0, pix); |
861 vec_st(q1, stride, pix); | 854 vec_st(q1, stride, pix); |
862 } | 855 } |
863 } | 856 } |
864 | 857 |
865 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { | 858 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { |
866 | 859 |
867 register vector unsigned char line0, line1, line2, line3, line4, line5; | 860 register vec_u8_t line0, line1, line2, line3, line4, line5; |
868 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) | 861 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) |
869 return; | 862 return; |
870 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); | 863 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); |
871 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); | 864 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); |
872 transpose4x16(line1, line2, line3, line4); | 865 transpose4x16(line1, line2, line3, line4); |