comparison ppc/h264_altivec.c @ 8494:1615d6b75ada libavcodec

Cleanup _t types in libavcodec/ppc
author lu_zero
date Sat, 27 Dec 2008 11:21:28 +0000
parents 11307ea31e57
children 992e2f8bfba7
comparison
equal deleted inserted replaced
8493:469f3e5bcf13 8494:1615d6b75ada
187 {((8 - x) * (8 - y)), 187 {((8 - x) * (8 - y)),
188 ((x) * (8 - y)), 188 ((x) * (8 - y)),
189 ((8 - x) * (y)), 189 ((8 - x) * (y)),
190 ((x) * (y))}; 190 ((x) * (y))};
191 register int i; 191 register int i;
192 vec_u8_t fperm; 192 vec_u8 fperm;
193 const vec_s32_t vABCD = vec_ld(0, ABCD); 193 const vec_s32 vABCD = vec_ld(0, ABCD);
194 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); 194 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
195 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); 195 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
196 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); 196 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
197 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); 197 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
198 LOAD_ZERO; 198 LOAD_ZERO;
199 const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 199 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
200 const vec_u16_t v6us = vec_splat_u16(6); 200 const vec_u16 v6us = vec_splat_u16(6);
201 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 201 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
202 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 202 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
203 203
204 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; 204 vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
205 vec_u8_t vsrc0uc, vsrc1uc; 205 vec_u8 vsrc0uc, vsrc1uc;
206 vec_s16_t vsrc0ssH, vsrc1ssH; 206 vec_s16 vsrc0ssH, vsrc1ssH;
207 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; 207 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
208 vec_s16_t vsrc2ssH, vsrc3ssH, psum; 208 vec_s16 vsrc2ssH, vsrc3ssH, psum;
209 vec_u8_t vdst, ppsum, fsum; 209 vec_u8 vdst, ppsum, fsum;
210 210
211 if (((unsigned long)dst) % 16 == 0) { 211 if (((unsigned long)dst) % 16 == 0) {
212 fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13, 212 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
213 0x14, 0x15, 0x16, 0x17, 213 0x14, 0x15, 0x16, 0x17,
214 0x08, 0x09, 0x0A, 0x0B, 214 0x08, 0x09, 0x0A, 0x0B,
215 0x0C, 0x0D, 0x0E, 0x0F}; 215 0x0C, 0x0D, 0x0E, 0x0F};
216 } else { 216 } else {
217 fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03, 217 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
218 0x04, 0x05, 0x06, 0x07, 218 0x04, 0x05, 0x06, 0x07,
219 0x18, 0x19, 0x1A, 0x1B, 219 0x18, 0x19, 0x1A, 0x1B,
220 0x1C, 0x1D, 0x1E, 0x1F}; 220 0x1C, 0x1D, 0x1E, 0x1F};
221 } 221 }
222 222
231 if (reallyBadAlign) 231 if (reallyBadAlign)
232 vsrc1uc = vsrcBuc; 232 vsrc1uc = vsrcBuc;
233 else 233 else
234 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 234 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
235 235
236 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc); 236 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
237 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc); 237 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
238 238
239 if (!loadSecond) {// -> !reallyBadAlign 239 if (!loadSecond) {// -> !reallyBadAlign
240 for (i = 0 ; i < h ; i++) { 240 for (i = 0 ; i < h ; i++) {
241 241
242 242
243 vsrcCuc = vec_ld(stride + 0, src); 243 vsrcCuc = vec_ld(stride + 0, src);
244 244
245 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 245 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
246 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 246 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
247 247
248 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); 248 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
249 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); 249 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
250 250
251 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 251 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
252 psum = vec_mladd(vB, vsrc1ssH, psum); 252 psum = vec_mladd(vB, vsrc1ssH, psum);
253 psum = vec_mladd(vC, vsrc2ssH, psum); 253 psum = vec_mladd(vC, vsrc2ssH, psum);
254 psum = vec_mladd(vD, vsrc3ssH, psum); 254 psum = vec_mladd(vD, vsrc3ssH, psum);
255 psum = vec_add(v28ss, psum); 255 psum = vec_add(v28ss, psum);
256 psum = vec_sra(psum, v6us); 256 psum = vec_sra(psum, v6us);
257 257
258 vdst = vec_ld(0, dst); 258 vdst = vec_ld(0, dst);
259 ppsum = (vec_u8_t)vec_packsu(psum, psum); 259 ppsum = (vec_u8)vec_packsu(psum, psum);
260 fsum = vec_perm(vdst, ppsum, fperm); 260 fsum = vec_perm(vdst, ppsum, fperm);
261 261
262 vec_st(fsum, 0, dst); 262 vec_st(fsum, 0, dst);
263 263
264 vsrc0ssH = vsrc2ssH; 264 vsrc0ssH = vsrc2ssH;
266 266
267 dst += stride; 267 dst += stride;
268 src += stride; 268 src += stride;
269 } 269 }
270 } else { 270 } else {
271 vec_u8_t vsrcDuc; 271 vec_u8 vsrcDuc;
272 for (i = 0 ; i < h ; i++) { 272 for (i = 0 ; i < h ; i++) {
273 vsrcCuc = vec_ld(stride + 0, src); 273 vsrcCuc = vec_ld(stride + 0, src);
274 vsrcDuc = vec_ld(stride + 16, src); 274 vsrcDuc = vec_ld(stride + 16, src);
275 275
276 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 276 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
277 if (reallyBadAlign) 277 if (reallyBadAlign)
278 vsrc3uc = vsrcDuc; 278 vsrc3uc = vsrcDuc;
279 else 279 else
280 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 280 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
281 281
282 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); 282 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
283 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); 283 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
284 284
285 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 285 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
286 psum = vec_mladd(vB, vsrc1ssH, psum); 286 psum = vec_mladd(vB, vsrc1ssH, psum);
287 psum = vec_mladd(vC, vsrc2ssH, psum); 287 psum = vec_mladd(vC, vsrc2ssH, psum);
288 psum = vec_mladd(vD, vsrc3ssH, psum); 288 psum = vec_mladd(vD, vsrc3ssH, psum);
289 psum = vec_add(v28ss, psum); 289 psum = vec_add(v28ss, psum);
290 psum = vec_sr(psum, v6us); 290 psum = vec_sr(psum, v6us);
291 291
292 vdst = vec_ld(0, dst); 292 vdst = vec_ld(0, dst);
293 ppsum = (vec_u8_t)vec_pack(psum, psum); 293 ppsum = (vec_u8)vec_pack(psum, psum);
294 fsum = vec_perm(vdst, ppsum, fperm); 294 fsum = vec_perm(vdst, ppsum, fperm);
295 295
296 vec_st(fsum, 0, dst); 296 vec_st(fsum, 0, dst);
297 297
298 vsrc0ssH = vsrc2ssH; 298 vsrc0ssH = vsrc2ssH;
307 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, 307 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
308 const uint8_t * src2, int dst_stride, 308 const uint8_t * src2, int dst_stride,
309 int src_stride1, int h) 309 int src_stride1, int h)
310 { 310 {
311 int i; 311 int i;
312 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; 312 vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
313 313
314 mask_ = vec_lvsl(0, src2); 314 mask_ = vec_lvsl(0, src2);
315 315
316 for (i = 0; i < h; i++) { 316 for (i = 0; i < h; i++) {
317 317
349 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, 349 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
350 const uint8_t * src2, int dst_stride, 350 const uint8_t * src2, int dst_stride,
351 int src_stride1, int h) 351 int src_stride1, int h)
352 { 352 {
353 int i; 353 int i;
354 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; 354 vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
355 355
356 mask_ = vec_lvsl(0, src2); 356 mask_ = vec_lvsl(0, src2);
357 357
358 for (i = 0; i < h; i++) { 358 for (i = 0; i < h; i++) {
359 359
430 b3 = vec_mergel( a1, a3 ) 430 b3 = vec_mergel( a1, a3 )
431 431
432 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ 432 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
433 vdst_orig = vec_ld(0, dst); \ 433 vdst_orig = vec_ld(0, dst); \
434 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ 434 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
435 vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \ 435 vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \
436 va = vec_add(va, vdst_ss); \ 436 va = vec_add(va, vdst_ss); \
437 va_u8 = vec_packsu(va, zero_s16v); \ 437 va_u8 = vec_packsu(va, zero_s16v); \
438 va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ 438 va_u32 = vec_splat((vec_u32)va_u8, 0); \
439 vec_ste(va_u32, element, (uint32_t*)dst); 439 vec_ste(va_u32, element, (uint32_t*)dst);
440 440
441 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) 441 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
442 { 442 {
443 vec_s16_t va0, va1, va2, va3; 443 vec_s16 va0, va1, va2, va3;
444 vec_s16_t vz0, vz1, vz2, vz3; 444 vec_s16 vz0, vz1, vz2, vz3;
445 vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3; 445 vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
446 vec_u8_t va_u8; 446 vec_u8 va_u8;
447 vec_u32_t va_u32; 447 vec_u32 va_u32;
448 vec_s16_t vdst_ss; 448 vec_s16 vdst_ss;
449 const vec_u16_t v6us = vec_splat_u16(6); 449 const vec_u16 v6us = vec_splat_u16(6);
450 vec_u8_t vdst, vdst_orig; 450 vec_u8 vdst, vdst_orig;
451 vec_u8_t vdst_mask = vec_lvsl(0, dst); 451 vec_u8 vdst_mask = vec_lvsl(0, dst);
452 int element = ((unsigned long)dst & 0xf) >> 2; 452 int element = ((unsigned long)dst & 0xf) >> 2;
453 LOAD_ZERO; 453 LOAD_ZERO;
454 454
455 block[0] += 32; /* add 32 as a DC-level for rounding */ 455 block[0] += 32; /* add 32 as a DC-level for rounding */
456 456
477 VEC_LOAD_U8_ADD_S16_STORE_U8(va3); 477 VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
478 } 478 }
479 479
480 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ 480 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
481 /* a0 = SRC(0) + SRC(4); */ \ 481 /* a0 = SRC(0) + SRC(4); */ \
482 vec_s16_t a0v = vec_add(s0, s4); \ 482 vec_s16 a0v = vec_add(s0, s4); \
483 /* a2 = SRC(0) - SRC(4); */ \ 483 /* a2 = SRC(0) - SRC(4); */ \
484 vec_s16_t a2v = vec_sub(s0, s4); \ 484 vec_s16 a2v = vec_sub(s0, s4); \
485 /* a4 = (SRC(2)>>1) - SRC(6); */ \ 485 /* a4 = (SRC(2)>>1) - SRC(6); */ \
486 vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \ 486 vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \
487 /* a6 = (SRC(6)>>1) + SRC(2); */ \ 487 /* a6 = (SRC(6)>>1) + SRC(2); */ \
488 vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \ 488 vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \
489 /* b0 = a0 + a6; */ \ 489 /* b0 = a0 + a6; */ \
490 vec_s16_t b0v = vec_add(a0v, a6v); \ 490 vec_s16 b0v = vec_add(a0v, a6v); \
491 /* b2 = a2 + a4; */ \ 491 /* b2 = a2 + a4; */ \
492 vec_s16_t b2v = vec_add(a2v, a4v); \ 492 vec_s16 b2v = vec_add(a2v, a4v); \
493 /* b4 = a2 - a4; */ \ 493 /* b4 = a2 - a4; */ \
494 vec_s16_t b4v = vec_sub(a2v, a4v); \ 494 vec_s16 b4v = vec_sub(a2v, a4v); \
495 /* b6 = a0 - a6; */ \ 495 /* b6 = a0 - a6; */ \
496 vec_s16_t b6v = vec_sub(a0v, a6v); \ 496 vec_s16 b6v = vec_sub(a0v, a6v); \
497 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ 497 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
498 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ 498 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
499 vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ 499 vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
500 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ 500 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
501 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ 501 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
502 vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ 502 vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
503 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ 503 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
504 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ 504 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
505 vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ 505 vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
506 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ 506 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
507 vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ 507 vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
508 /* b1 = (a7>>2) + a1; */ \ 508 /* b1 = (a7>>2) + a1; */ \
509 vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \ 509 vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
510 /* b3 = a3 + (a5>>2); */ \ 510 /* b3 = a3 + (a5>>2); */ \
511 vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \ 511 vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
512 /* b5 = (a3>>2) - a5; */ \ 512 /* b5 = (a3>>2) - a5; */ \
513 vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \ 513 vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
514 /* b7 = a7 - (a1>>2); */ \ 514 /* b7 = a7 - (a1>>2); */ \
515 vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ 515 vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
516 /* DST(0, b0 + b7); */ \ 516 /* DST(0, b0 + b7); */ \
517 d0 = vec_add(b0v, b7v); \ 517 d0 = vec_add(b0v, b7v); \
518 /* DST(1, b2 + b5); */ \ 518 /* DST(1, b2 + b5); */ \
519 d1 = vec_add(b2v, b5v); \ 519 d1 = vec_add(b2v, b5v); \
520 /* DST(2, b4 + b3); */ \ 520 /* DST(2, b4 + b3); */ \
531 d7 = vec_sub(b0v, b7v); \ 531 d7 = vec_sub(b0v, b7v); \
532 } 532 }
533 533
534 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ 534 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
535 /* unaligned load */ \ 535 /* unaligned load */ \
536 vec_u8_t hv = vec_ld( 0, dest ); \ 536 vec_u8 hv = vec_ld( 0, dest ); \
537 vec_u8_t lv = vec_ld( 7, dest ); \ 537 vec_u8 lv = vec_ld( 7, dest ); \
538 vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \ 538 vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \
539 vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ 539 vec_s16 idct_sh6 = vec_sra(idctv, sixv); \
540 vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ 540 vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \
541 vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ 541 vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \
542 vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \ 542 vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \
543 vec_u8_t edgehv; \ 543 vec_u8 edgehv; \
544 /* unaligned store */ \ 544 /* unaligned store */ \
545 vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ 545 vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
546 vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ 546 vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
547 lv = vec_sel( lv, bodyv, edgelv ); \ 547 lv = vec_sel( lv, bodyv, edgelv ); \
548 vec_st( lv, 7, dest ); \ 548 vec_st( lv, 7, dest ); \
549 hv = vec_ld( 0, dest ); \ 549 hv = vec_ld( 0, dest ); \
550 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ 550 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
551 hv = vec_sel( hv, bodyv, edgehv ); \ 551 hv = vec_sel( hv, bodyv, edgehv ); \
552 vec_st( hv, 0, dest ); \ 552 vec_st( hv, 0, dest ); \
553 } 553 }
554 554
555 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { 555 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
556 vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; 556 vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
557 vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; 557 vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
558 vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; 558 vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
559 559
560 vec_u8_t perm_ldv = vec_lvsl(0, dst); 560 vec_u8 perm_ldv = vec_lvsl(0, dst);
561 vec_u8_t perm_stv = vec_lvsr(8, dst); 561 vec_u8 perm_stv = vec_lvsr(8, dst);
562 562
563 const vec_u16_t onev = vec_splat_u16(1); 563 const vec_u16 onev = vec_splat_u16(1);
564 const vec_u16_t twov = vec_splat_u16(2); 564 const vec_u16 twov = vec_splat_u16(2);
565 const vec_u16_t sixv = vec_splat_u16(6); 565 const vec_u16 sixv = vec_splat_u16(6);
566 566
567 const vec_u8_t sel = (vec_u8_t) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; 567 const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
568 LOAD_ZERO; 568 LOAD_ZERO;
569 569
570 dct[0] += 32; // rounding for the >>6 at the end 570 dct[0] += 32; // rounding for the >>6 at the end
571 571
572 s0 = vec_ld(0x00, (int16_t*)dct); 572 s0 = vec_ld(0x00, (int16_t*)dct);
619 } 619 }
620 } 620 }
621 } 621 }
622 622
623 #define transpose4x16(r0, r1, r2, r3) { \ 623 #define transpose4x16(r0, r1, r2, r3) { \
624 register vec_u8_t r4; \ 624 register vec_u8 r4; \
625 register vec_u8_t r5; \ 625 register vec_u8 r5; \
626 register vec_u8_t r6; \ 626 register vec_u8 r6; \
627 register vec_u8_t r7; \ 627 register vec_u8 r7; \
628 \ 628 \
629 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ 629 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
630 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ 630 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
631 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ 631 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
632 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ 632 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
636 r2 = vec_mergeh(r5, r7); /*all set 2*/ \ 636 r2 = vec_mergeh(r5, r7); /*all set 2*/ \
637 r3 = vec_mergel(r5, r7); /*all set 3*/ \ 637 r3 = vec_mergel(r5, r7); /*all set 3*/ \
638 } 638 }
639 639
640 static inline void write16x4(uint8_t *dst, int dst_stride, 640 static inline void write16x4(uint8_t *dst, int dst_stride,
641 register vec_u8_t r0, register vec_u8_t r1, 641 register vec_u8 r0, register vec_u8 r1,
642 register vec_u8_t r2, register vec_u8_t r3) { 642 register vec_u8 r2, register vec_u8 r3) {
643 DECLARE_ALIGNED_16(unsigned char, result[64]); 643 DECLARE_ALIGNED_16(unsigned char, result[64]);
644 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; 644 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
645 int int_dst_stride = dst_stride/4; 645 int int_dst_stride = dst_stride/4;
646 646
647 vec_st(r0, 0, result); 647 vec_st(r0, 0, result);
669 669
670 /** \brief performs a 6x16 transpose of data in src, and stores it to dst 670 /** \brief performs a 6x16 transpose of data in src, and stores it to dst
671 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing 671 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
672 out of unaligned_load() */ 672 out of unaligned_load() */
673 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ 673 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
674 register vec_u8_t r0 = unaligned_load(0, src); \ 674 register vec_u8 r0 = unaligned_load(0, src); \
675 register vec_u8_t r1 = unaligned_load( src_stride, src); \ 675 register vec_u8 r1 = unaligned_load( src_stride, src); \
676 register vec_u8_t r2 = unaligned_load(2* src_stride, src); \ 676 register vec_u8 r2 = unaligned_load(2* src_stride, src); \
677 register vec_u8_t r3 = unaligned_load(3* src_stride, src); \ 677 register vec_u8 r3 = unaligned_load(3* src_stride, src); \
678 register vec_u8_t r4 = unaligned_load(4* src_stride, src); \ 678 register vec_u8 r4 = unaligned_load(4* src_stride, src); \
679 register vec_u8_t r5 = unaligned_load(5* src_stride, src); \ 679 register vec_u8 r5 = unaligned_load(5* src_stride, src); \
680 register vec_u8_t r6 = unaligned_load(6* src_stride, src); \ 680 register vec_u8 r6 = unaligned_load(6* src_stride, src); \
681 register vec_u8_t r7 = unaligned_load(7* src_stride, src); \ 681 register vec_u8 r7 = unaligned_load(7* src_stride, src); \
682 register vec_u8_t r14 = unaligned_load(14*src_stride, src); \ 682 register vec_u8 r14 = unaligned_load(14*src_stride, src); \
683 register vec_u8_t r15 = unaligned_load(15*src_stride, src); \ 683 register vec_u8 r15 = unaligned_load(15*src_stride, src); \
684 \ 684 \
685 r8 = unaligned_load( 8*src_stride, src); \ 685 r8 = unaligned_load( 8*src_stride, src); \
686 r9 = unaligned_load( 9*src_stride, src); \ 686 r9 = unaligned_load( 9*src_stride, src); \
687 r10 = unaligned_load(10*src_stride, src); \ 687 r10 = unaligned_load(10*src_stride, src); \
688 r11 = unaligned_load(11*src_stride, src); \ 688 r11 = unaligned_load(11*src_stride, src); \
728 /* Don't need to compute 14 and 15*/ \ 728 /* Don't need to compute 14 and 15*/ \
729 \ 729 \
730 } 730 }
731 731
732 // out: o = |x-y| < a 732 // out: o = |x-y| < a
733 static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x, 733 static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
734 register vec_u8_t y, 734 register vec_u8 y,
735 register vec_u8_t a) { 735 register vec_u8 a) {
736 736
737 register vec_u8_t diff = vec_subs(x, y); 737 register vec_u8 diff = vec_subs(x, y);
738 register vec_u8_t diffneg = vec_subs(y, x); 738 register vec_u8 diffneg = vec_subs(y, x);
739 register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ 739 register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
740 o = (vec_u8_t)vec_cmplt(o, a); 740 o = (vec_u8)vec_cmplt(o, a);
741 return o; 741 return o;
742 } 742 }
743 743
744 static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0, 744 static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
745 register vec_u8_t p1, 745 register vec_u8 p1,
746 register vec_u8_t q0, 746 register vec_u8 q0,
747 register vec_u8_t q1, 747 register vec_u8 q1,
748 register vec_u8_t alpha, 748 register vec_u8 alpha,
749 register vec_u8_t beta) { 749 register vec_u8 beta) {
750 750
751 register vec_u8_t mask; 751 register vec_u8 mask;
752 register vec_u8_t tempmask; 752 register vec_u8 tempmask;
753 753
754 mask = diff_lt_altivec(p0, q0, alpha); 754 mask = diff_lt_altivec(p0, q0, alpha);
755 tempmask = diff_lt_altivec(p1, p0, beta); 755 tempmask = diff_lt_altivec(p1, p0, beta);
756 mask = vec_and(mask, tempmask); 756 mask = vec_and(mask, tempmask);
757 tempmask = diff_lt_altivec(q1, q0, beta); 757 tempmask = diff_lt_altivec(q1, q0, beta);
759 759
760 return mask; 760 return mask;
761 } 761 }
762 762
763 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) 763 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
764 static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, 764 static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
765 register vec_u8_t p1, 765 register vec_u8 p1,
766 register vec_u8_t p2, 766 register vec_u8 p2,
767 register vec_u8_t q0, 767 register vec_u8 q0,
768 register vec_u8_t tc0) { 768 register vec_u8 tc0) {
769 769
770 register vec_u8_t average = vec_avg(p0, q0); 770 register vec_u8 average = vec_avg(p0, q0);
771 register vec_u8_t temp; 771 register vec_u8 temp;
772 register vec_u8_t uncliped; 772 register vec_u8 uncliped;
773 register vec_u8_t ones; 773 register vec_u8 ones;
774 register vec_u8_t max; 774 register vec_u8 max;
775 register vec_u8_t min; 775 register vec_u8 min;
776 register vec_u8_t newp1; 776 register vec_u8 newp1;
777 777
778 temp = vec_xor(average, p2); 778 temp = vec_xor(average, p2);
779 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ 779 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
780 ones = vec_splat_u8(1); 780 ones = vec_splat_u8(1);
781 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ 781 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
787 return newp1; 787 return newp1;
788 } 788 }
789 789
790 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ 790 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
791 \ 791 \
792 const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ 792 const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
793 \ 793 \
794 register vec_u8_t pq0bit = vec_xor(p0,q0); \ 794 register vec_u8 pq0bit = vec_xor(p0,q0); \
795 register vec_u8_t q1minus; \ 795 register vec_u8 q1minus; \
796 register vec_u8_t p0minus; \ 796 register vec_u8 p0minus; \
797 register vec_u8_t stage1; \ 797 register vec_u8 stage1; \
798 register vec_u8_t stage2; \ 798 register vec_u8 stage2; \
799 register vec_u8_t vec160; \ 799 register vec_u8 vec160; \
800 register vec_u8_t delta; \ 800 register vec_u8 delta; \
801 register vec_u8_t deltaneg; \ 801 register vec_u8 deltaneg; \
802 \ 802 \
803 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ 803 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
804 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ 804 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
805 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ 805 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
806 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ 806 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
819 q0 = vec_adds(q0, deltaneg); \ 819 q0 = vec_adds(q0, deltaneg); \
820 } 820 }
821 821
822 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ 822 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
823 DECLARE_ALIGNED_16(unsigned char, temp[16]); \ 823 DECLARE_ALIGNED_16(unsigned char, temp[16]); \
824 register vec_u8_t alphavec; \ 824 register vec_u8 alphavec; \
825 register vec_u8_t betavec; \ 825 register vec_u8 betavec; \
826 register vec_u8_t mask; \ 826 register vec_u8 mask; \
827 register vec_u8_t p1mask; \ 827 register vec_u8 p1mask; \
828 register vec_u8_t q1mask; \ 828 register vec_u8 q1mask; \
829 register vector signed char tc0vec; \ 829 register vector signed char tc0vec; \
830 register vec_u8_t finaltc0; \ 830 register vec_u8 finaltc0; \
831 register vec_u8_t tc0masked; \ 831 register vec_u8 tc0masked; \
832 register vec_u8_t newp1; \ 832 register vec_u8 newp1; \
833 register vec_u8_t newq1; \ 833 register vec_u8 newq1; \
834 \ 834 \
835 temp[0] = alpha; \ 835 temp[0] = alpha; \
836 temp[1] = beta; \ 836 temp[1] = beta; \
837 alphavec = vec_ld(0, temp); \ 837 alphavec = vec_ld(0, temp); \
838 betavec = vec_splat(alphavec, 0x1); \ 838 betavec = vec_splat(alphavec, 0x1); \
842 *((int *)temp) = *((int *)tc0); \ 842 *((int *)temp) = *((int *)tc0); \
843 tc0vec = vec_ld(0, (signed char*)temp); \ 843 tc0vec = vec_ld(0, (signed char*)temp); \
844 tc0vec = vec_mergeh(tc0vec, tc0vec); \ 844 tc0vec = vec_mergeh(tc0vec, tc0vec); \
845 tc0vec = vec_mergeh(tc0vec, tc0vec); \ 845 tc0vec = vec_mergeh(tc0vec, tc0vec); \
846 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ 846 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
847 finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \ 847 finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \
848 \ 848 \
849 p1mask = diff_lt_altivec(p2, p0, betavec); \ 849 p1mask = diff_lt_altivec(p2, p0, betavec); \
850 p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ 850 p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \
851 tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \ 851 tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \
852 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ 852 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
853 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ 853 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
854 /*end if*/ \ 854 /*end if*/ \
855 \ 855 \
856 q1mask = diff_lt_altivec(q2, q0, betavec); \ 856 q1mask = diff_lt_altivec(q2, q0, betavec); \
857 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ 857 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
858 tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \ 858 tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \
859 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ 859 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
860 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ 860 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
861 /*end if*/ \ 861 /*end if*/ \
862 \ 862 \
863 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ 863 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
866 } 866 }
867 867
868 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { 868 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
869 869
870 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { 870 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
871 register vec_u8_t p2 = vec_ld(-3*stride, pix); 871 register vec_u8 p2 = vec_ld(-3*stride, pix);
872 register vec_u8_t p1 = vec_ld(-2*stride, pix); 872 register vec_u8 p1 = vec_ld(-2*stride, pix);
873 register vec_u8_t p0 = vec_ld(-1*stride, pix); 873 register vec_u8 p0 = vec_ld(-1*stride, pix);
874 register vec_u8_t q0 = vec_ld(0, pix); 874 register vec_u8 q0 = vec_ld(0, pix);
875 register vec_u8_t q1 = vec_ld(stride, pix); 875 register vec_u8 q1 = vec_ld(stride, pix);
876 register vec_u8_t q2 = vec_ld(2*stride, pix); 876 register vec_u8 q2 = vec_ld(2*stride, pix);
877 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); 877 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
878 vec_st(p1, -2*stride, pix); 878 vec_st(p1, -2*stride, pix);
879 vec_st(p0, -1*stride, pix); 879 vec_st(p0, -1*stride, pix);
880 vec_st(q0, 0, pix); 880 vec_st(q0, 0, pix);
881 vec_st(q1, stride, pix); 881 vec_st(q1, stride, pix);
882 } 882 }
883 } 883 }
884 884
885 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { 885 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
886 886
887 register vec_u8_t line0, line1, line2, line3, line4, line5; 887 register vec_u8 line0, line1, line2, line3, line4, line5;
888 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) 888 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
889 return; 889 return;
890 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); 890 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
891 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); 891 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
892 transpose4x16(line1, line2, line3, line4); 892 transpose4x16(line1, line2, line3, line4);