Mercurial > libavcodec.hg
comparison ppc/h264_altivec.c @ 8494:1615d6b75ada libavcodec
Cleanup _t types in libavcodec/ppc
author | lu_zero |
---|---|
date | Sat, 27 Dec 2008 11:21:28 +0000 |
parents | 11307ea31e57 |
children | 992e2f8bfba7 |
comparison
equal
deleted
inserted
replaced
8493:469f3e5bcf13 | 8494:1615d6b75ada |
---|---|
187 {((8 - x) * (8 - y)), | 187 {((8 - x) * (8 - y)), |
188 ((x) * (8 - y)), | 188 ((x) * (8 - y)), |
189 ((8 - x) * (y)), | 189 ((8 - x) * (y)), |
190 ((x) * (y))}; | 190 ((x) * (y))}; |
191 register int i; | 191 register int i; |
192 vec_u8_t fperm; | 192 vec_u8 fperm; |
193 const vec_s32_t vABCD = vec_ld(0, ABCD); | 193 const vec_s32 vABCD = vec_ld(0, ABCD); |
194 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); | 194 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); |
195 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); | 195 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); |
196 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); | 196 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); |
197 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); | 197 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); |
198 LOAD_ZERO; | 198 LOAD_ZERO; |
199 const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); | 199 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); |
200 const vec_u16_t v6us = vec_splat_u16(6); | 200 const vec_u16 v6us = vec_splat_u16(6); |
201 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; | 201 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
202 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; | 202 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
203 | 203 |
204 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; | 204 vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; |
205 vec_u8_t vsrc0uc, vsrc1uc; | 205 vec_u8 vsrc0uc, vsrc1uc; |
206 vec_s16_t vsrc0ssH, vsrc1ssH; | 206 vec_s16 vsrc0ssH, vsrc1ssH; |
207 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; | 207 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; |
208 vec_s16_t vsrc2ssH, vsrc3ssH, psum; | 208 vec_s16 vsrc2ssH, vsrc3ssH, psum; |
209 vec_u8_t vdst, ppsum, fsum; | 209 vec_u8 vdst, ppsum, fsum; |
210 | 210 |
211 if (((unsigned long)dst) % 16 == 0) { | 211 if (((unsigned long)dst) % 16 == 0) { |
212 fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13, | 212 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, |
213 0x14, 0x15, 0x16, 0x17, | 213 0x14, 0x15, 0x16, 0x17, |
214 0x08, 0x09, 0x0A, 0x0B, | 214 0x08, 0x09, 0x0A, 0x0B, |
215 0x0C, 0x0D, 0x0E, 0x0F}; | 215 0x0C, 0x0D, 0x0E, 0x0F}; |
216 } else { | 216 } else { |
217 fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03, | 217 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, |
218 0x04, 0x05, 0x06, 0x07, | 218 0x04, 0x05, 0x06, 0x07, |
219 0x18, 0x19, 0x1A, 0x1B, | 219 0x18, 0x19, 0x1A, 0x1B, |
220 0x1C, 0x1D, 0x1E, 0x1F}; | 220 0x1C, 0x1D, 0x1E, 0x1F}; |
221 } | 221 } |
222 | 222 |
231 if (reallyBadAlign) | 231 if (reallyBadAlign) |
232 vsrc1uc = vsrcBuc; | 232 vsrc1uc = vsrcBuc; |
233 else | 233 else |
234 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); | 234 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
235 | 235 |
236 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc); | 236 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); |
237 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc); | 237 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); |
238 | 238 |
239 if (!loadSecond) {// -> !reallyBadAlign | 239 if (!loadSecond) {// -> !reallyBadAlign |
240 for (i = 0 ; i < h ; i++) { | 240 for (i = 0 ; i < h ; i++) { |
241 | 241 |
242 | 242 |
243 vsrcCuc = vec_ld(stride + 0, src); | 243 vsrcCuc = vec_ld(stride + 0, src); |
244 | 244 |
245 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | 245 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
246 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | 246 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
247 | 247 |
248 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); | 248 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc); |
249 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); | 249 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc); |
250 | 250 |
251 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 251 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
252 psum = vec_mladd(vB, vsrc1ssH, psum); | 252 psum = vec_mladd(vB, vsrc1ssH, psum); |
253 psum = vec_mladd(vC, vsrc2ssH, psum); | 253 psum = vec_mladd(vC, vsrc2ssH, psum); |
254 psum = vec_mladd(vD, vsrc3ssH, psum); | 254 psum = vec_mladd(vD, vsrc3ssH, psum); |
255 psum = vec_add(v28ss, psum); | 255 psum = vec_add(v28ss, psum); |
256 psum = vec_sra(psum, v6us); | 256 psum = vec_sra(psum, v6us); |
257 | 257 |
258 vdst = vec_ld(0, dst); | 258 vdst = vec_ld(0, dst); |
259 ppsum = (vec_u8_t)vec_packsu(psum, psum); | 259 ppsum = (vec_u8)vec_packsu(psum, psum); |
260 fsum = vec_perm(vdst, ppsum, fperm); | 260 fsum = vec_perm(vdst, ppsum, fperm); |
261 | 261 |
262 vec_st(fsum, 0, dst); | 262 vec_st(fsum, 0, dst); |
263 | 263 |
264 vsrc0ssH = vsrc2ssH; | 264 vsrc0ssH = vsrc2ssH; |
266 | 266 |
267 dst += stride; | 267 dst += stride; |
268 src += stride; | 268 src += stride; |
269 } | 269 } |
270 } else { | 270 } else { |
271 vec_u8_t vsrcDuc; | 271 vec_u8 vsrcDuc; |
272 for (i = 0 ; i < h ; i++) { | 272 for (i = 0 ; i < h ; i++) { |
273 vsrcCuc = vec_ld(stride + 0, src); | 273 vsrcCuc = vec_ld(stride + 0, src); |
274 vsrcDuc = vec_ld(stride + 16, src); | 274 vsrcDuc = vec_ld(stride + 16, src); |
275 | 275 |
276 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | 276 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
277 if (reallyBadAlign) | 277 if (reallyBadAlign) |
278 vsrc3uc = vsrcDuc; | 278 vsrc3uc = vsrcDuc; |
279 else | 279 else |
280 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | 280 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
281 | 281 |
282 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); | 282 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc); |
283 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); | 283 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc); |
284 | 284 |
285 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 285 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
286 psum = vec_mladd(vB, vsrc1ssH, psum); | 286 psum = vec_mladd(vB, vsrc1ssH, psum); |
287 psum = vec_mladd(vC, vsrc2ssH, psum); | 287 psum = vec_mladd(vC, vsrc2ssH, psum); |
288 psum = vec_mladd(vD, vsrc3ssH, psum); | 288 psum = vec_mladd(vD, vsrc3ssH, psum); |
289 psum = vec_add(v28ss, psum); | 289 psum = vec_add(v28ss, psum); |
290 psum = vec_sr(psum, v6us); | 290 psum = vec_sr(psum, v6us); |
291 | 291 |
292 vdst = vec_ld(0, dst); | 292 vdst = vec_ld(0, dst); |
293 ppsum = (vec_u8_t)vec_pack(psum, psum); | 293 ppsum = (vec_u8)vec_pack(psum, psum); |
294 fsum = vec_perm(vdst, ppsum, fperm); | 294 fsum = vec_perm(vdst, ppsum, fperm); |
295 | 295 |
296 vec_st(fsum, 0, dst); | 296 vec_st(fsum, 0, dst); |
297 | 297 |
298 vsrc0ssH = vsrc2ssH; | 298 vsrc0ssH = vsrc2ssH; |
307 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, | 307 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, |
308 const uint8_t * src2, int dst_stride, | 308 const uint8_t * src2, int dst_stride, |
309 int src_stride1, int h) | 309 int src_stride1, int h) |
310 { | 310 { |
311 int i; | 311 int i; |
312 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; | 312 vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; |
313 | 313 |
314 mask_ = vec_lvsl(0, src2); | 314 mask_ = vec_lvsl(0, src2); |
315 | 315 |
316 for (i = 0; i < h; i++) { | 316 for (i = 0; i < h; i++) { |
317 | 317 |
349 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, | 349 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, |
350 const uint8_t * src2, int dst_stride, | 350 const uint8_t * src2, int dst_stride, |
351 int src_stride1, int h) | 351 int src_stride1, int h) |
352 { | 352 { |
353 int i; | 353 int i; |
354 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; | 354 vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; |
355 | 355 |
356 mask_ = vec_lvsl(0, src2); | 356 mask_ = vec_lvsl(0, src2); |
357 | 357 |
358 for (i = 0; i < h; i++) { | 358 for (i = 0; i < h; i++) { |
359 | 359 |
430 b3 = vec_mergel( a1, a3 ) | 430 b3 = vec_mergel( a1, a3 ) |
431 | 431 |
432 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ | 432 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ |
433 vdst_orig = vec_ld(0, dst); \ | 433 vdst_orig = vec_ld(0, dst); \ |
434 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ | 434 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ |
435 vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \ | 435 vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \ |
436 va = vec_add(va, vdst_ss); \ | 436 va = vec_add(va, vdst_ss); \ |
437 va_u8 = vec_packsu(va, zero_s16v); \ | 437 va_u8 = vec_packsu(va, zero_s16v); \ |
438 va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ | 438 va_u32 = vec_splat((vec_u32)va_u8, 0); \ |
439 vec_ste(va_u32, element, (uint32_t*)dst); | 439 vec_ste(va_u32, element, (uint32_t*)dst); |
440 | 440 |
441 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) | 441 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) |
442 { | 442 { |
443 vec_s16_t va0, va1, va2, va3; | 443 vec_s16 va0, va1, va2, va3; |
444 vec_s16_t vz0, vz1, vz2, vz3; | 444 vec_s16 vz0, vz1, vz2, vz3; |
445 vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3; | 445 vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; |
446 vec_u8_t va_u8; | 446 vec_u8 va_u8; |
447 vec_u32_t va_u32; | 447 vec_u32 va_u32; |
448 vec_s16_t vdst_ss; | 448 vec_s16 vdst_ss; |
449 const vec_u16_t v6us = vec_splat_u16(6); | 449 const vec_u16 v6us = vec_splat_u16(6); |
450 vec_u8_t vdst, vdst_orig; | 450 vec_u8 vdst, vdst_orig; |
451 vec_u8_t vdst_mask = vec_lvsl(0, dst); | 451 vec_u8 vdst_mask = vec_lvsl(0, dst); |
452 int element = ((unsigned long)dst & 0xf) >> 2; | 452 int element = ((unsigned long)dst & 0xf) >> 2; |
453 LOAD_ZERO; | 453 LOAD_ZERO; |
454 | 454 |
455 block[0] += 32; /* add 32 as a DC-level for rounding */ | 455 block[0] += 32; /* add 32 as a DC-level for rounding */ |
456 | 456 |
477 VEC_LOAD_U8_ADD_S16_STORE_U8(va3); | 477 VEC_LOAD_U8_ADD_S16_STORE_U8(va3); |
478 } | 478 } |
479 | 479 |
480 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ | 480 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ |
481 /* a0 = SRC(0) + SRC(4); */ \ | 481 /* a0 = SRC(0) + SRC(4); */ \ |
482 vec_s16_t a0v = vec_add(s0, s4); \ | 482 vec_s16 a0v = vec_add(s0, s4); \ |
483 /* a2 = SRC(0) - SRC(4); */ \ | 483 /* a2 = SRC(0) - SRC(4); */ \ |
484 vec_s16_t a2v = vec_sub(s0, s4); \ | 484 vec_s16 a2v = vec_sub(s0, s4); \ |
485 /* a4 = (SRC(2)>>1) - SRC(6); */ \ | 485 /* a4 = (SRC(2)>>1) - SRC(6); */ \ |
486 vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \ | 486 vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \ |
487 /* a6 = (SRC(6)>>1) + SRC(2); */ \ | 487 /* a6 = (SRC(6)>>1) + SRC(2); */ \ |
488 vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \ | 488 vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \ |
489 /* b0 = a0 + a6; */ \ | 489 /* b0 = a0 + a6; */ \ |
490 vec_s16_t b0v = vec_add(a0v, a6v); \ | 490 vec_s16 b0v = vec_add(a0v, a6v); \ |
491 /* b2 = a2 + a4; */ \ | 491 /* b2 = a2 + a4; */ \ |
492 vec_s16_t b2v = vec_add(a2v, a4v); \ | 492 vec_s16 b2v = vec_add(a2v, a4v); \ |
493 /* b4 = a2 - a4; */ \ | 493 /* b4 = a2 - a4; */ \ |
494 vec_s16_t b4v = vec_sub(a2v, a4v); \ | 494 vec_s16 b4v = vec_sub(a2v, a4v); \ |
495 /* b6 = a0 - a6; */ \ | 495 /* b6 = a0 - a6; */ \ |
496 vec_s16_t b6v = vec_sub(a0v, a6v); \ | 496 vec_s16 b6v = vec_sub(a0v, a6v); \ |
497 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ | 497 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ |
498 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ | 498 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ |
499 vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ | 499 vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ |
500 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ | 500 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ |
501 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ | 501 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ |
502 vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ | 502 vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ |
503 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ | 503 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ |
504 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ | 504 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ |
505 vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ | 505 vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ |
506 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ | 506 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ |
507 vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ | 507 vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ |
508 /* b1 = (a7>>2) + a1; */ \ | 508 /* b1 = (a7>>2) + a1; */ \ |
509 vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \ | 509 vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \ |
510 /* b3 = a3 + (a5>>2); */ \ | 510 /* b3 = a3 + (a5>>2); */ \ |
511 vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \ | 511 vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \ |
512 /* b5 = (a3>>2) - a5; */ \ | 512 /* b5 = (a3>>2) - a5; */ \ |
513 vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \ | 513 vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \ |
514 /* b7 = a7 - (a1>>2); */ \ | 514 /* b7 = a7 - (a1>>2); */ \ |
515 vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ | 515 vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ |
516 /* DST(0, b0 + b7); */ \ | 516 /* DST(0, b0 + b7); */ \ |
517 d0 = vec_add(b0v, b7v); \ | 517 d0 = vec_add(b0v, b7v); \ |
518 /* DST(1, b2 + b5); */ \ | 518 /* DST(1, b2 + b5); */ \ |
519 d1 = vec_add(b2v, b5v); \ | 519 d1 = vec_add(b2v, b5v); \ |
520 /* DST(2, b4 + b3); */ \ | 520 /* DST(2, b4 + b3); */ \ |
531 d7 = vec_sub(b0v, b7v); \ | 531 d7 = vec_sub(b0v, b7v); \ |
532 } | 532 } |
533 | 533 |
534 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ | 534 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ |
535 /* unaligned load */ \ | 535 /* unaligned load */ \ |
536 vec_u8_t hv = vec_ld( 0, dest ); \ | 536 vec_u8 hv = vec_ld( 0, dest ); \ |
537 vec_u8_t lv = vec_ld( 7, dest ); \ | 537 vec_u8 lv = vec_ld( 7, dest ); \ |
538 vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \ | 538 vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \ |
539 vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ | 539 vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ |
540 vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ | 540 vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \ |
541 vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ | 541 vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ |
542 vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \ | 542 vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ |
543 vec_u8_t edgehv; \ | 543 vec_u8 edgehv; \ |
544 /* unaligned store */ \ | 544 /* unaligned store */ \ |
545 vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ | 545 vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ |
546 vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ | 546 vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ |
547 lv = vec_sel( lv, bodyv, edgelv ); \ | 547 lv = vec_sel( lv, bodyv, edgelv ); \ |
548 vec_st( lv, 7, dest ); \ | 548 vec_st( lv, 7, dest ); \ |
549 hv = vec_ld( 0, dest ); \ | 549 hv = vec_ld( 0, dest ); \ |
550 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ | 550 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ |
551 hv = vec_sel( hv, bodyv, edgehv ); \ | 551 hv = vec_sel( hv, bodyv, edgehv ); \ |
552 vec_st( hv, 0, dest ); \ | 552 vec_st( hv, 0, dest ); \ |
553 } | 553 } |
554 | 554 |
555 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { | 555 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { |
556 vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; | 556 vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; |
557 vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; | 557 vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; |
558 vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; | 558 vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; |
559 | 559 |
560 vec_u8_t perm_ldv = vec_lvsl(0, dst); | 560 vec_u8 perm_ldv = vec_lvsl(0, dst); |
561 vec_u8_t perm_stv = vec_lvsr(8, dst); | 561 vec_u8 perm_stv = vec_lvsr(8, dst); |
562 | 562 |
563 const vec_u16_t onev = vec_splat_u16(1); | 563 const vec_u16 onev = vec_splat_u16(1); |
564 const vec_u16_t twov = vec_splat_u16(2); | 564 const vec_u16 twov = vec_splat_u16(2); |
565 const vec_u16_t sixv = vec_splat_u16(6); | 565 const vec_u16 sixv = vec_splat_u16(6); |
566 | 566 |
567 const vec_u8_t sel = (vec_u8_t) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; | 567 const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; |
568 LOAD_ZERO; | 568 LOAD_ZERO; |
569 | 569 |
570 dct[0] += 32; // rounding for the >>6 at the end | 570 dct[0] += 32; // rounding for the >>6 at the end |
571 | 571 |
572 s0 = vec_ld(0x00, (int16_t*)dct); | 572 s0 = vec_ld(0x00, (int16_t*)dct); |
619 } | 619 } |
620 } | 620 } |
621 } | 621 } |
622 | 622 |
623 #define transpose4x16(r0, r1, r2, r3) { \ | 623 #define transpose4x16(r0, r1, r2, r3) { \ |
624 register vec_u8_t r4; \ | 624 register vec_u8 r4; \ |
625 register vec_u8_t r5; \ | 625 register vec_u8 r5; \ |
626 register vec_u8_t r6; \ | 626 register vec_u8 r6; \ |
627 register vec_u8_t r7; \ | 627 register vec_u8 r7; \ |
628 \ | 628 \ |
629 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ | 629 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ |
630 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ | 630 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ |
631 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ | 631 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ |
632 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ | 632 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ |
636 r2 = vec_mergeh(r5, r7); /*all set 2*/ \ | 636 r2 = vec_mergeh(r5, r7); /*all set 2*/ \ |
637 r3 = vec_mergel(r5, r7); /*all set 3*/ \ | 637 r3 = vec_mergel(r5, r7); /*all set 3*/ \ |
638 } | 638 } |
639 | 639 |
640 static inline void write16x4(uint8_t *dst, int dst_stride, | 640 static inline void write16x4(uint8_t *dst, int dst_stride, |
641 register vec_u8_t r0, register vec_u8_t r1, | 641 register vec_u8 r0, register vec_u8 r1, |
642 register vec_u8_t r2, register vec_u8_t r3) { | 642 register vec_u8 r2, register vec_u8 r3) { |
643 DECLARE_ALIGNED_16(unsigned char, result[64]); | 643 DECLARE_ALIGNED_16(unsigned char, result[64]); |
644 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; | 644 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; |
645 int int_dst_stride = dst_stride/4; | 645 int int_dst_stride = dst_stride/4; |
646 | 646 |
647 vec_st(r0, 0, result); | 647 vec_st(r0, 0, result); |
669 | 669 |
670 /** \brief performs a 6x16 transpose of data in src, and stores it to dst | 670 /** \brief performs a 6x16 transpose of data in src, and stores it to dst |
671 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing | 671 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing |
672 out of unaligned_load() */ | 672 out of unaligned_load() */ |
673 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ | 673 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ |
674 register vec_u8_t r0 = unaligned_load(0, src); \ | 674 register vec_u8 r0 = unaligned_load(0, src); \ |
675 register vec_u8_t r1 = unaligned_load( src_stride, src); \ | 675 register vec_u8 r1 = unaligned_load( src_stride, src); \ |
676 register vec_u8_t r2 = unaligned_load(2* src_stride, src); \ | 676 register vec_u8 r2 = unaligned_load(2* src_stride, src); \ |
677 register vec_u8_t r3 = unaligned_load(3* src_stride, src); \ | 677 register vec_u8 r3 = unaligned_load(3* src_stride, src); \ |
678 register vec_u8_t r4 = unaligned_load(4* src_stride, src); \ | 678 register vec_u8 r4 = unaligned_load(4* src_stride, src); \ |
679 register vec_u8_t r5 = unaligned_load(5* src_stride, src); \ | 679 register vec_u8 r5 = unaligned_load(5* src_stride, src); \ |
680 register vec_u8_t r6 = unaligned_load(6* src_stride, src); \ | 680 register vec_u8 r6 = unaligned_load(6* src_stride, src); \ |
681 register vec_u8_t r7 = unaligned_load(7* src_stride, src); \ | 681 register vec_u8 r7 = unaligned_load(7* src_stride, src); \ |
682 register vec_u8_t r14 = unaligned_load(14*src_stride, src); \ | 682 register vec_u8 r14 = unaligned_load(14*src_stride, src); \ |
683 register vec_u8_t r15 = unaligned_load(15*src_stride, src); \ | 683 register vec_u8 r15 = unaligned_load(15*src_stride, src); \ |
684 \ | 684 \ |
685 r8 = unaligned_load( 8*src_stride, src); \ | 685 r8 = unaligned_load( 8*src_stride, src); \ |
686 r9 = unaligned_load( 9*src_stride, src); \ | 686 r9 = unaligned_load( 9*src_stride, src); \ |
687 r10 = unaligned_load(10*src_stride, src); \ | 687 r10 = unaligned_load(10*src_stride, src); \ |
688 r11 = unaligned_load(11*src_stride, src); \ | 688 r11 = unaligned_load(11*src_stride, src); \ |
728 /* Don't need to compute 14 and 15*/ \ | 728 /* Don't need to compute 14 and 15*/ \ |
729 \ | 729 \ |
730 } | 730 } |
731 | 731 |
732 // out: o = |x-y| < a | 732 // out: o = |x-y| < a |
733 static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x, | 733 static inline vec_u8 diff_lt_altivec ( register vec_u8 x, |
734 register vec_u8_t y, | 734 register vec_u8 y, |
735 register vec_u8_t a) { | 735 register vec_u8 a) { |
736 | 736 |
737 register vec_u8_t diff = vec_subs(x, y); | 737 register vec_u8 diff = vec_subs(x, y); |
738 register vec_u8_t diffneg = vec_subs(y, x); | 738 register vec_u8 diffneg = vec_subs(y, x); |
739 register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ | 739 register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */ |
740 o = (vec_u8_t)vec_cmplt(o, a); | 740 o = (vec_u8)vec_cmplt(o, a); |
741 return o; | 741 return o; |
742 } | 742 } |
743 | 743 |
744 static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0, | 744 static inline vec_u8 h264_deblock_mask ( register vec_u8 p0, |
745 register vec_u8_t p1, | 745 register vec_u8 p1, |
746 register vec_u8_t q0, | 746 register vec_u8 q0, |
747 register vec_u8_t q1, | 747 register vec_u8 q1, |
748 register vec_u8_t alpha, | 748 register vec_u8 alpha, |
749 register vec_u8_t beta) { | 749 register vec_u8 beta) { |
750 | 750 |
751 register vec_u8_t mask; | 751 register vec_u8 mask; |
752 register vec_u8_t tempmask; | 752 register vec_u8 tempmask; |
753 | 753 |
754 mask = diff_lt_altivec(p0, q0, alpha); | 754 mask = diff_lt_altivec(p0, q0, alpha); |
755 tempmask = diff_lt_altivec(p1, p0, beta); | 755 tempmask = diff_lt_altivec(p1, p0, beta); |
756 mask = vec_and(mask, tempmask); | 756 mask = vec_and(mask, tempmask); |
757 tempmask = diff_lt_altivec(q1, q0, beta); | 757 tempmask = diff_lt_altivec(q1, q0, beta); |
759 | 759 |
760 return mask; | 760 return mask; |
761 } | 761 } |
762 | 762 |
763 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) | 763 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) |
764 static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, | 764 static inline vec_u8 h264_deblock_q1(register vec_u8 p0, |
765 register vec_u8_t p1, | 765 register vec_u8 p1, |
766 register vec_u8_t p2, | 766 register vec_u8 p2, |
767 register vec_u8_t q0, | 767 register vec_u8 q0, |
768 register vec_u8_t tc0) { | 768 register vec_u8 tc0) { |
769 | 769 |
770 register vec_u8_t average = vec_avg(p0, q0); | 770 register vec_u8 average = vec_avg(p0, q0); |
771 register vec_u8_t temp; | 771 register vec_u8 temp; |
772 register vec_u8_t uncliped; | 772 register vec_u8 uncliped; |
773 register vec_u8_t ones; | 773 register vec_u8 ones; |
774 register vec_u8_t max; | 774 register vec_u8 max; |
775 register vec_u8_t min; | 775 register vec_u8 min; |
776 register vec_u8_t newp1; | 776 register vec_u8 newp1; |
777 | 777 |
778 temp = vec_xor(average, p2); | 778 temp = vec_xor(average, p2); |
779 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ | 779 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ |
780 ones = vec_splat_u8(1); | 780 ones = vec_splat_u8(1); |
781 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ | 781 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ |
787 return newp1; | 787 return newp1; |
788 } | 788 } |
789 | 789 |
790 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ | 790 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ |
791 \ | 791 \ |
792 const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ | 792 const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ |
793 \ | 793 \ |
794 register vec_u8_t pq0bit = vec_xor(p0,q0); \ | 794 register vec_u8 pq0bit = vec_xor(p0,q0); \ |
795 register vec_u8_t q1minus; \ | 795 register vec_u8 q1minus; \ |
796 register vec_u8_t p0minus; \ | 796 register vec_u8 p0minus; \ |
797 register vec_u8_t stage1; \ | 797 register vec_u8 stage1; \ |
798 register vec_u8_t stage2; \ | 798 register vec_u8 stage2; \ |
799 register vec_u8_t vec160; \ | 799 register vec_u8 vec160; \ |
800 register vec_u8_t delta; \ | 800 register vec_u8 delta; \ |
801 register vec_u8_t deltaneg; \ | 801 register vec_u8 deltaneg; \ |
802 \ | 802 \ |
803 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ | 803 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ |
804 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ | 804 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ |
805 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ | 805 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ |
806 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ | 806 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ |
819 q0 = vec_adds(q0, deltaneg); \ | 819 q0 = vec_adds(q0, deltaneg); \ |
820 } | 820 } |
821 | 821 |
822 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ | 822 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ |
823 DECLARE_ALIGNED_16(unsigned char, temp[16]); \ | 823 DECLARE_ALIGNED_16(unsigned char, temp[16]); \ |
824 register vec_u8_t alphavec; \ | 824 register vec_u8 alphavec; \ |
825 register vec_u8_t betavec; \ | 825 register vec_u8 betavec; \ |
826 register vec_u8_t mask; \ | 826 register vec_u8 mask; \ |
827 register vec_u8_t p1mask; \ | 827 register vec_u8 p1mask; \ |
828 register vec_u8_t q1mask; \ | 828 register vec_u8 q1mask; \ |
829 register vector signed char tc0vec; \ | 829 register vector signed char tc0vec; \ |
830 register vec_u8_t finaltc0; \ | 830 register vec_u8 finaltc0; \ |
831 register vec_u8_t tc0masked; \ | 831 register vec_u8 tc0masked; \ |
832 register vec_u8_t newp1; \ | 832 register vec_u8 newp1; \ |
833 register vec_u8_t newq1; \ | 833 register vec_u8 newq1; \ |
834 \ | 834 \ |
835 temp[0] = alpha; \ | 835 temp[0] = alpha; \ |
836 temp[1] = beta; \ | 836 temp[1] = beta; \ |
837 alphavec = vec_ld(0, temp); \ | 837 alphavec = vec_ld(0, temp); \ |
838 betavec = vec_splat(alphavec, 0x1); \ | 838 betavec = vec_splat(alphavec, 0x1); \ |
842 *((int *)temp) = *((int *)tc0); \ | 842 *((int *)temp) = *((int *)tc0); \ |
843 tc0vec = vec_ld(0, (signed char*)temp); \ | 843 tc0vec = vec_ld(0, (signed char*)temp); \ |
844 tc0vec = vec_mergeh(tc0vec, tc0vec); \ | 844 tc0vec = vec_mergeh(tc0vec, tc0vec); \ |
845 tc0vec = vec_mergeh(tc0vec, tc0vec); \ | 845 tc0vec = vec_mergeh(tc0vec, tc0vec); \ |
846 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ | 846 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ |
847 finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \ | 847 finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \ |
848 \ | 848 \ |
849 p1mask = diff_lt_altivec(p2, p0, betavec); \ | 849 p1mask = diff_lt_altivec(p2, p0, betavec); \ |
850 p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ | 850 p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ |
851 tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \ | 851 tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \ |
852 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ | 852 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ |
853 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ | 853 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ |
854 /*end if*/ \ | 854 /*end if*/ \ |
855 \ | 855 \ |
856 q1mask = diff_lt_altivec(q2, q0, betavec); \ | 856 q1mask = diff_lt_altivec(q2, q0, betavec); \ |
857 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ | 857 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ |
858 tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \ | 858 tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \ |
859 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ | 859 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ |
860 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ | 860 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ |
861 /*end if*/ \ | 861 /*end if*/ \ |
862 \ | 862 \ |
863 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ | 863 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ |
866 } | 866 } |
867 | 867 |
868 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { | 868 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { |
869 | 869 |
870 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { | 870 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { |
871 register vec_u8_t p2 = vec_ld(-3*stride, pix); | 871 register vec_u8 p2 = vec_ld(-3*stride, pix); |
872 register vec_u8_t p1 = vec_ld(-2*stride, pix); | 872 register vec_u8 p1 = vec_ld(-2*stride, pix); |
873 register vec_u8_t p0 = vec_ld(-1*stride, pix); | 873 register vec_u8 p0 = vec_ld(-1*stride, pix); |
874 register vec_u8_t q0 = vec_ld(0, pix); | 874 register vec_u8 q0 = vec_ld(0, pix); |
875 register vec_u8_t q1 = vec_ld(stride, pix); | 875 register vec_u8 q1 = vec_ld(stride, pix); |
876 register vec_u8_t q2 = vec_ld(2*stride, pix); | 876 register vec_u8 q2 = vec_ld(2*stride, pix); |
877 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); | 877 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); |
878 vec_st(p1, -2*stride, pix); | 878 vec_st(p1, -2*stride, pix); |
879 vec_st(p0, -1*stride, pix); | 879 vec_st(p0, -1*stride, pix); |
880 vec_st(q0, 0, pix); | 880 vec_st(q0, 0, pix); |
881 vec_st(q1, stride, pix); | 881 vec_st(q1, stride, pix); |
882 } | 882 } |
883 } | 883 } |
884 | 884 |
885 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { | 885 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { |
886 | 886 |
887 register vec_u8_t line0, line1, line2, line3, line4, line5; | 887 register vec_u8 line0, line1, line2, line3, line4, line5; |
888 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) | 888 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) |
889 return; | 889 return; |
890 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); | 890 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); |
891 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); | 891 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); |
892 transpose4x16(line1, line2, line3, line4); | 892 transpose4x16(line1, line2, line3, line4); |