Mercurial > libavcodec.hg
comparison ppc/h264_template_altivec.c @ 5603:861eb234e6ba libavcodec
remove alignment correction of the destination pointers in luma_16x6
interpolations, since they are always 16-bytes aligned in practice.
Add asserts to ease narrowing down potential image corructions on exotic plateforms
Based on a patch by Mauricio Alvarez % lokifo A gmail P com %
Original thread:
Date: Jun 26, 2007 1:07 PM
Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec
author | gpoirier |
---|---|
date | Sun, 26 Aug 2007 21:47:58 +0000 |
parents | cd266411b11a |
children | 93089aed00cb |
comparison
equal
deleted
inserted
replaced
5602:3b21f3268707 | 5603:861eb234e6ba |
---|---|
15 * | 15 * |
16 * You should have received a copy of the GNU Lesser General Public | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with FFmpeg; if not, write to the Free Software | 17 * License along with FFmpeg; if not, write to the Free Software |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 */ | 19 */ |
20 | |
21 //#define DEBUG_ALIGNMENT | |
22 #ifdef DEBUG_ALIGNMENT | |
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); | |
24 #else | |
25 #define ASSERT_ALIGNED(ptr) ; | |
26 #endif | |
20 | 27 |
21 /* this code assume that stride % 16 == 0 */ | 28 /* this code assume that stride % 16 == 0 */ |
22 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { | 29 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { |
23 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); | 30 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); |
24 DECLARE_ALIGNED_16(signed int, ABCD[4]) = | 31 DECLARE_ALIGNED_16(signed int, ABCD[4]) = |
163 const vec_u8_t permP3 = vec_lvsl(+3, src); | 170 const vec_u8_t permP3 = vec_lvsl(+3, src); |
164 const vec_s16_t v5ss = vec_splat_s16(5); | 171 const vec_s16_t v5ss = vec_splat_s16(5); |
165 const vec_u16_t v5us = vec_splat_u16(5); | 172 const vec_u16_t v5us = vec_splat_u16(5); |
166 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 173 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
167 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | 174 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
168 const vec_u8_t dstperm = vec_lvsr(0, dst); | |
169 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1); | |
170 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm); | |
171 | 175 |
172 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | 176 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
173 | 177 |
174 register int align = ((((unsigned long)src) - 2) % 16); | 178 register int align = ((((unsigned long)src) - 2) % 16); |
175 | 179 |
178 srcM1A, srcM1B, srcM2A, srcM2B, | 182 srcM1A, srcM1B, srcM2A, srcM2B, |
179 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | 183 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
180 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | 184 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
181 psumA, psumB, sumA, sumB; | 185 psumA, psumB, sumA, sumB; |
182 | 186 |
183 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2; | 187 vec_u8_t sum, vdst, fsum; |
184 | 188 |
185 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | 189 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
186 | 190 |
187 for (i = 0 ; i < 16 ; i ++) { | 191 for (i = 0 ; i < 16 ; i ++) { |
188 vec_u8_t srcR1 = vec_ld(-2, src); | 192 vec_u8_t srcR1 = vec_ld(-2, src); |
280 sumA = vec_sra(psumA, v5us); | 284 sumA = vec_sra(psumA, v5us); |
281 sumB = vec_sra(psumB, v5us); | 285 sumB = vec_sra(psumB, v5us); |
282 | 286 |
283 sum = vec_packsu(sumA, sumB); | 287 sum = vec_packsu(sumA, sumB); |
284 | 288 |
285 dst1 = vec_ld(0, dst); | 289 ASSERT_ALIGNED(dst); |
286 dst2 = vec_ld(16, dst); | 290 vdst = vec_ld(0, dst); |
287 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); | |
288 | 291 |
289 OP_U8_ALTIVEC(fsum, sum, vdst); | 292 OP_U8_ALTIVEC(fsum, sum, vdst); |
290 | 293 |
291 rsum = vec_perm(fsum, fsum, dstperm); | 294 vec_st(fsum, 0, dst); |
292 fdst1 = vec_sel(dst1, rsum, dstmask); | |
293 fdst2 = vec_sel(rsum, dst2, dstmask); | |
294 | |
295 vec_st(fdst1, 0, dst); | |
296 vec_st(fdst2, 16, dst); | |
297 | 295 |
298 src += srcStride; | 296 src += srcStride; |
299 dst += dstStride; | 297 dst += dstStride; |
300 } | 298 } |
301 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | 299 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
311 const vec_u8_t perm = vec_lvsl(0, src); | 309 const vec_u8_t perm = vec_lvsl(0, src); |
312 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 310 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
313 const vec_u16_t v5us = vec_splat_u16(5); | 311 const vec_u16_t v5us = vec_splat_u16(5); |
314 const vec_s16_t v5ss = vec_splat_s16(5); | 312 const vec_s16_t v5ss = vec_splat_s16(5); |
315 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | 313 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
316 const vec_u8_t dstperm = vec_lvsr(0, dst); | |
317 const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1); | |
318 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm); | |
319 | 314 |
320 uint8_t *srcbis = src - (srcStride * 2); | 315 uint8_t *srcbis = src - (srcStride * 2); |
321 | 316 |
322 const vec_u8_t srcM2a = vec_ld(0, srcbis); | 317 const vec_u8_t srcM2a = vec_ld(0, srcbis); |
323 const vec_u8_t srcM2b = vec_ld(16, srcbis); | 318 const vec_u8_t srcM2b = vec_ld(16, srcbis); |
354 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | 349 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
355 psumA, psumB, sumA, sumB, | 350 psumA, psumB, sumA, sumB, |
356 srcP3ssA, srcP3ssB, | 351 srcP3ssA, srcP3ssB, |
357 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | 352 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
358 | 353 |
359 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3; | 354 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; |
360 | 355 |
361 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | 356 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
362 | 357 |
363 for (i = 0 ; i < 16 ; i++) { | 358 for (i = 0 ; i < 16 ; i++) { |
364 srcP3a = vec_ld(0, srcbis += srcStride); | 359 srcP3a = vec_ld(0, srcbis += srcStride); |
401 sumA = vec_sra(psumA, v5us); | 396 sumA = vec_sra(psumA, v5us); |
402 sumB = vec_sra(psumB, v5us); | 397 sumB = vec_sra(psumB, v5us); |
403 | 398 |
404 sum = vec_packsu(sumA, sumB); | 399 sum = vec_packsu(sumA, sumB); |
405 | 400 |
406 dst1 = vec_ld(0, dst); | 401 ASSERT_ALIGNED(dst); |
407 dst2 = vec_ld(16, dst); | 402 vdst = vec_ld(0, dst); |
408 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); | |
409 | 403 |
410 OP_U8_ALTIVEC(fsum, sum, vdst); | 404 OP_U8_ALTIVEC(fsum, sum, vdst); |
411 | 405 |
412 rsum = vec_perm(fsum, fsum, dstperm); | 406 vec_st(fsum, 0, dst); |
413 fdst1 = vec_sel(dst1, rsum, dstmask); | |
414 fdst2 = vec_sel(rsum, dst2, dstmask); | |
415 | |
416 vec_st(fdst1, 0, dst); | |
417 vec_st(fdst2, 16, dst); | |
418 | 407 |
419 dst += dstStride; | 408 dst += dstStride; |
420 } | 409 } |
421 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | 410 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
422 } | 411 } |
439 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | 428 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
440 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | 429 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
441 | 430 |
442 register int align = ((((unsigned long)src) - 2) % 16); | 431 register int align = ((((unsigned long)src) - 2) % 16); |
443 | 432 |
444 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1); | |
445 | |
446 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, | 433 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, |
447 srcP2A, srcP2B, srcP3A, srcP3B, | 434 srcP2A, srcP2B, srcP3A, srcP3B, |
448 srcM1A, srcM1B, srcM2A, srcM2B, | 435 srcM1A, srcM1B, srcM2A, srcM2B, |
449 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | 436 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
450 pp1A, pp1B, pp2A, pp2B, psumA, psumB; | 437 pp1A, pp1B, pp2A, pp2B, psumA, psumB; |
451 | 438 |
452 const vec_u8_t dstperm = vec_lvsr(0, dst); | |
453 | |
454 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm); | |
455 | |
456 const vec_u8_t mperm = (const vec_u8_t) | 439 const vec_u8_t mperm = (const vec_u8_t) |
457 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | 440 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, |
458 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); | 441 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); |
459 int16_t *tmpbis = tmp; | 442 int16_t *tmpbis = tmp; |
460 | 443 |
464 | 447 |
465 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | 448 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, |
466 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | 449 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, |
467 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | 450 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, |
468 ssumAe, ssumAo, ssumBe, ssumBo; | 451 ssumAe, ssumAo, ssumBe, ssumBo; |
469 vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2; | 452 vec_u8_t fsum, sumv, sum, vdst; |
470 vec_s16_t ssume, ssumo; | 453 vec_s16_t ssume, ssumo; |
471 | 454 |
472 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | 455 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
473 src -= (2 * srcStride); | 456 src -= (2 * srcStride); |
474 for (i = 0 ; i < 21 ; i ++) { | 457 for (i = 0 ; i < 21 ; i ++) { |
648 ssumo = vec_packs(ssumAo, ssumBo); | 631 ssumo = vec_packs(ssumAo, ssumBo); |
649 | 632 |
650 sumv = vec_packsu(ssume, ssumo); | 633 sumv = vec_packsu(ssume, ssumo); |
651 sum = vec_perm(sumv, sumv, mperm); | 634 sum = vec_perm(sumv, sumv, mperm); |
652 | 635 |
653 dst1 = vec_ld(0, dst); | 636 ASSERT_ALIGNED(dst); |
654 dst2 = vec_ld(16, dst); | 637 vdst = vec_ld(0, dst); |
655 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); | |
656 | 638 |
657 OP_U8_ALTIVEC(fsum, sum, vdst); | 639 OP_U8_ALTIVEC(fsum, sum, vdst); |
658 | 640 |
659 rsum = vec_perm(fsum, fsum, dstperm); | 641 vec_st(fsum, 0, dst); |
660 fdst1 = vec_sel(dst1, rsum, dstmask); | |
661 fdst2 = vec_sel(rsum, dst2, dstmask); | |
662 | |
663 vec_st(fdst1, 0, dst); | |
664 vec_st(fdst2, 16, dst); | |
665 | 642 |
666 dst += dstStride; | 643 dst += dstStride; |
667 } | 644 } |
668 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | 645 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
669 } | 646 } |