comparison ppc/h264_template_altivec.c @ 5603:861eb234e6ba libavcodec

remove alignment correction of the destination pointers in luma_16x6 interpolations, since they are always 16-bytes aligned in practice. Add asserts to ease narrowing down potential image corructions on exotic plateforms Based on a patch by Mauricio Alvarez % lokifo A gmail P com % Original thread: Date: Jun 26, 2007 1:07 PM Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec
author gpoirier
date Sun, 26 Aug 2007 21:47:58 +0000
parents cd266411b11a
children 93089aed00cb
comparison
equal deleted inserted replaced
5602:3b21f3268707 5603:861eb234e6ba
15 * 15 *
16 * You should have received a copy of the GNU Lesser General Public 16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software 17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */ 19 */
20
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24 #else
25 #define ASSERT_ALIGNED(ptr) ;
26 #endif
20 27
21 /* this code assume that stride % 16 == 0 */ 28 /* this code assume that stride % 16 == 0 */
22 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { 29 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
23 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); 30 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
24 DECLARE_ALIGNED_16(signed int, ABCD[4]) = 31 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
163 const vec_u8_t permP3 = vec_lvsl(+3, src); 170 const vec_u8_t permP3 = vec_lvsl(+3, src);
164 const vec_s16_t v5ss = vec_splat_s16(5); 171 const vec_s16_t v5ss = vec_splat_s16(5);
165 const vec_u16_t v5us = vec_splat_u16(5); 172 const vec_u16_t v5us = vec_splat_u16(5);
166 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 173 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
167 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 174 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
168 const vec_u8_t dstperm = vec_lvsr(0, dst);
169 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
170 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
171 175
172 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 176 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
173 177
174 register int align = ((((unsigned long)src) - 2) % 16); 178 register int align = ((((unsigned long)src) - 2) % 16);
175 179
178 srcM1A, srcM1B, srcM2A, srcM2B, 182 srcM1A, srcM1B, srcM2A, srcM2B,
179 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 183 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
180 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 184 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
181 psumA, psumB, sumA, sumB; 185 psumA, psumB, sumA, sumB;
182 186
183 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2; 187 vec_u8_t sum, vdst, fsum;
184 188
185 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 189 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
186 190
187 for (i = 0 ; i < 16 ; i ++) { 191 for (i = 0 ; i < 16 ; i ++) {
188 vec_u8_t srcR1 = vec_ld(-2, src); 192 vec_u8_t srcR1 = vec_ld(-2, src);
280 sumA = vec_sra(psumA, v5us); 284 sumA = vec_sra(psumA, v5us);
281 sumB = vec_sra(psumB, v5us); 285 sumB = vec_sra(psumB, v5us);
282 286
283 sum = vec_packsu(sumA, sumB); 287 sum = vec_packsu(sumA, sumB);
284 288
285 dst1 = vec_ld(0, dst); 289 ASSERT_ALIGNED(dst);
286 dst2 = vec_ld(16, dst); 290 vdst = vec_ld(0, dst);
287 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
288 291
289 OP_U8_ALTIVEC(fsum, sum, vdst); 292 OP_U8_ALTIVEC(fsum, sum, vdst);
290 293
291 rsum = vec_perm(fsum, fsum, dstperm); 294 vec_st(fsum, 0, dst);
292 fdst1 = vec_sel(dst1, rsum, dstmask);
293 fdst2 = vec_sel(rsum, dst2, dstmask);
294
295 vec_st(fdst1, 0, dst);
296 vec_st(fdst2, 16, dst);
297 295
298 src += srcStride; 296 src += srcStride;
299 dst += dstStride; 297 dst += dstStride;
300 } 298 }
301 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 299 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
311 const vec_u8_t perm = vec_lvsl(0, src); 309 const vec_u8_t perm = vec_lvsl(0, src);
312 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 310 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
313 const vec_u16_t v5us = vec_splat_u16(5); 311 const vec_u16_t v5us = vec_splat_u16(5);
314 const vec_s16_t v5ss = vec_splat_s16(5); 312 const vec_s16_t v5ss = vec_splat_s16(5);
315 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 313 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
316 const vec_u8_t dstperm = vec_lvsr(0, dst);
317 const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1);
318 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
319 314
320 uint8_t *srcbis = src - (srcStride * 2); 315 uint8_t *srcbis = src - (srcStride * 2);
321 316
322 const vec_u8_t srcM2a = vec_ld(0, srcbis); 317 const vec_u8_t srcM2a = vec_ld(0, srcbis);
323 const vec_u8_t srcM2b = vec_ld(16, srcbis); 318 const vec_u8_t srcM2b = vec_ld(16, srcbis);
354 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 349 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
355 psumA, psumB, sumA, sumB, 350 psumA, psumB, sumA, sumB,
356 srcP3ssA, srcP3ssB, 351 srcP3ssA, srcP3ssB,
357 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 352 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
358 353
359 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3; 354 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
360 355
361 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 356 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
362 357
363 for (i = 0 ; i < 16 ; i++) { 358 for (i = 0 ; i < 16 ; i++) {
364 srcP3a = vec_ld(0, srcbis += srcStride); 359 srcP3a = vec_ld(0, srcbis += srcStride);
401 sumA = vec_sra(psumA, v5us); 396 sumA = vec_sra(psumA, v5us);
402 sumB = vec_sra(psumB, v5us); 397 sumB = vec_sra(psumB, v5us);
403 398
404 sum = vec_packsu(sumA, sumB); 399 sum = vec_packsu(sumA, sumB);
405 400
406 dst1 = vec_ld(0, dst); 401 ASSERT_ALIGNED(dst);
407 dst2 = vec_ld(16, dst); 402 vdst = vec_ld(0, dst);
408 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
409 403
410 OP_U8_ALTIVEC(fsum, sum, vdst); 404 OP_U8_ALTIVEC(fsum, sum, vdst);
411 405
412 rsum = vec_perm(fsum, fsum, dstperm); 406 vec_st(fsum, 0, dst);
413 fdst1 = vec_sel(dst1, rsum, dstmask);
414 fdst2 = vec_sel(rsum, dst2, dstmask);
415
416 vec_st(fdst1, 0, dst);
417 vec_st(fdst2, 16, dst);
418 407
419 dst += dstStride; 408 dst += dstStride;
420 } 409 }
421 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 410 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
422 } 411 }
439 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 428 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
440 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 429 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
441 430
442 register int align = ((((unsigned long)src) - 2) % 16); 431 register int align = ((((unsigned long)src) - 2) % 16);
443 432
444 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
445
446 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, 433 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
447 srcP2A, srcP2B, srcP3A, srcP3B, 434 srcP2A, srcP2B, srcP3A, srcP3B,
448 srcM1A, srcM1B, srcM2A, srcM2B, 435 srcM1A, srcM1B, srcM2A, srcM2B,
449 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 436 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
450 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 437 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
451 438
452 const vec_u8_t dstperm = vec_lvsr(0, dst);
453
454 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
455
456 const vec_u8_t mperm = (const vec_u8_t) 439 const vec_u8_t mperm = (const vec_u8_t)
457 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 440 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
458 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 441 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
459 int16_t *tmpbis = tmp; 442 int16_t *tmpbis = tmp;
460 443
464 447
465 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 448 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
466 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 449 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
467 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 450 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
468 ssumAe, ssumAo, ssumBe, ssumBo; 451 ssumAe, ssumAo, ssumBe, ssumBo;
469 vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2; 452 vec_u8_t fsum, sumv, sum, vdst;
470 vec_s16_t ssume, ssumo; 453 vec_s16_t ssume, ssumo;
471 454
472 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 455 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
473 src -= (2 * srcStride); 456 src -= (2 * srcStride);
474 for (i = 0 ; i < 21 ; i ++) { 457 for (i = 0 ; i < 21 ; i ++) {
648 ssumo = vec_packs(ssumAo, ssumBo); 631 ssumo = vec_packs(ssumAo, ssumBo);
649 632
650 sumv = vec_packsu(ssume, ssumo); 633 sumv = vec_packsu(ssume, ssumo);
651 sum = vec_perm(sumv, sumv, mperm); 634 sum = vec_perm(sumv, sumv, mperm);
652 635
653 dst1 = vec_ld(0, dst); 636 ASSERT_ALIGNED(dst);
654 dst2 = vec_ld(16, dst); 637 vdst = vec_ld(0, dst);
655 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
656 638
657 OP_U8_ALTIVEC(fsum, sum, vdst); 639 OP_U8_ALTIVEC(fsum, sum, vdst);
658 640
659 rsum = vec_perm(fsum, fsum, dstperm); 641 vec_st(fsum, 0, dst);
660 fdst1 = vec_sel(dst1, rsum, dstmask);
661 fdst2 = vec_sel(rsum, dst2, dstmask);
662
663 vec_st(fdst1, 0, dst);
664 vec_st(fdst2, 16, dst);
665 642
666 dst += dstStride; 643 dst += dstStride;
667 } 644 }
668 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 645 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
669 } 646 }