comparison ppc/dsputil_altivec.c @ 1009:3b7cc8e4b83f libavcodec

AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michaelni
date Thu, 16 Jan 2003 21:54:55 +0000
parents edc10966b081
children 35cf2f4a0f8c
comparison
equal deleted inserted replaced
1008:fb6cbb8a04a3 1009:3b7cc8e4b83f
22 22
23 #if CONFIG_DARWIN 23 #if CONFIG_DARWIN
24 #include <sys/sysctl.h> 24 #include <sys/sysctl.h>
25 #endif 25 #endif
26 26
27 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT
28 unsigned long long perfdata[altivec_perf_total][altivec_data_total];
29 /* list below must match enum in dsputil_altivec.h */
30 static unsigned char* perfname[] = {
31 "fft_calc",
32 "gmc1",
33 "dct_unquantize_h263",
34 "idct_add",
35 "idct_put",
36 "put_pixels_clamped",
37 "put_pixels16",
38 "avg_pixels16"
39 };
40 #include <stdio.h>
41 #endif
42
27 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) 43 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
28 { 44 {
29 int i; 45 int i;
30 int s __attribute__((aligned(16))); 46 int s __attribute__((aligned(16)));
31 const vector unsigned char zero = (const vector unsigned char)(0); 47 const vector unsigned char zero = (const vector unsigned char)(0);
592 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { 608 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
593 return pix_abs8x8_altivec(a,b,stride); 609 return pix_abs8x8_altivec(a,b,stride);
594 } 610 }
595 611
596 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 612 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
597 #if 0 613 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
598 int i; 614 int i;
599 for(i=0; i+7<w; i++){ 615 for(i=0; i+7<w; i++){
600 dst[i+0] += src[i+0]; 616 dst[i+0] += src[i+0];
601 dst[i+1] += src[i+1]; 617 dst[i+1] += src[i+1];
602 dst[i+2] += src[i+2]; 618 dst[i+2] += src[i+2];
606 dst[i+6] += src[i+6]; 622 dst[i+6] += src[i+6];
607 dst[i+7] += src[i+7]; 623 dst[i+7] += src[i+7];
608 } 624 }
609 for(; i<w; i++) 625 for(; i<w; i++)
610 dst[i+0] += src[i+0]; 626 dst[i+0] += src[i+0];
611 #else 627 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
612 register int i; 628 register int i;
613 register uint8_t *temp_src = src, *temp_dst = dst; 629 register vector unsigned char vdst, vsrc;
614 register vector unsigned char vdst, vsrc, temp1, temp2; 630
615 register vector unsigned char perm; 631 /* dst and src are 16 bytes-aligned (guaranteed) */
616 register int count = 0; 632 for(i = 0 ; (i + 15) < w ; i++)
617
618 for (i = 0; (i < w) && ((unsigned long)temp_dst & 0x0000000F) ; i++)
619 { 633 {
620 dst[i] = src[i]; 634 vdst = vec_ld(i << 4, (unsigned char*)dst);
621 temp_src ++; 635 vsrc = vec_ld(i << 4, (unsigned char*)src);
622 temp_dst ++;
623 }
624 /* temp_dst is a properly aligned pointer */
625 /* we still need to deal with ill-aligned src */
626 perm = vec_lvsl(0, temp_src);
627 temp1 = vec_ld(0, temp_src);
628 while ((i + 15) < w)
629 {
630 temp2 = vec_ld(count + 16, temp_src);
631 vdst = vec_ld(count, temp_dst);
632 vsrc = vec_perm(temp1, temp2, perm);
633 temp1 = temp2;
634 vdst = vec_add(vsrc, vdst); 636 vdst = vec_add(vsrc, vdst);
635 vec_st(vdst, count, temp_dst); 637 vec_st(vdst, i << 4, (unsigned char*)dst);
636 count += 16; 638 }
637 } 639 /* if w is not a multiple of 16 */
638 for (; (i < w) ; i++) 640 for (; (i < w) ; i++)
639 { 641 {
640 dst[i] = src[i]; 642 dst[i] = src[i];
641 } 643 }
642 #endif 644 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
645 }
646
647 extern UINT8 cropTbl[];
648 void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels,
649 int line_size)
650 {
651 ALTIVEC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1);
652 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
653 int i;
654 UINT8 *cm = cropTbl + MAX_NEG_CROP;
655
656 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
657
658 /* read the pixels */
659 for(i=0;i<8;i++) {
660 pixels[0] = cm[block[0]];
661 pixels[1] = cm[block[1]];
662 pixels[2] = cm[block[2]];
663 pixels[3] = cm[block[3]];
664 pixels[4] = cm[block[4]];
665 pixels[5] = cm[block[5]];
666 pixels[6] = cm[block[6]];
667 pixels[7] = cm[block[7]];
668
669 pixels += line_size;
670 block += 8;
671 }
672
673 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
674
675 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
676 register const vector short vczero = (const vector short)(0);
677 register vector short
678 blockv0, blockv1, blockv2, blockv3,
679 blockv4, blockv5, blockv6, blockv7;
680 register vector unsigned char
681 pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4,
682 pixelsv0old, pixelsv4old;
683
684 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
685
686 blockv0 = vec_ld(0, block);
687 blockv1 = vec_ld(16, block);
688 blockv2 = vec_ld(32, block);
689 blockv3 = vec_ld(48, block);
690 blockv4 = vec_ld(64, block);
691 blockv5 = vec_ld(80, block);
692 blockv6 = vec_ld(96, block);
693 blockv7 = vec_ld(112, block);
694 if (((unsigned long)pixels) & 0x0000000F)
695 {
696 pixelsv0old = vec_ld(-8, pixels);
697 pixelsv4old = vec_ld(56, pixels);
698 pixelsv0 = vec_packsu(vczero, blockv0);
699 pixelsv1 = vec_packsu(blockv1, blockv2);
700 pixelsv2 = vec_packsu(blockv3, blockv4);
701 pixelsv3 = vec_packsu(blockv5, blockv6);
702 pixelsv4 = vec_packsu(blockv5, vczero);
703 pixelsv0 = vec_perm(pixelsv0old, pixelsv0, vcprm(0, 1, s2, s3));
704 pixelsv4 = vec_perm(pixelsv4, pixelsv4old, vcprm(0, 1, s2, s3));
705 vec_st(pixelsv0, -8, pixels);
706 vec_st(pixelsv1, 8, pixels);
707 vec_st(pixelsv2, 24, pixels);
708 vec_st(pixelsv3, 40, pixels);
709 vec_st(pixelsv4, 56, pixels);
710 }
711 else
712 {
713 pixelsv0 = vec_packsu(blockv0, blockv1);
714 pixelsv1 = vec_packsu(blockv2, blockv3);
715 pixelsv2 = vec_packsu(blockv4, blockv5);
716 pixelsv3 = vec_packsu(blockv6, blockv7);
717 vec_st(pixelsv0, 0, pixels);
718 vec_st(pixelsv1, 16, pixels);
719 vec_st(pixelsv2, 32, pixels);
720 vec_st(pixelsv3, 48, pixels);
721 }
722
723 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
724 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
725 }
726
727 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
728 {
729 ALTIVEC_TBL_DECLARE(altivec_put_pixels16_num, 1);
730 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
731 int i;
732
733 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
734
735 for(i=0; i<h; i++) {
736 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
737 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
738 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
739 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
740 pixels+=line_size;
741 block +=line_size;
742 }
743
744 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
745
746 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
747
748 register vector unsigned char perm = vec_lvsl(0, pixels);
749 register vector unsigned char pixelsv1, pixelsv2;
750 int i;
751
752 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
753
754 for(i=0; i<h; i++) {
755 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
756 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
757 vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)block);
758 pixels+=line_size;
759 block +=line_size;
760 }
761
762 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
763
764 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
765 }
766
767 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
768 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
769 {
770 ALTIVEC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
771 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
772 int i;
773
774 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
775
776 for(i=0; i<h; i++) {
777 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
778 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
779 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
780 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
781 pixels+=line_size;
782 block +=line_size;
783 }
784
785 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
786
787 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
788
789 register vector unsigned char perm = vec_lvsl(0, pixels);
790 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
791 int i;
792
793 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
794
795 for(i=0; i<h; i++) {
796 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
797 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
798 blockv = vec_ld(0, block);
799 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
800 blockv = vec_avg(blockv,pixelsv);
801 vec_st(blockv, 0, (unsigned char*)block);
802 pixels+=line_size;
803 block +=line_size;
804 }
805
806 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
807
808 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
643 } 809 }
644 810
645 int has_altivec(void) 811 int has_altivec(void)
646 { 812 {
647 #if CONFIG_DARWIN 813 #if CONFIG_DARWIN
654 820
655 if (err == 0) return (has_vu != 0); 821 if (err == 0) return (has_vu != 0);
656 #endif 822 #endif
657 return 0; 823 return 0;
658 } 824 }
825
826 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT
827 void altivec_display_perf_report(void)
828 {
829 int i;
830 fprintf(stderr, "AltiVec performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
831 for(i = 0 ; i < altivec_perf_total ; i++)
832 {
833 if (perfdata[i][altivec_data_num] != (unsigned long long)0)
834 fprintf(stderr, " Function \"%s\":\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
835 perfname[i],
836 perfdata[i][altivec_data_min],
837 perfdata[i][altivec_data_max],
838 (double)perfdata[i][altivec_data_sum] /
839 (double)perfdata[i][altivec_data_num],
840 perfdata[i][altivec_data_num]);
841 }
842 }
843 #endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */