comparison ppc/dsputil_altivec.c @ 1015:35cf2f4a0f8c libavcodec

PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michaelni
date Sun, 19 Jan 2003 19:00:45 +0000
parents 3b7cc8e4b83f
children 9cc1031e1864
comparison
equal deleted inserted replaced
1014:48349e11c9b2 1015:35cf2f4a0f8c
22 22
23 #if CONFIG_DARWIN 23 #if CONFIG_DARWIN
24 #include <sys/sysctl.h> 24 #include <sys/sysctl.h>
25 #endif 25 #endif
26 26
27 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT
28 unsigned long long perfdata[altivec_perf_total][altivec_data_total];
29 /* list below must match enum in dsputil_altivec.h */
30 static unsigned char* perfname[] = {
31 "fft_calc",
32 "gmc1",
33 "dct_unquantize_h263",
34 "idct_add",
35 "idct_put",
36 "put_pixels_clamped",
37 "put_pixels16",
38 "avg_pixels16"
39 };
40 #include <stdio.h>
41 #endif
42
43 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) 27 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
44 { 28 {
45 int i; 29 int i;
46 int s __attribute__((aligned(16))); 30 int s __attribute__((aligned(16)));
47 const vector unsigned char zero = (const vector unsigned char)(0); 31 const vector unsigned char zero = (const vector unsigned char)(0);
646 630
647 extern UINT8 cropTbl[]; 631 extern UINT8 cropTbl[];
648 void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels, 632 void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels,
649 int line_size) 633 int line_size)
650 { 634 {
651 ALTIVEC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1); 635 POWERPC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1);
652 #ifdef ALTIVEC_USE_REFERENCE_C_CODE 636 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
653 int i; 637 int i;
654 UINT8 *cm = cropTbl + MAX_NEG_CROP; 638 UINT8 *cm = cropTbl + MAX_NEG_CROP;
655 639
656 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); 640 POWERPC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
657 641
658 /* read the pixels */ 642 /* read the pixels */
659 for(i=0;i<8;i++) { 643 for(i=0;i<8;i++) {
660 pixels[0] = cm[block[0]]; 644 pixels[0] = cm[block[0]];
661 pixels[1] = cm[block[1]]; 645 pixels[1] = cm[block[1]];
668 652
669 pixels += line_size; 653 pixels += line_size;
670 block += 8; 654 block += 8;
671 } 655 }
672 656
673 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); 657 POWERPC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
674 658
675 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ 659 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
676 register const vector short vczero = (const vector short)(0); 660 register const vector short vczero = (const vector short)(0);
677 register vector short 661 register vector short
678 blockv0, blockv1, blockv2, blockv3, 662 blockv0, blockv1, blockv2, blockv3,
679 blockv4, blockv5, blockv6, blockv7; 663 blockv4, blockv5, blockv6, blockv7;
680 register vector unsigned char 664 register vector unsigned char
681 pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4, 665 pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4,
682 pixelsv0old, pixelsv4old; 666 pixelsv0old, pixelsv4old;
683 667
684 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); 668 POWERPC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
685 669
686 blockv0 = vec_ld(0, block); 670 blockv0 = vec_ld(0, block);
687 blockv1 = vec_ld(16, block); 671 blockv1 = vec_ld(16, block);
688 blockv2 = vec_ld(32, block); 672 blockv2 = vec_ld(32, block);
689 blockv3 = vec_ld(48, block); 673 blockv3 = vec_ld(48, block);
718 vec_st(pixelsv1, 16, pixels); 702 vec_st(pixelsv1, 16, pixels);
719 vec_st(pixelsv2, 32, pixels); 703 vec_st(pixelsv2, 32, pixels);
720 vec_st(pixelsv3, 48, pixels); 704 vec_st(pixelsv3, 48, pixels);
721 } 705 }
722 706
723 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); 707 POWERPC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
724 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 708 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
725 } 709 }
726 710
727 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 711 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
728 { 712 {
729 ALTIVEC_TBL_DECLARE(altivec_put_pixels16_num, 1); 713 POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1);
730 #ifdef ALTIVEC_USE_REFERENCE_C_CODE 714 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
731 int i; 715 int i;
732 716
733 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1); 717 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
734 718
735 for(i=0; i<h; i++) { 719 for(i=0; i<h; i++) {
736 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); 720 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
737 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); 721 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
738 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); 722 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
739 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); 723 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
740 pixels+=line_size; 724 pixels+=line_size;
741 block +=line_size; 725 block +=line_size;
742 } 726 }
743 727
744 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); 728 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
745 729
746 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ 730 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
747
748 register vector unsigned char perm = vec_lvsl(0, pixels);
749 register vector unsigned char pixelsv1, pixelsv2; 731 register vector unsigned char pixelsv1, pixelsv2;
750 int i; 732 int i;
751 733
752 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1); 734 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
753 735
754 for(i=0; i<h; i++) { 736 for(i=0; i<h; i++) {
755 pixelsv1 = vec_ld(0, (unsigned char*)pixels); 737 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
756 pixelsv2 = vec_ld(16, (unsigned char*)pixels); 738 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
757 vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)block); 739 vec_st(vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)),
740 0, (unsigned char*)block);
758 pixels+=line_size; 741 pixels+=line_size;
759 block +=line_size; 742 block +=line_size;
760 } 743 }
761 744
762 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); 745 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
763 746
764 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 747 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
765 } 748 }
766 749
767 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 750 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
768 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 751 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
769 { 752 {
770 ALTIVEC_TBL_DECLARE(altivec_avg_pixels16_num, 1); 753 POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
771 #ifdef ALTIVEC_USE_REFERENCE_C_CODE 754 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
772 int i; 755 int i;
773 756
774 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); 757 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
775 758
776 for(i=0; i<h; i++) { 759 for(i=0; i<h; i++) {
777 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); 760 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
778 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); 761 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
779 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); 762 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
780 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); 763 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
781 pixels+=line_size; 764 pixels+=line_size;
782 block +=line_size; 765 block +=line_size;
783 } 766 }
784 767
785 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); 768 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
786 769
787 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ 770 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
788
789 register vector unsigned char perm = vec_lvsl(0, pixels);
790 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 771 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
791 int i; 772 int i;
792 773
793 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); 774 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
794 775
795 for(i=0; i<h; i++) { 776 for(i=0; i<h; i++) {
796 pixelsv1 = vec_ld(0, (unsigned char*)pixels); 777 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
797 pixelsv2 = vec_ld(16, (unsigned char*)pixels); 778 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
798 blockv = vec_ld(0, block); 779 blockv = vec_ld(0, block);
799 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 780 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
800 blockv = vec_avg(blockv,pixelsv); 781 blockv = vec_avg(blockv,pixelsv);
801 vec_st(blockv, 0, (unsigned char*)block); 782 vec_st(blockv, 0, (unsigned char*)block);
802 pixels+=line_size; 783 pixels+=line_size;
803 block +=line_size; 784 block +=line_size;
804 } 785 }
805 786
806 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); 787 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
807 788
789 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
790 }
791
792 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels,
793 int line_size, int h)
794 {
795 POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1);
796 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
797 int i;
798 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
799 for (i = 0; i < h; i++) {
800 *((uint32_t *) (block)) =
801 (((*((uint32_t *) (block))) |
802 ((((const struct unaligned_32 *) (pixels))->l))) -
803 ((((*((uint32_t *) (block))) ^
804 ((((const struct unaligned_32 *) (pixels))->
805 l))) & 0xFEFEFEFEUL) >> 1));
806 *((uint32_t *) (block + 4)) =
807 (((*((uint32_t *) (block + 4))) |
808 ((((const struct unaligned_32 *) (pixels + 4))->l))) -
809 ((((*((uint32_t *) (block + 4))) ^
810 ((((const struct unaligned_32 *) (pixels +
811 4))->
812 l))) & 0xFEFEFEFEUL) >> 1));
813 pixels += line_size;
814 block += line_size;
815 }
816 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
817
818 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
819 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
820 int i;
821
822 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
823
824 for (i = 0; i < h; i++) {
825 /*
826 block is 8 bytes-aligned, so we're either in the
827 left block (16 bytes-aligned) or in the right block (not)
828 */
829 int rightside = ((unsigned long)block & 0x0000000F);
830
831 blockv = vec_ld(0, block);
832 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
833 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
834 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
835
836 if (rightside)
837 {
838 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
839 }
840 else
841 {
842 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
843 }
844
845 blockv = vec_avg(blockv, pixelsv);
846
847 vec_st(blockv, 0, block);
848
849 pixels += line_size;
850 block += line_size;
851 }
852
853 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
854
855 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
856 }
857
858 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
859 {
860 POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1);
861 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
862 int j;
863 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
864 for (j = 0; j < 2; j++) {
865 int i;
866 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
867 const uint32_t b =
868 (((const struct unaligned_32 *) (pixels + 1))->l);
869 uint32_t l0 =
870 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
871 uint32_t h0 =
872 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
873 uint32_t l1, h1;
874 pixels += line_size;
875 for (i = 0; i < h; i += 2) {
876 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
877 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
878 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
879 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
880 *((uint32_t *) block) =
881 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
882 pixels += line_size;
883 block += line_size;
884 a = (((const struct unaligned_32 *) (pixels))->l);
885 b = (((const struct unaligned_32 *) (pixels + 1))->l);
886 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
887 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
888 *((uint32_t *) block) =
889 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
890 pixels += line_size;
891 block += line_size;
892 } pixels += 4 - line_size * (h + 1);
893 block += 4 - line_size * h;
894 }
895
896 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
897
898 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
899 register int i;
900 register vector unsigned char
901 pixelsv1, pixelsv2,
902 pixelsavg;
903 register vector unsigned char
904 blockv, temp1, temp2;
905 register vector unsigned short
906 pixelssum1, pixelssum2, temp3;
907 register const vector unsigned char vczero = (const vector unsigned char)(0);
908 register const vector unsigned short vctwo = (const vector unsigned short)(2);
909
910 temp1 = vec_ld(0, pixels);
911 temp2 = vec_ld(16, pixels);
912 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
913 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
914 {
915 pixelsv2 = temp2;
916 }
917 else
918 {
919 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
920 }
921 pixelsv1 = vec_mergeh(vczero, pixelsv1);
922 pixelsv2 = vec_mergeh(vczero, pixelsv2);
923 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
924 (vector unsigned short)pixelsv2);
925 pixelssum1 = vec_add(pixelssum1, vctwo);
926
927 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
928 for (i = 0; i < h ; i++) {
929 int rightside = ((unsigned long)block & 0x0000000F);
930 blockv = vec_ld(0, block);
931
932 temp1 = vec_ld(line_size, pixels);
933 temp2 = vec_ld(line_size + 16, pixels);
934 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
935 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
936 {
937 pixelsv2 = temp2;
938 }
939 else
940 {
941 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
942 }
943
944 pixelsv1 = vec_mergeh(vczero, pixelsv1);
945 pixelsv2 = vec_mergeh(vczero, pixelsv2);
946 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
947 (vector unsigned short)pixelsv2);
948 temp3 = vec_add(pixelssum1, pixelssum2);
949 temp3 = vec_sra(temp3, vctwo);
950 pixelssum1 = vec_add(pixelssum2, vctwo);
951 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
952
953 if (rightside)
954 {
955 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
956 }
957 else
958 {
959 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
960 }
961
962 vec_st(blockv, 0, block);
963
964 block += line_size;
965 pixels += line_size;
966 }
967
968 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
808 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 969 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
809 } 970 }
810 971
811 int has_altivec(void) 972 int has_altivec(void)
812 { 973 {
820 981
821 if (err == 0) return (has_vu != 0); 982 if (err == 0) return (has_vu != 0);
822 #endif 983 #endif
823 return 0; 984 return 0;
824 } 985 }
825
826 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT
827 void altivec_display_perf_report(void)
828 {
829 int i;
830 fprintf(stderr, "AltiVec performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
831 for(i = 0 ; i < altivec_perf_total ; i++)
832 {
833 if (perfdata[i][altivec_data_num] != (unsigned long long)0)
834 fprintf(stderr, " Function \"%s\":\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
835 perfname[i],
836 perfdata[i][altivec_data_min],
837 perfdata[i][altivec_data_max],
838 (double)perfdata[i][altivec_data_sum] /
839 (double)perfdata[i][altivec_data_num],
840 perfdata[i][altivec_data_num]);
841 }
842 }
843 #endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */