Mercurial > libavcodec.hg
comparison ppc/dsputil_altivec.c @ 1024:9cc1031e1864 libavcodec
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author | michaelni |
---|---|
date | Mon, 20 Jan 2003 22:50:14 +0000 |
parents | 35cf2f4a0f8c |
children | b4172ff70d27 |
comparison
equal
deleted
inserted
replaced
1023:e61be5796027 | 1024:9cc1031e1864 |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2002 Brian Foley | 2 * Copyright (c) 2002 Brian Foley |
3 * Copyright (c) 2002 Dieter Shirley | 3 * Copyright (c) 2002 Dieter Shirley |
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> | |
4 * | 5 * |
5 * This library is free software; you can redistribute it and/or | 6 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | 7 * modify it under the terms of the GNU Lesser General Public |
7 * License as published by the Free Software Foundation; either | 8 * License as published by the Free Software Foundation; either |
8 * version 2 of the License, or (at your option) any later version. | 9 * version 2 of the License, or (at your option) any later version. |
626 dst[i] = src[i]; | 627 dst[i] = src[i]; |
627 } | 628 } |
628 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | 629 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
629 } | 630 } |
630 | 631 |
631 extern UINT8 cropTbl[]; | 632 /* next one assumes that ((line_size % 16) == 0) */ |
632 void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels, | |
633 int line_size) | |
634 { | |
635 POWERPC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1); | |
636 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
637 int i; | |
638 UINT8 *cm = cropTbl + MAX_NEG_CROP; | |
639 | |
640 POWERPC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); | |
641 | |
642 /* read the pixels */ | |
643 for(i=0;i<8;i++) { | |
644 pixels[0] = cm[block[0]]; | |
645 pixels[1] = cm[block[1]]; | |
646 pixels[2] = cm[block[2]]; | |
647 pixels[3] = cm[block[3]]; | |
648 pixels[4] = cm[block[4]]; | |
649 pixels[5] = cm[block[5]]; | |
650 pixels[6] = cm[block[6]]; | |
651 pixels[7] = cm[block[7]]; | |
652 | |
653 pixels += line_size; | |
654 block += 8; | |
655 } | |
656 | |
657 POWERPC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); | |
658 | |
659 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
660 register const vector short vczero = (const vector short)(0); | |
661 register vector short | |
662 blockv0, blockv1, blockv2, blockv3, | |
663 blockv4, blockv5, blockv6, blockv7; | |
664 register vector unsigned char | |
665 pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4, | |
666 pixelsv0old, pixelsv4old; | |
667 | |
668 POWERPC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); | |
669 | |
670 blockv0 = vec_ld(0, block); | |
671 blockv1 = vec_ld(16, block); | |
672 blockv2 = vec_ld(32, block); | |
673 blockv3 = vec_ld(48, block); | |
674 blockv4 = vec_ld(64, block); | |
675 blockv5 = vec_ld(80, block); | |
676 blockv6 = vec_ld(96, block); | |
677 blockv7 = vec_ld(112, block); | |
678 if (((unsigned long)pixels) & 0x0000000F) | |
679 { | |
680 pixelsv0old = vec_ld(-8, pixels); | |
681 pixelsv4old = vec_ld(56, pixels); | |
682 pixelsv0 = vec_packsu(vczero, blockv0); | |
683 pixelsv1 = vec_packsu(blockv1, blockv2); | |
684 pixelsv2 = vec_packsu(blockv3, blockv4); | |
685 pixelsv3 = vec_packsu(blockv5, blockv6); | |
686 pixelsv4 = vec_packsu(blockv5, vczero); | |
687 pixelsv0 = vec_perm(pixelsv0old, pixelsv0, vcprm(0, 1, s2, s3)); | |
688 pixelsv4 = vec_perm(pixelsv4, pixelsv4old, vcprm(0, 1, s2, s3)); | |
689 vec_st(pixelsv0, -8, pixels); | |
690 vec_st(pixelsv1, 8, pixels); | |
691 vec_st(pixelsv2, 24, pixels); | |
692 vec_st(pixelsv3, 40, pixels); | |
693 vec_st(pixelsv4, 56, pixels); | |
694 } | |
695 else | |
696 { | |
697 pixelsv0 = vec_packsu(blockv0, blockv1); | |
698 pixelsv1 = vec_packsu(blockv2, blockv3); | |
699 pixelsv2 = vec_packsu(blockv4, blockv5); | |
700 pixelsv3 = vec_packsu(blockv6, blockv7); | |
701 vec_st(pixelsv0, 0, pixels); | |
702 vec_st(pixelsv1, 16, pixels); | |
703 vec_st(pixelsv2, 32, pixels); | |
704 vec_st(pixelsv3, 48, pixels); | |
705 } | |
706 | |
707 POWERPC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); | |
708 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
709 } | |
710 | |
711 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 633 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
712 { | 634 { |
713 POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1); | 635 POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1); |
714 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | 636 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
715 int i; | 637 int i; |
727 | 649 |
728 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); | 650 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); |
729 | 651 |
730 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | 652 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
731 register vector unsigned char pixelsv1, pixelsv2; | 653 register vector unsigned char pixelsv1, pixelsv2; |
654 register vector unsigned char perm = vec_lvsl(0, pixels); | |
732 int i; | 655 int i; |
733 | 656 |
734 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); | 657 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); |
735 | 658 |
736 for(i=0; i<h; i++) { | 659 for(i=0; i<h; i++) { |
737 pixelsv1 = vec_ld(0, (unsigned char*)pixels); | 660 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
738 pixelsv2 = vec_ld(16, (unsigned char*)pixels); | 661 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
739 vec_st(vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)), | 662 vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
740 0, (unsigned char*)block); | 663 0, (unsigned char*)block); |
741 pixels+=line_size; | 664 pixels+=line_size; |
742 block +=line_size; | 665 block +=line_size; |
743 } | 666 } |
744 | 667 |
745 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); | 668 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); |
746 | 669 |
747 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | 670 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
748 } | 671 } |
749 | 672 |
673 /* next one assumes that ((line_size % 16) == 0) */ | |
750 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | 674 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) |
751 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 675 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
752 { | 676 { |
753 POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1); | 677 POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1); |
754 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | 678 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
767 | 691 |
768 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); | 692 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); |
769 | 693 |
770 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | 694 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
771 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | 695 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
696 register vector unsigned char perm = vec_lvsl(0, pixels); | |
772 int i; | 697 int i; |
773 | 698 |
774 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); | 699 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); |
775 | 700 |
776 for(i=0; i<h; i++) { | 701 for(i=0; i<h; i++) { |
777 pixelsv1 = vec_ld(0, (unsigned char*)pixels); | 702 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
778 pixelsv2 = vec_ld(16, (unsigned char*)pixels); | 703 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
779 blockv = vec_ld(0, block); | 704 blockv = vec_ld(0, block); |
780 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); | 705 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); |
781 blockv = vec_avg(blockv,pixelsv); | 706 blockv = vec_avg(blockv,pixelsv); |
782 vec_st(blockv, 0, (unsigned char*)block); | 707 vec_st(blockv, 0, (unsigned char*)block); |
783 pixels+=line_size; | 708 pixels+=line_size; |
784 block +=line_size; | 709 block +=line_size; |
785 } | 710 } |
787 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); | 712 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); |
788 | 713 |
789 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | 714 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
790 } | 715 } |
791 | 716 |
792 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, | 717 /* next one assumes that ((line_size % 8) == 0) */ |
793 int line_size, int h) | 718 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
794 { | 719 { |
795 POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1); | 720 POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1); |
796 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | 721 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
797 int i; | 722 int i; |
798 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1); | 723 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1); |
853 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1); | 778 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1); |
854 | 779 |
855 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | 780 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
856 } | 781 } |
857 | 782 |
783 /* next one assumes that ((line_size % 8) == 0) */ | |
858 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 784 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
859 { | 785 { |
860 POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1); | 786 POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1); |
861 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | 787 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
862 int j; | 788 int j; |
967 | 893 |
968 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); | 894 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); |
969 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | 895 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
970 } | 896 } |
971 | 897 |
898 /* next one assumes that ((line_size % 8) == 0) */ | |
899 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
900 { | |
901 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
902 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
903 int j; | |
904 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
905 for (j = 0; j < 2; j++) { | |
906 int i; | |
907 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
908 const uint32_t b = | |
909 (((const struct unaligned_32 *) (pixels + 1))->l); | |
910 uint32_t l0 = | |
911 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
912 uint32_t h0 = | |
913 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
914 uint32_t l1, h1; | |
915 pixels += line_size; | |
916 for (i = 0; i < h; i += 2) { | |
917 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
918 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
919 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
920 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
921 *((uint32_t *) block) = | |
922 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
923 pixels += line_size; | |
924 block += line_size; | |
925 a = (((const struct unaligned_32 *) (pixels))->l); | |
926 b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
927 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
928 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
929 *((uint32_t *) block) = | |
930 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
931 pixels += line_size; | |
932 block += line_size; | |
933 } pixels += 4 - line_size * (h + 1); | |
934 block += 4 - line_size * h; | |
935 } | |
936 | |
937 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
938 | |
939 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
940 register int i; | |
941 register vector unsigned char | |
942 pixelsv1, pixelsv2, | |
943 pixelsavg; | |
944 register vector unsigned char | |
945 blockv, temp1, temp2; | |
946 register vector unsigned short | |
947 pixelssum1, pixelssum2, temp3; | |
948 register const vector unsigned char vczero = (const vector unsigned char)(0); | |
949 register const vector unsigned short vcone = (const vector unsigned short)(1); | |
950 register const vector unsigned short vctwo = (const vector unsigned short)(2); | |
951 | |
952 temp1 = vec_ld(0, pixels); | |
953 temp2 = vec_ld(16, pixels); | |
954 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
955 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
956 { | |
957 pixelsv2 = temp2; | |
958 } | |
959 else | |
960 { | |
961 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
962 } | |
963 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
964 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
965 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
966 (vector unsigned short)pixelsv2); | |
967 pixelssum1 = vec_add(pixelssum1, vcone); | |
968 | |
969 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
970 for (i = 0; i < h ; i++) { | |
971 int rightside = ((unsigned long)block & 0x0000000F); | |
972 blockv = vec_ld(0, block); | |
973 | |
974 temp1 = vec_ld(line_size, pixels); | |
975 temp2 = vec_ld(line_size + 16, pixels); | |
976 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
977 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
978 { | |
979 pixelsv2 = temp2; | |
980 } | |
981 else | |
982 { | |
983 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
984 } | |
985 | |
986 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
987 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
988 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
989 (vector unsigned short)pixelsv2); | |
990 temp3 = vec_add(pixelssum1, pixelssum2); | |
991 temp3 = vec_sra(temp3, vctwo); | |
992 pixelssum1 = vec_add(pixelssum2, vcone); | |
993 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
994 | |
995 if (rightside) | |
996 { | |
997 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
998 } | |
999 else | |
1000 { | |
1001 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
1002 } | |
1003 | |
1004 vec_st(blockv, 0, block); | |
1005 | |
1006 block += line_size; | |
1007 pixels += line_size; | |
1008 } | |
1009 | |
1010 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
1011 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1012 } | |
1013 | |
1014 /* next one assumes that ((line_size % 16) == 0) */ | |
1015 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1016 { | |
1017 POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1); | |
1018 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
1019 int j; | |
1020 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1021 for (j = 0; j < 4; j++) { | |
1022 int i; | |
1023 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1024 const uint32_t b = | |
1025 (((const struct unaligned_32 *) (pixels + 1))->l); | |
1026 uint32_t l0 = | |
1027 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1028 uint32_t h0 = | |
1029 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1030 uint32_t l1, h1; | |
1031 pixels += line_size; | |
1032 for (i = 0; i < h; i += 2) { | |
1033 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1034 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1035 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1036 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1037 *((uint32_t *) block) = | |
1038 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1039 pixels += line_size; | |
1040 block += line_size; | |
1041 a = (((const struct unaligned_32 *) (pixels))->l); | |
1042 b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1043 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1044 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1045 *((uint32_t *) block) = | |
1046 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1047 pixels += line_size; | |
1048 block += line_size; | |
1049 } pixels += 4 - line_size * (h + 1); | |
1050 block += 4 - line_size * h; | |
1051 } | |
1052 | |
1053 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1054 | |
1055 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1056 register int i; | |
1057 register vector unsigned char | |
1058 pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1059 register vector unsigned char | |
1060 blockv, temp1, temp2; | |
1061 register vector unsigned short | |
1062 pixelssum1, pixelssum2, temp3, | |
1063 pixelssum3, pixelssum4, temp4; | |
1064 register const vector unsigned char vczero = (const vector unsigned char)(0); | |
1065 register const vector unsigned short vctwo = (const vector unsigned short)(2); | |
1066 | |
1067 temp1 = vec_ld(0, pixels); | |
1068 temp2 = vec_ld(16, pixels); | |
1069 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1070 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1071 { | |
1072 pixelsv2 = temp2; | |
1073 } | |
1074 else | |
1075 { | |
1076 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1077 } | |
1078 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1079 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1080 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1081 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1082 pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1083 (vector unsigned short)pixelsv4); | |
1084 pixelssum3 = vec_add(pixelssum3, vctwo); | |
1085 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1086 (vector unsigned short)pixelsv2); | |
1087 pixelssum1 = vec_add(pixelssum1, vctwo); | |
1088 | |
1089 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1090 for (i = 0; i < h ; i++) { | |
1091 blockv = vec_ld(0, block); | |
1092 | |
1093 temp1 = vec_ld(line_size, pixels); | |
1094 temp2 = vec_ld(line_size + 16, pixels); | |
1095 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1096 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1097 { | |
1098 pixelsv2 = temp2; | |
1099 } | |
1100 else | |
1101 { | |
1102 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1103 } | |
1104 | |
1105 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1106 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1107 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1108 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1109 | |
1110 pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1111 (vector unsigned short)pixelsv4); | |
1112 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1113 (vector unsigned short)pixelsv2); | |
1114 temp4 = vec_add(pixelssum3, pixelssum4); | |
1115 temp4 = vec_sra(temp4, vctwo); | |
1116 temp3 = vec_add(pixelssum1, pixelssum2); | |
1117 temp3 = vec_sra(temp3, vctwo); | |
1118 | |
1119 pixelssum3 = vec_add(pixelssum4, vctwo); | |
1120 pixelssum1 = vec_add(pixelssum2, vctwo); | |
1121 | |
1122 blockv = vec_packsu(temp3, temp4); | |
1123 | |
1124 vec_st(blockv, 0, block); | |
1125 | |
1126 block += line_size; | |
1127 pixels += line_size; | |
1128 } | |
1129 | |
1130 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1131 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1132 } | |
1133 | |
1134 /* next one assumes that ((line_size % 16) == 0) */ | |
1135 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1136 { | |
1137 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1138 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
1139 int j; | |
1140 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1141 for (j = 0; j < 4; j++) { | |
1142 int i; | |
1143 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1144 const uint32_t b = | |
1145 (((const struct unaligned_32 *) (pixels + 1))->l); | |
1146 uint32_t l0 = | |
1147 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1148 uint32_t h0 = | |
1149 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1150 uint32_t l1, h1; | |
1151 pixels += line_size; | |
1152 for (i = 0; i < h; i += 2) { | |
1153 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1154 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1155 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1156 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1157 *((uint32_t *) block) = | |
1158 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1159 pixels += line_size; | |
1160 block += line_size; | |
1161 a = (((const struct unaligned_32 *) (pixels))->l); | |
1162 b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1163 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1164 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1165 *((uint32_t *) block) = | |
1166 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1167 pixels += line_size; | |
1168 block += line_size; | |
1169 } pixels += 4 - line_size * (h + 1); | |
1170 block += 4 - line_size * h; | |
1171 } | |
1172 | |
1173 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1174 | |
1175 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1176 register int i; | |
1177 register vector unsigned char | |
1178 pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1179 register vector unsigned char | |
1180 blockv, temp1, temp2; | |
1181 register vector unsigned short | |
1182 pixelssum1, pixelssum2, temp3, | |
1183 pixelssum3, pixelssum4, temp4; | |
1184 register const vector unsigned char vczero = (const vector unsigned char)(0); | |
1185 register const vector unsigned short vcone = (const vector unsigned short)(1); | |
1186 register const vector unsigned short vctwo = (const vector unsigned short)(2); | |
1187 | |
1188 temp1 = vec_ld(0, pixels); | |
1189 temp2 = vec_ld(16, pixels); | |
1190 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1191 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1192 { | |
1193 pixelsv2 = temp2; | |
1194 } | |
1195 else | |
1196 { | |
1197 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1198 } | |
1199 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1200 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1201 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1202 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1203 pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1204 (vector unsigned short)pixelsv4); | |
1205 pixelssum3 = vec_add(pixelssum3, vcone); | |
1206 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1207 (vector unsigned short)pixelsv2); | |
1208 pixelssum1 = vec_add(pixelssum1, vcone); | |
1209 | |
1210 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1211 for (i = 0; i < h ; i++) { | |
1212 blockv = vec_ld(0, block); | |
1213 | |
1214 temp1 = vec_ld(line_size, pixels); | |
1215 temp2 = vec_ld(line_size + 16, pixels); | |
1216 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1217 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1218 { | |
1219 pixelsv2 = temp2; | |
1220 } | |
1221 else | |
1222 { | |
1223 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1224 } | |
1225 | |
1226 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1227 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1228 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1229 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1230 | |
1231 pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1232 (vector unsigned short)pixelsv4); | |
1233 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1234 (vector unsigned short)pixelsv2); | |
1235 temp4 = vec_add(pixelssum3, pixelssum4); | |
1236 temp4 = vec_sra(temp4, vctwo); | |
1237 temp3 = vec_add(pixelssum1, pixelssum2); | |
1238 temp3 = vec_sra(temp3, vctwo); | |
1239 | |
1240 pixelssum3 = vec_add(pixelssum4, vcone); | |
1241 pixelssum1 = vec_add(pixelssum2, vcone); | |
1242 | |
1243 blockv = vec_packsu(temp3, temp4); | |
1244 | |
1245 vec_st(blockv, 0, block); | |
1246 | |
1247 block += line_size; | |
1248 pixels += line_size; | |
1249 } | |
1250 | |
1251 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1252 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1253 } | |
1254 | |
972 int has_altivec(void) | 1255 int has_altivec(void) |
973 { | 1256 { |
974 #if CONFIG_DARWIN | 1257 #if CONFIG_DARWIN |
975 int sels[2] = {CTL_HW, HW_VECTORUNIT}; | 1258 int sels[2] = {CTL_HW, HW_VECTORUNIT}; |
976 int has_vu = 0; | 1259 int has_vu = 0; |