Mercurial > libavcodec.hg
comparison ppc/dsputil_altivec.c @ 1009:3b7cc8e4b83f libavcodec
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
author | michaelni |
---|---|
date | Thu, 16 Jan 2003 21:54:55 +0000 |
parents | edc10966b081 |
children | 35cf2f4a0f8c |
comparison
equal
deleted
inserted
replaced
1008:fb6cbb8a04a3 | 1009:3b7cc8e4b83f |
---|---|
22 | 22 |
23 #if CONFIG_DARWIN | 23 #if CONFIG_DARWIN |
24 #include <sys/sysctl.h> | 24 #include <sys/sysctl.h> |
25 #endif | 25 #endif |
26 | 26 |
27 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT | |
28 unsigned long long perfdata[altivec_perf_total][altivec_data_total]; | |
29 /* list below must match enum in dsputil_altivec.h */ | |
30 static unsigned char* perfname[] = { | |
31 "fft_calc", | |
32 "gmc1", | |
33 "dct_unquantize_h263", | |
34 "idct_add", | |
35 "idct_put", | |
36 "put_pixels_clamped", | |
37 "put_pixels16", | |
38 "avg_pixels16" | |
39 }; | |
40 #include <stdio.h> | |
41 #endif | |
42 | |
27 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | 43 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
28 { | 44 { |
29 int i; | 45 int i; |
30 int s __attribute__((aligned(16))); | 46 int s __attribute__((aligned(16))); |
31 const vector unsigned char zero = (const vector unsigned char)(0); | 47 const vector unsigned char zero = (const vector unsigned char)(0); |
592 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { | 608 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { |
593 return pix_abs8x8_altivec(a,b,stride); | 609 return pix_abs8x8_altivec(a,b,stride); |
594 } | 610 } |
595 | 611 |
596 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { | 612 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { |
597 #if 0 | 613 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
598 int i; | 614 int i; |
599 for(i=0; i+7<w; i++){ | 615 for(i=0; i+7<w; i++){ |
600 dst[i+0] += src[i+0]; | 616 dst[i+0] += src[i+0]; |
601 dst[i+1] += src[i+1]; | 617 dst[i+1] += src[i+1]; |
602 dst[i+2] += src[i+2]; | 618 dst[i+2] += src[i+2]; |
606 dst[i+6] += src[i+6]; | 622 dst[i+6] += src[i+6]; |
607 dst[i+7] += src[i+7]; | 623 dst[i+7] += src[i+7]; |
608 } | 624 } |
609 for(; i<w; i++) | 625 for(; i<w; i++) |
610 dst[i+0] += src[i+0]; | 626 dst[i+0] += src[i+0]; |
611 #else | 627 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
612 register int i; | 628 register int i; |
613 register uint8_t *temp_src = src, *temp_dst = dst; | 629 register vector unsigned char vdst, vsrc; |
614 register vector unsigned char vdst, vsrc, temp1, temp2; | 630 |
615 register vector unsigned char perm; | 631 /* dst and src are 16 bytes-aligned (guaranteed) */ |
616 register int count = 0; | 632 for(i = 0 ; (i + 15) < w ; i++) |
617 | |
618 for (i = 0; (i < w) && ((unsigned long)temp_dst & 0x0000000F) ; i++) | |
619 { | 633 { |
620 dst[i] = src[i]; | 634 vdst = vec_ld(i << 4, (unsigned char*)dst); |
621 temp_src ++; | 635 vsrc = vec_ld(i << 4, (unsigned char*)src); |
622 temp_dst ++; | |
623 } | |
624 /* temp_dst is a properly aligned pointer */ | |
625 /* we still need to deal with ill-aligned src */ | |
626 perm = vec_lvsl(0, temp_src); | |
627 temp1 = vec_ld(0, temp_src); | |
628 while ((i + 15) < w) | |
629 { | |
630 temp2 = vec_ld(count + 16, temp_src); | |
631 vdst = vec_ld(count, temp_dst); | |
632 vsrc = vec_perm(temp1, temp2, perm); | |
633 temp1 = temp2; | |
634 vdst = vec_add(vsrc, vdst); | 636 vdst = vec_add(vsrc, vdst); |
635 vec_st(vdst, count, temp_dst); | 637 vec_st(vdst, i << 4, (unsigned char*)dst); |
636 count += 16; | 638 } |
637 } | 639 /* if w is not a multiple of 16 */ |
638 for (; (i < w) ; i++) | 640 for (; (i < w) ; i++) |
639 { | 641 { |
640 dst[i] = src[i]; | 642 dst[i] = src[i]; |
641 } | 643 } |
642 #endif | 644 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
645 } | |
646 | |
647 extern UINT8 cropTbl[]; | |
648 void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels, | |
649 int line_size) | |
650 { | |
651 ALTIVEC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1); | |
652 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
653 int i; | |
654 UINT8 *cm = cropTbl + MAX_NEG_CROP; | |
655 | |
656 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); | |
657 | |
658 /* read the pixels */ | |
659 for(i=0;i<8;i++) { | |
660 pixels[0] = cm[block[0]]; | |
661 pixels[1] = cm[block[1]]; | |
662 pixels[2] = cm[block[2]]; | |
663 pixels[3] = cm[block[3]]; | |
664 pixels[4] = cm[block[4]]; | |
665 pixels[5] = cm[block[5]]; | |
666 pixels[6] = cm[block[6]]; | |
667 pixels[7] = cm[block[7]]; | |
668 | |
669 pixels += line_size; | |
670 block += 8; | |
671 } | |
672 | |
673 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); | |
674 | |
675 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
676 register const vector short vczero = (const vector short)(0); | |
677 register vector short | |
678 blockv0, blockv1, blockv2, blockv3, | |
679 blockv4, blockv5, blockv6, blockv7; | |
680 register vector unsigned char | |
681 pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4, | |
682 pixelsv0old, pixelsv4old; | |
683 | |
684 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); | |
685 | |
686 blockv0 = vec_ld(0, block); | |
687 blockv1 = vec_ld(16, block); | |
688 blockv2 = vec_ld(32, block); | |
689 blockv3 = vec_ld(48, block); | |
690 blockv4 = vec_ld(64, block); | |
691 blockv5 = vec_ld(80, block); | |
692 blockv6 = vec_ld(96, block); | |
693 blockv7 = vec_ld(112, block); | |
694 if (((unsigned long)pixels) & 0x0000000F) | |
695 { | |
696 pixelsv0old = vec_ld(-8, pixels); | |
697 pixelsv4old = vec_ld(56, pixels); | |
698 pixelsv0 = vec_packsu(vczero, blockv0); | |
699 pixelsv1 = vec_packsu(blockv1, blockv2); | |
700 pixelsv2 = vec_packsu(blockv3, blockv4); | |
701 pixelsv3 = vec_packsu(blockv5, blockv6); | |
702 pixelsv4 = vec_packsu(blockv5, vczero); | |
703 pixelsv0 = vec_perm(pixelsv0old, pixelsv0, vcprm(0, 1, s2, s3)); | |
704 pixelsv4 = vec_perm(pixelsv4, pixelsv4old, vcprm(0, 1, s2, s3)); | |
705 vec_st(pixelsv0, -8, pixels); | |
706 vec_st(pixelsv1, 8, pixels); | |
707 vec_st(pixelsv2, 24, pixels); | |
708 vec_st(pixelsv3, 40, pixels); | |
709 vec_st(pixelsv4, 56, pixels); | |
710 } | |
711 else | |
712 { | |
713 pixelsv0 = vec_packsu(blockv0, blockv1); | |
714 pixelsv1 = vec_packsu(blockv2, blockv3); | |
715 pixelsv2 = vec_packsu(blockv4, blockv5); | |
716 pixelsv3 = vec_packsu(blockv6, blockv7); | |
717 vec_st(pixelsv0, 0, pixels); | |
718 vec_st(pixelsv1, 16, pixels); | |
719 vec_st(pixelsv2, 32, pixels); | |
720 vec_st(pixelsv3, 48, pixels); | |
721 } | |
722 | |
723 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); | |
724 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
725 } | |
726 | |
727 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
728 { | |
729 ALTIVEC_TBL_DECLARE(altivec_put_pixels16_num, 1); | |
730 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
731 int i; | |
732 | |
733 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1); | |
734 | |
735 for(i=0; i<h; i++) { | |
736 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); | |
737 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); | |
738 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); | |
739 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); | |
740 pixels+=line_size; | |
741 block +=line_size; | |
742 } | |
743 | |
744 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); | |
745 | |
746 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
747 | |
748 register vector unsigned char perm = vec_lvsl(0, pixels); | |
749 register vector unsigned char pixelsv1, pixelsv2; | |
750 int i; | |
751 | |
752 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1); | |
753 | |
754 for(i=0; i<h; i++) { | |
755 pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
756 pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
757 vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)block); | |
758 pixels+=line_size; | |
759 block +=line_size; | |
760 } | |
761 | |
762 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); | |
763 | |
764 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
765 } | |
766 | |
767 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | |
768 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
769 { | |
770 ALTIVEC_TBL_DECLARE(altivec_avg_pixels16_num, 1); | |
771 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
772 int i; | |
773 | |
774 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); | |
775 | |
776 for(i=0; i<h; i++) { | |
777 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); | |
778 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); | |
779 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); | |
780 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); | |
781 pixels+=line_size; | |
782 block +=line_size; | |
783 } | |
784 | |
785 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); | |
786 | |
787 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
788 | |
789 register vector unsigned char perm = vec_lvsl(0, pixels); | |
790 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
791 int i; | |
792 | |
793 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); | |
794 | |
795 for(i=0; i<h; i++) { | |
796 pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
797 pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
798 blockv = vec_ld(0, block); | |
799 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); | |
800 blockv = vec_avg(blockv,pixelsv); | |
801 vec_st(blockv, 0, (unsigned char*)block); | |
802 pixels+=line_size; | |
803 block +=line_size; | |
804 } | |
805 | |
806 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); | |
807 | |
808 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
643 } | 809 } |
644 | 810 |
645 int has_altivec(void) | 811 int has_altivec(void) |
646 { | 812 { |
647 #if CONFIG_DARWIN | 813 #if CONFIG_DARWIN |
654 | 820 |
655 if (err == 0) return (has_vu != 0); | 821 if (err == 0) return (has_vu != 0); |
656 #endif | 822 #endif |
657 return 0; | 823 return 0; |
658 } | 824 } |
825 | |
826 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT | |
827 void altivec_display_perf_report(void) | |
828 { | |
829 int i; | |
830 fprintf(stderr, "AltiVec performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n"); | |
831 for(i = 0 ; i < altivec_perf_total ; i++) | |
832 { | |
833 if (perfdata[i][altivec_data_num] != (unsigned long long)0) | |
834 fprintf(stderr, " Function \"%s\":\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n", | |
835 perfname[i], | |
836 perfdata[i][altivec_data_min], | |
837 perfdata[i][altivec_data_max], | |
838 (double)perfdata[i][altivec_data_sum] / | |
839 (double)perfdata[i][altivec_data_num], | |
840 perfdata[i][altivec_data_num]); | |
841 } | |
842 } | |
843 #endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */ |