comparison ppc/h264_altivec.c @ 5165:c99fa49eaa80 libavcodec

part 2/2 of fixing Altivec-accelerated H264 luma inloop filter In h264_deblock_q1, the result of the deblock needs to be kept to be used in future deblocks, so return this value now. Also change the sign of tc0 vector: It is really a signed value, so treat it as such until after the >=0 check; then, at that point, after being masked, it can be treated as unsigned. Patch by Graham Booker % gbooker A tamu P edu%
author gpoirier
date Sun, 17 Jun 2007 09:37:13 +0000
parents 830b9dd36fef
children 5be6f723abb6
comparison
equal deleted inserted replaced
5164:830b9dd36fef 5165:c99fa49eaa80
738 mask = vec_and(mask, tempmask); 738 mask = vec_and(mask, tempmask);
739 739
740 return mask; 740 return mask;
741 } 741 }
742 742
743 // out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) 743 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744 static inline void h264_deblock_q1(register vector unsigned char p0, 744 static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
745 register vector unsigned char p1, 745 register vector unsigned char p1,
746 register vector unsigned char p2, 746 register vector unsigned char p2,
747 register vector unsigned char q0, 747 register vector unsigned char q0,
748 register vector unsigned char tc0) { 748 register vector unsigned char tc0) {
749 749
751 register vector unsigned char temp; 751 register vector unsigned char temp;
752 register vector unsigned char uncliped; 752 register vector unsigned char uncliped;
753 register vector unsigned char ones; 753 register vector unsigned char ones;
754 register vector unsigned char max; 754 register vector unsigned char max;
755 register vector unsigned char min; 755 register vector unsigned char min;
756 register vector unsigned char newp1;
756 757
757 temp = vec_xor(average, p2); 758 temp = vec_xor(average, p2);
758 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ 759 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
759 ones = vec_splat_u8(1); 760 ones = vec_splat_u8(1);
760 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ 761 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
761 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ 762 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
762 max = vec_adds(p1, tc0); 763 max = vec_adds(p1, tc0);
763 min = vec_subs(p1, tc0); 764 min = vec_subs(p1, tc0);
764 p1 = vec_max(min, uncliped); 765 newp1 = vec_max(min, uncliped);
765 p1 = vec_min(max, p1); 766 newp1 = vec_min(max, newp1);
767 return newp1;
766 } 768 }
767 769
768 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ 770 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
769 \ 771 \
770 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ 772 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
802 register vector unsigned char alphavec; \ 804 register vector unsigned char alphavec; \
803 register vector unsigned char betavec; \ 805 register vector unsigned char betavec; \
804 register vector unsigned char mask; \ 806 register vector unsigned char mask; \
805 register vector unsigned char p1mask; \ 807 register vector unsigned char p1mask; \
806 register vector unsigned char q1mask; \ 808 register vector unsigned char q1mask; \
807 register vector unsigned char tc0vec; \ 809 register vector char tc0vec; \
808 register vector unsigned char finaltc0; \ 810 register vector unsigned char finaltc0; \
809 register vector unsigned char tc0masked; \ 811 register vector unsigned char tc0masked; \
812 register vector unsigned char newp1; \
813 register vector unsigned char newq1; \
810 \ 814 \
811 temp[0] = alpha; \ 815 temp[0] = alpha; \
812 temp[1] = beta; \ 816 temp[1] = beta; \
813 alphavec = vec_ld(0, temp); \ 817 alphavec = vec_ld(0, temp); \
814 betavec = vec_splat(alphavec, 0x1); \ 818 betavec = vec_splat(alphavec, 0x1); \
817 \ 821 \
818 *((int *)temp) = *((int *)tc0); \ 822 *((int *)temp) = *((int *)tc0); \
819 tc0vec = vec_ld(0, temp); \ 823 tc0vec = vec_ld(0, temp); \
820 tc0vec = vec_mergeh(tc0vec, tc0vec); \ 824 tc0vec = vec_mergeh(tc0vec, tc0vec); \
821 tc0vec = vec_mergeh(tc0vec, tc0vec); \ 825 tc0vec = vec_mergeh(tc0vec, tc0vec); \
822 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1))); /* if tc0[i] >= 0 */ \ 826 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
823 finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/ \ 827 finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \
824 \ 828 \
825 p1mask = diff_lt_altivec(p2, p0, betavec); \ 829 p1mask = diff_lt_altivec(p2, p0, betavec); \
826 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ 830 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
827 tc0masked = vec_and(p1mask, tc0vec); \ 831 tc0masked = vec_and(p1mask, tc0vec); \
828 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ 832 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
829 h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ 833 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
830 /*end if*/ \ 834 /*end if*/ \
831 \ 835 \
832 q1mask = diff_lt_altivec(q2, q0, betavec); \ 836 q1mask = diff_lt_altivec(q2, q0, betavec); \
833 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ 837 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
834 tc0masked = vec_and(q1mask, tc0vec); \ 838 tc0masked = vec_and(q1mask, tc0vec); \
835 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ 839 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
836 h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ 840 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
837 /*end if*/ \ 841 /*end if*/ \
838 \ 842 \
839 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ 843 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
844 p1 = newp1; \
845 q1 = newq1; \
840 } 846 }
841 847
842 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { 848 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
843 849
844 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { 850 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {