Mercurial > libavcodec.hg
comparison ppc/h264_altivec.c @ 5165:c99fa49eaa80 libavcodec
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
In h264_deblock_q1, the result of the deblock needs to be kept to
be used in future deblocks, so return this value now.
Also change the sign of tc0 vector: It is really a signed value, so
treat it as such until after the >=0 check;
then, at that point, after being masked, it can be treated as unsigned.
Patch by Graham Booker % gbooker A tamu P edu%
author | gpoirier |
---|---|
date | Sun, 17 Jun 2007 09:37:13 +0000 |
parents | 830b9dd36fef |
children | 5be6f723abb6 |
comparison
equal
deleted
inserted
replaced
5164:830b9dd36fef | 5165:c99fa49eaa80 |
---|---|
738 mask = vec_and(mask, tempmask); | 738 mask = vec_and(mask, tempmask); |
739 | 739 |
740 return mask; | 740 return mask; |
741 } | 741 } |
742 | 742 |
743 // out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) | 743 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) |
744 static inline void h264_deblock_q1(register vector unsigned char p0, | 744 static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, |
745 register vector unsigned char p1, | 745 register vector unsigned char p1, |
746 register vector unsigned char p2, | 746 register vector unsigned char p2, |
747 register vector unsigned char q0, | 747 register vector unsigned char q0, |
748 register vector unsigned char tc0) { | 748 register vector unsigned char tc0) { |
749 | 749 |
751 register vector unsigned char temp; | 751 register vector unsigned char temp; |
752 register vector unsigned char uncliped; | 752 register vector unsigned char uncliped; |
753 register vector unsigned char ones; | 753 register vector unsigned char ones; |
754 register vector unsigned char max; | 754 register vector unsigned char max; |
755 register vector unsigned char min; | 755 register vector unsigned char min; |
756 register vector unsigned char newp1; | |
756 | 757 |
757 temp = vec_xor(average, p2); | 758 temp = vec_xor(average, p2); |
758 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ | 759 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ |
759 ones = vec_splat_u8(1); | 760 ones = vec_splat_u8(1); |
760 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ | 761 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ |
761 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ | 762 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ |
762 max = vec_adds(p1, tc0); | 763 max = vec_adds(p1, tc0); |
763 min = vec_subs(p1, tc0); | 764 min = vec_subs(p1, tc0); |
764 p1 = vec_max(min, uncliped); | 765 newp1 = vec_max(min, uncliped); |
765 p1 = vec_min(max, p1); | 766 newp1 = vec_min(max, newp1); |
767 return newp1; | |
766 } | 768 } |
767 | 769 |
768 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ | 770 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ |
769 \ | 771 \ |
770 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ | 772 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ |
802 register vector unsigned char alphavec; \ | 804 register vector unsigned char alphavec; \ |
803 register vector unsigned char betavec; \ | 805 register vector unsigned char betavec; \ |
804 register vector unsigned char mask; \ | 806 register vector unsigned char mask; \ |
805 register vector unsigned char p1mask; \ | 807 register vector unsigned char p1mask; \ |
806 register vector unsigned char q1mask; \ | 808 register vector unsigned char q1mask; \ |
807 register vector unsigned char tc0vec; \ | 809 register vector char tc0vec; \ |
808 register vector unsigned char finaltc0; \ | 810 register vector unsigned char finaltc0; \ |
809 register vector unsigned char tc0masked; \ | 811 register vector unsigned char tc0masked; \ |
812 register vector unsigned char newp1; \ | |
813 register vector unsigned char newq1; \ | |
810 \ | 814 \ |
811 temp[0] = alpha; \ | 815 temp[0] = alpha; \ |
812 temp[1] = beta; \ | 816 temp[1] = beta; \ |
813 alphavec = vec_ld(0, temp); \ | 817 alphavec = vec_ld(0, temp); \ |
814 betavec = vec_splat(alphavec, 0x1); \ | 818 betavec = vec_splat(alphavec, 0x1); \ |
817 \ | 821 \ |
818 *((int *)temp) = *((int *)tc0); \ | 822 *((int *)temp) = *((int *)tc0); \ |
819 tc0vec = vec_ld(0, temp); \ | 823 tc0vec = vec_ld(0, temp); \ |
820 tc0vec = vec_mergeh(tc0vec, tc0vec); \ | 824 tc0vec = vec_mergeh(tc0vec, tc0vec); \ |
821 tc0vec = vec_mergeh(tc0vec, tc0vec); \ | 825 tc0vec = vec_mergeh(tc0vec, tc0vec); \ |
822 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1))); /* if tc0[i] >= 0 */ \ | 826 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ |
823 finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/ \ | 827 finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \ |
824 \ | 828 \ |
825 p1mask = diff_lt_altivec(p2, p0, betavec); \ | 829 p1mask = diff_lt_altivec(p2, p0, betavec); \ |
826 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ | 830 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ |
827 tc0masked = vec_and(p1mask, tc0vec); \ | 831 tc0masked = vec_and(p1mask, tc0vec); \ |
828 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ | 832 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ |
829 h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ | 833 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ |
830 /*end if*/ \ | 834 /*end if*/ \ |
831 \ | 835 \ |
832 q1mask = diff_lt_altivec(q2, q0, betavec); \ | 836 q1mask = diff_lt_altivec(q2, q0, betavec); \ |
833 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ | 837 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ |
834 tc0masked = vec_and(q1mask, tc0vec); \ | 838 tc0masked = vec_and(q1mask, tc0vec); \ |
835 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ | 839 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ |
836 h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ | 840 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ |
837 /*end if*/ \ | 841 /*end if*/ \ |
838 \ | 842 \ |
839 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ | 843 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ |
844 p1 = newp1; \ | |
845 q1 = newq1; \ | |
840 } | 846 } |
841 | 847 |
842 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { | 848 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { |
843 | 849 |
844 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { | 850 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { |