Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 6403:9a736918fd90 libavcodec
split encoding part of dsputil_mmx into its own file
author | aurel |
---|---|
date | Mon, 25 Feb 2008 23:14:22 +0000 |
parents | 3dc36ec2dcad |
children | 5154ab444372 |
comparison
equal
deleted
inserted
replaced
6402:3164768539be | 6403:9a736918fd90 |
---|---|
72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; | 72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; |
73 | 73 |
74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::) | 74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::) |
75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::) | 75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::) |
76 | 76 |
77 #define MOVQ_WONE(regd) \ | |
78 asm volatile ( \ | |
79 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
80 "psrlw $15, %%" #regd ::) | |
81 | |
82 #define MOVQ_BFE(regd) \ | 77 #define MOVQ_BFE(regd) \ |
83 asm volatile ( \ | 78 asm volatile ( \ |
84 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | 79 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ |
85 "paddb %%" #regd ", %%" #regd " \n\t" ::) | 80 "paddb %%" #regd ", %%" #regd " \n\t" ::) |
86 | 81 |
217 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx | 212 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx |
218 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx | 213 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx |
219 | 214 |
220 /***********************************/ | 215 /***********************************/ |
221 /* standard MMX */ | 216 /* standard MMX */ |
222 | |
223 #ifdef CONFIG_ENCODERS | |
224 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) | |
225 { | |
226 asm volatile( | |
227 "mov $-128, %%"REG_a" \n\t" | |
228 "pxor %%mm7, %%mm7 \n\t" | |
229 ASMALIGN(4) | |
230 "1: \n\t" | |
231 "movq (%0), %%mm0 \n\t" | |
232 "movq (%0, %2), %%mm2 \n\t" | |
233 "movq %%mm0, %%mm1 \n\t" | |
234 "movq %%mm2, %%mm3 \n\t" | |
235 "punpcklbw %%mm7, %%mm0 \n\t" | |
236 "punpckhbw %%mm7, %%mm1 \n\t" | |
237 "punpcklbw %%mm7, %%mm2 \n\t" | |
238 "punpckhbw %%mm7, %%mm3 \n\t" | |
239 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
240 "movq %%mm1, 8(%1, %%"REG_a") \n\t" | |
241 "movq %%mm2, 16(%1, %%"REG_a") \n\t" | |
242 "movq %%mm3, 24(%1, %%"REG_a") \n\t" | |
243 "add %3, %0 \n\t" | |
244 "add $32, %%"REG_a" \n\t" | |
245 "js 1b \n\t" | |
246 : "+r" (pixels) | |
247 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) | |
248 : "%"REG_a | |
249 ); | |
250 } | |
251 | |
252 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) | |
253 { | |
254 asm volatile( | |
255 "pxor %%mm7, %%mm7 \n\t" | |
256 "mov $-128, %%"REG_a" \n\t" | |
257 ASMALIGN(4) | |
258 "1: \n\t" | |
259 "movq (%0), %%mm0 \n\t" | |
260 "movq (%1), %%mm2 \n\t" | |
261 "movq %%mm0, %%mm1 \n\t" | |
262 "movq %%mm2, %%mm3 \n\t" | |
263 "punpcklbw %%mm7, %%mm0 \n\t" | |
264 "punpckhbw %%mm7, %%mm1 \n\t" | |
265 "punpcklbw %%mm7, %%mm2 \n\t" | |
266 "punpckhbw %%mm7, %%mm3 \n\t" | |
267 "psubw %%mm2, %%mm0 \n\t" | |
268 "psubw %%mm3, %%mm1 \n\t" | |
269 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
270 "movq %%mm1, 8(%2, %%"REG_a") \n\t" | |
271 "add %3, %0 \n\t" | |
272 "add %3, %1 \n\t" | |
273 "add $16, %%"REG_a" \n\t" | |
274 "jnz 1b \n\t" | |
275 : "+r" (s1), "+r" (s2) | |
276 : "r" (block+64), "r" ((long)stride) | |
277 : "%"REG_a | |
278 ); | |
279 } | |
280 #endif //CONFIG_ENCODERS | |
281 | 217 |
282 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) | 218 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
283 { | 219 { |
284 const DCTELEM *p; | 220 const DCTELEM *p; |
285 uint8_t *pix; | 221 uint8_t *pix; |
541 " js 1b \n\t" | 477 " js 1b \n\t" |
542 : : "r" (((uint8_t *)blocks)+128*6) | 478 : : "r" (((uint8_t *)blocks)+128*6) |
543 : "%"REG_a | 479 : "%"REG_a |
544 ); | 480 ); |
545 } | 481 } |
546 | |
547 #ifdef CONFIG_ENCODERS | |
548 static int pix_sum16_mmx(uint8_t * pix, int line_size){ | |
549 const int h=16; | |
550 int sum; | |
551 long index= -line_size*h; | |
552 | |
553 asm volatile( | |
554 "pxor %%mm7, %%mm7 \n\t" | |
555 "pxor %%mm6, %%mm6 \n\t" | |
556 "1: \n\t" | |
557 "movq (%2, %1), %%mm0 \n\t" | |
558 "movq (%2, %1), %%mm1 \n\t" | |
559 "movq 8(%2, %1), %%mm2 \n\t" | |
560 "movq 8(%2, %1), %%mm3 \n\t" | |
561 "punpcklbw %%mm7, %%mm0 \n\t" | |
562 "punpckhbw %%mm7, %%mm1 \n\t" | |
563 "punpcklbw %%mm7, %%mm2 \n\t" | |
564 "punpckhbw %%mm7, %%mm3 \n\t" | |
565 "paddw %%mm0, %%mm1 \n\t" | |
566 "paddw %%mm2, %%mm3 \n\t" | |
567 "paddw %%mm1, %%mm3 \n\t" | |
568 "paddw %%mm3, %%mm6 \n\t" | |
569 "add %3, %1 \n\t" | |
570 " js 1b \n\t" | |
571 "movq %%mm6, %%mm5 \n\t" | |
572 "psrlq $32, %%mm6 \n\t" | |
573 "paddw %%mm5, %%mm6 \n\t" | |
574 "movq %%mm6, %%mm5 \n\t" | |
575 "psrlq $16, %%mm6 \n\t" | |
576 "paddw %%mm5, %%mm6 \n\t" | |
577 "movd %%mm6, %0 \n\t" | |
578 "andl $0xFFFF, %0 \n\t" | |
579 : "=&r" (sum), "+r" (index) | |
580 : "r" (pix - index), "r" ((long)line_size) | |
581 ); | |
582 | |
583 return sum; | |
584 } | |
585 #endif //CONFIG_ENCODERS | |
586 | 482 |
587 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ | 483 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
588 long i=0; | 484 long i=0; |
589 asm volatile( | 485 asm volatile( |
590 "1: \n\t" | 486 "1: \n\t" |
798 "r" ((long)(3*stride)) | 694 "r" ((long)(3*stride)) |
799 ); | 695 ); |
800 } | 696 } |
801 } | 697 } |
802 | 698 |
803 #ifdef CONFIG_ENCODERS | |
804 static int pix_norm1_mmx(uint8_t *pix, int line_size) { | |
805 int tmp; | |
806 asm volatile ( | |
807 "movl $16,%%ecx\n" | |
808 "pxor %%mm0,%%mm0\n" | |
809 "pxor %%mm7,%%mm7\n" | |
810 "1:\n" | |
811 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ | |
812 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ | |
813 | |
814 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ | |
815 | |
816 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ | |
817 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ | |
818 | |
819 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ | |
820 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ | |
821 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ | |
822 | |
823 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ | |
824 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ | |
825 | |
826 "pmaddwd %%mm3,%%mm3\n" | |
827 "pmaddwd %%mm4,%%mm4\n" | |
828 | |
829 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, | |
830 pix2^2+pix3^2+pix6^2+pix7^2) */ | |
831 "paddd %%mm3,%%mm4\n" | |
832 "paddd %%mm2,%%mm7\n" | |
833 | |
834 "add %2, %0\n" | |
835 "paddd %%mm4,%%mm7\n" | |
836 "dec %%ecx\n" | |
837 "jnz 1b\n" | |
838 | |
839 "movq %%mm7,%%mm1\n" | |
840 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
841 "paddd %%mm7,%%mm1\n" | |
842 "movd %%mm1,%1\n" | |
843 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); | |
844 return tmp; | |
845 } | |
846 | |
847 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
848 int tmp; | |
849 asm volatile ( | |
850 "movl %4,%%ecx\n" | |
851 "shr $1,%%ecx\n" | |
852 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
853 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
854 "1:\n" | |
855 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ | |
856 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ | |
857 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ | |
858 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ | |
859 | |
860 /* todo: mm1-mm2, mm3-mm4 */ | |
861 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
862 /* OR the results to get absolute difference */ | |
863 "movq %%mm1,%%mm5\n" | |
864 "movq %%mm3,%%mm6\n" | |
865 "psubusb %%mm2,%%mm1\n" | |
866 "psubusb %%mm4,%%mm3\n" | |
867 "psubusb %%mm5,%%mm2\n" | |
868 "psubusb %%mm6,%%mm4\n" | |
869 | |
870 "por %%mm1,%%mm2\n" | |
871 "por %%mm3,%%mm4\n" | |
872 | |
873 /* now convert to 16-bit vectors so we can square them */ | |
874 "movq %%mm2,%%mm1\n" | |
875 "movq %%mm4,%%mm3\n" | |
876 | |
877 "punpckhbw %%mm0,%%mm2\n" | |
878 "punpckhbw %%mm0,%%mm4\n" | |
879 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
880 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
881 | |
882 "pmaddwd %%mm2,%%mm2\n" | |
883 "pmaddwd %%mm4,%%mm4\n" | |
884 "pmaddwd %%mm1,%%mm1\n" | |
885 "pmaddwd %%mm3,%%mm3\n" | |
886 | |
887 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ | |
888 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ | |
889 | |
890 "paddd %%mm2,%%mm1\n" | |
891 "paddd %%mm4,%%mm3\n" | |
892 "paddd %%mm1,%%mm7\n" | |
893 "paddd %%mm3,%%mm7\n" | |
894 | |
895 "decl %%ecx\n" | |
896 "jnz 1b\n" | |
897 | |
898 "movq %%mm7,%%mm1\n" | |
899 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
900 "paddd %%mm7,%%mm1\n" | |
901 "movd %%mm1,%2\n" | |
902 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
903 : "r" ((long)line_size) , "m" (h) | |
904 : "%ecx"); | |
905 return tmp; | |
906 } | |
907 | |
908 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
909 int tmp; | |
910 asm volatile ( | |
911 "movl %4,%%ecx\n" | |
912 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
913 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
914 "1:\n" | |
915 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ | |
916 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ | |
917 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ | |
918 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ | |
919 | |
920 /* todo: mm1-mm2, mm3-mm4 */ | |
921 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
922 /* OR the results to get absolute difference */ | |
923 "movq %%mm1,%%mm5\n" | |
924 "movq %%mm3,%%mm6\n" | |
925 "psubusb %%mm2,%%mm1\n" | |
926 "psubusb %%mm4,%%mm3\n" | |
927 "psubusb %%mm5,%%mm2\n" | |
928 "psubusb %%mm6,%%mm4\n" | |
929 | |
930 "por %%mm1,%%mm2\n" | |
931 "por %%mm3,%%mm4\n" | |
932 | |
933 /* now convert to 16-bit vectors so we can square them */ | |
934 "movq %%mm2,%%mm1\n" | |
935 "movq %%mm4,%%mm3\n" | |
936 | |
937 "punpckhbw %%mm0,%%mm2\n" | |
938 "punpckhbw %%mm0,%%mm4\n" | |
939 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
940 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
941 | |
942 "pmaddwd %%mm2,%%mm2\n" | |
943 "pmaddwd %%mm4,%%mm4\n" | |
944 "pmaddwd %%mm1,%%mm1\n" | |
945 "pmaddwd %%mm3,%%mm3\n" | |
946 | |
947 "add %3,%0\n" | |
948 "add %3,%1\n" | |
949 | |
950 "paddd %%mm2,%%mm1\n" | |
951 "paddd %%mm4,%%mm3\n" | |
952 "paddd %%mm1,%%mm7\n" | |
953 "paddd %%mm3,%%mm7\n" | |
954 | |
955 "decl %%ecx\n" | |
956 "jnz 1b\n" | |
957 | |
958 "movq %%mm7,%%mm1\n" | |
959 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
960 "paddd %%mm7,%%mm1\n" | |
961 "movd %%mm1,%2\n" | |
962 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
963 : "r" ((long)line_size) , "m" (h) | |
964 : "%ecx"); | |
965 return tmp; | |
966 } | |
967 | |
968 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
969 int tmp; | |
970 asm volatile ( | |
971 "shr $1,%2\n" | |
972 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ | |
973 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ | |
974 "1:\n" | |
975 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ | |
976 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ | |
977 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ | |
978 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ | |
979 | |
980 /* todo: mm1-mm2, mm3-mm4 */ | |
981 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
982 /* OR the results to get absolute difference */ | |
983 "movdqa %%xmm1,%%xmm5\n" | |
984 "movdqa %%xmm3,%%xmm6\n" | |
985 "psubusb %%xmm2,%%xmm1\n" | |
986 "psubusb %%xmm4,%%xmm3\n" | |
987 "psubusb %%xmm5,%%xmm2\n" | |
988 "psubusb %%xmm6,%%xmm4\n" | |
989 | |
990 "por %%xmm1,%%xmm2\n" | |
991 "por %%xmm3,%%xmm4\n" | |
992 | |
993 /* now convert to 16-bit vectors so we can square them */ | |
994 "movdqa %%xmm2,%%xmm1\n" | |
995 "movdqa %%xmm4,%%xmm3\n" | |
996 | |
997 "punpckhbw %%xmm0,%%xmm2\n" | |
998 "punpckhbw %%xmm0,%%xmm4\n" | |
999 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ | |
1000 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ | |
1001 | |
1002 "pmaddwd %%xmm2,%%xmm2\n" | |
1003 "pmaddwd %%xmm4,%%xmm4\n" | |
1004 "pmaddwd %%xmm1,%%xmm1\n" | |
1005 "pmaddwd %%xmm3,%%xmm3\n" | |
1006 | |
1007 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ | |
1008 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ | |
1009 | |
1010 "paddd %%xmm2,%%xmm1\n" | |
1011 "paddd %%xmm4,%%xmm3\n" | |
1012 "paddd %%xmm1,%%xmm7\n" | |
1013 "paddd %%xmm3,%%xmm7\n" | |
1014 | |
1015 "decl %2\n" | |
1016 "jnz 1b\n" | |
1017 | |
1018 "movdqa %%xmm7,%%xmm1\n" | |
1019 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ | |
1020 "paddd %%xmm1,%%xmm7\n" | |
1021 "movdqa %%xmm7,%%xmm1\n" | |
1022 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ | |
1023 "paddd %%xmm1,%%xmm7\n" | |
1024 "movd %%xmm7,%3\n" | |
1025 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) | |
1026 : "r" ((long)line_size)); | |
1027 return tmp; | |
1028 } | |
1029 | |
1030 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { | |
1031 int tmp; | |
1032 asm volatile ( | |
1033 "movl %3,%%ecx\n" | |
1034 "pxor %%mm7,%%mm7\n" | |
1035 "pxor %%mm6,%%mm6\n" | |
1036 | |
1037 "movq (%0),%%mm0\n" | |
1038 "movq %%mm0, %%mm1\n" | |
1039 "psllq $8, %%mm0\n" | |
1040 "psrlq $8, %%mm1\n" | |
1041 "psrlq $8, %%mm0\n" | |
1042 "movq %%mm0, %%mm2\n" | |
1043 "movq %%mm1, %%mm3\n" | |
1044 "punpcklbw %%mm7,%%mm0\n" | |
1045 "punpcklbw %%mm7,%%mm1\n" | |
1046 "punpckhbw %%mm7,%%mm2\n" | |
1047 "punpckhbw %%mm7,%%mm3\n" | |
1048 "psubw %%mm1, %%mm0\n" | |
1049 "psubw %%mm3, %%mm2\n" | |
1050 | |
1051 "add %2,%0\n" | |
1052 | |
1053 "movq (%0),%%mm4\n" | |
1054 "movq %%mm4, %%mm1\n" | |
1055 "psllq $8, %%mm4\n" | |
1056 "psrlq $8, %%mm1\n" | |
1057 "psrlq $8, %%mm4\n" | |
1058 "movq %%mm4, %%mm5\n" | |
1059 "movq %%mm1, %%mm3\n" | |
1060 "punpcklbw %%mm7,%%mm4\n" | |
1061 "punpcklbw %%mm7,%%mm1\n" | |
1062 "punpckhbw %%mm7,%%mm5\n" | |
1063 "punpckhbw %%mm7,%%mm3\n" | |
1064 "psubw %%mm1, %%mm4\n" | |
1065 "psubw %%mm3, %%mm5\n" | |
1066 "psubw %%mm4, %%mm0\n" | |
1067 "psubw %%mm5, %%mm2\n" | |
1068 "pxor %%mm3, %%mm3\n" | |
1069 "pxor %%mm1, %%mm1\n" | |
1070 "pcmpgtw %%mm0, %%mm3\n\t" | |
1071 "pcmpgtw %%mm2, %%mm1\n\t" | |
1072 "pxor %%mm3, %%mm0\n" | |
1073 "pxor %%mm1, %%mm2\n" | |
1074 "psubw %%mm3, %%mm0\n" | |
1075 "psubw %%mm1, %%mm2\n" | |
1076 "paddw %%mm0, %%mm2\n" | |
1077 "paddw %%mm2, %%mm6\n" | |
1078 | |
1079 "add %2,%0\n" | |
1080 "1:\n" | |
1081 | |
1082 "movq (%0),%%mm0\n" | |
1083 "movq %%mm0, %%mm1\n" | |
1084 "psllq $8, %%mm0\n" | |
1085 "psrlq $8, %%mm1\n" | |
1086 "psrlq $8, %%mm0\n" | |
1087 "movq %%mm0, %%mm2\n" | |
1088 "movq %%mm1, %%mm3\n" | |
1089 "punpcklbw %%mm7,%%mm0\n" | |
1090 "punpcklbw %%mm7,%%mm1\n" | |
1091 "punpckhbw %%mm7,%%mm2\n" | |
1092 "punpckhbw %%mm7,%%mm3\n" | |
1093 "psubw %%mm1, %%mm0\n" | |
1094 "psubw %%mm3, %%mm2\n" | |
1095 "psubw %%mm0, %%mm4\n" | |
1096 "psubw %%mm2, %%mm5\n" | |
1097 "pxor %%mm3, %%mm3\n" | |
1098 "pxor %%mm1, %%mm1\n" | |
1099 "pcmpgtw %%mm4, %%mm3\n\t" | |
1100 "pcmpgtw %%mm5, %%mm1\n\t" | |
1101 "pxor %%mm3, %%mm4\n" | |
1102 "pxor %%mm1, %%mm5\n" | |
1103 "psubw %%mm3, %%mm4\n" | |
1104 "psubw %%mm1, %%mm5\n" | |
1105 "paddw %%mm4, %%mm5\n" | |
1106 "paddw %%mm5, %%mm6\n" | |
1107 | |
1108 "add %2,%0\n" | |
1109 | |
1110 "movq (%0),%%mm4\n" | |
1111 "movq %%mm4, %%mm1\n" | |
1112 "psllq $8, %%mm4\n" | |
1113 "psrlq $8, %%mm1\n" | |
1114 "psrlq $8, %%mm4\n" | |
1115 "movq %%mm4, %%mm5\n" | |
1116 "movq %%mm1, %%mm3\n" | |
1117 "punpcklbw %%mm7,%%mm4\n" | |
1118 "punpcklbw %%mm7,%%mm1\n" | |
1119 "punpckhbw %%mm7,%%mm5\n" | |
1120 "punpckhbw %%mm7,%%mm3\n" | |
1121 "psubw %%mm1, %%mm4\n" | |
1122 "psubw %%mm3, %%mm5\n" | |
1123 "psubw %%mm4, %%mm0\n" | |
1124 "psubw %%mm5, %%mm2\n" | |
1125 "pxor %%mm3, %%mm3\n" | |
1126 "pxor %%mm1, %%mm1\n" | |
1127 "pcmpgtw %%mm0, %%mm3\n\t" | |
1128 "pcmpgtw %%mm2, %%mm1\n\t" | |
1129 "pxor %%mm3, %%mm0\n" | |
1130 "pxor %%mm1, %%mm2\n" | |
1131 "psubw %%mm3, %%mm0\n" | |
1132 "psubw %%mm1, %%mm2\n" | |
1133 "paddw %%mm0, %%mm2\n" | |
1134 "paddw %%mm2, %%mm6\n" | |
1135 | |
1136 "add %2,%0\n" | |
1137 "subl $2, %%ecx\n" | |
1138 " jnz 1b\n" | |
1139 | |
1140 "movq %%mm6, %%mm0\n" | |
1141 "punpcklwd %%mm7,%%mm0\n" | |
1142 "punpckhwd %%mm7,%%mm6\n" | |
1143 "paddd %%mm0, %%mm6\n" | |
1144 | |
1145 "movq %%mm6,%%mm0\n" | |
1146 "psrlq $32, %%mm6\n" | |
1147 "paddd %%mm6,%%mm0\n" | |
1148 "movd %%mm0,%1\n" | |
1149 : "+r" (pix1), "=r"(tmp) | |
1150 : "r" ((long)line_size) , "g" (h-2) | |
1151 : "%ecx"); | |
1152 return tmp; | |
1153 } | |
1154 | |
1155 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | |
1156 int tmp; | |
1157 uint8_t * pix= pix1; | |
1158 asm volatile ( | |
1159 "movl %3,%%ecx\n" | |
1160 "pxor %%mm7,%%mm7\n" | |
1161 "pxor %%mm6,%%mm6\n" | |
1162 | |
1163 "movq (%0),%%mm0\n" | |
1164 "movq 1(%0),%%mm1\n" | |
1165 "movq %%mm0, %%mm2\n" | |
1166 "movq %%mm1, %%mm3\n" | |
1167 "punpcklbw %%mm7,%%mm0\n" | |
1168 "punpcklbw %%mm7,%%mm1\n" | |
1169 "punpckhbw %%mm7,%%mm2\n" | |
1170 "punpckhbw %%mm7,%%mm3\n" | |
1171 "psubw %%mm1, %%mm0\n" | |
1172 "psubw %%mm3, %%mm2\n" | |
1173 | |
1174 "add %2,%0\n" | |
1175 | |
1176 "movq (%0),%%mm4\n" | |
1177 "movq 1(%0),%%mm1\n" | |
1178 "movq %%mm4, %%mm5\n" | |
1179 "movq %%mm1, %%mm3\n" | |
1180 "punpcklbw %%mm7,%%mm4\n" | |
1181 "punpcklbw %%mm7,%%mm1\n" | |
1182 "punpckhbw %%mm7,%%mm5\n" | |
1183 "punpckhbw %%mm7,%%mm3\n" | |
1184 "psubw %%mm1, %%mm4\n" | |
1185 "psubw %%mm3, %%mm5\n" | |
1186 "psubw %%mm4, %%mm0\n" | |
1187 "psubw %%mm5, %%mm2\n" | |
1188 "pxor %%mm3, %%mm3\n" | |
1189 "pxor %%mm1, %%mm1\n" | |
1190 "pcmpgtw %%mm0, %%mm3\n\t" | |
1191 "pcmpgtw %%mm2, %%mm1\n\t" | |
1192 "pxor %%mm3, %%mm0\n" | |
1193 "pxor %%mm1, %%mm2\n" | |
1194 "psubw %%mm3, %%mm0\n" | |
1195 "psubw %%mm1, %%mm2\n" | |
1196 "paddw %%mm0, %%mm2\n" | |
1197 "paddw %%mm2, %%mm6\n" | |
1198 | |
1199 "add %2,%0\n" | |
1200 "1:\n" | |
1201 | |
1202 "movq (%0),%%mm0\n" | |
1203 "movq 1(%0),%%mm1\n" | |
1204 "movq %%mm0, %%mm2\n" | |
1205 "movq %%mm1, %%mm3\n" | |
1206 "punpcklbw %%mm7,%%mm0\n" | |
1207 "punpcklbw %%mm7,%%mm1\n" | |
1208 "punpckhbw %%mm7,%%mm2\n" | |
1209 "punpckhbw %%mm7,%%mm3\n" | |
1210 "psubw %%mm1, %%mm0\n" | |
1211 "psubw %%mm3, %%mm2\n" | |
1212 "psubw %%mm0, %%mm4\n" | |
1213 "psubw %%mm2, %%mm5\n" | |
1214 "pxor %%mm3, %%mm3\n" | |
1215 "pxor %%mm1, %%mm1\n" | |
1216 "pcmpgtw %%mm4, %%mm3\n\t" | |
1217 "pcmpgtw %%mm5, %%mm1\n\t" | |
1218 "pxor %%mm3, %%mm4\n" | |
1219 "pxor %%mm1, %%mm5\n" | |
1220 "psubw %%mm3, %%mm4\n" | |
1221 "psubw %%mm1, %%mm5\n" | |
1222 "paddw %%mm4, %%mm5\n" | |
1223 "paddw %%mm5, %%mm6\n" | |
1224 | |
1225 "add %2,%0\n" | |
1226 | |
1227 "movq (%0),%%mm4\n" | |
1228 "movq 1(%0),%%mm1\n" | |
1229 "movq %%mm4, %%mm5\n" | |
1230 "movq %%mm1, %%mm3\n" | |
1231 "punpcklbw %%mm7,%%mm4\n" | |
1232 "punpcklbw %%mm7,%%mm1\n" | |
1233 "punpckhbw %%mm7,%%mm5\n" | |
1234 "punpckhbw %%mm7,%%mm3\n" | |
1235 "psubw %%mm1, %%mm4\n" | |
1236 "psubw %%mm3, %%mm5\n" | |
1237 "psubw %%mm4, %%mm0\n" | |
1238 "psubw %%mm5, %%mm2\n" | |
1239 "pxor %%mm3, %%mm3\n" | |
1240 "pxor %%mm1, %%mm1\n" | |
1241 "pcmpgtw %%mm0, %%mm3\n\t" | |
1242 "pcmpgtw %%mm2, %%mm1\n\t" | |
1243 "pxor %%mm3, %%mm0\n" | |
1244 "pxor %%mm1, %%mm2\n" | |
1245 "psubw %%mm3, %%mm0\n" | |
1246 "psubw %%mm1, %%mm2\n" | |
1247 "paddw %%mm0, %%mm2\n" | |
1248 "paddw %%mm2, %%mm6\n" | |
1249 | |
1250 "add %2,%0\n" | |
1251 "subl $2, %%ecx\n" | |
1252 " jnz 1b\n" | |
1253 | |
1254 "movq %%mm6, %%mm0\n" | |
1255 "punpcklwd %%mm7,%%mm0\n" | |
1256 "punpckhwd %%mm7,%%mm6\n" | |
1257 "paddd %%mm0, %%mm6\n" | |
1258 | |
1259 "movq %%mm6,%%mm0\n" | |
1260 "psrlq $32, %%mm6\n" | |
1261 "paddd %%mm6,%%mm0\n" | |
1262 "movd %%mm0,%1\n" | |
1263 : "+r" (pix1), "=r"(tmp) | |
1264 : "r" ((long)line_size) , "g" (h-2) | |
1265 : "%ecx"); | |
1266 return tmp + hf_noise8_mmx(pix+8, line_size, h); | |
1267 } | |
1268 | |
1269 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1270 MpegEncContext *c = p; | |
1271 int score1, score2; | |
1272 | |
1273 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); | |
1274 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); | |
1275 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); | |
1276 | |
1277 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; | |
1278 else return score1 + FFABS(score2)*8; | |
1279 } | |
1280 | |
1281 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1282 MpegEncContext *c = p; | |
1283 int score1= sse8_mmx(c, pix1, pix2, line_size, h); | |
1284 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); | |
1285 | |
1286 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; | |
1287 else return score1 + FFABS(score2)*8; | |
1288 } | |
1289 | |
1290 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
1291 int tmp; | |
1292 | |
1293 assert( (((int)pix) & 7) == 0); | |
1294 assert((line_size &7) ==0); | |
1295 | |
1296 #define SUM(in0, in1, out0, out1) \ | |
1297 "movq (%0), %%mm2\n"\ | |
1298 "movq 8(%0), %%mm3\n"\ | |
1299 "add %2,%0\n"\ | |
1300 "movq %%mm2, " #out0 "\n"\ | |
1301 "movq %%mm3, " #out1 "\n"\ | |
1302 "psubusb " #in0 ", %%mm2\n"\ | |
1303 "psubusb " #in1 ", %%mm3\n"\ | |
1304 "psubusb " #out0 ", " #in0 "\n"\ | |
1305 "psubusb " #out1 ", " #in1 "\n"\ | |
1306 "por %%mm2, " #in0 "\n"\ | |
1307 "por %%mm3, " #in1 "\n"\ | |
1308 "movq " #in0 ", %%mm2\n"\ | |
1309 "movq " #in1 ", %%mm3\n"\ | |
1310 "punpcklbw %%mm7, " #in0 "\n"\ | |
1311 "punpcklbw %%mm7, " #in1 "\n"\ | |
1312 "punpckhbw %%mm7, %%mm2\n"\ | |
1313 "punpckhbw %%mm7, %%mm3\n"\ | |
1314 "paddw " #in1 ", " #in0 "\n"\ | |
1315 "paddw %%mm3, %%mm2\n"\ | |
1316 "paddw %%mm2, " #in0 "\n"\ | |
1317 "paddw " #in0 ", %%mm6\n" | |
1318 | |
1319 | |
1320 asm volatile ( | |
1321 "movl %3,%%ecx\n" | |
1322 "pxor %%mm6,%%mm6\n" | |
1323 "pxor %%mm7,%%mm7\n" | |
1324 "movq (%0),%%mm0\n" | |
1325 "movq 8(%0),%%mm1\n" | |
1326 "add %2,%0\n" | |
1327 "subl $2, %%ecx\n" | |
1328 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1329 "1:\n" | |
1330 | |
1331 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1332 | |
1333 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1334 | |
1335 "subl $2, %%ecx\n" | |
1336 "jnz 1b\n" | |
1337 | |
1338 "movq %%mm6,%%mm0\n" | |
1339 "psrlq $32, %%mm6\n" | |
1340 "paddw %%mm6,%%mm0\n" | |
1341 "movq %%mm0,%%mm6\n" | |
1342 "psrlq $16, %%mm0\n" | |
1343 "paddw %%mm6,%%mm0\n" | |
1344 "movd %%mm0,%1\n" | |
1345 : "+r" (pix), "=r"(tmp) | |
1346 : "r" ((long)line_size) , "m" (h) | |
1347 : "%ecx"); | |
1348 return tmp & 0xFFFF; | |
1349 } | |
1350 #undef SUM | |
1351 | |
1352 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
1353 int tmp; | |
1354 | |
1355 assert( (((int)pix) & 7) == 0); | |
1356 assert((line_size &7) ==0); | |
1357 | |
1358 #define SUM(in0, in1, out0, out1) \ | |
1359 "movq (%0), " #out0 "\n"\ | |
1360 "movq 8(%0), " #out1 "\n"\ | |
1361 "add %2,%0\n"\ | |
1362 "psadbw " #out0 ", " #in0 "\n"\ | |
1363 "psadbw " #out1 ", " #in1 "\n"\ | |
1364 "paddw " #in1 ", " #in0 "\n"\ | |
1365 "paddw " #in0 ", %%mm6\n" | |
1366 | |
1367 asm volatile ( | |
1368 "movl %3,%%ecx\n" | |
1369 "pxor %%mm6,%%mm6\n" | |
1370 "pxor %%mm7,%%mm7\n" | |
1371 "movq (%0),%%mm0\n" | |
1372 "movq 8(%0),%%mm1\n" | |
1373 "add %2,%0\n" | |
1374 "subl $2, %%ecx\n" | |
1375 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1376 "1:\n" | |
1377 | |
1378 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1379 | |
1380 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1381 | |
1382 "subl $2, %%ecx\n" | |
1383 "jnz 1b\n" | |
1384 | |
1385 "movd %%mm6,%1\n" | |
1386 : "+r" (pix), "=r"(tmp) | |
1387 : "r" ((long)line_size) , "m" (h) | |
1388 : "%ecx"); | |
1389 return tmp; | |
1390 } | |
1391 #undef SUM | |
1392 | |
1393 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1394 int tmp; | |
1395 | |
1396 assert( (((int)pix1) & 7) == 0); | |
1397 assert( (((int)pix2) & 7) == 0); | |
1398 assert((line_size &7) ==0); | |
1399 | |
1400 #define SUM(in0, in1, out0, out1) \ | |
1401 "movq (%0),%%mm2\n"\ | |
1402 "movq (%1)," #out0 "\n"\ | |
1403 "movq 8(%0),%%mm3\n"\ | |
1404 "movq 8(%1)," #out1 "\n"\ | |
1405 "add %3,%0\n"\ | |
1406 "add %3,%1\n"\ | |
1407 "psubb " #out0 ", %%mm2\n"\ | |
1408 "psubb " #out1 ", %%mm3\n"\ | |
1409 "pxor %%mm7, %%mm2\n"\ | |
1410 "pxor %%mm7, %%mm3\n"\ | |
1411 "movq %%mm2, " #out0 "\n"\ | |
1412 "movq %%mm3, " #out1 "\n"\ | |
1413 "psubusb " #in0 ", %%mm2\n"\ | |
1414 "psubusb " #in1 ", %%mm3\n"\ | |
1415 "psubusb " #out0 ", " #in0 "\n"\ | |
1416 "psubusb " #out1 ", " #in1 "\n"\ | |
1417 "por %%mm2, " #in0 "\n"\ | |
1418 "por %%mm3, " #in1 "\n"\ | |
1419 "movq " #in0 ", %%mm2\n"\ | |
1420 "movq " #in1 ", %%mm3\n"\ | |
1421 "punpcklbw %%mm7, " #in0 "\n"\ | |
1422 "punpcklbw %%mm7, " #in1 "\n"\ | |
1423 "punpckhbw %%mm7, %%mm2\n"\ | |
1424 "punpckhbw %%mm7, %%mm3\n"\ | |
1425 "paddw " #in1 ", " #in0 "\n"\ | |
1426 "paddw %%mm3, %%mm2\n"\ | |
1427 "paddw %%mm2, " #in0 "\n"\ | |
1428 "paddw " #in0 ", %%mm6\n" | |
1429 | |
1430 | |
1431 asm volatile ( | |
1432 "movl %4,%%ecx\n" | |
1433 "pxor %%mm6,%%mm6\n" | |
1434 "pcmpeqw %%mm7,%%mm7\n" | |
1435 "psllw $15, %%mm7\n" | |
1436 "packsswb %%mm7, %%mm7\n" | |
1437 "movq (%0),%%mm0\n" | |
1438 "movq (%1),%%mm2\n" | |
1439 "movq 8(%0),%%mm1\n" | |
1440 "movq 8(%1),%%mm3\n" | |
1441 "add %3,%0\n" | |
1442 "add %3,%1\n" | |
1443 "subl $2, %%ecx\n" | |
1444 "psubb %%mm2, %%mm0\n" | |
1445 "psubb %%mm3, %%mm1\n" | |
1446 "pxor %%mm7, %%mm0\n" | |
1447 "pxor %%mm7, %%mm1\n" | |
1448 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1449 "1:\n" | |
1450 | |
1451 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1452 | |
1453 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1454 | |
1455 "subl $2, %%ecx\n" | |
1456 "jnz 1b\n" | |
1457 | |
1458 "movq %%mm6,%%mm0\n" | |
1459 "psrlq $32, %%mm6\n" | |
1460 "paddw %%mm6,%%mm0\n" | |
1461 "movq %%mm0,%%mm6\n" | |
1462 "psrlq $16, %%mm0\n" | |
1463 "paddw %%mm6,%%mm0\n" | |
1464 "movd %%mm0,%2\n" | |
1465 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
1466 : "r" ((long)line_size) , "m" (h) | |
1467 : "%ecx"); | |
1468 return tmp & 0x7FFF; | |
1469 } | |
1470 #undef SUM | |
1471 | |
1472 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1473 int tmp; | |
1474 | |
1475 assert( (((int)pix1) & 7) == 0); | |
1476 assert( (((int)pix2) & 7) == 0); | |
1477 assert((line_size &7) ==0); | |
1478 | |
1479 #define SUM(in0, in1, out0, out1) \ | |
1480 "movq (%0)," #out0 "\n"\ | |
1481 "movq (%1),%%mm2\n"\ | |
1482 "movq 8(%0)," #out1 "\n"\ | |
1483 "movq 8(%1),%%mm3\n"\ | |
1484 "add %3,%0\n"\ | |
1485 "add %3,%1\n"\ | |
1486 "psubb %%mm2, " #out0 "\n"\ | |
1487 "psubb %%mm3, " #out1 "\n"\ | |
1488 "pxor %%mm7, " #out0 "\n"\ | |
1489 "pxor %%mm7, " #out1 "\n"\ | |
1490 "psadbw " #out0 ", " #in0 "\n"\ | |
1491 "psadbw " #out1 ", " #in1 "\n"\ | |
1492 "paddw " #in1 ", " #in0 "\n"\ | |
1493 "paddw " #in0 ", %%mm6\n" | |
1494 | |
1495 asm volatile ( | |
1496 "movl %4,%%ecx\n" | |
1497 "pxor %%mm6,%%mm6\n" | |
1498 "pcmpeqw %%mm7,%%mm7\n" | |
1499 "psllw $15, %%mm7\n" | |
1500 "packsswb %%mm7, %%mm7\n" | |
1501 "movq (%0),%%mm0\n" | |
1502 "movq (%1),%%mm2\n" | |
1503 "movq 8(%0),%%mm1\n" | |
1504 "movq 8(%1),%%mm3\n" | |
1505 "add %3,%0\n" | |
1506 "add %3,%1\n" | |
1507 "subl $2, %%ecx\n" | |
1508 "psubb %%mm2, %%mm0\n" | |
1509 "psubb %%mm3, %%mm1\n" | |
1510 "pxor %%mm7, %%mm0\n" | |
1511 "pxor %%mm7, %%mm1\n" | |
1512 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1513 "1:\n" | |
1514 | |
1515 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1516 | |
1517 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1518 | |
1519 "subl $2, %%ecx\n" | |
1520 "jnz 1b\n" | |
1521 | |
1522 "movd %%mm6,%2\n" | |
1523 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
1524 : "r" ((long)line_size) , "m" (h) | |
1525 : "%ecx"); | |
1526 return tmp; | |
1527 } | |
1528 #undef SUM | |
1529 | |
1530 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | |
1531 long i=0; | |
1532 asm volatile( | |
1533 "1: \n\t" | |
1534 "movq (%2, %0), %%mm0 \n\t" | |
1535 "movq (%1, %0), %%mm1 \n\t" | |
1536 "psubb %%mm0, %%mm1 \n\t" | |
1537 "movq %%mm1, (%3, %0) \n\t" | |
1538 "movq 8(%2, %0), %%mm0 \n\t" | |
1539 "movq 8(%1, %0), %%mm1 \n\t" | |
1540 "psubb %%mm0, %%mm1 \n\t" | |
1541 "movq %%mm1, 8(%3, %0) \n\t" | |
1542 "add $16, %0 \n\t" | |
1543 "cmp %4, %0 \n\t" | |
1544 " jb 1b \n\t" | |
1545 : "+r" (i) | |
1546 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) | |
1547 ); | |
1548 for(; i<w; i++) | |
1549 dst[i+0] = src1[i+0]-src2[i+0]; | |
1550 } | |
1551 | |
1552 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ | |
1553 long i=0; | |
1554 uint8_t l, lt; | |
1555 | |
1556 asm volatile( | |
1557 "1: \n\t" | |
1558 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
1559 "movq (%1, %0), %%mm1 \n\t" // T | |
1560 "movq -1(%2, %0), %%mm2 \n\t" // L | |
1561 "movq (%2, %0), %%mm3 \n\t" // X | |
1562 "movq %%mm2, %%mm4 \n\t" // L | |
1563 "psubb %%mm0, %%mm2 \n\t" | |
1564 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
1565 "movq %%mm4, %%mm5 \n\t" // L | |
1566 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
1567 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
1568 "pminub %%mm2, %%mm4 \n\t" | |
1569 "pmaxub %%mm1, %%mm4 \n\t" | |
1570 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
1571 "movq %%mm3, (%3, %0) \n\t" | |
1572 "add $8, %0 \n\t" | |
1573 "cmp %4, %0 \n\t" | |
1574 " jb 1b \n\t" | |
1575 : "+r" (i) | |
1576 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) | |
1577 ); | |
1578 | |
1579 l= *left; | |
1580 lt= *left_top; | |
1581 | |
1582 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); | |
1583 | |
1584 *left_top= src1[w-1]; | |
1585 *left = src2[w-1]; | |
1586 } | |
1587 | |
1588 #define PAETH(cpu, abs3)\ | 699 #define PAETH(cpu, abs3)\ |
1589 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ | 700 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ |
1590 {\ | 701 {\ |
1591 long i = -bpp;\ | 702 long i = -bpp;\ |
1592 long end = w-3;\ | 703 long end = w-3;\ |
1656 | 767 |
1657 PAETH(mmx2, ABS3_MMX2) | 768 PAETH(mmx2, ABS3_MMX2) |
1658 #ifdef HAVE_SSSE3 | 769 #ifdef HAVE_SSSE3 |
1659 PAETH(ssse3, ABS3_SSSE3) | 770 PAETH(ssse3, ABS3_SSSE3) |
1660 #endif | 771 #endif |
1661 | |
1662 #define DIFF_PIXELS_1(m,a,t,p1,p2)\ | |
1663 "mov"#m" "#p1", "#a" \n\t"\ | |
1664 "mov"#m" "#p2", "#t" \n\t"\ | |
1665 "punpcklbw "#a", "#t" \n\t"\ | |
1666 "punpcklbw "#a", "#a" \n\t"\ | |
1667 "psubw "#t", "#a" \n\t"\ | |
1668 | |
1669 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ | |
1670 uint8_t *p1b=p1, *p2b=p2;\ | |
1671 asm volatile(\ | |
1672 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ | |
1673 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ | |
1674 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ | |
1675 "add %4, %1 \n\t"\ | |
1676 "add %4, %2 \n\t"\ | |
1677 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ | |
1678 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ | |
1679 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ | |
1680 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ | |
1681 "mov"#m1" "#mm"0, %0 \n\t"\ | |
1682 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ | |
1683 "mov"#m1" %0, "#mm"0 \n\t"\ | |
1684 : "+m"(temp), "+r"(p1b), "+r"(p2b)\ | |
1685 : "r"((long)stride), "r"((long)stride*3)\ | |
1686 );\ | |
1687 } | |
1688 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) | |
1689 | |
1690 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) | |
1691 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) | |
1692 | |
1693 #define LBUTTERFLY2(a1,b1,a2,b2)\ | |
1694 "paddw " #b1 ", " #a1 " \n\t"\ | |
1695 "paddw " #b2 ", " #a2 " \n\t"\ | |
1696 "paddw " #b1 ", " #b1 " \n\t"\ | |
1697 "paddw " #b2 ", " #b2 " \n\t"\ | |
1698 "psubw " #a1 ", " #b1 " \n\t"\ | |
1699 "psubw " #a2 ", " #b2 " \n\t" | |
1700 | |
1701 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ | |
1702 LBUTTERFLY2(m0, m1, m2, m3)\ | |
1703 LBUTTERFLY2(m4, m5, m6, m7)\ | |
1704 LBUTTERFLY2(m0, m2, m1, m3)\ | |
1705 LBUTTERFLY2(m4, m6, m5, m7)\ | |
1706 LBUTTERFLY2(m0, m4, m1, m5)\ | |
1707 LBUTTERFLY2(m2, m6, m3, m7)\ | |
1708 | |
1709 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) | |
1710 | |
1711 #define MMABS_MMX(a,z)\ | |
1712 "pxor " #z ", " #z " \n\t"\ | |
1713 "pcmpgtw " #a ", " #z " \n\t"\ | |
1714 "pxor " #z ", " #a " \n\t"\ | |
1715 "psubw " #z ", " #a " \n\t" | |
1716 | |
1717 #define MMABS_MMX2(a,z)\ | |
1718 "pxor " #z ", " #z " \n\t"\ | |
1719 "psubw " #a ", " #z " \n\t"\ | |
1720 "pmaxsw " #z ", " #a " \n\t" | |
1721 | |
1722 #define MMABS_SSSE3(a,z)\ | |
1723 "pabsw " #a ", " #a " \n\t" | |
1724 | |
1725 #define MMABS_SUM(a,z, sum)\ | |
1726 MMABS(a,z)\ | |
1727 "paddusw " #a ", " #sum " \n\t" | |
1728 | |
1729 #define MMABS_SUM_8x8_NOSPILL\ | |
1730 MMABS(%%xmm0, %%xmm8)\ | |
1731 MMABS(%%xmm1, %%xmm9)\ | |
1732 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ | |
1733 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ | |
1734 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ | |
1735 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ | |
1736 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ | |
1737 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ | |
1738 "paddusw %%xmm1, %%xmm0 \n\t" | |
1739 | |
1740 #ifdef ARCH_X86_64 | |
1741 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL | |
1742 #else | |
1743 #define MMABS_SUM_8x8_SSE2\ | |
1744 "movdqa %%xmm7, (%1) \n\t"\ | |
1745 MMABS(%%xmm0, %%xmm7)\ | |
1746 MMABS(%%xmm1, %%xmm7)\ | |
1747 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ | |
1748 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ | |
1749 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ | |
1750 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ | |
1751 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ | |
1752 "movdqa (%1), %%xmm2 \n\t"\ | |
1753 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ | |
1754 "paddusw %%xmm1, %%xmm0 \n\t" | |
1755 #endif | |
1756 | |
1757 #define LOAD4(o, a, b, c, d)\ | |
1758 "movq "#o"(%1), "#a" \n\t"\ | |
1759 "movq "#o"+8(%1), "#b" \n\t"\ | |
1760 "movq "#o"+16(%1), "#c" \n\t"\ | |
1761 "movq "#o"+24(%1), "#d" \n\t"\ | |
1762 | |
1763 #define STORE4(o, a, b, c, d)\ | |
1764 "movq "#a", "#o"(%1) \n\t"\ | |
1765 "movq "#b", "#o"+8(%1) \n\t"\ | |
1766 "movq "#c", "#o"+16(%1) \n\t"\ | |
1767 "movq "#d", "#o"+24(%1) \n\t"\ | |
1768 | |
1769 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to | |
1770 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, | |
1771 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ | |
1772 #define HSUM_MMX(a, t, dst)\ | |
1773 "movq "#a", "#t" \n\t"\ | |
1774 "psrlq $32, "#a" \n\t"\ | |
1775 "paddusw "#t", "#a" \n\t"\ | |
1776 "movq "#a", "#t" \n\t"\ | |
1777 "psrlq $16, "#a" \n\t"\ | |
1778 "paddusw "#t", "#a" \n\t"\ | |
1779 "movd "#a", "#dst" \n\t"\ | |
1780 | |
1781 #define HSUM_MMX2(a, t, dst)\ | |
1782 "pshufw $0x0E, "#a", "#t" \n\t"\ | |
1783 "paddusw "#t", "#a" \n\t"\ | |
1784 "pshufw $0x01, "#a", "#t" \n\t"\ | |
1785 "paddusw "#t", "#a" \n\t"\ | |
1786 "movd "#a", "#dst" \n\t"\ | |
1787 | |
1788 #define HSUM_SSE2(a, t, dst)\ | |
1789 "movhlps "#a", "#t" \n\t"\ | |
1790 "paddusw "#t", "#a" \n\t"\ | |
1791 "pshuflw $0x0E, "#a", "#t" \n\t"\ | |
1792 "paddusw "#t", "#a" \n\t"\ | |
1793 "pshuflw $0x01, "#a", "#t" \n\t"\ | |
1794 "paddusw "#t", "#a" \n\t"\ | |
1795 "movd "#a", "#dst" \n\t"\ | |
1796 | |
1797 #define HADAMARD8_DIFF_MMX(cpu) \ | |
1798 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | |
1799 DECLARE_ALIGNED_8(uint64_t, temp[13]);\ | |
1800 int sum;\ | |
1801 \ | |
1802 assert(h==8);\ | |
1803 \ | |
1804 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ | |
1805 \ | |
1806 asm volatile(\ | |
1807 HADAMARD48\ | |
1808 \ | |
1809 "movq %%mm7, 96(%1) \n\t"\ | |
1810 \ | |
1811 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1812 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1813 \ | |
1814 "movq 96(%1), %%mm7 \n\t"\ | |
1815 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1816 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ | |
1817 \ | |
1818 : "=r" (sum)\ | |
1819 : "r"(temp)\ | |
1820 );\ | |
1821 \ | |
1822 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ | |
1823 \ | |
1824 asm volatile(\ | |
1825 HADAMARD48\ | |
1826 \ | |
1827 "movq %%mm7, 96(%1) \n\t"\ | |
1828 \ | |
1829 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1830 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1831 \ | |
1832 "movq 96(%1), %%mm7 \n\t"\ | |
1833 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1834 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ | |
1835 "movq %%mm6, %%mm7 \n\t"\ | |
1836 "movq %%mm0, %%mm6 \n\t"\ | |
1837 \ | |
1838 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1839 \ | |
1840 HADAMARD48\ | |
1841 "movq %%mm7, 64(%1) \n\t"\ | |
1842 MMABS(%%mm0, %%mm7)\ | |
1843 MMABS(%%mm1, %%mm7)\ | |
1844 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1845 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1846 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1847 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1848 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1849 "movq 64(%1), %%mm2 \n\t"\ | |
1850 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1851 "paddusw %%mm1, %%mm0 \n\t"\ | |
1852 "movq %%mm0, 64(%1) \n\t"\ | |
1853 \ | |
1854 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1855 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ | |
1856 \ | |
1857 HADAMARD48\ | |
1858 "movq %%mm7, (%1) \n\t"\ | |
1859 MMABS(%%mm0, %%mm7)\ | |
1860 MMABS(%%mm1, %%mm7)\ | |
1861 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1862 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1863 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1864 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1865 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1866 "movq (%1), %%mm2 \n\t"\ | |
1867 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1868 "paddusw 64(%1), %%mm0 \n\t"\ | |
1869 "paddusw %%mm1, %%mm0 \n\t"\ | |
1870 \ | |
1871 HSUM(%%mm0, %%mm1, %0)\ | |
1872 \ | |
1873 : "=r" (sum)\ | |
1874 : "r"(temp)\ | |
1875 );\ | |
1876 return sum&0xFFFF;\ | |
1877 }\ | |
1878 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1879 | |
1880 #define HADAMARD8_DIFF_SSE2(cpu) \ | |
1881 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | |
1882 DECLARE_ALIGNED_16(uint64_t, temp[4]);\ | |
1883 int sum;\ | |
1884 \ | |
1885 assert(h==8);\ | |
1886 \ | |
1887 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ | |
1888 \ | |
1889 asm volatile(\ | |
1890 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ | |
1891 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ | |
1892 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ | |
1893 MMABS_SUM_8x8\ | |
1894 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ | |
1895 : "=r" (sum)\ | |
1896 : "r"(temp)\ | |
1897 );\ | |
1898 return sum&0xFFFF;\ | |
1899 }\ | |
1900 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1901 | |
1902 #define MMABS(a,z) MMABS_MMX(a,z) | |
1903 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) | |
1904 HADAMARD8_DIFF_MMX(mmx) | |
1905 #undef MMABS | |
1906 #undef HSUM | |
1907 | |
1908 #define MMABS(a,z) MMABS_MMX2(a,z) | |
1909 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 | |
1910 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) | |
1911 HADAMARD8_DIFF_MMX(mmx2) | |
1912 HADAMARD8_DIFF_SSE2(sse2) | |
1913 #undef MMABS | |
1914 #undef MMABS_SUM_8x8 | |
1915 #undef HSUM | |
1916 | |
1917 #ifdef HAVE_SSSE3 | |
1918 #define MMABS(a,z) MMABS_SSSE3(a,z) | |
1919 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL | |
1920 HADAMARD8_DIFF_SSE2(ssse3) | |
1921 #undef MMABS | |
1922 #undef MMABS_SUM_8x8 | |
1923 #endif | |
1924 | |
1925 #define DCT_SAD4(m,mm,o)\ | |
1926 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ | |
1927 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ | |
1928 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ | |
1929 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ | |
1930 MMABS_SUM(mm##2, mm##6, mm##0)\ | |
1931 MMABS_SUM(mm##3, mm##7, mm##1)\ | |
1932 MMABS_SUM(mm##4, mm##6, mm##0)\ | |
1933 MMABS_SUM(mm##5, mm##7, mm##1)\ | |
1934 | |
1935 #define DCT_SAD_MMX\ | |
1936 "pxor %%mm0, %%mm0 \n\t"\ | |
1937 "pxor %%mm1, %%mm1 \n\t"\ | |
1938 DCT_SAD4(q, %%mm, 0)\ | |
1939 DCT_SAD4(q, %%mm, 8)\ | |
1940 DCT_SAD4(q, %%mm, 64)\ | |
1941 DCT_SAD4(q, %%mm, 72)\ | |
1942 "paddusw %%mm1, %%mm0 \n\t"\ | |
1943 HSUM(%%mm0, %%mm1, %0) | |
1944 | |
1945 #define DCT_SAD_SSE2\ | |
1946 "pxor %%xmm0, %%xmm0 \n\t"\ | |
1947 "pxor %%xmm1, %%xmm1 \n\t"\ | |
1948 DCT_SAD4(dqa, %%xmm, 0)\ | |
1949 DCT_SAD4(dqa, %%xmm, 64)\ | |
1950 "paddusw %%xmm1, %%xmm0 \n\t"\ | |
1951 HSUM(%%xmm0, %%xmm1, %0) | |
1952 | |
1953 #define DCT_SAD_FUNC(cpu) \ | |
1954 static int sum_abs_dctelem_##cpu(DCTELEM *block){\ | |
1955 int sum;\ | |
1956 asm volatile(\ | |
1957 DCT_SAD\ | |
1958 :"=r"(sum)\ | |
1959 :"r"(block)\ | |
1960 );\ | |
1961 return sum&0xFFFF;\ | |
1962 } | |
1963 | |
1964 #define DCT_SAD DCT_SAD_MMX | |
1965 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) | |
1966 #define MMABS(a,z) MMABS_MMX(a,z) | |
1967 DCT_SAD_FUNC(mmx) | |
1968 #undef MMABS | |
1969 #undef HSUM | |
1970 | |
1971 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) | |
1972 #define MMABS(a,z) MMABS_MMX2(a,z) | |
1973 DCT_SAD_FUNC(mmx2) | |
1974 #undef HSUM | |
1975 #undef DCT_SAD | |
1976 | |
1977 #define DCT_SAD DCT_SAD_SSE2 | |
1978 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) | |
1979 DCT_SAD_FUNC(sse2) | |
1980 #undef MMABS | |
1981 | |
1982 #ifdef HAVE_SSSE3 | |
1983 #define MMABS(a,z) MMABS_SSSE3(a,z) | |
1984 DCT_SAD_FUNC(ssse3) | |
1985 #undef MMABS | |
1986 #endif | |
1987 #undef HSUM | |
1988 #undef DCT_SAD | |
1989 | |
1990 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ | |
1991 int sum; | |
1992 long i=size; | |
1993 asm volatile( | |
1994 "pxor %%mm4, %%mm4 \n" | |
1995 "1: \n" | |
1996 "sub $8, %0 \n" | |
1997 "movq (%2,%0), %%mm2 \n" | |
1998 "movq (%3,%0,2), %%mm0 \n" | |
1999 "movq 8(%3,%0,2), %%mm1 \n" | |
2000 "punpckhbw %%mm2, %%mm3 \n" | |
2001 "punpcklbw %%mm2, %%mm2 \n" | |
2002 "psraw $8, %%mm3 \n" | |
2003 "psraw $8, %%mm2 \n" | |
2004 "psubw %%mm3, %%mm1 \n" | |
2005 "psubw %%mm2, %%mm0 \n" | |
2006 "pmaddwd %%mm1, %%mm1 \n" | |
2007 "pmaddwd %%mm0, %%mm0 \n" | |
2008 "paddd %%mm1, %%mm4 \n" | |
2009 "paddd %%mm0, %%mm4 \n" | |
2010 "jg 1b \n" | |
2011 "movq %%mm4, %%mm3 \n" | |
2012 "psrlq $32, %%mm3 \n" | |
2013 "paddd %%mm3, %%mm4 \n" | |
2014 "movd %%mm4, %1 \n" | |
2015 :"+r"(i), "=r"(sum) | |
2016 :"r"(pix1), "r"(pix2) | |
2017 ); | |
2018 return sum; | |
2019 } | |
2020 | |
2021 #endif //CONFIG_ENCODERS | |
2022 | 772 |
2023 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ | 773 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
2024 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ | 774 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
2025 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ | 775 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
2026 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ | 776 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
2856 } | 1606 } |
2857 src += 4-h*stride; | 1607 src += 4-h*stride; |
2858 } | 1608 } |
2859 } | 1609 } |
2860 | 1610 |
2861 #ifdef CONFIG_ENCODERS | |
2862 | |
2863 #define PHADDD(a, t)\ | |
2864 "movq "#a", "#t" \n\t"\ | |
2865 "psrlq $32, "#a" \n\t"\ | |
2866 "paddd "#t", "#a" \n\t" | |
2867 /* | |
2868 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] | |
2869 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] | |
2870 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] | |
2871 */ | |
2872 #define PMULHRW(x, y, s, o)\ | |
2873 "pmulhw " #s ", "#x " \n\t"\ | |
2874 "pmulhw " #s ", "#y " \n\t"\ | |
2875 "paddw " #o ", "#x " \n\t"\ | |
2876 "paddw " #o ", "#y " \n\t"\ | |
2877 "psraw $1, "#x " \n\t"\ | |
2878 "psraw $1, "#y " \n\t" | |
2879 #define DEF(x) x ## _mmx | |
2880 #define SET_RND MOVQ_WONE | |
2881 #define SCALE_OFFSET 1 | |
2882 | |
2883 #include "dsputil_mmx_qns.h" | |
2884 | |
2885 #undef DEF | |
2886 #undef SET_RND | |
2887 #undef SCALE_OFFSET | |
2888 #undef PMULHRW | |
2889 | |
2890 #define DEF(x) x ## _3dnow | |
2891 #define SET_RND(x) | |
2892 #define SCALE_OFFSET 0 | |
2893 #define PMULHRW(x, y, s, o)\ | |
2894 "pmulhrw " #s ", "#x " \n\t"\ | |
2895 "pmulhrw " #s ", "#y " \n\t" | |
2896 | |
2897 #include "dsputil_mmx_qns.h" | |
2898 | |
2899 #undef DEF | |
2900 #undef SET_RND | |
2901 #undef SCALE_OFFSET | |
2902 #undef PMULHRW | |
2903 | |
2904 #ifdef HAVE_SSSE3 | |
2905 #undef PHADDD | |
2906 #define DEF(x) x ## _ssse3 | |
2907 #define SET_RND(x) | |
2908 #define SCALE_OFFSET -1 | |
2909 #define PHADDD(a, t)\ | |
2910 "pshufw $0x0E, "#a", "#t" \n\t"\ | |
2911 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ | |
2912 #define PMULHRW(x, y, s, o)\ | |
2913 "pmulhrsw " #s ", "#x " \n\t"\ | |
2914 "pmulhrsw " #s ", "#y " \n\t" | |
2915 | |
2916 #include "dsputil_mmx_qns.h" | |
2917 | |
2918 #undef DEF | |
2919 #undef SET_RND | |
2920 #undef SCALE_OFFSET | |
2921 #undef PMULHRW | |
2922 #undef PHADDD | |
2923 #endif //HAVE_SSSE3 | |
2924 | |
2925 #endif /* CONFIG_ENCODERS */ | |
2926 | |
2927 #define PREFETCH(name, op) \ | 1611 #define PREFETCH(name, op) \ |
2928 static void name(void *mem, int stride, int h){\ | 1612 static void name(void *mem, int stride, int h){\ |
2929 const uint8_t *p= mem;\ | 1613 const uint8_t *p= mem;\ |
2930 do{\ | 1614 do{\ |
2931 asm volatile(#op" %0" :: "m"(*p));\ | 1615 asm volatile(#op" %0" :: "m"(*p));\ |
2951 put_pixels16_mmx(dst, src, stride, 16); | 1635 put_pixels16_mmx(dst, src, stride, 16); |
2952 } | 1636 } |
2953 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | 1637 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { |
2954 avg_pixels16_mmx(dst, src, stride, 16); | 1638 avg_pixels16_mmx(dst, src, stride, 16); |
2955 } | 1639 } |
2956 | |
2957 /* FLAC specific */ | |
2958 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, | |
2959 double *autoc); | |
2960 | 1640 |
2961 /* VC1 specific */ | 1641 /* VC1 specific */ |
2962 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); | 1642 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); |
2963 | 1643 |
2964 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { | 1644 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { |
3318 #endif | 1998 #endif |
3319 | 1999 |
3320 if (mm_flags & MM_MMX) { | 2000 if (mm_flags & MM_MMX) { |
3321 const int idct_algo= avctx->idct_algo; | 2001 const int idct_algo= avctx->idct_algo; |
3322 | 2002 |
3323 #ifdef CONFIG_ENCODERS | |
3324 const int dct_algo = avctx->dct_algo; | |
3325 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ | |
3326 if(mm_flags & MM_SSE2){ | |
3327 c->fdct = ff_fdct_sse2; | |
3328 }else if(mm_flags & MM_MMXEXT){ | |
3329 c->fdct = ff_fdct_mmx2; | |
3330 }else{ | |
3331 c->fdct = ff_fdct_mmx; | |
3332 } | |
3333 } | |
3334 #endif //CONFIG_ENCODERS | |
3335 if(avctx->lowres==0){ | 2003 if(avctx->lowres==0){ |
3336 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | 2004 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ |
3337 c->idct_put= ff_simple_idct_put_mmx; | 2005 c->idct_put= ff_simple_idct_put_mmx; |
3338 c->idct_add= ff_simple_idct_add_mmx; | 2006 c->idct_add= ff_simple_idct_add_mmx; |
3339 c->idct = ff_simple_idct_mmx; | 2007 c->idct = ff_simple_idct_mmx; |
3380 c->idct = ff_idct_xvid_mmx; | 2048 c->idct = ff_idct_xvid_mmx; |
3381 } | 2049 } |
3382 } | 2050 } |
3383 } | 2051 } |
3384 | 2052 |
3385 #ifdef CONFIG_ENCODERS | |
3386 c->get_pixels = get_pixels_mmx; | |
3387 c->diff_pixels = diff_pixels_mmx; | |
3388 #endif //CONFIG_ENCODERS | |
3389 c->put_pixels_clamped = put_pixels_clamped_mmx; | 2053 c->put_pixels_clamped = put_pixels_clamped_mmx; |
3390 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; | 2054 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; |
3391 c->add_pixels_clamped = add_pixels_clamped_mmx; | 2055 c->add_pixels_clamped = add_pixels_clamped_mmx; |
3392 c->clear_blocks = clear_blocks_mmx; | 2056 c->clear_blocks = clear_blocks_mmx; |
3393 #ifdef CONFIG_ENCODERS | |
3394 c->pix_sum = pix_sum16_mmx; | |
3395 #endif //CONFIG_ENCODERS | |
3396 | 2057 |
3397 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | 2058 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
3398 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ | 2059 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ |
3399 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | 2060 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ |
3400 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | 2061 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ |
3411 | 2072 |
3412 c->gmc= gmc_mmx; | 2073 c->gmc= gmc_mmx; |
3413 | 2074 |
3414 c->add_bytes= add_bytes_mmx; | 2075 c->add_bytes= add_bytes_mmx; |
3415 c->add_bytes_l2= add_bytes_l2_mmx; | 2076 c->add_bytes_l2= add_bytes_l2_mmx; |
3416 #ifdef CONFIG_ENCODERS | |
3417 c->diff_bytes= diff_bytes_mmx; | |
3418 c->sum_abs_dctelem= sum_abs_dctelem_mmx; | |
3419 | |
3420 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | |
3421 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
3422 | |
3423 c->pix_norm1 = pix_norm1_mmx; | |
3424 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; | |
3425 c->sse[1] = sse8_mmx; | |
3426 c->vsad[4]= vsad_intra16_mmx; | |
3427 | |
3428 c->nsse[0] = nsse16_mmx; | |
3429 c->nsse[1] = nsse8_mmx; | |
3430 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
3431 c->vsad[0] = vsad16_mmx; | |
3432 } | |
3433 | |
3434 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
3435 c->try_8x8basis= try_8x8basis_mmx; | |
3436 } | |
3437 c->add_8x8basis= add_8x8basis_mmx; | |
3438 | |
3439 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; | |
3440 | |
3441 #endif //CONFIG_ENCODERS | |
3442 | 2077 |
3443 if (ENABLE_ANY_H263) { | 2078 if (ENABLE_ANY_H263) { |
3444 c->h263_v_loop_filter= h263_v_loop_filter_mmx; | 2079 c->h263_v_loop_filter= h263_v_loop_filter_mmx; |
3445 c->h263_h_loop_filter= h263_h_loop_filter_mmx; | 2080 c->h263_h_loop_filter= h263_h_loop_filter_mmx; |
3446 } | 2081 } |
3470 | 2105 |
3471 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; | 2106 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; |
3472 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; | 2107 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; |
3473 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; | 2108 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; |
3474 | 2109 |
3475 #ifdef CONFIG_ENCODERS | |
3476 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; | |
3477 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; | |
3478 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
3479 c->vsad[4]= vsad_intra16_mmx2; | |
3480 #endif //CONFIG_ENCODERS | |
3481 | |
3482 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | 2110 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; |
3483 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; | 2111 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; |
3484 | 2112 |
3485 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | 2113 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3486 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | 2114 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; |
3487 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | 2115 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; |
3488 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | 2116 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; |
3489 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | 2117 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; |
3490 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | 2118 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; |
3491 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | 2119 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; |
3492 #ifdef CONFIG_ENCODERS | |
3493 c->vsad[0] = vsad16_mmx2; | |
3494 #endif //CONFIG_ENCODERS | |
3495 } | 2120 } |
3496 | 2121 |
3497 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | 2122 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
3498 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ | 2123 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ |
3499 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ | 2124 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ |
3566 | 2191 |
3567 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) | 2192 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) |
3568 ff_vc1dsp_init_mmx(c, avctx); | 2193 ff_vc1dsp_init_mmx(c, avctx); |
3569 | 2194 |
3570 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; | 2195 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; |
3571 #ifdef CONFIG_ENCODERS | |
3572 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; | |
3573 #endif //CONFIG_ENCODERS | |
3574 } else if (mm_flags & MM_3DNOW) { | 2196 } else if (mm_flags & MM_3DNOW) { |
3575 c->prefetch = prefetch_3dnow; | 2197 c->prefetch = prefetch_3dnow; |
3576 | 2198 |
3577 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; | 2199 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
3578 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; | 2200 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
3664 H264_QPEL_FUNCS(3, 3, ssse3); | 2286 H264_QPEL_FUNCS(3, 3, ssse3); |
3665 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; | 2287 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; |
3666 } | 2288 } |
3667 #endif | 2289 #endif |
3668 | 2290 |
3669 #ifdef CONFIG_ENCODERS | |
3670 if(mm_flags & MM_SSE2){ | |
3671 c->sum_abs_dctelem= sum_abs_dctelem_sse2; | |
3672 c->hadamard8_diff[0]= hadamard8_diff16_sse2; | |
3673 c->hadamard8_diff[1]= hadamard8_diff_sse2; | |
3674 if (ENABLE_FLAC_ENCODER) | |
3675 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2; | |
3676 } | |
3677 | |
3678 #ifdef HAVE_SSSE3 | |
3679 if(mm_flags & MM_SSSE3){ | |
3680 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
3681 c->try_8x8basis= try_8x8basis_ssse3; | |
3682 } | |
3683 c->add_8x8basis= add_8x8basis_ssse3; | |
3684 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; | |
3685 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; | |
3686 c->hadamard8_diff[1]= hadamard8_diff_ssse3; | |
3687 } | |
3688 #endif | |
3689 #endif | |
3690 | |
3691 #ifdef CONFIG_SNOW_DECODER | 2291 #ifdef CONFIG_SNOW_DECODER |
3692 if(mm_flags & MM_SSE2 & 0){ | 2292 if(mm_flags & MM_SSE2 & 0){ |
3693 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; | 2293 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; |
3694 #ifdef HAVE_7REGS | 2294 #ifdef HAVE_7REGS |
3695 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; | 2295 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; |
3706 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; | 2306 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; |
3707 } | 2307 } |
3708 #endif | 2308 #endif |
3709 | 2309 |
3710 if(mm_flags & MM_3DNOW){ | 2310 if(mm_flags & MM_3DNOW){ |
3711 #ifdef CONFIG_ENCODERS | |
3712 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
3713 c->try_8x8basis= try_8x8basis_3dnow; | |
3714 } | |
3715 c->add_8x8basis= add_8x8basis_3dnow; | |
3716 #endif //CONFIG_ENCODERS | |
3717 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | 2311 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
3718 c->vector_fmul = vector_fmul_3dnow; | 2312 c->vector_fmul = vector_fmul_3dnow; |
3719 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) | 2313 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) |
3720 c->float_to_int16 = float_to_int16_3dnow; | 2314 c->float_to_int16 = float_to_int16_3dnow; |
3721 } | 2315 } |
3730 } | 2324 } |
3731 if(mm_flags & MM_3DNOW) | 2325 if(mm_flags & MM_3DNOW) |
3732 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse | 2326 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse |
3733 } | 2327 } |
3734 | 2328 |
3735 #ifdef CONFIG_ENCODERS | 2329 if (ENABLE_ENCODERS) |
3736 dsputil_init_pix_mmx(c, avctx); | 2330 dsputilenc_init_mmx(c, avctx); |
3737 #endif //CONFIG_ENCODERS | 2331 |
3738 #if 0 | 2332 #if 0 |
3739 // for speed testing | 2333 // for speed testing |
3740 get_pixels = just_return; | 2334 get_pixels = just_return; |
3741 put_pixels_clamped = just_return; | 2335 put_pixels_clamped = just_return; |
3742 add_pixels_clamped = just_return; | 2336 add_pixels_clamped = just_return; |