comparison i386/dsputil_mmx.c @ 6403:9a736918fd90 libavcodec

split encoding part of dsputil_mmx into its own file
author aurel
date Mon, 25 Feb 2008 23:14:22 +0000
parents 3dc36ec2dcad
children 5154ab444372
comparison
equal deleted inserted replaced
6402:3164768539be 6403:9a736918fd90
72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; 72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
73 73
74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::) 74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::) 75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
76 76
77 #define MOVQ_WONE(regd) \
78 asm volatile ( \
79 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
80 "psrlw $15, %%" #regd ::)
81
82 #define MOVQ_BFE(regd) \ 77 #define MOVQ_BFE(regd) \
83 asm volatile ( \ 78 asm volatile ( \
84 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ 79 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
85 "paddb %%" #regd ", %%" #regd " \n\t" ::) 80 "paddb %%" #regd ", %%" #regd " \n\t" ::)
86 81
217 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx 212 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
218 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx 213 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
219 214
220 /***********************************/ 215 /***********************************/
221 /* standard MMX */ 216 /* standard MMX */
222
223 #ifdef CONFIG_ENCODERS
224 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
225 {
226 asm volatile(
227 "mov $-128, %%"REG_a" \n\t"
228 "pxor %%mm7, %%mm7 \n\t"
229 ASMALIGN(4)
230 "1: \n\t"
231 "movq (%0), %%mm0 \n\t"
232 "movq (%0, %2), %%mm2 \n\t"
233 "movq %%mm0, %%mm1 \n\t"
234 "movq %%mm2, %%mm3 \n\t"
235 "punpcklbw %%mm7, %%mm0 \n\t"
236 "punpckhbw %%mm7, %%mm1 \n\t"
237 "punpcklbw %%mm7, %%mm2 \n\t"
238 "punpckhbw %%mm7, %%mm3 \n\t"
239 "movq %%mm0, (%1, %%"REG_a") \n\t"
240 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
241 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
242 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
243 "add %3, %0 \n\t"
244 "add $32, %%"REG_a" \n\t"
245 "js 1b \n\t"
246 : "+r" (pixels)
247 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
248 : "%"REG_a
249 );
250 }
251
252 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
253 {
254 asm volatile(
255 "pxor %%mm7, %%mm7 \n\t"
256 "mov $-128, %%"REG_a" \n\t"
257 ASMALIGN(4)
258 "1: \n\t"
259 "movq (%0), %%mm0 \n\t"
260 "movq (%1), %%mm2 \n\t"
261 "movq %%mm0, %%mm1 \n\t"
262 "movq %%mm2, %%mm3 \n\t"
263 "punpcklbw %%mm7, %%mm0 \n\t"
264 "punpckhbw %%mm7, %%mm1 \n\t"
265 "punpcklbw %%mm7, %%mm2 \n\t"
266 "punpckhbw %%mm7, %%mm3 \n\t"
267 "psubw %%mm2, %%mm0 \n\t"
268 "psubw %%mm3, %%mm1 \n\t"
269 "movq %%mm0, (%2, %%"REG_a") \n\t"
270 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
271 "add %3, %0 \n\t"
272 "add %3, %1 \n\t"
273 "add $16, %%"REG_a" \n\t"
274 "jnz 1b \n\t"
275 : "+r" (s1), "+r" (s2)
276 : "r" (block+64), "r" ((long)stride)
277 : "%"REG_a
278 );
279 }
280 #endif //CONFIG_ENCODERS
281 217
282 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 218 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
283 { 219 {
284 const DCTELEM *p; 220 const DCTELEM *p;
285 uint8_t *pix; 221 uint8_t *pix;
541 " js 1b \n\t" 477 " js 1b \n\t"
542 : : "r" (((uint8_t *)blocks)+128*6) 478 : : "r" (((uint8_t *)blocks)+128*6)
543 : "%"REG_a 479 : "%"REG_a
544 ); 480 );
545 } 481 }
546
547 #ifdef CONFIG_ENCODERS
548 static int pix_sum16_mmx(uint8_t * pix, int line_size){
549 const int h=16;
550 int sum;
551 long index= -line_size*h;
552
553 asm volatile(
554 "pxor %%mm7, %%mm7 \n\t"
555 "pxor %%mm6, %%mm6 \n\t"
556 "1: \n\t"
557 "movq (%2, %1), %%mm0 \n\t"
558 "movq (%2, %1), %%mm1 \n\t"
559 "movq 8(%2, %1), %%mm2 \n\t"
560 "movq 8(%2, %1), %%mm3 \n\t"
561 "punpcklbw %%mm7, %%mm0 \n\t"
562 "punpckhbw %%mm7, %%mm1 \n\t"
563 "punpcklbw %%mm7, %%mm2 \n\t"
564 "punpckhbw %%mm7, %%mm3 \n\t"
565 "paddw %%mm0, %%mm1 \n\t"
566 "paddw %%mm2, %%mm3 \n\t"
567 "paddw %%mm1, %%mm3 \n\t"
568 "paddw %%mm3, %%mm6 \n\t"
569 "add %3, %1 \n\t"
570 " js 1b \n\t"
571 "movq %%mm6, %%mm5 \n\t"
572 "psrlq $32, %%mm6 \n\t"
573 "paddw %%mm5, %%mm6 \n\t"
574 "movq %%mm6, %%mm5 \n\t"
575 "psrlq $16, %%mm6 \n\t"
576 "paddw %%mm5, %%mm6 \n\t"
577 "movd %%mm6, %0 \n\t"
578 "andl $0xFFFF, %0 \n\t"
579 : "=&r" (sum), "+r" (index)
580 : "r" (pix - index), "r" ((long)line_size)
581 );
582
583 return sum;
584 }
585 #endif //CONFIG_ENCODERS
586 482
587 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ 483 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
588 long i=0; 484 long i=0;
589 asm volatile( 485 asm volatile(
590 "1: \n\t" 486 "1: \n\t"
798 "r" ((long)(3*stride)) 694 "r" ((long)(3*stride))
799 ); 695 );
800 } 696 }
801 } 697 }
802 698
803 #ifdef CONFIG_ENCODERS
804 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
805 int tmp;
806 asm volatile (
807 "movl $16,%%ecx\n"
808 "pxor %%mm0,%%mm0\n"
809 "pxor %%mm7,%%mm7\n"
810 "1:\n"
811 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
812 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
813
814 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
815
816 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
817 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
818
819 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
820 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
821 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
822
823 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
824 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
825
826 "pmaddwd %%mm3,%%mm3\n"
827 "pmaddwd %%mm4,%%mm4\n"
828
829 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
830 pix2^2+pix3^2+pix6^2+pix7^2) */
831 "paddd %%mm3,%%mm4\n"
832 "paddd %%mm2,%%mm7\n"
833
834 "add %2, %0\n"
835 "paddd %%mm4,%%mm7\n"
836 "dec %%ecx\n"
837 "jnz 1b\n"
838
839 "movq %%mm7,%%mm1\n"
840 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
841 "paddd %%mm7,%%mm1\n"
842 "movd %%mm1,%1\n"
843 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
844 return tmp;
845 }
846
847 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
848 int tmp;
849 asm volatile (
850 "movl %4,%%ecx\n"
851 "shr $1,%%ecx\n"
852 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
853 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
854 "1:\n"
855 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
856 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
857 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
858 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
859
860 /* todo: mm1-mm2, mm3-mm4 */
861 /* algo: subtract mm1 from mm2 with saturation and vice versa */
862 /* OR the results to get absolute difference */
863 "movq %%mm1,%%mm5\n"
864 "movq %%mm3,%%mm6\n"
865 "psubusb %%mm2,%%mm1\n"
866 "psubusb %%mm4,%%mm3\n"
867 "psubusb %%mm5,%%mm2\n"
868 "psubusb %%mm6,%%mm4\n"
869
870 "por %%mm1,%%mm2\n"
871 "por %%mm3,%%mm4\n"
872
873 /* now convert to 16-bit vectors so we can square them */
874 "movq %%mm2,%%mm1\n"
875 "movq %%mm4,%%mm3\n"
876
877 "punpckhbw %%mm0,%%mm2\n"
878 "punpckhbw %%mm0,%%mm4\n"
879 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
880 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
881
882 "pmaddwd %%mm2,%%mm2\n"
883 "pmaddwd %%mm4,%%mm4\n"
884 "pmaddwd %%mm1,%%mm1\n"
885 "pmaddwd %%mm3,%%mm3\n"
886
887 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
888 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
889
890 "paddd %%mm2,%%mm1\n"
891 "paddd %%mm4,%%mm3\n"
892 "paddd %%mm1,%%mm7\n"
893 "paddd %%mm3,%%mm7\n"
894
895 "decl %%ecx\n"
896 "jnz 1b\n"
897
898 "movq %%mm7,%%mm1\n"
899 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
900 "paddd %%mm7,%%mm1\n"
901 "movd %%mm1,%2\n"
902 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
903 : "r" ((long)line_size) , "m" (h)
904 : "%ecx");
905 return tmp;
906 }
907
908 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
909 int tmp;
910 asm volatile (
911 "movl %4,%%ecx\n"
912 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
913 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
914 "1:\n"
915 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
916 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
917 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
918 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
919
920 /* todo: mm1-mm2, mm3-mm4 */
921 /* algo: subtract mm1 from mm2 with saturation and vice versa */
922 /* OR the results to get absolute difference */
923 "movq %%mm1,%%mm5\n"
924 "movq %%mm3,%%mm6\n"
925 "psubusb %%mm2,%%mm1\n"
926 "psubusb %%mm4,%%mm3\n"
927 "psubusb %%mm5,%%mm2\n"
928 "psubusb %%mm6,%%mm4\n"
929
930 "por %%mm1,%%mm2\n"
931 "por %%mm3,%%mm4\n"
932
933 /* now convert to 16-bit vectors so we can square them */
934 "movq %%mm2,%%mm1\n"
935 "movq %%mm4,%%mm3\n"
936
937 "punpckhbw %%mm0,%%mm2\n"
938 "punpckhbw %%mm0,%%mm4\n"
939 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
940 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
941
942 "pmaddwd %%mm2,%%mm2\n"
943 "pmaddwd %%mm4,%%mm4\n"
944 "pmaddwd %%mm1,%%mm1\n"
945 "pmaddwd %%mm3,%%mm3\n"
946
947 "add %3,%0\n"
948 "add %3,%1\n"
949
950 "paddd %%mm2,%%mm1\n"
951 "paddd %%mm4,%%mm3\n"
952 "paddd %%mm1,%%mm7\n"
953 "paddd %%mm3,%%mm7\n"
954
955 "decl %%ecx\n"
956 "jnz 1b\n"
957
958 "movq %%mm7,%%mm1\n"
959 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
960 "paddd %%mm7,%%mm1\n"
961 "movd %%mm1,%2\n"
962 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
963 : "r" ((long)line_size) , "m" (h)
964 : "%ecx");
965 return tmp;
966 }
967
968 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
969 int tmp;
970 asm volatile (
971 "shr $1,%2\n"
972 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
973 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
974 "1:\n"
975 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
976 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
977 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
978 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
979
980 /* todo: mm1-mm2, mm3-mm4 */
981 /* algo: subtract mm1 from mm2 with saturation and vice versa */
982 /* OR the results to get absolute difference */
983 "movdqa %%xmm1,%%xmm5\n"
984 "movdqa %%xmm3,%%xmm6\n"
985 "psubusb %%xmm2,%%xmm1\n"
986 "psubusb %%xmm4,%%xmm3\n"
987 "psubusb %%xmm5,%%xmm2\n"
988 "psubusb %%xmm6,%%xmm4\n"
989
990 "por %%xmm1,%%xmm2\n"
991 "por %%xmm3,%%xmm4\n"
992
993 /* now convert to 16-bit vectors so we can square them */
994 "movdqa %%xmm2,%%xmm1\n"
995 "movdqa %%xmm4,%%xmm3\n"
996
997 "punpckhbw %%xmm0,%%xmm2\n"
998 "punpckhbw %%xmm0,%%xmm4\n"
999 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
1000 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
1001
1002 "pmaddwd %%xmm2,%%xmm2\n"
1003 "pmaddwd %%xmm4,%%xmm4\n"
1004 "pmaddwd %%xmm1,%%xmm1\n"
1005 "pmaddwd %%xmm3,%%xmm3\n"
1006
1007 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
1008 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
1009
1010 "paddd %%xmm2,%%xmm1\n"
1011 "paddd %%xmm4,%%xmm3\n"
1012 "paddd %%xmm1,%%xmm7\n"
1013 "paddd %%xmm3,%%xmm7\n"
1014
1015 "decl %2\n"
1016 "jnz 1b\n"
1017
1018 "movdqa %%xmm7,%%xmm1\n"
1019 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
1020 "paddd %%xmm1,%%xmm7\n"
1021 "movdqa %%xmm7,%%xmm1\n"
1022 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
1023 "paddd %%xmm1,%%xmm7\n"
1024 "movd %%xmm7,%3\n"
1025 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
1026 : "r" ((long)line_size));
1027 return tmp;
1028 }
1029
1030 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
1031 int tmp;
1032 asm volatile (
1033 "movl %3,%%ecx\n"
1034 "pxor %%mm7,%%mm7\n"
1035 "pxor %%mm6,%%mm6\n"
1036
1037 "movq (%0),%%mm0\n"
1038 "movq %%mm0, %%mm1\n"
1039 "psllq $8, %%mm0\n"
1040 "psrlq $8, %%mm1\n"
1041 "psrlq $8, %%mm0\n"
1042 "movq %%mm0, %%mm2\n"
1043 "movq %%mm1, %%mm3\n"
1044 "punpcklbw %%mm7,%%mm0\n"
1045 "punpcklbw %%mm7,%%mm1\n"
1046 "punpckhbw %%mm7,%%mm2\n"
1047 "punpckhbw %%mm7,%%mm3\n"
1048 "psubw %%mm1, %%mm0\n"
1049 "psubw %%mm3, %%mm2\n"
1050
1051 "add %2,%0\n"
1052
1053 "movq (%0),%%mm4\n"
1054 "movq %%mm4, %%mm1\n"
1055 "psllq $8, %%mm4\n"
1056 "psrlq $8, %%mm1\n"
1057 "psrlq $8, %%mm4\n"
1058 "movq %%mm4, %%mm5\n"
1059 "movq %%mm1, %%mm3\n"
1060 "punpcklbw %%mm7,%%mm4\n"
1061 "punpcklbw %%mm7,%%mm1\n"
1062 "punpckhbw %%mm7,%%mm5\n"
1063 "punpckhbw %%mm7,%%mm3\n"
1064 "psubw %%mm1, %%mm4\n"
1065 "psubw %%mm3, %%mm5\n"
1066 "psubw %%mm4, %%mm0\n"
1067 "psubw %%mm5, %%mm2\n"
1068 "pxor %%mm3, %%mm3\n"
1069 "pxor %%mm1, %%mm1\n"
1070 "pcmpgtw %%mm0, %%mm3\n\t"
1071 "pcmpgtw %%mm2, %%mm1\n\t"
1072 "pxor %%mm3, %%mm0\n"
1073 "pxor %%mm1, %%mm2\n"
1074 "psubw %%mm3, %%mm0\n"
1075 "psubw %%mm1, %%mm2\n"
1076 "paddw %%mm0, %%mm2\n"
1077 "paddw %%mm2, %%mm6\n"
1078
1079 "add %2,%0\n"
1080 "1:\n"
1081
1082 "movq (%0),%%mm0\n"
1083 "movq %%mm0, %%mm1\n"
1084 "psllq $8, %%mm0\n"
1085 "psrlq $8, %%mm1\n"
1086 "psrlq $8, %%mm0\n"
1087 "movq %%mm0, %%mm2\n"
1088 "movq %%mm1, %%mm3\n"
1089 "punpcklbw %%mm7,%%mm0\n"
1090 "punpcklbw %%mm7,%%mm1\n"
1091 "punpckhbw %%mm7,%%mm2\n"
1092 "punpckhbw %%mm7,%%mm3\n"
1093 "psubw %%mm1, %%mm0\n"
1094 "psubw %%mm3, %%mm2\n"
1095 "psubw %%mm0, %%mm4\n"
1096 "psubw %%mm2, %%mm5\n"
1097 "pxor %%mm3, %%mm3\n"
1098 "pxor %%mm1, %%mm1\n"
1099 "pcmpgtw %%mm4, %%mm3\n\t"
1100 "pcmpgtw %%mm5, %%mm1\n\t"
1101 "pxor %%mm3, %%mm4\n"
1102 "pxor %%mm1, %%mm5\n"
1103 "psubw %%mm3, %%mm4\n"
1104 "psubw %%mm1, %%mm5\n"
1105 "paddw %%mm4, %%mm5\n"
1106 "paddw %%mm5, %%mm6\n"
1107
1108 "add %2,%0\n"
1109
1110 "movq (%0),%%mm4\n"
1111 "movq %%mm4, %%mm1\n"
1112 "psllq $8, %%mm4\n"
1113 "psrlq $8, %%mm1\n"
1114 "psrlq $8, %%mm4\n"
1115 "movq %%mm4, %%mm5\n"
1116 "movq %%mm1, %%mm3\n"
1117 "punpcklbw %%mm7,%%mm4\n"
1118 "punpcklbw %%mm7,%%mm1\n"
1119 "punpckhbw %%mm7,%%mm5\n"
1120 "punpckhbw %%mm7,%%mm3\n"
1121 "psubw %%mm1, %%mm4\n"
1122 "psubw %%mm3, %%mm5\n"
1123 "psubw %%mm4, %%mm0\n"
1124 "psubw %%mm5, %%mm2\n"
1125 "pxor %%mm3, %%mm3\n"
1126 "pxor %%mm1, %%mm1\n"
1127 "pcmpgtw %%mm0, %%mm3\n\t"
1128 "pcmpgtw %%mm2, %%mm1\n\t"
1129 "pxor %%mm3, %%mm0\n"
1130 "pxor %%mm1, %%mm2\n"
1131 "psubw %%mm3, %%mm0\n"
1132 "psubw %%mm1, %%mm2\n"
1133 "paddw %%mm0, %%mm2\n"
1134 "paddw %%mm2, %%mm6\n"
1135
1136 "add %2,%0\n"
1137 "subl $2, %%ecx\n"
1138 " jnz 1b\n"
1139
1140 "movq %%mm6, %%mm0\n"
1141 "punpcklwd %%mm7,%%mm0\n"
1142 "punpckhwd %%mm7,%%mm6\n"
1143 "paddd %%mm0, %%mm6\n"
1144
1145 "movq %%mm6,%%mm0\n"
1146 "psrlq $32, %%mm6\n"
1147 "paddd %%mm6,%%mm0\n"
1148 "movd %%mm0,%1\n"
1149 : "+r" (pix1), "=r"(tmp)
1150 : "r" ((long)line_size) , "g" (h-2)
1151 : "%ecx");
1152 return tmp;
1153 }
1154
1155 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1156 int tmp;
1157 uint8_t * pix= pix1;
1158 asm volatile (
1159 "movl %3,%%ecx\n"
1160 "pxor %%mm7,%%mm7\n"
1161 "pxor %%mm6,%%mm6\n"
1162
1163 "movq (%0),%%mm0\n"
1164 "movq 1(%0),%%mm1\n"
1165 "movq %%mm0, %%mm2\n"
1166 "movq %%mm1, %%mm3\n"
1167 "punpcklbw %%mm7,%%mm0\n"
1168 "punpcklbw %%mm7,%%mm1\n"
1169 "punpckhbw %%mm7,%%mm2\n"
1170 "punpckhbw %%mm7,%%mm3\n"
1171 "psubw %%mm1, %%mm0\n"
1172 "psubw %%mm3, %%mm2\n"
1173
1174 "add %2,%0\n"
1175
1176 "movq (%0),%%mm4\n"
1177 "movq 1(%0),%%mm1\n"
1178 "movq %%mm4, %%mm5\n"
1179 "movq %%mm1, %%mm3\n"
1180 "punpcklbw %%mm7,%%mm4\n"
1181 "punpcklbw %%mm7,%%mm1\n"
1182 "punpckhbw %%mm7,%%mm5\n"
1183 "punpckhbw %%mm7,%%mm3\n"
1184 "psubw %%mm1, %%mm4\n"
1185 "psubw %%mm3, %%mm5\n"
1186 "psubw %%mm4, %%mm0\n"
1187 "psubw %%mm5, %%mm2\n"
1188 "pxor %%mm3, %%mm3\n"
1189 "pxor %%mm1, %%mm1\n"
1190 "pcmpgtw %%mm0, %%mm3\n\t"
1191 "pcmpgtw %%mm2, %%mm1\n\t"
1192 "pxor %%mm3, %%mm0\n"
1193 "pxor %%mm1, %%mm2\n"
1194 "psubw %%mm3, %%mm0\n"
1195 "psubw %%mm1, %%mm2\n"
1196 "paddw %%mm0, %%mm2\n"
1197 "paddw %%mm2, %%mm6\n"
1198
1199 "add %2,%0\n"
1200 "1:\n"
1201
1202 "movq (%0),%%mm0\n"
1203 "movq 1(%0),%%mm1\n"
1204 "movq %%mm0, %%mm2\n"
1205 "movq %%mm1, %%mm3\n"
1206 "punpcklbw %%mm7,%%mm0\n"
1207 "punpcklbw %%mm7,%%mm1\n"
1208 "punpckhbw %%mm7,%%mm2\n"
1209 "punpckhbw %%mm7,%%mm3\n"
1210 "psubw %%mm1, %%mm0\n"
1211 "psubw %%mm3, %%mm2\n"
1212 "psubw %%mm0, %%mm4\n"
1213 "psubw %%mm2, %%mm5\n"
1214 "pxor %%mm3, %%mm3\n"
1215 "pxor %%mm1, %%mm1\n"
1216 "pcmpgtw %%mm4, %%mm3\n\t"
1217 "pcmpgtw %%mm5, %%mm1\n\t"
1218 "pxor %%mm3, %%mm4\n"
1219 "pxor %%mm1, %%mm5\n"
1220 "psubw %%mm3, %%mm4\n"
1221 "psubw %%mm1, %%mm5\n"
1222 "paddw %%mm4, %%mm5\n"
1223 "paddw %%mm5, %%mm6\n"
1224
1225 "add %2,%0\n"
1226
1227 "movq (%0),%%mm4\n"
1228 "movq 1(%0),%%mm1\n"
1229 "movq %%mm4, %%mm5\n"
1230 "movq %%mm1, %%mm3\n"
1231 "punpcklbw %%mm7,%%mm4\n"
1232 "punpcklbw %%mm7,%%mm1\n"
1233 "punpckhbw %%mm7,%%mm5\n"
1234 "punpckhbw %%mm7,%%mm3\n"
1235 "psubw %%mm1, %%mm4\n"
1236 "psubw %%mm3, %%mm5\n"
1237 "psubw %%mm4, %%mm0\n"
1238 "psubw %%mm5, %%mm2\n"
1239 "pxor %%mm3, %%mm3\n"
1240 "pxor %%mm1, %%mm1\n"
1241 "pcmpgtw %%mm0, %%mm3\n\t"
1242 "pcmpgtw %%mm2, %%mm1\n\t"
1243 "pxor %%mm3, %%mm0\n"
1244 "pxor %%mm1, %%mm2\n"
1245 "psubw %%mm3, %%mm0\n"
1246 "psubw %%mm1, %%mm2\n"
1247 "paddw %%mm0, %%mm2\n"
1248 "paddw %%mm2, %%mm6\n"
1249
1250 "add %2,%0\n"
1251 "subl $2, %%ecx\n"
1252 " jnz 1b\n"
1253
1254 "movq %%mm6, %%mm0\n"
1255 "punpcklwd %%mm7,%%mm0\n"
1256 "punpckhwd %%mm7,%%mm6\n"
1257 "paddd %%mm0, %%mm6\n"
1258
1259 "movq %%mm6,%%mm0\n"
1260 "psrlq $32, %%mm6\n"
1261 "paddd %%mm6,%%mm0\n"
1262 "movd %%mm0,%1\n"
1263 : "+r" (pix1), "=r"(tmp)
1264 : "r" ((long)line_size) , "g" (h-2)
1265 : "%ecx");
1266 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1267 }
1268
1269 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1270 MpegEncContext *c = p;
1271 int score1, score2;
1272
1273 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1274 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1275 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1276
1277 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1278 else return score1 + FFABS(score2)*8;
1279 }
1280
1281 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1282 MpegEncContext *c = p;
1283 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1284 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1285
1286 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1287 else return score1 + FFABS(score2)*8;
1288 }
1289
1290 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1291 int tmp;
1292
1293 assert( (((int)pix) & 7) == 0);
1294 assert((line_size &7) ==0);
1295
1296 #define SUM(in0, in1, out0, out1) \
1297 "movq (%0), %%mm2\n"\
1298 "movq 8(%0), %%mm3\n"\
1299 "add %2,%0\n"\
1300 "movq %%mm2, " #out0 "\n"\
1301 "movq %%mm3, " #out1 "\n"\
1302 "psubusb " #in0 ", %%mm2\n"\
1303 "psubusb " #in1 ", %%mm3\n"\
1304 "psubusb " #out0 ", " #in0 "\n"\
1305 "psubusb " #out1 ", " #in1 "\n"\
1306 "por %%mm2, " #in0 "\n"\
1307 "por %%mm3, " #in1 "\n"\
1308 "movq " #in0 ", %%mm2\n"\
1309 "movq " #in1 ", %%mm3\n"\
1310 "punpcklbw %%mm7, " #in0 "\n"\
1311 "punpcklbw %%mm7, " #in1 "\n"\
1312 "punpckhbw %%mm7, %%mm2\n"\
1313 "punpckhbw %%mm7, %%mm3\n"\
1314 "paddw " #in1 ", " #in0 "\n"\
1315 "paddw %%mm3, %%mm2\n"\
1316 "paddw %%mm2, " #in0 "\n"\
1317 "paddw " #in0 ", %%mm6\n"
1318
1319
1320 asm volatile (
1321 "movl %3,%%ecx\n"
1322 "pxor %%mm6,%%mm6\n"
1323 "pxor %%mm7,%%mm7\n"
1324 "movq (%0),%%mm0\n"
1325 "movq 8(%0),%%mm1\n"
1326 "add %2,%0\n"
1327 "subl $2, %%ecx\n"
1328 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1329 "1:\n"
1330
1331 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1332
1333 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1334
1335 "subl $2, %%ecx\n"
1336 "jnz 1b\n"
1337
1338 "movq %%mm6,%%mm0\n"
1339 "psrlq $32, %%mm6\n"
1340 "paddw %%mm6,%%mm0\n"
1341 "movq %%mm0,%%mm6\n"
1342 "psrlq $16, %%mm0\n"
1343 "paddw %%mm6,%%mm0\n"
1344 "movd %%mm0,%1\n"
1345 : "+r" (pix), "=r"(tmp)
1346 : "r" ((long)line_size) , "m" (h)
1347 : "%ecx");
1348 return tmp & 0xFFFF;
1349 }
1350 #undef SUM
1351
1352 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1353 int tmp;
1354
1355 assert( (((int)pix) & 7) == 0);
1356 assert((line_size &7) ==0);
1357
1358 #define SUM(in0, in1, out0, out1) \
1359 "movq (%0), " #out0 "\n"\
1360 "movq 8(%0), " #out1 "\n"\
1361 "add %2,%0\n"\
1362 "psadbw " #out0 ", " #in0 "\n"\
1363 "psadbw " #out1 ", " #in1 "\n"\
1364 "paddw " #in1 ", " #in0 "\n"\
1365 "paddw " #in0 ", %%mm6\n"
1366
1367 asm volatile (
1368 "movl %3,%%ecx\n"
1369 "pxor %%mm6,%%mm6\n"
1370 "pxor %%mm7,%%mm7\n"
1371 "movq (%0),%%mm0\n"
1372 "movq 8(%0),%%mm1\n"
1373 "add %2,%0\n"
1374 "subl $2, %%ecx\n"
1375 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1376 "1:\n"
1377
1378 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1379
1380 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1381
1382 "subl $2, %%ecx\n"
1383 "jnz 1b\n"
1384
1385 "movd %%mm6,%1\n"
1386 : "+r" (pix), "=r"(tmp)
1387 : "r" ((long)line_size) , "m" (h)
1388 : "%ecx");
1389 return tmp;
1390 }
1391 #undef SUM
1392
1393 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1394 int tmp;
1395
1396 assert( (((int)pix1) & 7) == 0);
1397 assert( (((int)pix2) & 7) == 0);
1398 assert((line_size &7) ==0);
1399
1400 #define SUM(in0, in1, out0, out1) \
1401 "movq (%0),%%mm2\n"\
1402 "movq (%1)," #out0 "\n"\
1403 "movq 8(%0),%%mm3\n"\
1404 "movq 8(%1)," #out1 "\n"\
1405 "add %3,%0\n"\
1406 "add %3,%1\n"\
1407 "psubb " #out0 ", %%mm2\n"\
1408 "psubb " #out1 ", %%mm3\n"\
1409 "pxor %%mm7, %%mm2\n"\
1410 "pxor %%mm7, %%mm3\n"\
1411 "movq %%mm2, " #out0 "\n"\
1412 "movq %%mm3, " #out1 "\n"\
1413 "psubusb " #in0 ", %%mm2\n"\
1414 "psubusb " #in1 ", %%mm3\n"\
1415 "psubusb " #out0 ", " #in0 "\n"\
1416 "psubusb " #out1 ", " #in1 "\n"\
1417 "por %%mm2, " #in0 "\n"\
1418 "por %%mm3, " #in1 "\n"\
1419 "movq " #in0 ", %%mm2\n"\
1420 "movq " #in1 ", %%mm3\n"\
1421 "punpcklbw %%mm7, " #in0 "\n"\
1422 "punpcklbw %%mm7, " #in1 "\n"\
1423 "punpckhbw %%mm7, %%mm2\n"\
1424 "punpckhbw %%mm7, %%mm3\n"\
1425 "paddw " #in1 ", " #in0 "\n"\
1426 "paddw %%mm3, %%mm2\n"\
1427 "paddw %%mm2, " #in0 "\n"\
1428 "paddw " #in0 ", %%mm6\n"
1429
1430
1431 asm volatile (
1432 "movl %4,%%ecx\n"
1433 "pxor %%mm6,%%mm6\n"
1434 "pcmpeqw %%mm7,%%mm7\n"
1435 "psllw $15, %%mm7\n"
1436 "packsswb %%mm7, %%mm7\n"
1437 "movq (%0),%%mm0\n"
1438 "movq (%1),%%mm2\n"
1439 "movq 8(%0),%%mm1\n"
1440 "movq 8(%1),%%mm3\n"
1441 "add %3,%0\n"
1442 "add %3,%1\n"
1443 "subl $2, %%ecx\n"
1444 "psubb %%mm2, %%mm0\n"
1445 "psubb %%mm3, %%mm1\n"
1446 "pxor %%mm7, %%mm0\n"
1447 "pxor %%mm7, %%mm1\n"
1448 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1449 "1:\n"
1450
1451 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1452
1453 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1454
1455 "subl $2, %%ecx\n"
1456 "jnz 1b\n"
1457
1458 "movq %%mm6,%%mm0\n"
1459 "psrlq $32, %%mm6\n"
1460 "paddw %%mm6,%%mm0\n"
1461 "movq %%mm0,%%mm6\n"
1462 "psrlq $16, %%mm0\n"
1463 "paddw %%mm6,%%mm0\n"
1464 "movd %%mm0,%2\n"
1465 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1466 : "r" ((long)line_size) , "m" (h)
1467 : "%ecx");
1468 return tmp & 0x7FFF;
1469 }
1470 #undef SUM
1471
1472 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1473 int tmp;
1474
1475 assert( (((int)pix1) & 7) == 0);
1476 assert( (((int)pix2) & 7) == 0);
1477 assert((line_size &7) ==0);
1478
1479 #define SUM(in0, in1, out0, out1) \
1480 "movq (%0)," #out0 "\n"\
1481 "movq (%1),%%mm2\n"\
1482 "movq 8(%0)," #out1 "\n"\
1483 "movq 8(%1),%%mm3\n"\
1484 "add %3,%0\n"\
1485 "add %3,%1\n"\
1486 "psubb %%mm2, " #out0 "\n"\
1487 "psubb %%mm3, " #out1 "\n"\
1488 "pxor %%mm7, " #out0 "\n"\
1489 "pxor %%mm7, " #out1 "\n"\
1490 "psadbw " #out0 ", " #in0 "\n"\
1491 "psadbw " #out1 ", " #in1 "\n"\
1492 "paddw " #in1 ", " #in0 "\n"\
1493 "paddw " #in0 ", %%mm6\n"
1494
1495 asm volatile (
1496 "movl %4,%%ecx\n"
1497 "pxor %%mm6,%%mm6\n"
1498 "pcmpeqw %%mm7,%%mm7\n"
1499 "psllw $15, %%mm7\n"
1500 "packsswb %%mm7, %%mm7\n"
1501 "movq (%0),%%mm0\n"
1502 "movq (%1),%%mm2\n"
1503 "movq 8(%0),%%mm1\n"
1504 "movq 8(%1),%%mm3\n"
1505 "add %3,%0\n"
1506 "add %3,%1\n"
1507 "subl $2, %%ecx\n"
1508 "psubb %%mm2, %%mm0\n"
1509 "psubb %%mm3, %%mm1\n"
1510 "pxor %%mm7, %%mm0\n"
1511 "pxor %%mm7, %%mm1\n"
1512 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1513 "1:\n"
1514
1515 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1516
1517 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1518
1519 "subl $2, %%ecx\n"
1520 "jnz 1b\n"
1521
1522 "movd %%mm6,%2\n"
1523 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1524 : "r" ((long)line_size) , "m" (h)
1525 : "%ecx");
1526 return tmp;
1527 }
1528 #undef SUM
1529
1530 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1531 long i=0;
1532 asm volatile(
1533 "1: \n\t"
1534 "movq (%2, %0), %%mm0 \n\t"
1535 "movq (%1, %0), %%mm1 \n\t"
1536 "psubb %%mm0, %%mm1 \n\t"
1537 "movq %%mm1, (%3, %0) \n\t"
1538 "movq 8(%2, %0), %%mm0 \n\t"
1539 "movq 8(%1, %0), %%mm1 \n\t"
1540 "psubb %%mm0, %%mm1 \n\t"
1541 "movq %%mm1, 8(%3, %0) \n\t"
1542 "add $16, %0 \n\t"
1543 "cmp %4, %0 \n\t"
1544 " jb 1b \n\t"
1545 : "+r" (i)
1546 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1547 );
1548 for(; i<w; i++)
1549 dst[i+0] = src1[i+0]-src2[i+0];
1550 }
1551
1552 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1553 long i=0;
1554 uint8_t l, lt;
1555
1556 asm volatile(
1557 "1: \n\t"
1558 "movq -1(%1, %0), %%mm0 \n\t" // LT
1559 "movq (%1, %0), %%mm1 \n\t" // T
1560 "movq -1(%2, %0), %%mm2 \n\t" // L
1561 "movq (%2, %0), %%mm3 \n\t" // X
1562 "movq %%mm2, %%mm4 \n\t" // L
1563 "psubb %%mm0, %%mm2 \n\t"
1564 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1565 "movq %%mm4, %%mm5 \n\t" // L
1566 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1567 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1568 "pminub %%mm2, %%mm4 \n\t"
1569 "pmaxub %%mm1, %%mm4 \n\t"
1570 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1571 "movq %%mm3, (%3, %0) \n\t"
1572 "add $8, %0 \n\t"
1573 "cmp %4, %0 \n\t"
1574 " jb 1b \n\t"
1575 : "+r" (i)
1576 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1577 );
1578
1579 l= *left;
1580 lt= *left_top;
1581
1582 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1583
1584 *left_top= src1[w-1];
1585 *left = src2[w-1];
1586 }
1587
1588 #define PAETH(cpu, abs3)\ 699 #define PAETH(cpu, abs3)\
1589 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ 700 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
1590 {\ 701 {\
1591 long i = -bpp;\ 702 long i = -bpp;\
1592 long end = w-3;\ 703 long end = w-3;\
1656 767
1657 PAETH(mmx2, ABS3_MMX2) 768 PAETH(mmx2, ABS3_MMX2)
1658 #ifdef HAVE_SSSE3 769 #ifdef HAVE_SSSE3
1659 PAETH(ssse3, ABS3_SSSE3) 770 PAETH(ssse3, ABS3_SSSE3)
1660 #endif 771 #endif
1661
1662 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1663 "mov"#m" "#p1", "#a" \n\t"\
1664 "mov"#m" "#p2", "#t" \n\t"\
1665 "punpcklbw "#a", "#t" \n\t"\
1666 "punpcklbw "#a", "#a" \n\t"\
1667 "psubw "#t", "#a" \n\t"\
1668
1669 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1670 uint8_t *p1b=p1, *p2b=p2;\
1671 asm volatile(\
1672 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1673 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1674 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1675 "add %4, %1 \n\t"\
1676 "add %4, %2 \n\t"\
1677 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1678 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1679 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1680 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1681 "mov"#m1" "#mm"0, %0 \n\t"\
1682 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1683 "mov"#m1" %0, "#mm"0 \n\t"\
1684 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1685 : "r"((long)stride), "r"((long)stride*3)\
1686 );\
1687 }
1688 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1689
1690 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1691 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1692
1693 #define LBUTTERFLY2(a1,b1,a2,b2)\
1694 "paddw " #b1 ", " #a1 " \n\t"\
1695 "paddw " #b2 ", " #a2 " \n\t"\
1696 "paddw " #b1 ", " #b1 " \n\t"\
1697 "paddw " #b2 ", " #b2 " \n\t"\
1698 "psubw " #a1 ", " #b1 " \n\t"\
1699 "psubw " #a2 ", " #b2 " \n\t"
1700
1701 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1702 LBUTTERFLY2(m0, m1, m2, m3)\
1703 LBUTTERFLY2(m4, m5, m6, m7)\
1704 LBUTTERFLY2(m0, m2, m1, m3)\
1705 LBUTTERFLY2(m4, m6, m5, m7)\
1706 LBUTTERFLY2(m0, m4, m1, m5)\
1707 LBUTTERFLY2(m2, m6, m3, m7)\
1708
1709 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1710
1711 #define MMABS_MMX(a,z)\
1712 "pxor " #z ", " #z " \n\t"\
1713 "pcmpgtw " #a ", " #z " \n\t"\
1714 "pxor " #z ", " #a " \n\t"\
1715 "psubw " #z ", " #a " \n\t"
1716
1717 #define MMABS_MMX2(a,z)\
1718 "pxor " #z ", " #z " \n\t"\
1719 "psubw " #a ", " #z " \n\t"\
1720 "pmaxsw " #z ", " #a " \n\t"
1721
1722 #define MMABS_SSSE3(a,z)\
1723 "pabsw " #a ", " #a " \n\t"
1724
1725 #define MMABS_SUM(a,z, sum)\
1726 MMABS(a,z)\
1727 "paddusw " #a ", " #sum " \n\t"
1728
1729 #define MMABS_SUM_8x8_NOSPILL\
1730 MMABS(%%xmm0, %%xmm8)\
1731 MMABS(%%xmm1, %%xmm9)\
1732 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1733 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1734 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1735 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1736 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1737 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1738 "paddusw %%xmm1, %%xmm0 \n\t"
1739
1740 #ifdef ARCH_X86_64
1741 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1742 #else
1743 #define MMABS_SUM_8x8_SSE2\
1744 "movdqa %%xmm7, (%1) \n\t"\
1745 MMABS(%%xmm0, %%xmm7)\
1746 MMABS(%%xmm1, %%xmm7)\
1747 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1748 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1749 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1750 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1751 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1752 "movdqa (%1), %%xmm2 \n\t"\
1753 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1754 "paddusw %%xmm1, %%xmm0 \n\t"
1755 #endif
1756
1757 #define LOAD4(o, a, b, c, d)\
1758 "movq "#o"(%1), "#a" \n\t"\
1759 "movq "#o"+8(%1), "#b" \n\t"\
1760 "movq "#o"+16(%1), "#c" \n\t"\
1761 "movq "#o"+24(%1), "#d" \n\t"\
1762
1763 #define STORE4(o, a, b, c, d)\
1764 "movq "#a", "#o"(%1) \n\t"\
1765 "movq "#b", "#o"+8(%1) \n\t"\
1766 "movq "#c", "#o"+16(%1) \n\t"\
1767 "movq "#d", "#o"+24(%1) \n\t"\
1768
1769 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1770 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1771 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1772 #define HSUM_MMX(a, t, dst)\
1773 "movq "#a", "#t" \n\t"\
1774 "psrlq $32, "#a" \n\t"\
1775 "paddusw "#t", "#a" \n\t"\
1776 "movq "#a", "#t" \n\t"\
1777 "psrlq $16, "#a" \n\t"\
1778 "paddusw "#t", "#a" \n\t"\
1779 "movd "#a", "#dst" \n\t"\
1780
1781 #define HSUM_MMX2(a, t, dst)\
1782 "pshufw $0x0E, "#a", "#t" \n\t"\
1783 "paddusw "#t", "#a" \n\t"\
1784 "pshufw $0x01, "#a", "#t" \n\t"\
1785 "paddusw "#t", "#a" \n\t"\
1786 "movd "#a", "#dst" \n\t"\
1787
1788 #define HSUM_SSE2(a, t, dst)\
1789 "movhlps "#a", "#t" \n\t"\
1790 "paddusw "#t", "#a" \n\t"\
1791 "pshuflw $0x0E, "#a", "#t" \n\t"\
1792 "paddusw "#t", "#a" \n\t"\
1793 "pshuflw $0x01, "#a", "#t" \n\t"\
1794 "paddusw "#t", "#a" \n\t"\
1795 "movd "#a", "#dst" \n\t"\
1796
1797 #define HADAMARD8_DIFF_MMX(cpu) \
1798 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1799 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1800 int sum;\
1801 \
1802 assert(h==8);\
1803 \
1804 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1805 \
1806 asm volatile(\
1807 HADAMARD48\
1808 \
1809 "movq %%mm7, 96(%1) \n\t"\
1810 \
1811 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1812 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1813 \
1814 "movq 96(%1), %%mm7 \n\t"\
1815 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1816 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1817 \
1818 : "=r" (sum)\
1819 : "r"(temp)\
1820 );\
1821 \
1822 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1823 \
1824 asm volatile(\
1825 HADAMARD48\
1826 \
1827 "movq %%mm7, 96(%1) \n\t"\
1828 \
1829 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1830 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1831 \
1832 "movq 96(%1), %%mm7 \n\t"\
1833 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1834 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1835 "movq %%mm6, %%mm7 \n\t"\
1836 "movq %%mm0, %%mm6 \n\t"\
1837 \
1838 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1839 \
1840 HADAMARD48\
1841 "movq %%mm7, 64(%1) \n\t"\
1842 MMABS(%%mm0, %%mm7)\
1843 MMABS(%%mm1, %%mm7)\
1844 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1845 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1846 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1847 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1848 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1849 "movq 64(%1), %%mm2 \n\t"\
1850 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1851 "paddusw %%mm1, %%mm0 \n\t"\
1852 "movq %%mm0, 64(%1) \n\t"\
1853 \
1854 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1855 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1856 \
1857 HADAMARD48\
1858 "movq %%mm7, (%1) \n\t"\
1859 MMABS(%%mm0, %%mm7)\
1860 MMABS(%%mm1, %%mm7)\
1861 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1862 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1863 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1864 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1865 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1866 "movq (%1), %%mm2 \n\t"\
1867 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1868 "paddusw 64(%1), %%mm0 \n\t"\
1869 "paddusw %%mm1, %%mm0 \n\t"\
1870 \
1871 HSUM(%%mm0, %%mm1, %0)\
1872 \
1873 : "=r" (sum)\
1874 : "r"(temp)\
1875 );\
1876 return sum&0xFFFF;\
1877 }\
1878 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1879
1880 #define HADAMARD8_DIFF_SSE2(cpu) \
1881 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1882 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1883 int sum;\
1884 \
1885 assert(h==8);\
1886 \
1887 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1888 \
1889 asm volatile(\
1890 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1891 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1892 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1893 MMABS_SUM_8x8\
1894 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1895 : "=r" (sum)\
1896 : "r"(temp)\
1897 );\
1898 return sum&0xFFFF;\
1899 }\
1900 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1901
1902 #define MMABS(a,z) MMABS_MMX(a,z)
1903 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1904 HADAMARD8_DIFF_MMX(mmx)
1905 #undef MMABS
1906 #undef HSUM
1907
1908 #define MMABS(a,z) MMABS_MMX2(a,z)
1909 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1910 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1911 HADAMARD8_DIFF_MMX(mmx2)
1912 HADAMARD8_DIFF_SSE2(sse2)
1913 #undef MMABS
1914 #undef MMABS_SUM_8x8
1915 #undef HSUM
1916
1917 #ifdef HAVE_SSSE3
1918 #define MMABS(a,z) MMABS_SSSE3(a,z)
1919 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1920 HADAMARD8_DIFF_SSE2(ssse3)
1921 #undef MMABS
1922 #undef MMABS_SUM_8x8
1923 #endif
1924
1925 #define DCT_SAD4(m,mm,o)\
1926 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1927 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1928 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1929 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1930 MMABS_SUM(mm##2, mm##6, mm##0)\
1931 MMABS_SUM(mm##3, mm##7, mm##1)\
1932 MMABS_SUM(mm##4, mm##6, mm##0)\
1933 MMABS_SUM(mm##5, mm##7, mm##1)\
1934
1935 #define DCT_SAD_MMX\
1936 "pxor %%mm0, %%mm0 \n\t"\
1937 "pxor %%mm1, %%mm1 \n\t"\
1938 DCT_SAD4(q, %%mm, 0)\
1939 DCT_SAD4(q, %%mm, 8)\
1940 DCT_SAD4(q, %%mm, 64)\
1941 DCT_SAD4(q, %%mm, 72)\
1942 "paddusw %%mm1, %%mm0 \n\t"\
1943 HSUM(%%mm0, %%mm1, %0)
1944
1945 #define DCT_SAD_SSE2\
1946 "pxor %%xmm0, %%xmm0 \n\t"\
1947 "pxor %%xmm1, %%xmm1 \n\t"\
1948 DCT_SAD4(dqa, %%xmm, 0)\
1949 DCT_SAD4(dqa, %%xmm, 64)\
1950 "paddusw %%xmm1, %%xmm0 \n\t"\
1951 HSUM(%%xmm0, %%xmm1, %0)
1952
1953 #define DCT_SAD_FUNC(cpu) \
1954 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1955 int sum;\
1956 asm volatile(\
1957 DCT_SAD\
1958 :"=r"(sum)\
1959 :"r"(block)\
1960 );\
1961 return sum&0xFFFF;\
1962 }
1963
1964 #define DCT_SAD DCT_SAD_MMX
1965 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1966 #define MMABS(a,z) MMABS_MMX(a,z)
1967 DCT_SAD_FUNC(mmx)
1968 #undef MMABS
1969 #undef HSUM
1970
1971 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1972 #define MMABS(a,z) MMABS_MMX2(a,z)
1973 DCT_SAD_FUNC(mmx2)
1974 #undef HSUM
1975 #undef DCT_SAD
1976
1977 #define DCT_SAD DCT_SAD_SSE2
1978 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1979 DCT_SAD_FUNC(sse2)
1980 #undef MMABS
1981
1982 #ifdef HAVE_SSSE3
1983 #define MMABS(a,z) MMABS_SSSE3(a,z)
1984 DCT_SAD_FUNC(ssse3)
1985 #undef MMABS
1986 #endif
1987 #undef HSUM
1988 #undef DCT_SAD
1989
1990 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1991 int sum;
1992 long i=size;
1993 asm volatile(
1994 "pxor %%mm4, %%mm4 \n"
1995 "1: \n"
1996 "sub $8, %0 \n"
1997 "movq (%2,%0), %%mm2 \n"
1998 "movq (%3,%0,2), %%mm0 \n"
1999 "movq 8(%3,%0,2), %%mm1 \n"
2000 "punpckhbw %%mm2, %%mm3 \n"
2001 "punpcklbw %%mm2, %%mm2 \n"
2002 "psraw $8, %%mm3 \n"
2003 "psraw $8, %%mm2 \n"
2004 "psubw %%mm3, %%mm1 \n"
2005 "psubw %%mm2, %%mm0 \n"
2006 "pmaddwd %%mm1, %%mm1 \n"
2007 "pmaddwd %%mm0, %%mm0 \n"
2008 "paddd %%mm1, %%mm4 \n"
2009 "paddd %%mm0, %%mm4 \n"
2010 "jg 1b \n"
2011 "movq %%mm4, %%mm3 \n"
2012 "psrlq $32, %%mm3 \n"
2013 "paddd %%mm3, %%mm4 \n"
2014 "movd %%mm4, %1 \n"
2015 :"+r"(i), "=r"(sum)
2016 :"r"(pix1), "r"(pix2)
2017 );
2018 return sum;
2019 }
2020
2021 #endif //CONFIG_ENCODERS
2022 772
2023 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ 773 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2024 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ 774 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
2025 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ 775 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
2026 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ 776 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
2856 } 1606 }
2857 src += 4-h*stride; 1607 src += 4-h*stride;
2858 } 1608 }
2859 } 1609 }
2860 1610
2861 #ifdef CONFIG_ENCODERS
2862
2863 #define PHADDD(a, t)\
2864 "movq "#a", "#t" \n\t"\
2865 "psrlq $32, "#a" \n\t"\
2866 "paddd "#t", "#a" \n\t"
2867 /*
2868 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2869 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2870 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2871 */
2872 #define PMULHRW(x, y, s, o)\
2873 "pmulhw " #s ", "#x " \n\t"\
2874 "pmulhw " #s ", "#y " \n\t"\
2875 "paddw " #o ", "#x " \n\t"\
2876 "paddw " #o ", "#y " \n\t"\
2877 "psraw $1, "#x " \n\t"\
2878 "psraw $1, "#y " \n\t"
2879 #define DEF(x) x ## _mmx
2880 #define SET_RND MOVQ_WONE
2881 #define SCALE_OFFSET 1
2882
2883 #include "dsputil_mmx_qns.h"
2884
2885 #undef DEF
2886 #undef SET_RND
2887 #undef SCALE_OFFSET
2888 #undef PMULHRW
2889
2890 #define DEF(x) x ## _3dnow
2891 #define SET_RND(x)
2892 #define SCALE_OFFSET 0
2893 #define PMULHRW(x, y, s, o)\
2894 "pmulhrw " #s ", "#x " \n\t"\
2895 "pmulhrw " #s ", "#y " \n\t"
2896
2897 #include "dsputil_mmx_qns.h"
2898
2899 #undef DEF
2900 #undef SET_RND
2901 #undef SCALE_OFFSET
2902 #undef PMULHRW
2903
2904 #ifdef HAVE_SSSE3
2905 #undef PHADDD
2906 #define DEF(x) x ## _ssse3
2907 #define SET_RND(x)
2908 #define SCALE_OFFSET -1
2909 #define PHADDD(a, t)\
2910 "pshufw $0x0E, "#a", "#t" \n\t"\
2911 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
2912 #define PMULHRW(x, y, s, o)\
2913 "pmulhrsw " #s ", "#x " \n\t"\
2914 "pmulhrsw " #s ", "#y " \n\t"
2915
2916 #include "dsputil_mmx_qns.h"
2917
2918 #undef DEF
2919 #undef SET_RND
2920 #undef SCALE_OFFSET
2921 #undef PMULHRW
2922 #undef PHADDD
2923 #endif //HAVE_SSSE3
2924
2925 #endif /* CONFIG_ENCODERS */
2926
2927 #define PREFETCH(name, op) \ 1611 #define PREFETCH(name, op) \
2928 static void name(void *mem, int stride, int h){\ 1612 static void name(void *mem, int stride, int h){\
2929 const uint8_t *p= mem;\ 1613 const uint8_t *p= mem;\
2930 do{\ 1614 do{\
2931 asm volatile(#op" %0" :: "m"(*p));\ 1615 asm volatile(#op" %0" :: "m"(*p));\
2951 put_pixels16_mmx(dst, src, stride, 16); 1635 put_pixels16_mmx(dst, src, stride, 16);
2952 } 1636 }
2953 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1637 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2954 avg_pixels16_mmx(dst, src, stride, 16); 1638 avg_pixels16_mmx(dst, src, stride, 16);
2955 } 1639 }
2956
2957 /* FLAC specific */
2958 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2959 double *autoc);
2960 1640
2961 /* VC1 specific */ 1641 /* VC1 specific */
2962 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); 1642 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2963 1643
2964 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { 1644 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
3318 #endif 1998 #endif
3319 1999
3320 if (mm_flags & MM_MMX) { 2000 if (mm_flags & MM_MMX) {
3321 const int idct_algo= avctx->idct_algo; 2001 const int idct_algo= avctx->idct_algo;
3322 2002
3323 #ifdef CONFIG_ENCODERS
3324 const int dct_algo = avctx->dct_algo;
3325 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3326 if(mm_flags & MM_SSE2){
3327 c->fdct = ff_fdct_sse2;
3328 }else if(mm_flags & MM_MMXEXT){
3329 c->fdct = ff_fdct_mmx2;
3330 }else{
3331 c->fdct = ff_fdct_mmx;
3332 }
3333 }
3334 #endif //CONFIG_ENCODERS
3335 if(avctx->lowres==0){ 2003 if(avctx->lowres==0){
3336 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ 2004 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3337 c->idct_put= ff_simple_idct_put_mmx; 2005 c->idct_put= ff_simple_idct_put_mmx;
3338 c->idct_add= ff_simple_idct_add_mmx; 2006 c->idct_add= ff_simple_idct_add_mmx;
3339 c->idct = ff_simple_idct_mmx; 2007 c->idct = ff_simple_idct_mmx;
3380 c->idct = ff_idct_xvid_mmx; 2048 c->idct = ff_idct_xvid_mmx;
3381 } 2049 }
3382 } 2050 }
3383 } 2051 }
3384 2052
3385 #ifdef CONFIG_ENCODERS
3386 c->get_pixels = get_pixels_mmx;
3387 c->diff_pixels = diff_pixels_mmx;
3388 #endif //CONFIG_ENCODERS
3389 c->put_pixels_clamped = put_pixels_clamped_mmx; 2053 c->put_pixels_clamped = put_pixels_clamped_mmx;
3390 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; 2054 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
3391 c->add_pixels_clamped = add_pixels_clamped_mmx; 2055 c->add_pixels_clamped = add_pixels_clamped_mmx;
3392 c->clear_blocks = clear_blocks_mmx; 2056 c->clear_blocks = clear_blocks_mmx;
3393 #ifdef CONFIG_ENCODERS
3394 c->pix_sum = pix_sum16_mmx;
3395 #endif //CONFIG_ENCODERS
3396 2057
3397 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ 2058 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
3398 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ 2059 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
3399 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ 2060 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
3400 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ 2061 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
3411 2072
3412 c->gmc= gmc_mmx; 2073 c->gmc= gmc_mmx;
3413 2074
3414 c->add_bytes= add_bytes_mmx; 2075 c->add_bytes= add_bytes_mmx;
3415 c->add_bytes_l2= add_bytes_l2_mmx; 2076 c->add_bytes_l2= add_bytes_l2_mmx;
3416 #ifdef CONFIG_ENCODERS
3417 c->diff_bytes= diff_bytes_mmx;
3418 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
3419
3420 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
3421 c->hadamard8_diff[1]= hadamard8_diff_mmx;
3422
3423 c->pix_norm1 = pix_norm1_mmx;
3424 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
3425 c->sse[1] = sse8_mmx;
3426 c->vsad[4]= vsad_intra16_mmx;
3427
3428 c->nsse[0] = nsse16_mmx;
3429 c->nsse[1] = nsse8_mmx;
3430 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3431 c->vsad[0] = vsad16_mmx;
3432 }
3433
3434 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3435 c->try_8x8basis= try_8x8basis_mmx;
3436 }
3437 c->add_8x8basis= add_8x8basis_mmx;
3438
3439 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
3440
3441 #endif //CONFIG_ENCODERS
3442 2077
3443 if (ENABLE_ANY_H263) { 2078 if (ENABLE_ANY_H263) {
3444 c->h263_v_loop_filter= h263_v_loop_filter_mmx; 2079 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
3445 c->h263_h_loop_filter= h263_h_loop_filter_mmx; 2080 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
3446 } 2081 }
3470 2105
3471 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; 2106 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
3472 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; 2107 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
3473 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; 2108 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
3474 2109
3475 #ifdef CONFIG_ENCODERS
3476 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
3477 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
3478 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
3479 c->vsad[4]= vsad_intra16_mmx2;
3480 #endif //CONFIG_ENCODERS
3481
3482 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; 2110 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
3483 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; 2111 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
3484 2112
3485 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2113 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3486 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; 2114 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
3487 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; 2115 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
3488 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; 2116 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
3489 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; 2117 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
3490 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; 2118 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
3491 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; 2119 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3492 #ifdef CONFIG_ENCODERS
3493 c->vsad[0] = vsad16_mmx2;
3494 #endif //CONFIG_ENCODERS
3495 } 2120 }
3496 2121
3497 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ 2122 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
3498 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ 2123 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
3499 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ 2124 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
3566 2191
3567 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) 2192 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
3568 ff_vc1dsp_init_mmx(c, avctx); 2193 ff_vc1dsp_init_mmx(c, avctx);
3569 2194
3570 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; 2195 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
3571 #ifdef CONFIG_ENCODERS
3572 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
3573 #endif //CONFIG_ENCODERS
3574 } else if (mm_flags & MM_3DNOW) { 2196 } else if (mm_flags & MM_3DNOW) {
3575 c->prefetch = prefetch_3dnow; 2197 c->prefetch = prefetch_3dnow;
3576 2198
3577 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; 2199 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
3578 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; 2200 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
3664 H264_QPEL_FUNCS(3, 3, ssse3); 2286 H264_QPEL_FUNCS(3, 3, ssse3);
3665 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; 2287 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
3666 } 2288 }
3667 #endif 2289 #endif
3668 2290
3669 #ifdef CONFIG_ENCODERS
3670 if(mm_flags & MM_SSE2){
3671 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
3672 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
3673 c->hadamard8_diff[1]= hadamard8_diff_sse2;
3674 if (ENABLE_FLAC_ENCODER)
3675 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
3676 }
3677
3678 #ifdef HAVE_SSSE3
3679 if(mm_flags & MM_SSSE3){
3680 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3681 c->try_8x8basis= try_8x8basis_ssse3;
3682 }
3683 c->add_8x8basis= add_8x8basis_ssse3;
3684 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
3685 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
3686 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
3687 }
3688 #endif
3689 #endif
3690
3691 #ifdef CONFIG_SNOW_DECODER 2291 #ifdef CONFIG_SNOW_DECODER
3692 if(mm_flags & MM_SSE2 & 0){ 2292 if(mm_flags & MM_SSE2 & 0){
3693 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; 2293 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3694 #ifdef HAVE_7REGS 2294 #ifdef HAVE_7REGS
3695 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; 2295 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3706 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; 2306 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3707 } 2307 }
3708 #endif 2308 #endif
3709 2309
3710 if(mm_flags & MM_3DNOW){ 2310 if(mm_flags & MM_3DNOW){
3711 #ifdef CONFIG_ENCODERS
3712 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3713 c->try_8x8basis= try_8x8basis_3dnow;
3714 }
3715 c->add_8x8basis= add_8x8basis_3dnow;
3716 #endif //CONFIG_ENCODERS
3717 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; 2311 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
3718 c->vector_fmul = vector_fmul_3dnow; 2312 c->vector_fmul = vector_fmul_3dnow;
3719 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) 2313 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
3720 c->float_to_int16 = float_to_int16_3dnow; 2314 c->float_to_int16 = float_to_int16_3dnow;
3721 } 2315 }
3730 } 2324 }
3731 if(mm_flags & MM_3DNOW) 2325 if(mm_flags & MM_3DNOW)
3732 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse 2326 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
3733 } 2327 }
3734 2328
3735 #ifdef CONFIG_ENCODERS 2329 if (ENABLE_ENCODERS)
3736 dsputil_init_pix_mmx(c, avctx); 2330 dsputilenc_init_mmx(c, avctx);
3737 #endif //CONFIG_ENCODERS 2331
3738 #if 0 2332 #if 0
3739 // for speed testing 2333 // for speed testing
3740 get_pixels = just_return; 2334 get_pixels = just_return;
3741 put_pixels_clamped = just_return; 2335 put_pixels_clamped = just_return;
3742 add_pixels_clamped = just_return; 2336 add_pixels_clamped = just_return;