comparison postproc/postprocess_template.c @ 7990:f4f9390651b2

using fewer registers ... to workaround something
author michael
date Thu, 31 Oct 2002 01:42:59 +0000
parents a57c1fc0c2fc
children 772d6d27fd66
comparison
equal deleted inserted replaced
7989:31d6c10b8887 7990:f4f9390651b2
890 } 890 }
891 } 891 }
892 */ 892 */
893 #elif defined (HAVE_MMX) 893 #elif defined (HAVE_MMX)
894 src+= stride*4; 894 src+= stride*4;
895
896 asm volatile( 895 asm volatile(
897 "pxor %%mm7, %%mm7 \n\t" 896 "pxor %%mm7, %%mm7 \n\t"
898 "leal (%0, %1), %%eax \n\t"
899 "leal (%%eax, %1, 4), %%edx \n\t"
900 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars 897 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
901 "andl $0xFFFFFFF8, %%ecx \n\t" // align 898 "andl $0xFFFFFFF8, %%ecx \n\t" // align
902 // 0 1 2 3 4 5 6 7 899 // 0 1 2 3 4 5 6 7
903 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 900 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
904 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 901 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
906 "movq (%0), %%mm0 \n\t" 903 "movq (%0), %%mm0 \n\t"
907 "movq %%mm0, %%mm1 \n\t" 904 "movq %%mm0, %%mm1 \n\t"
908 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 905 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
909 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 906 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
910 907
911 "movq (%%eax), %%mm2 \n\t" 908 "movq (%0, %1), %%mm2 \n\t"
909 "leal (%0, %1, 2), %%eax \n\t"
912 "movq %%mm2, %%mm3 \n\t" 910 "movq %%mm2, %%mm3 \n\t"
913 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 911 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
914 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 912 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
915 913
916 "movq (%%eax, %1), %%mm4 \n\t" 914 "movq (%%eax), %%mm4 \n\t"
917 "movq %%mm4, %%mm5 \n\t" 915 "movq %%mm4, %%mm5 \n\t"
918 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 916 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
919 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 917 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
920 918
921 "paddw %%mm0, %%mm0 \n\t" // 2L0 919 "paddw %%mm0, %%mm0 \n\t" // 2L0
928 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 926 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
929 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 927 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
930 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 928 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
931 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 929 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
932 930
933 "movq (%%eax, %1, 2), %%mm2 \n\t" 931 "movq (%%eax, %1), %%mm2 \n\t"
934 "movq %%mm2, %%mm3 \n\t" 932 "movq %%mm2, %%mm3 \n\t"
935 "punpcklbw %%mm7, %%mm2 \n\t" // L3 933 "punpcklbw %%mm7, %%mm2 \n\t" // L3
936 "punpckhbw %%mm7, %%mm3 \n\t" // H3 934 "punpckhbw %%mm7, %%mm3 \n\t" // H3
937 935
938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 936 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
940 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
941 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
942 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 940 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
943 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 941 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
944 942
945 "movq (%0, %1, 4), %%mm0 \n\t" 943 "movq (%%eax, %1, 2), %%mm0 \n\t"
946 "movq %%mm0, %%mm1 \n\t" 944 "movq %%mm0, %%mm1 \n\t"
947 "punpcklbw %%mm7, %%mm0 \n\t" // L4 945 "punpcklbw %%mm7, %%mm0 \n\t" // L4
948 "punpckhbw %%mm7, %%mm1 \n\t" // H4 946 "punpckhbw %%mm7, %%mm1 \n\t" // H4
949 947
950 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 948 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
954 "paddw %%mm4, %%mm4 \n\t" // 2L2 952 "paddw %%mm4, %%mm4 \n\t" // 2L2
955 "paddw %%mm5, %%mm5 \n\t" // 2H2 953 "paddw %%mm5, %%mm5 \n\t" // 2H2
956 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 954 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
957 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 955 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
958 956
957 "leal (%%eax, %1), %0 \n\t"
959 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 958 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
960 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 959 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
961 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 960 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
962 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 961 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
963 //50 opcodes so far 962 //50 opcodes so far
964 "movq (%%edx), %%mm2 \n\t" 963 "movq (%0, %1, 2), %%mm2 \n\t"
965 "movq %%mm2, %%mm3 \n\t" 964 "movq %%mm2, %%mm3 \n\t"
966 "punpcklbw %%mm7, %%mm2 \n\t" // L5 965 "punpcklbw %%mm7, %%mm2 \n\t" // L5
967 "punpckhbw %%mm7, %%mm3 \n\t" // H5 966 "punpckhbw %%mm7, %%mm3 \n\t" // H5
968 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 967 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
969 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 968 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
970 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 969 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
971 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 970 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
972 971
973 "movq (%%edx, %1), %%mm6 \n\t" 972 "movq (%%eax, %1, 4), %%mm6 \n\t"
974 "punpcklbw %%mm7, %%mm6 \n\t" // L6 973 "punpcklbw %%mm7, %%mm6 \n\t" // L6
975 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 974 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
976 "movq (%%edx, %1), %%mm6 \n\t" 975 "movq (%%eax, %1, 4), %%mm6 \n\t"
977 "punpckhbw %%mm7, %%mm6 \n\t" // H6 976 "punpckhbw %%mm7, %%mm6 \n\t" // H6
978 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 977 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
979 978
980 "paddw %%mm0, %%mm0 \n\t" // 2L4 979 "paddw %%mm0, %%mm0 \n\t" // 2L4
981 "paddw %%mm1, %%mm1 \n\t" // 2H4 980 "paddw %%mm1, %%mm1 \n\t" // 2H4
985 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 984 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
986 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 985 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
987 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 986 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
988 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 987 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
989 988
990 "movq (%%edx, %1, 2), %%mm2 \n\t" 989 "movq (%0, %1, 4), %%mm2 \n\t"
991 "movq %%mm2, %%mm3 \n\t" 990 "movq %%mm2, %%mm3 \n\t"
992 "punpcklbw %%mm7, %%mm2 \n\t" // L7 991 "punpcklbw %%mm7, %%mm2 \n\t" // L7
993 "punpckhbw %%mm7, %%mm3 \n\t" // H7 992 "punpckhbw %%mm7, %%mm3 \n\t" // H7
994 993
995 "paddw %%mm2, %%mm2 \n\t" // 2L7 994 "paddw %%mm2, %%mm2 \n\t" // 2L7
1108 "pxor %%mm6, %%mm4 \n\t" 1107 "pxor %%mm6, %%mm4 \n\t"
1109 "pxor %%mm7, %%mm5 \n\t" 1108 "pxor %%mm7, %%mm5 \n\t"
1110 "psubw %%mm6, %%mm4 \n\t" 1109 "psubw %%mm6, %%mm4 \n\t"
1111 "psubw %%mm7, %%mm5 \n\t" 1110 "psubw %%mm7, %%mm5 \n\t"
1112 "packsswb %%mm5, %%mm4 \n\t" 1111 "packsswb %%mm5, %%mm4 \n\t"
1113 "movq (%%eax, %1, 2), %%mm0 \n\t" 1112 "movq (%0), %%mm0 \n\t"
1114 "paddb %%mm4, %%mm0 \n\t" 1113 "paddb %%mm4, %%mm0 \n\t"
1115 "movq %%mm0, (%%eax, %1, 2) \n\t" 1114 "movq %%mm0, (%0) \n\t"
1116 "movq (%0, %1, 4), %%mm0 \n\t" 1115 "movq (%0, %1), %%mm0 \n\t"
1117 "psubb %%mm4, %%mm0 \n\t" 1116 "psubb %%mm4, %%mm0 \n\t"
1118 "movq %%mm0, (%0, %1, 4) \n\t" 1117 "movq %%mm0, (%0, %1) \n\t"
1119 1118
1120 : 1119 : "+r" (src)
1121 : "r" (src), "r" (stride), "m" (c->pQPb) 1120 : "r" (stride), "m" (c->pQPb)
1122 : "%eax", "%edx", "%ecx" 1121 : "%eax", "%ecx"
1123 ); 1122 );
1124 #else 1123 #else
1125 const int l1= stride; 1124 const int l1= stride;
1126 const int l2= stride + l1; 1125 const int l2= stride + l1;
1127 const int l3= stride + l2; 1126 const int l3= stride + l2;