Mercurial > mplayer.hg
comparison postproc/postprocess_template.c @ 7990:f4f9390651b2
using fewer registers ... to workaround something
author | michael |
---|---|
date | Thu, 31 Oct 2002 01:42:59 +0000 |
parents | a57c1fc0c2fc |
children | 772d6d27fd66 |
comparison
equal
deleted
inserted
replaced
7989:31d6c10b8887 | 7990:f4f9390651b2 |
---|---|
890 } | 890 } |
891 } | 891 } |
892 */ | 892 */ |
893 #elif defined (HAVE_MMX) | 893 #elif defined (HAVE_MMX) |
894 src+= stride*4; | 894 src+= stride*4; |
895 | |
896 asm volatile( | 895 asm volatile( |
897 "pxor %%mm7, %%mm7 \n\t" | 896 "pxor %%mm7, %%mm7 \n\t" |
898 "leal (%0, %1), %%eax \n\t" | |
899 "leal (%%eax, %1, 4), %%edx \n\t" | |
900 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars | 897 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
901 "andl $0xFFFFFFF8, %%ecx \n\t" // align | 898 "andl $0xFFFFFFF8, %%ecx \n\t" // align |
902 // 0 1 2 3 4 5 6 7 | 899 // 0 1 2 3 4 5 6 7 |
903 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 | 900 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
904 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 | 901 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 |
906 "movq (%0), %%mm0 \n\t" | 903 "movq (%0), %%mm0 \n\t" |
907 "movq %%mm0, %%mm1 \n\t" | 904 "movq %%mm0, %%mm1 \n\t" |
908 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | 905 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
909 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | 906 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
910 | 907 |
911 "movq (%%eax), %%mm2 \n\t" | 908 "movq (%0, %1), %%mm2 \n\t" |
909 "leal (%0, %1, 2), %%eax \n\t" | |
912 "movq %%mm2, %%mm3 \n\t" | 910 "movq %%mm2, %%mm3 \n\t" |
913 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 | 911 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
914 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 | 912 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
915 | 913 |
916 "movq (%%eax, %1), %%mm4 \n\t" | 914 "movq (%%eax), %%mm4 \n\t" |
917 "movq %%mm4, %%mm5 \n\t" | 915 "movq %%mm4, %%mm5 \n\t" |
918 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 | 916 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
919 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 | 917 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
920 | 918 |
921 "paddw %%mm0, %%mm0 \n\t" // 2L0 | 919 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
928 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 | 926 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
929 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 | 927 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
930 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 | 928 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
931 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 | 929 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
932 | 930 |
933 "movq (%%eax, %1, 2), %%mm2 \n\t" | 931 "movq (%%eax, %1), %%mm2 \n\t" |
934 "movq %%mm2, %%mm3 \n\t" | 932 "movq %%mm2, %%mm3 \n\t" |
935 "punpcklbw %%mm7, %%mm2 \n\t" // L3 | 933 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
936 "punpckhbw %%mm7, %%mm3 \n\t" // H3 | 934 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
937 | 935 |
938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | 936 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
940 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
941 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
942 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 940 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
943 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 941 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
944 | 942 |
945 "movq (%0, %1, 4), %%mm0 \n\t" | 943 "movq (%%eax, %1, 2), %%mm0 \n\t" |
946 "movq %%mm0, %%mm1 \n\t" | 944 "movq %%mm0, %%mm1 \n\t" |
947 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | 945 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
948 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | 946 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
949 | 947 |
950 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | 948 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
954 "paddw %%mm4, %%mm4 \n\t" // 2L2 | 952 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
955 "paddw %%mm5, %%mm5 \n\t" // 2H2 | 953 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
956 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | 954 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
957 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | 955 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
958 | 956 |
957 "leal (%%eax, %1), %0 \n\t" | |
959 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | 958 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
960 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | 959 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
961 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | 960 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
962 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | 961 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
963 //50 opcodes so far | 962 //50 opcodes so far |
964 "movq (%%edx), %%mm2 \n\t" | 963 "movq (%0, %1, 2), %%mm2 \n\t" |
965 "movq %%mm2, %%mm3 \n\t" | 964 "movq %%mm2, %%mm3 \n\t" |
966 "punpcklbw %%mm7, %%mm2 \n\t" // L5 | 965 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
967 "punpckhbw %%mm7, %%mm3 \n\t" // H5 | 966 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
968 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | 967 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
969 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | 968 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
970 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | 969 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
971 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | 970 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
972 | 971 |
973 "movq (%%edx, %1), %%mm6 \n\t" | 972 "movq (%%eax, %1, 4), %%mm6 \n\t" |
974 "punpcklbw %%mm7, %%mm6 \n\t" // L6 | 973 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
975 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | 974 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
976 "movq (%%edx, %1), %%mm6 \n\t" | 975 "movq (%%eax, %1, 4), %%mm6 \n\t" |
977 "punpckhbw %%mm7, %%mm6 \n\t" // H6 | 976 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
978 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | 977 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
979 | 978 |
980 "paddw %%mm0, %%mm0 \n\t" // 2L4 | 979 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
981 "paddw %%mm1, %%mm1 \n\t" // 2H4 | 980 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
985 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | 984 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
986 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | 985 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
987 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | 986 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
988 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | 987 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
989 | 988 |
990 "movq (%%edx, %1, 2), %%mm2 \n\t" | 989 "movq (%0, %1, 4), %%mm2 \n\t" |
991 "movq %%mm2, %%mm3 \n\t" | 990 "movq %%mm2, %%mm3 \n\t" |
992 "punpcklbw %%mm7, %%mm2 \n\t" // L7 | 991 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
993 "punpckhbw %%mm7, %%mm3 \n\t" // H7 | 992 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
994 | 993 |
995 "paddw %%mm2, %%mm2 \n\t" // 2L7 | 994 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
1108 "pxor %%mm6, %%mm4 \n\t" | 1107 "pxor %%mm6, %%mm4 \n\t" |
1109 "pxor %%mm7, %%mm5 \n\t" | 1108 "pxor %%mm7, %%mm5 \n\t" |
1110 "psubw %%mm6, %%mm4 \n\t" | 1109 "psubw %%mm6, %%mm4 \n\t" |
1111 "psubw %%mm7, %%mm5 \n\t" | 1110 "psubw %%mm7, %%mm5 \n\t" |
1112 "packsswb %%mm5, %%mm4 \n\t" | 1111 "packsswb %%mm5, %%mm4 \n\t" |
1113 "movq (%%eax, %1, 2), %%mm0 \n\t" | 1112 "movq (%0), %%mm0 \n\t" |
1114 "paddb %%mm4, %%mm0 \n\t" | 1113 "paddb %%mm4, %%mm0 \n\t" |
1115 "movq %%mm0, (%%eax, %1, 2) \n\t" | 1114 "movq %%mm0, (%0) \n\t" |
1116 "movq (%0, %1, 4), %%mm0 \n\t" | 1115 "movq (%0, %1), %%mm0 \n\t" |
1117 "psubb %%mm4, %%mm0 \n\t" | 1116 "psubb %%mm4, %%mm0 \n\t" |
1118 "movq %%mm0, (%0, %1, 4) \n\t" | 1117 "movq %%mm0, (%0, %1) \n\t" |
1119 | 1118 |
1120 : | 1119 : "+r" (src) |
1121 : "r" (src), "r" (stride), "m" (c->pQPb) | 1120 : "r" (stride), "m" (c->pQPb) |
1122 : "%eax", "%edx", "%ecx" | 1121 : "%eax", "%ecx" |
1123 ); | 1122 ); |
1124 #else | 1123 #else |
1125 const int l1= stride; | 1124 const int l1= stride; |
1126 const int l2= stride + l1; | 1125 const int l2= stride + l1; |
1127 const int l3= stride + l2; | 1126 const int l3= stride + l2; |