comparison postproc/swscale_template.c @ 2469:03abc2743ed6

downscale fixed some warnings
author michael
date Thu, 25 Oct 2001 14:31:11 +0000
parents 7d3542955132
children a6c5a537f30a
comparison
equal deleted inserted replaced
2468:24427e03cd66 2469:03abc2743ed6
31 TODO 31 TODO
32 more intelligent missalignment avoidance for the horizontal scaler 32 more intelligent missalignment avoidance for the horizontal scaler
33 */ 33 */
34 34
35 #define ABS(a) ((a) > 0 ? (a) : (-(a))) 35 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
36 #define MIN(a,b) ((a) > (b) ? (b) : (a))
37 #define MAX(a,b) ((a) < (b) ? (b) : (a))
36 38
37 #ifdef HAVE_MMX2 39 #ifdef HAVE_MMX2
38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 40 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39 #elif defined (HAVE_3DNOW) 41 #elif defined (HAVE_3DNOW)
40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 42 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
100 static int yuvtab_40cf[256]; 102 static int yuvtab_40cf[256];
101 103
102 104
103 static uint8_t funnyYCode[10000]; 105 static uint8_t funnyYCode[10000];
104 static uint8_t funnyUVCode[10000]; 106 static uint8_t funnyUVCode[10000];
107
108 static int canMMX2BeUsed=0;
105 109
106 #define FULL_YSCALEYUV2RGB \ 110 #define FULL_YSCALEYUV2RGB \
107 "pxor %%mm7, %%mm7 \n\t"\ 111 "pxor %%mm7, %%mm7 \n\t"\
108 "movd %6, %%mm6 \n\t" /*yalpha1*/\ 112 "movd %6, %%mm6 \n\t" /*yalpha1*/\
109 "punpcklwd %%mm6, %%mm6 \n\t"\ 113 "punpcklwd %%mm6, %%mm6 \n\t"\
614 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 618 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
615 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); 619 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
616 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); 620 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
617 621
618 ((uint16_t*)dest)[0] = 622 ((uint16_t*)dest)[0] =
619 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | 623 (clip_table[(Y + yuvtab_3343[U]) >>13]>>3) |
620 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | 624 ((clip_table[(Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13]<<3)&0x07E0) |
621 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; 625 ((clip_table[(Y + yuvtab_40cf[V]) >>13]<<8)&0xF800);
622 dest+=2; 626 dest+=2;
623 } 627 }
624 } 628 }
625 else if(dstbpp==15) 629 else if(dstbpp==15)
626 { 630 {
629 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 633 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
630 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); 634 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
631 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); 635 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
632 636
633 ((uint16_t*)dest)[0] = 637 ((uint16_t*)dest)[0] =
634 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | 638 (clip_table[(Y + yuvtab_3343[U]) >>13]>>3) |
635 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | 639 ((clip_table[(Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13]<<2)&0x03E0) |
636 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; 640 ((clip_table[(Y + yuvtab_40cf[V]) >>13]<<7)&0x7C00);
637 dest+=2; 641 dest+=2;
638 } 642 }
639 } 643 }
640 #endif 644 #endif
641 }//FULL_UV_IPOL 645 }//FULL_UV_IPOL
722 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 726 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
723 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 727 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
724 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 728 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
725 729
726 ((uint16_t*)dest)[0] = 730 ((uint16_t*)dest)[0] =
727 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | 731 (clip_table[(Y + yuvtab_3343[U]) >>13]>>3) |
728 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | 732 ((clip_table[(Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13]<<3)&0x07E0) |
729 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; 733 ((clip_table[(Y + yuvtab_40cf[V]) >>13]<<8)&0xF800);
730 dest+=2; 734 dest+=2;
731 } 735 }
732 } 736 }
733 else if(dstbpp==15) 737 else if(dstbpp==15)
734 { 738 {
737 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 741 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
738 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 742 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
739 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 743 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
740 744
741 ((uint16_t*)dest)[0] = 745 ((uint16_t*)dest)[0] =
742 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | 746 (clip_table[(Y + yuvtab_3343[U]) >>13]>>3) |
743 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | 747 ((clip_table[(Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13]<<2)&0x03E0) |
744 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; 748 ((clip_table[(Y + yuvtab_40cf[V]) >>13]<<7)&0x7C00);
745 dest+=2; 749 dest+=2;
746 } 750 }
747 } 751 }
748 #endif 752 #endif
749 } //!FULL_UV_IPOL 753 } //!FULL_UV_IPOL
839 int Y=yuvtab_2568[buf0[i]>>7]; 843 int Y=yuvtab_2568[buf0[i]>>7];
840 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 844 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
841 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 845 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
842 846
843 ((uint16_t*)dest)[0] = 847 ((uint16_t*)dest)[0] =
844 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | 848 (clip_table[(Y + yuvtab_3343[U]) >>13]>>3) |
845 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | 849 ((clip_table[(Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13]<<3)&0x07E0) |
846 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; 850 ((clip_table[(Y + yuvtab_40cf[V]) >>13]<<8)&0xF800);
847 dest+=2; 851 dest+=2;
848 } 852 }
849 } 853 }
850 else if(dstbpp==15) 854 else if(dstbpp==15)
851 { 855 {
854 int Y=yuvtab_2568[buf0[i]>>7]; 858 int Y=yuvtab_2568[buf0[i]>>7];
855 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 859 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
856 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 860 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
857 861
858 ((uint16_t*)dest)[0] = 862 ((uint16_t*)dest)[0] =
859 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | 863 (clip_table[(Y + yuvtab_3343[U]) >>13]>>3) |
860 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | 864 ((clip_table[(Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13]<<2)&0x03E0) |
861 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; 865 ((clip_table[(Y + yuvtab_40cf[V]) >>13]<<7)&0x7C00);
862 dest+=2; 866 dest+=2;
863 } 867 }
864 } 868 }
865 #endif 869 #endif
866 } 870 }
867 871
868 872
873 static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc)
874 {
875 int i;
876 unsigned int xpos=0;
877 // *** horizontal scale Y line to temp buffer
878 #ifdef ARCH_X86
879
880 #ifdef HAVE_MMX2
881 if(canMMX2BeUsed)
882 {
883 asm volatile(
884 "pxor %%mm7, %%mm7 \n\t"
885 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
886 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
887 "punpcklwd %%mm6, %%mm6 \n\t"
888 "punpcklwd %%mm6, %%mm6 \n\t"
889 "movq %%mm6, %%mm2 \n\t"
890 "psllq $16, %%mm2 \n\t"
891 "paddw %%mm6, %%mm2 \n\t"
892 "psllq $16, %%mm2 \n\t"
893 "paddw %%mm6, %%mm2 \n\t"
894 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
895 "movq %%mm2, temp0 \n\t"
896 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
897 "punpcklwd %%mm6, %%mm6 \n\t"
898 "punpcklwd %%mm6, %%mm6 \n\t"
899 "xorl %%eax, %%eax \n\t" // i
900 "movl %0, %%esi \n\t" // src
901 "movl %1, %%edi \n\t" // buf1
902 "movl %3, %%edx \n\t" // (xInc*4)>>16
903 "xorl %%ecx, %%ecx \n\t"
904 "xorl %%ebx, %%ebx \n\t"
905 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
906 #ifdef HAVE_MMX2
907 #define FUNNY_Y_CODE \
908 "prefetchnta 1024(%%esi) \n\t"\
909 "prefetchnta 1056(%%esi) \n\t"\
910 "prefetchnta 1088(%%esi) \n\t"\
911 "call funnyYCode \n\t"\
912 "movq temp0, %%mm2 \n\t"\
913 "xorl %%ecx, %%ecx \n\t"
914 #else
915 #define FUNNY_Y_CODE \
916 "call funnyYCode \n\t"\
917 "movq temp0, %%mm2 \n\t"\
918 "xorl %%ecx, %%ecx \n\t"
919 #endif
920 FUNNY_Y_CODE
921 FUNNY_Y_CODE
922 FUNNY_Y_CODE
923 FUNNY_Y_CODE
924 FUNNY_Y_CODE
925 FUNNY_Y_CODE
926 FUNNY_Y_CODE
927 FUNNY_Y_CODE
928
929 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
930 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
931 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
932 );
933 for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128;
934 }
935 else
936 {
937 #endif
938 //NO MMX just normal asm ...
939 asm volatile(
940 "xorl %%eax, %%eax \n\t" // i
941 "xorl %%ebx, %%ebx \n\t" // xx
942 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
943 "1: \n\t"
944 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
945 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
946 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
947 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
948 "shll $16, %%edi \n\t"
949 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
950 "movl %1, %%edi \n\t"
951 "shrl $9, %%esi \n\t"
952 "movw %%si, (%%edi, %%eax, 2) \n\t"
953 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
954 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
955
956 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
957 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
958 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
959 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
960 "shll $16, %%edi \n\t"
961 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
962 "movl %1, %%edi \n\t"
963 "shrl $9, %%esi \n\t"
964 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
965 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
966 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
967
968
969 "addl $2, %%eax \n\t"
970 "cmpl %2, %%eax \n\t"
971 " jb 1b \n\t"
972
973
974 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
975 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
976 );
977 #ifdef HAVE_MMX2
978 } //if MMX2 cant be used
979 #endif
980 #else
981 for(i=0;i<dstWidth;i++){
982 register unsigned int xx=xpos>>16;
983 register unsigned int xalpha=(xpos&0xFFFF)>>9;
984 dst[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
985 xpos+=xInc;
986 }
987 #endif
988 }
989
990 inline static void hcscale(uint16_t *dst, int dstWidth,
991 uint8_t *src1, uint8_t *src2, int srcWidth, int xInc)
992 {
993 int xpos=0;
994 int i;
995 #ifdef ARCH_X86
996 #ifdef HAVE_MMX2
997 if(canMMX2BeUsed)
998 {
999 asm volatile(
1000 "pxor %%mm7, %%mm7 \n\t"
1001 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1002 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1003 "punpcklwd %%mm6, %%mm6 \n\t"
1004 "punpcklwd %%mm6, %%mm6 \n\t"
1005 "movq %%mm6, %%mm2 \n\t"
1006 "psllq $16, %%mm2 \n\t"
1007 "paddw %%mm6, %%mm2 \n\t"
1008 "psllq $16, %%mm2 \n\t"
1009 "paddw %%mm6, %%mm2 \n\t"
1010 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
1011 "movq %%mm2, temp0 \n\t"
1012 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1013 "punpcklwd %%mm6, %%mm6 \n\t"
1014 "punpcklwd %%mm6, %%mm6 \n\t"
1015 "xorl %%eax, %%eax \n\t" // i
1016 "movl %0, %%esi \n\t" // src
1017 "movl %1, %%edi \n\t" // buf1
1018 "movl %3, %%edx \n\t" // (xInc*4)>>16
1019 "xorl %%ecx, %%ecx \n\t"
1020 "xorl %%ebx, %%ebx \n\t"
1021 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1022
1023 #ifdef HAVE_MMX2
1024 #define FUNNYUVCODE \
1025 "prefetchnta 1024(%%esi) \n\t"\
1026 "prefetchnta 1056(%%esi) \n\t"\
1027 "prefetchnta 1088(%%esi) \n\t"\
1028 "call funnyUVCode \n\t"\
1029 "movq temp0, %%mm2 \n\t"\
1030 "xorl %%ecx, %%ecx \n\t"
1031 #else
1032 #define FUNNYUVCODE \
1033 "call funnyUVCode \n\t"\
1034 "movq temp0, %%mm2 \n\t"\
1035 "xorl %%ecx, %%ecx \n\t"
1036 #endif
1037
1038 FUNNYUVCODE
1039 FUNNYUVCODE
1040 FUNNYUVCODE
1041 FUNNYUVCODE
1042
1043 FUNNYUVCODE
1044 FUNNYUVCODE
1045 FUNNYUVCODE
1046 FUNNYUVCODE
1047
1048
1049 "xorl %%eax, %%eax \n\t" // i
1050 "movl %6, %%esi \n\t" // src
1051 "movl %1, %%edi \n\t" // buf1
1052 "addl $4096, %%edi \n\t"
1053
1054 FUNNYUVCODE
1055 FUNNYUVCODE
1056 FUNNYUVCODE
1057 FUNNYUVCODE
1058
1059 FUNNYUVCODE
1060 FUNNYUVCODE
1061 FUNNYUVCODE
1062 FUNNYUVCODE
1063
1064 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1065 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1066 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1067 );
1068 for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--)
1069 {
1070 dst[i] = src1[srcWidth/2-1]*128;
1071 dst[i+2048] = src2[srcWidth/2-1]*128;
1072 }
1073 }
1074 else
1075 {
1076 #endif
1077 asm volatile(
1078 "xorl %%eax, %%eax \n\t" // i
1079 "xorl %%ebx, %%ebx \n\t" // xx
1080 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1081 "1: \n\t"
1082 "movl %0, %%esi \n\t"
1083 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
1084 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
1085 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1086 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1087 "shll $16, %%edi \n\t"
1088 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1089 "movl %1, %%edi \n\t"
1090 "shrl $9, %%esi \n\t"
1091 "movw %%si, (%%edi, %%eax, 2) \n\t"
1092
1093 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
1094 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
1095 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1096 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1097 "shll $16, %%edi \n\t"
1098 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1099 "movl %1, %%edi \n\t"
1100 "shrl $9, %%esi \n\t"
1101 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1102
1103 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1104 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1105 "addl $1, %%eax \n\t"
1106 "cmpl %2, %%eax \n\t"
1107 " jb 1b \n\t"
1108
1109 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1110 "r" (src2)
1111 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1112 );
1113 #ifdef HAVE_MMX2
1114 } //if MMX2 cant be used
1115 #endif
1116 #else
1117 for(i=0;i<dstWidth;i++){
1118 register unsigned int xx=xpos>>16;
1119 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1120 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1121 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1122 xpos+=xInc;
1123 }
1124 #endif
1125 }
869 1126
870 1127
871 // *** bilinear scaling and yuv->rgb conversion of yv12 slices: 1128 // *** bilinear scaling and yuv->rgb conversion of yv12 slices:
872 // *** Note: it's called multiple times while decoding a frame, first time y==0 1129 // *** Note: it's called multiple times while decoding a frame, first time y==0
873 // *** Designed to upscale, but may work for downscale too. 1130 // *** Designed to upscale, but may work for downscale too.
896 // used to detect a horizontal size change 1153 // used to detect a horizontal size change
897 static int old_dstw= -1; 1154 static int old_dstw= -1;
898 static int old_s_xinc= -1; 1155 static int old_s_xinc= -1;
899 #endif 1156 #endif
900 1157
901 int canMMX2BeUsed=0;
902 int srcWidth= (dstw*s_xinc + 0x8000)>>16; 1158 int srcWidth= (dstw*s_xinc + 0x8000)>>16;
903 int dstUVw= fullUVIpol ? dstw : dstw/2; 1159 int dstUVw= fullUVIpol ? dstw : dstw/2;
904 1160
905 1161
906 #ifdef HAVE_MMX2 1162 #ifdef HAVE_MMX2
916 else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20; 1172 else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
917 1173
918 if(fullUVIpol) s_xinc2= s_xinc>>1; 1174 if(fullUVIpol) s_xinc2= s_xinc>>1;
919 else s_xinc2= s_xinc; 1175 else s_xinc2= s_xinc;
920 // force calculation of the horizontal interpolation of the first line 1176 // force calculation of the horizontal interpolation of the first line
921 s_last_ypos=-99;
922 s_last_y1pos=-99;
923 1177
924 if(y==0){ 1178 if(y==0){
925 s_srcypos=-0x8000; 1179 s_last_ypos=-99;
926 s_ypos=0; 1180 s_last_y1pos=-99;
1181 s_srcypos= s_yinc/2 - 0x8000;
1182 s_ypos=0;
927 #ifdef HAVE_MMX2 1183 #ifdef HAVE_MMX2
928 // cant downscale !!! 1184 // cant downscale !!!
929 if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) 1185 if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
930 { 1186 {
931 uint8_t *fragment; 1187 uint8_t *fragment;
1058 } 1314 }
1059 1315
1060 #endif // HAVE_MMX2 1316 #endif // HAVE_MMX2
1061 } // reset counters 1317 } // reset counters
1062 1318
1063
1064 while(1){ 1319 while(1){
1065 unsigned char *dest=dstptr+dststride*s_ypos; 1320 unsigned char *dest=dstptr+dststride*s_ypos;
1066 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line 1321 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line
1067 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) 1322 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1068 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; 1323 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000;
1073 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice 1328 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice
1074 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice 1329 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice
1075 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice 1330 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice
1076 int i; 1331 int i;
1077 1332
1078 // if this is before the first line than use only the first src line
1079 if(y0==0) buf0= buf1;
1080 if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0
1081
1082 if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway 1333 if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
1083 1334
1084 // if this is after the last line than use only the last src line 1335 // if this is after the last line than use only the last src line
1085 if(y0>=y+h) 1336 /* if(y0>=y+h)
1086 { 1337 {
1087 buf1= buf0; 1338 buf1= buf0;
1088 s_last_ypos=y0; 1339 s_last_ypos=y0;
1089 } 1340 }
1090 if(y1>=(y+h)/2) 1341 if(y1>=(y+h)/2)
1091 { 1342 {
1092 uvbuf1= uvbuf0; 1343 uvbuf1= uvbuf0;
1093 s_last_y1pos=y1; 1344 s_last_y1pos=y1;
1094 } 1345 }
1095 1346 */
1096 1347
1097 s_ypos++; s_srcypos+=s_yinc; 1348 s_ypos++; s_srcypos+=s_yinc;
1098 1349
1099 //only interpolate the src line horizontally if we didnt do it allready 1350 //only interpolate the src line horizontally if we didnt do it allready
1100 if(s_last_ypos!=y0){ 1351 if(s_last_ypos!=y0)
1101 unsigned char *src=srcptr[0]+(y0-y)*stride[0];
1102 unsigned int xpos=0;
1103 s_last_ypos=y0;
1104 // *** horizontal scale Y line to temp buffer
1105 #ifdef ARCH_X86
1106
1107 #ifdef HAVE_MMX2
1108 if(canMMX2BeUsed)
1109 { 1352 {
1110 asm volatile( 1353 unsigned char *src;
1111 "pxor %%mm7, %%mm7 \n\t" 1354 // skip if first line has been horiz scaled alleady
1112 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha 1355 if(s_last_ypos != y0-1)
1113 "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF 1356 {
1114 "punpcklwd %%mm6, %%mm6 \n\t" 1357 // check if first line is before any available src lines
1115 "punpcklwd %%mm6, %%mm6 \n\t" 1358 if(y0-1 < y) src=srcptr[0]+(0 )*stride[0];
1116 "movq %%mm6, %%mm2 \n\t" 1359 else src=srcptr[0]+(y0-y-1)*stride[0];
1117 "psllq $16, %%mm2 \n\t" 1360
1118 "paddw %%mm6, %%mm2 \n\t" 1361 hyscale(buf0, dstw, src, srcWidth, s_xinc);
1119 "psllq $16, %%mm2 \n\t" 1362 }
1120 "paddw %%mm6, %%mm2 \n\t" 1363 // check if second line is after any available src lines
1121 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF 1364 if(y0-y >= h) src=srcptr[0]+(h-1)*stride[0];
1122 "movq %%mm2, temp0 \n\t" 1365 else src=srcptr[0]+(y0-y)*stride[0];
1123 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF 1366
1124 "punpcklwd %%mm6, %%mm6 \n\t" 1367 // the min() is required to avoid reuseing lines which where not available
1125 "punpcklwd %%mm6, %%mm6 \n\t" 1368 s_last_ypos= MIN(y0, y+h-1);
1126 "xorl %%eax, %%eax \n\t" // i 1369 hyscale(buf1, dstw, src, srcWidth, s_xinc);
1127 "movl %0, %%esi \n\t" // src
1128 "movl %1, %%edi \n\t" // buf1
1129 "movl %3, %%edx \n\t" // (s_xinc*4)>>16
1130 "xorl %%ecx, %%ecx \n\t"
1131 "xorl %%ebx, %%ebx \n\t"
1132 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF
1133 #ifdef HAVE_MMX2
1134 #define FUNNY_Y_CODE \
1135 "prefetchnta 1024(%%esi) \n\t"\
1136 "prefetchnta 1056(%%esi) \n\t"\
1137 "prefetchnta 1088(%%esi) \n\t"\
1138 "call funnyYCode \n\t"\
1139 "movq temp0, %%mm2 \n\t"\
1140 "xorl %%ecx, %%ecx \n\t"
1141 #else
1142 #define FUNNY_Y_CODE \
1143 "call funnyYCode \n\t"\
1144 "movq temp0, %%mm2 \n\t"\
1145 "xorl %%ecx, %%ecx \n\t"
1146 #endif
1147 FUNNY_Y_CODE
1148 FUNNY_Y_CODE
1149 FUNNY_Y_CODE
1150 FUNNY_Y_CODE
1151 FUNNY_Y_CODE
1152 FUNNY_Y_CODE
1153 FUNNY_Y_CODE
1154 FUNNY_Y_CODE
1155
1156 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16),
1157 "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF)
1158 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1159 );
1160 for(i=dstw-1; (i*s_xinc)>>16 >=srcWidth-1; i--) buf1[i] = src[srcWidth-1]*128;
1161 } 1370 }
1162 else 1371 // printf("%d %d %d %d\n", y, y1, s_last_y1pos, h);
1372 // *** horizontal scale U and V lines to temp buffer
1373 if(s_last_y1pos!=y1)
1163 { 1374 {
1164 #endif 1375 uint8_t *src1, *src2;
1165 //NO MMX just normal asm ... 1376 // skip if first line has been horiz scaled alleady
1166 asm volatile( 1377 if(s_last_y1pos != y1-1)
1167 "xorl %%eax, %%eax \n\t" // i 1378 {
1168 "xorl %%ebx, %%ebx \n\t" // xx 1379 // check if first line is before any available src lines
1169 "xorl %%ecx, %%ecx \n\t" // 2*xalpha 1380 if(y1-y/2-1 < 0)
1170 "1: \n\t" 1381 {
1171 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] 1382 src1= srcptr[1]+(0)*stride[1];
1172 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] 1383 src2= srcptr[2]+(0)*stride[2];
1173 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 1384 }else{
1174 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 1385 src1= srcptr[1]+(y1-y/2-1)*stride[1];
1175 "shll $16, %%edi \n\t" 1386 src2= srcptr[2]+(y1-y/2-1)*stride[2];
1176 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 1387 }
1177 "movl %1, %%edi \n\t" 1388 hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2);
1178 "shrl $9, %%esi \n\t" 1389 }
1179 "movw %%si, (%%edi, %%eax, 2) \n\t" 1390
1180 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF 1391 // check if second line is after any available src lines
1181 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry 1392 if(y1 - y/2 >= h/2)
1182 1393 {
1183 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] 1394 src1= srcptr[1]+(h/2-1)*stride[1];
1184 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] 1395 src2= srcptr[2]+(h/2-1)*stride[2];
1185 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 1396 }else{
1186 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 1397 src1= srcptr[1]+(y1-y/2)*stride[1];
1187 "shll $16, %%edi \n\t" 1398 src2= srcptr[2]+(y1-y/2)*stride[2];
1188 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 1399 }
1189 "movl %1, %%edi \n\t" 1400 hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2);
1190 "shrl $9, %%esi \n\t" 1401
1191 "movw %%si, 2(%%edi, %%eax, 2) \n\t" 1402 // the min() is required to avoid reuseing lines which where not available
1192 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF 1403 s_last_y1pos= MIN(y1, y/2+h/2-1);
1193 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
1194
1195
1196 "addl $2, %%eax \n\t"
1197 "cmpl %2, %%eax \n\t"
1198 " jb 1b \n\t"
1199
1200
1201 :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF)
1202 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1203 );
1204 #ifdef HAVE_MMX2
1205 } //if MMX2 cant be used
1206 #endif
1207 #else
1208 for(i=0;i<dstw;i++){
1209 register unsigned int xx=xpos>>16;
1210 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1211 buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
1212 xpos+=s_xinc;
1213 }
1214 #endif
1215 }
1216 // *** horizontal scale U and V lines to temp buffer
1217 if(s_last_y1pos!=y1){
1218 unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
1219 unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
1220 int xpos=0;
1221 s_last_y1pos= y1;
1222 #ifdef ARCH_X86
1223 #ifdef HAVE_MMX2
1224 if(canMMX2BeUsed)
1225 {
1226 asm volatile(
1227 "pxor %%mm7, %%mm7 \n\t"
1228 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1229 "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF
1230 "punpcklwd %%mm6, %%mm6 \n\t"
1231 "punpcklwd %%mm6, %%mm6 \n\t"
1232 "movq %%mm6, %%mm2 \n\t"
1233 "psllq $16, %%mm2 \n\t"
1234 "paddw %%mm6, %%mm2 \n\t"
1235 "psllq $16, %%mm2 \n\t"
1236 "paddw %%mm6, %%mm2 \n\t"
1237 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF
1238 "movq %%mm2, temp0 \n\t"
1239 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF
1240 "punpcklwd %%mm6, %%mm6 \n\t"
1241 "punpcklwd %%mm6, %%mm6 \n\t"
1242 "xorl %%eax, %%eax \n\t" // i
1243 "movl %0, %%esi \n\t" // src
1244 "movl %1, %%edi \n\t" // buf1
1245 "movl %3, %%edx \n\t" // (s_xinc*4)>>16
1246 "xorl %%ecx, %%ecx \n\t"
1247 "xorl %%ebx, %%ebx \n\t"
1248 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF
1249
1250 #ifdef HAVE_MMX2
1251 #define FUNNYUVCODE \
1252 "prefetchnta 1024(%%esi) \n\t"\
1253 "prefetchnta 1056(%%esi) \n\t"\
1254 "prefetchnta 1088(%%esi) \n\t"\
1255 "call funnyUVCode \n\t"\
1256 "movq temp0, %%mm2 \n\t"\
1257 "xorl %%ecx, %%ecx \n\t"
1258 #else
1259 #define FUNNYUVCODE \
1260 "call funnyUVCode \n\t"\
1261 "movq temp0, %%mm2 \n\t"\
1262 "xorl %%ecx, %%ecx \n\t"
1263 #endif
1264
1265 FUNNYUVCODE
1266 FUNNYUVCODE
1267 FUNNYUVCODE
1268 FUNNYUVCODE
1269
1270 FUNNYUVCODE
1271 FUNNYUVCODE
1272 FUNNYUVCODE
1273 FUNNYUVCODE
1274
1275
1276 "xorl %%eax, %%eax \n\t" // i
1277 "movl %6, %%esi \n\t" // src
1278 "movl %1, %%edi \n\t" // buf1
1279 "addl $4096, %%edi \n\t"
1280
1281 FUNNYUVCODE
1282 FUNNYUVCODE
1283 FUNNYUVCODE
1284 FUNNYUVCODE
1285
1286 FUNNYUVCODE
1287 FUNNYUVCODE
1288 FUNNYUVCODE
1289 FUNNYUVCODE
1290
1291 :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" ((s_xinc2*4)>>16),
1292 "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2)
1293 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1294 );
1295 for(i=dstUVw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--)
1296 {
1297 uvbuf1[i] = src1[srcWidth/2-1]*128;
1298 uvbuf1[i+2048] = src2[srcWidth/2-1]*128;
1299 }
1300 } 1404 }
1301 else 1405
1302 {
1303 #endif
1304 asm volatile(
1305 "xorl %%eax, %%eax \n\t" // i
1306 "xorl %%ebx, %%ebx \n\t" // xx
1307 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1308 "1: \n\t"
1309 "movl %0, %%esi \n\t"
1310 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
1311 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
1312 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1313 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1314 "shll $16, %%edi \n\t"
1315 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1316 "movl %1, %%edi \n\t"
1317 "shrl $9, %%esi \n\t"
1318 "movw %%si, (%%edi, %%eax, 2) \n\t"
1319
1320 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
1321 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
1322 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1323 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1324 "shll $16, %%edi \n\t"
1325 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1326 "movl %1, %%edi \n\t"
1327 "shrl $9, %%esi \n\t"
1328 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1329
1330 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF
1331 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
1332 "addl $1, %%eax \n\t"
1333 "cmpl %2, %%eax \n\t"
1334 " jb 1b \n\t"
1335
1336 :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
1337 "r" (src2)
1338 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1339 );
1340 #ifdef HAVE_MMX2
1341 } //if MMX2 cant be used
1342 #endif
1343 #else
1344 for(i=0;i<dstUVw;i++){
1345 register unsigned int xx=xpos>>16;
1346 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1347 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1348 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1349 xpos+=s_xinc2;
1350 }
1351 #endif
1352 // if this is the line before the first line
1353 if(s_srcypos == s_xinc - 0x8000)
1354 {
1355 s_srcypos= s_yinc/2 - 0x8000;
1356 continue;
1357 }
1358 }
1359 1406
1360 if(ABS(s_yinc - 0x10000) < 10) 1407 if(ABS(s_yinc - 0x10000) < 10)
1361 yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); 1408 yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1362 else 1409 else
1363 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); 1410 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);