Mercurial > mplayer.hg
comparison postproc/swscale_template.c @ 2576:437ed06579d8
c optimizations
bugfix
author | michael |
---|---|
date | Tue, 30 Oct 2001 22:24:38 +0000 |
parents | 37da7219ebaf |
children | 6d20d5d5829f |
comparison
equal
deleted
inserted
replaced
2575:37da7219ebaf | 2576:437ed06579d8 |
---|---|
288 "1: \n\t"\ | 288 "1: \n\t"\ |
289 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ | 289 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
290 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ | 290 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
291 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | 291 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
292 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | 292 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
293 "paddw %%mm2, %%mm3 \n\t"\ | 293 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
294 "paddw %%mm5, %%mm4 \n\t"\ | 294 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ |
295 "psraw $5, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | 295 "psrlw $5, %%mm3 \n\t"\ |
296 "psraw $5, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | 296 "psrlw $5, %%mm4 \n\t"\ |
297 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ | 297 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
298 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | 298 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ |
299 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | 299 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
300 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | 300 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
301 "pmulhw ugCoeff, %%mm3 \n\t"\ | 301 "pmulhw ugCoeff, %%mm3 \n\t"\ |
783 "m" (yalpha1), "m" (uvalpha1) | 783 "m" (yalpha1), "m" (uvalpha1) |
784 : "%eax" | 784 : "%eax" |
785 ); | 785 ); |
786 } | 786 } |
787 #else | 787 #else |
788 //FIXME unroll C loop and dont recalculate UV | |
789 asm volatile ("\n\t"::: "memory"); | 788 asm volatile ("\n\t"::: "memory"); |
790 | 789 |
791 if(dstbpp==32) | 790 if(dstbpp==32) |
792 { | 791 { |
793 for(i=0; i<dstw-1; i+=2){ | 792 for(i=0; i<dstw-1; i+=2){ |
896 if(fullUVIpol || allwaysIpol) | 895 if(fullUVIpol || allwaysIpol) |
897 { | 896 { |
898 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); | 897 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); |
899 return; | 898 return; |
900 } | 899 } |
900 if( yalpha > 2048 ) buf0 = buf1; | |
901 | |
901 #ifdef HAVE_MMX | 902 #ifdef HAVE_MMX |
902 if( yalpha > 2048 ) buf0 = buf1; | |
903 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster | 903 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
904 { | 904 { |
905 if(dstbpp == 32) | 905 if(dstbpp == 32) |
906 { | 906 { |
907 asm volatile( | 907 asm volatile( |
1011 : "%eax" | 1011 : "%eax" |
1012 ); | 1012 ); |
1013 } | 1013 } |
1014 } | 1014 } |
1015 #else | 1015 #else |
1016 //FIXME unroll C loop and dont recalculate UV | 1016 //FIXME write 2 versions (for even & odd lines) |
1017 asm volatile ("\n\t"::: "memory"); | 1017 asm volatile ("\n\t"::: "memory"); |
1018 | 1018 |
1019 if(dstbpp==32 || dstbpp==24) | 1019 if(dstbpp==32) |
1020 { | 1020 { |
1021 for(i=0;i<dstw;i++){ | 1021 for(i=0; i<dstw-1; i+=2){ |
1022 // vertical linear interpolation && yuv2rgb in a single step: | 1022 // vertical linear interpolation && yuv2rgb in a single step: |
1023 int Y=yuvtab_2568[buf0[i]>>7]; | 1023 int Y1=yuvtab_2568[buf0[i]>>7]; |
1024 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
1024 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | 1025 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); |
1025 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | 1026 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); |
1026 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | 1027 |
1027 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | 1028 int Cb= yuvtab_40cf[U]; |
1028 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | 1029 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; |
1029 dest+=dstbpp>>3; | 1030 int Cr= yuvtab_3343[V]; |
1031 | |
1032 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1033 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1034 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1035 | |
1036 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1037 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1038 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1039 } | |
1040 } | |
1041 if(dstbpp==24) | |
1042 { | |
1043 for(i=0; i<dstw-1; i+=2){ | |
1044 // vertical linear interpolation && yuv2rgb in a single step: | |
1045 int Y1=yuvtab_2568[buf0[i]>>7]; | |
1046 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
1047 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | |
1048 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | |
1049 | |
1050 int Cb= yuvtab_40cf[U]; | |
1051 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1052 int Cr= yuvtab_3343[V]; | |
1053 | |
1054 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1055 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1056 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1057 | |
1058 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1059 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1060 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1061 dest+=6; | |
1030 } | 1062 } |
1031 } | 1063 } |
1032 else if(dstbpp==16) | 1064 else if(dstbpp==16) |
1033 { | 1065 { |
1034 for(i=0;i<dstw;i++){ | 1066 for(i=0; i<dstw-1; i+=2){ |
1035 // vertical linear interpolation && yuv2rgb in a single step: | 1067 // vertical linear interpolation && yuv2rgb in a single step: |
1036 int Y=yuvtab_2568[buf0[i]>>7]; | 1068 int Y1=yuvtab_2568[buf0[i]>>7]; |
1069 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
1037 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | 1070 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); |
1038 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | 1071 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); |
1039 | 1072 |
1073 int Cb= yuvtab_40cf[U]; | |
1074 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1075 int Cr= yuvtab_3343[V]; | |
1076 | |
1040 ((uint16_t*)dest)[i] = | 1077 ((uint16_t*)dest)[i] = |
1041 (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) | | 1078 (clip_table[(Y1 + Cb) >>13]>>3) | |
1042 ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<3)&0x07E0) | | 1079 ((clip_table[(Y1 + Cg) >>13]<<3)&0x07E0) | |
1043 ((clip_table[(Y + yuvtab_3343[V]) >>13]<<8)&0xF800); | 1080 ((clip_table[(Y1 + Cr) >>13]<<8)&0xF800); |
1081 | |
1082 ((uint16_t*)dest)[i+1] = | |
1083 (clip_table[(Y2 + Cb) >>13]>>3) | | |
1084 ((clip_table[(Y2 + Cg) >>13]<<3)&0x07E0) | | |
1085 ((clip_table[(Y2 + Cr) >>13]<<8)&0xF800); | |
1044 } | 1086 } |
1045 } | 1087 } |
1046 else if(dstbpp==15) | 1088 else if(dstbpp==15) |
1047 { | 1089 { |
1048 for(i=0;i<dstw;i++){ | 1090 for(i=0; i<dstw-1; i+=2){ |
1049 // vertical linear interpolation && yuv2rgb in a single step: | 1091 // vertical linear interpolation && yuv2rgb in a single step: |
1050 int Y=yuvtab_2568[buf0[i]>>7]; | 1092 int Y1=yuvtab_2568[buf0[i]>>7]; |
1093 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
1051 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | 1094 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); |
1052 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | 1095 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); |
1053 | 1096 |
1097 int Cb= yuvtab_40cf[U]; | |
1098 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1099 int Cr= yuvtab_3343[V]; | |
1100 | |
1054 ((uint16_t*)dest)[i] = | 1101 ((uint16_t*)dest)[i] = |
1055 (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) | | 1102 (clip_table[(Y1 + Cb) >>13]>>3) | |
1056 ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<2)&0x03E0) | | 1103 ((clip_table[(Y1 + Cg) >>13]<<2)&0x03E0) | |
1057 ((clip_table[(Y + yuvtab_3343[V]) >>13]<<7)&0x7C00); | 1104 ((clip_table[(Y1 + Cr) >>13]<<7)&0x7C00); |
1105 ((uint16_t*)dest)[i+1] = | |
1106 (clip_table[(Y2 + Cb) >>13]>>3) | | |
1107 ((clip_table[(Y2 + Cg) >>13]<<2)&0x03E0) | | |
1108 ((clip_table[(Y2 + Cr) >>13]<<7)&0x7C00); | |
1058 } | 1109 } |
1059 } | 1110 } |
1060 #endif | 1111 #endif |
1061 } | 1112 } |
1062 | 1113 |