comparison postproc/swscale_template.c @ 2576:437ed06579d8

c optimizations bugfix
author michael
date Tue, 30 Oct 2001 22:24:38 +0000
parents 37da7219ebaf
children 6d20d5d5829f
comparison
equal deleted inserted replaced
2575:37da7219ebaf 2576:437ed06579d8
288 "1: \n\t"\ 288 "1: \n\t"\
289 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ 289 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
290 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ 290 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
291 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 291 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
292 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 292 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
293 "paddw %%mm2, %%mm3 \n\t"\ 293 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
294 "paddw %%mm5, %%mm4 \n\t"\ 294 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
295 "psraw $5, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 295 "psrlw $5, %%mm3 \n\t"\
296 "psraw $5, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 296 "psrlw $5, %%mm4 \n\t"\
297 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ 297 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
298 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ 298 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
299 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 299 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
300 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 300 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
301 "pmulhw ugCoeff, %%mm3 \n\t"\ 301 "pmulhw ugCoeff, %%mm3 \n\t"\
783 "m" (yalpha1), "m" (uvalpha1) 783 "m" (yalpha1), "m" (uvalpha1)
784 : "%eax" 784 : "%eax"
785 ); 785 );
786 } 786 }
787 #else 787 #else
788 //FIXME unroll C loop and dont recalculate UV
789 asm volatile ("\n\t"::: "memory"); 788 asm volatile ("\n\t"::: "memory");
790 789
791 if(dstbpp==32) 790 if(dstbpp==32)
792 { 791 {
793 for(i=0; i<dstw-1; i+=2){ 792 for(i=0; i<dstw-1; i+=2){
896 if(fullUVIpol || allwaysIpol) 895 if(fullUVIpol || allwaysIpol)
897 { 896 {
898 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); 897 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
899 return; 898 return;
900 } 899 }
900 if( yalpha > 2048 ) buf0 = buf1;
901
901 #ifdef HAVE_MMX 902 #ifdef HAVE_MMX
902 if( yalpha > 2048 ) buf0 = buf1;
903 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster 903 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
904 { 904 {
905 if(dstbpp == 32) 905 if(dstbpp == 32)
906 { 906 {
907 asm volatile( 907 asm volatile(
1011 : "%eax" 1011 : "%eax"
1012 ); 1012 );
1013 } 1013 }
1014 } 1014 }
1015 #else 1015 #else
1016 //FIXME unroll C loop and dont recalculate UV 1016 //FIXME write 2 versions (for even & odd lines)
1017 asm volatile ("\n\t"::: "memory"); 1017 asm volatile ("\n\t"::: "memory");
1018 1018
1019 if(dstbpp==32 || dstbpp==24) 1019 if(dstbpp==32)
1020 { 1020 {
1021 for(i=0;i<dstw;i++){ 1021 for(i=0; i<dstw-1; i+=2){
1022 // vertical linear interpolation && yuv2rgb in a single step: 1022 // vertical linear interpolation && yuv2rgb in a single step:
1023 int Y=yuvtab_2568[buf0[i]>>7]; 1023 int Y1=yuvtab_2568[buf0[i]>>7];
1024 int Y2=yuvtab_2568[buf0[i+1]>>7];
1024 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 1025 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
1025 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 1026 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
1026 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; 1027
1027 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; 1028 int Cb= yuvtab_40cf[U];
1028 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; 1029 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1029 dest+=dstbpp>>3; 1030 int Cr= yuvtab_3343[V];
1031
1032 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1033 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1034 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1035
1036 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1037 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1038 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1039 }
1040 }
1041 if(dstbpp==24)
1042 {
1043 for(i=0; i<dstw-1; i+=2){
1044 // vertical linear interpolation && yuv2rgb in a single step:
1045 int Y1=yuvtab_2568[buf0[i]>>7];
1046 int Y2=yuvtab_2568[buf0[i+1]>>7];
1047 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
1048 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
1049
1050 int Cb= yuvtab_40cf[U];
1051 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1052 int Cr= yuvtab_3343[V];
1053
1054 dest[0]=clip_table[((Y1 + Cb) >>13)];
1055 dest[1]=clip_table[((Y1 + Cg) >>13)];
1056 dest[2]=clip_table[((Y1 + Cr) >>13)];
1057
1058 dest[3]=clip_table[((Y2 + Cb) >>13)];
1059 dest[4]=clip_table[((Y2 + Cg) >>13)];
1060 dest[5]=clip_table[((Y2 + Cr) >>13)];
1061 dest+=6;
1030 } 1062 }
1031 } 1063 }
1032 else if(dstbpp==16) 1064 else if(dstbpp==16)
1033 { 1065 {
1034 for(i=0;i<dstw;i++){ 1066 for(i=0; i<dstw-1; i+=2){
1035 // vertical linear interpolation && yuv2rgb in a single step: 1067 // vertical linear interpolation && yuv2rgb in a single step:
1036 int Y=yuvtab_2568[buf0[i]>>7]; 1068 int Y1=yuvtab_2568[buf0[i]>>7];
1069 int Y2=yuvtab_2568[buf0[i+1]>>7];
1037 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 1070 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
1038 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 1071 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
1039 1072
1073 int Cb= yuvtab_40cf[U];
1074 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1075 int Cr= yuvtab_3343[V];
1076
1040 ((uint16_t*)dest)[i] = 1077 ((uint16_t*)dest)[i] =
1041 (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) | 1078 (clip_table[(Y1 + Cb) >>13]>>3) |
1042 ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<3)&0x07E0) | 1079 ((clip_table[(Y1 + Cg) >>13]<<3)&0x07E0) |
1043 ((clip_table[(Y + yuvtab_3343[V]) >>13]<<8)&0xF800); 1080 ((clip_table[(Y1 + Cr) >>13]<<8)&0xF800);
1081
1082 ((uint16_t*)dest)[i+1] =
1083 (clip_table[(Y2 + Cb) >>13]>>3) |
1084 ((clip_table[(Y2 + Cg) >>13]<<3)&0x07E0) |
1085 ((clip_table[(Y2 + Cr) >>13]<<8)&0xF800);
1044 } 1086 }
1045 } 1087 }
1046 else if(dstbpp==15) 1088 else if(dstbpp==15)
1047 { 1089 {
1048 for(i=0;i<dstw;i++){ 1090 for(i=0; i<dstw-1; i+=2){
1049 // vertical linear interpolation && yuv2rgb in a single step: 1091 // vertical linear interpolation && yuv2rgb in a single step:
1050 int Y=yuvtab_2568[buf0[i]>>7]; 1092 int Y1=yuvtab_2568[buf0[i]>>7];
1093 int Y2=yuvtab_2568[buf0[i+1]>>7];
1051 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 1094 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
1052 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 1095 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
1053 1096
1097 int Cb= yuvtab_40cf[U];
1098 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1099 int Cr= yuvtab_3343[V];
1100
1054 ((uint16_t*)dest)[i] = 1101 ((uint16_t*)dest)[i] =
1055 (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) | 1102 (clip_table[(Y1 + Cb) >>13]>>3) |
1056 ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<2)&0x03E0) | 1103 ((clip_table[(Y1 + Cg) >>13]<<2)&0x03E0) |
1057 ((clip_table[(Y + yuvtab_3343[V]) >>13]<<7)&0x7C00); 1104 ((clip_table[(Y1 + Cr) >>13]<<7)&0x7C00);
1105 ((uint16_t*)dest)[i+1] =
1106 (clip_table[(Y2 + Cb) >>13]>>3) |
1107 ((clip_table[(Y2 + Cg) >>13]<<2)&0x03E0) |
1108 ((clip_table[(Y2 + Cr) >>13]<<7)&0x7C00);
1058 } 1109 }
1059 } 1110 }
1060 #endif 1111 #endif
1061 } 1112 }
1062 1113