comparison postproc/swscale_template.c @ 2264:7851375ea156

increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix) moved mmx variables to top to avoid alignment issues mmx2 code should work fine now if and only if the input width is %16=0 and the output width is %32=0 reordered some code (5% faster with a simply -benchmark) first line bug fixed (i hope i didnt introduce any new bugs with that ...) changed a lot of the vertical scale setup code, i hope i fixed something and didnt mess it up :) a few known bugs left (rightmost line is wrong) MMX2 code will only be used for upscaling & acceptable widthˇ­s 16bit dithering can be disabled
author michael
date Thu, 18 Oct 2001 22:27:13 +0000
parents 00a46cd41edd
children 3df32dabe98c
comparison
equal deleted inserted replaced
2263:351aaf1eff87 2264:7851375ea156
1 1
2 // Software scaling and colorspace conversion routines for MPlayer 2 // Software scaling and colorspace conversion routines for MPlayer
3
4 // Orginal C implementation by ?
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
3 6
4 #include <inttypes.h> 7 #include <inttypes.h>
5 #include "../config.h" 8 #include "../config.h"
6 9
7 #undef HAVE_MMX2 //code is buggy 10 //#undef HAVE_MMX2
8 //#undef HAVE_MMX 11 //#undef HAVE_MMX
12 //#undef ARCH_X86
13 #define DITHER16BPP
14 #define ALT_ERROR
9 15
10 #define RET 0xC3 //near return opcode 16 #define RET 0xC3 //near return opcode
11 17 /*
12 // temporary storage for 4 yuv lines: 18 NOTES
13 // 16bit for now (mmx likes it more compact) 19
14 static uint16_t pix_buf_y[4][2048]; 20 known BUGS with known cause (no bugreports please!)
15 static uint16_t pix_buf_uv[2][2048*2]; 21 line at the right (c,asm and mmx2)
16 22 code reads 1 sample too much (might cause a sig11)
17 // clipping helper table for C implementations: 23
18 static unsigned char clip_table[768]; 24 TODO
19 25 check alignment off everything
20 // yuv->rgb conversion tables: 26 */
21 static int yuvtab_2568[256];
22 static int yuvtab_3343[256];
23 static int yuvtab_0c92[256];
24 static int yuvtab_1a1e[256];
25 static int yuvtab_40cf[256];
26 27
27 static uint64_t yCoeff= 0x2568256825682568LL; 28 static uint64_t yCoeff= 0x2568256825682568LL;
28 static uint64_t ubCoeff= 0x3343334333433343LL; 29 static uint64_t ubCoeff= 0x3343334333433343LL;
29 static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; 30 static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL;
30 static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; 31 static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL;
44 static uint64_t b16Mask= 0x001F001F001F001FLL; 45 static uint64_t b16Mask= 0x001F001F001F001FLL;
45 static uint64_t g16Mask= 0x07E007E007E007E0LL; 46 static uint64_t g16Mask= 0x07E007E007E007E0LL;
46 static uint64_t r16Mask= 0xF800F800F800F800LL; 47 static uint64_t r16Mask= 0xF800F800F800F800LL;
47 static uint64_t temp0; 48 static uint64_t temp0;
48 49
50
51 // temporary storage for 4 yuv lines:
52 // 16bit for now (mmx likes it more compact)
53 static uint16_t pix_buf_y[4][2048];
54 static uint16_t pix_buf_uv[2][2048*2];
55
56 // clipping helper table for C implementations:
57 static unsigned char clip_table[768];
58
59 // yuv->rgb conversion tables:
60 static int yuvtab_2568[256];
61 static int yuvtab_3343[256];
62 static int yuvtab_0c92[256];
63 static int yuvtab_1a1e[256];
64 static int yuvtab_40cf[256];
65
66
49 static uint8_t funnyYCode[10000]; 67 static uint8_t funnyYCode[10000];
50 static uint8_t funnyUVCode[10000]; 68 static uint8_t funnyUVCode[10000];
51
52 69
53 70
54 // *** bilinear scaling and yuv->rgb conversion of yv12 slices: 71 // *** bilinear scaling and yuv->rgb conversion of yv12 slices:
55 // *** Note: it's called multiple times while decoding a frame, first time y==0 72 // *** Note: it's called multiple times while decoding a frame, first time y==0
56 // *** Designed to upscale, but may work for downscale too. 73 // *** Designed to upscale, but may work for downscale too.
62 79
63 // scaling factors: 80 // scaling factors:
64 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; 81 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
65 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; 82 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
66 83
67 unsigned int s_xinc2=s_xinc>>1; 84 unsigned int s_xinc2;
68 85
69 static int s_srcypos; 86 static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
70 static int s_ypos; 87 static int s_ypos;
88
89 // last horzontally interpolated lines, used to avoid unnecessary calculations
71 static int s_last_ypos; 90 static int s_last_ypos;
91 static int s_last_y1pos;
92
72 static int static_dstw; 93 static int static_dstw;
73 94
74 #ifdef HAVE_MMX2 95 #ifdef HAVE_MMX2
96 // used to detect a horizontal size change
75 static int old_dstw= -1; 97 static int old_dstw= -1;
76 static int old_s_xinc= -1; 98 static int old_s_xinc= -1;
77 #endif 99
78 100 // difference between the requested xinc and the required one for the mmx2 routine
79 s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other 101 static int s_xinc_diff=0;
102 static int s_xinc2_diff=0;
103 #endif
104 int canMMX2BeUsed;
105
106 // we need that precission at least for the mmx2 code
107 s_xinc*= 256;
108 s_xinc2=s_xinc>>1;
109 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0;
80 110
81 if(y==0){ 111 if(y==0){
82 s_srcypos=-2*s_yinc; 112 s_srcypos= s_yinc/2 - 0x8000;
83 s_ypos=-2; 113 s_ypos=0;
84 s_last_ypos=-2; 114
115 // force calculation of the horizontal interpolation of the first line
116 s_last_ypos=-99;
117 s_last_y1pos=-99;
85 #ifdef HAVE_MMX2 118 #ifdef HAVE_MMX2
86 // cant downscale !!! 119 // cant downscale !!!
87 if(old_s_xinc != s_xinc || old_dstw!=dstw) 120 if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
88 { 121 {
89 uint8_t *fragment; 122 uint8_t *fragment;
90 int imm8OfPShufW1; 123 int imm8OfPShufW1;
91 int imm8OfPShufW2; 124 int imm8OfPShufW2;
92 int fragmentLength; 125 int fragmentLength;
100 133
101 // create an optimized horizontal scaling routine 134 // create an optimized horizontal scaling routine
102 135
103 //code fragment 136 //code fragment
104 137
105 // fragmentLength=0;
106 // printf("%d, %d\n", fragmentLength,imm8OfPShufW1);
107
108 asm volatile( 138 asm volatile(
109 "jmp 9f \n\t" 139 "jmp 9f \n\t"
110 // Begin 140 // Begin
111 "0: \n\t" 141 "0: \n\t"
112 "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment 142 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
113 "movq %%mm0, %%mm1 \n\t" 143 "movq %%mm0, %%mm1 \n\t"
114 "psrlq $8, %%mm0 \n\t" 144 "psrlq $8, %%mm0 \n\t"
115 "punpcklbw %%mm7, %%mm1 \n\t" 145 "punpcklbw %%mm7, %%mm1 \n\t"
146 "movq %%mm2, %%mm3 \n\t"
116 "punpcklbw %%mm7, %%mm0 \n\t" 147 "punpcklbw %%mm7, %%mm0 \n\t"
148 "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
117 "pshufw $0xFF, %%mm1, %%mm1 \n\t" 149 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
118 "1: \n\t" 150 "1: \n\t"
151 "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry
119 "pshufw $0xFF, %%mm0, %%mm0 \n\t" 152 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
120 "2: \n\t" 153 "2: \n\t"
154 "psrlw $9, %%mm3 \n\t"
121 "psubw %%mm1, %%mm0 \n\t" 155 "psubw %%mm1, %%mm0 \n\t"
122 "psraw $1, %%mm0 \n\t" 156 "pmullw %%mm3, %%mm0 \n\t"
123 "pmullw %%mm2, %%mm0 \n\t" 157 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
124 "psllw $7, %%mm1 \n\t" 158 "psllw $7, %%mm1 \n\t"
125 "paddw %%mm1, %%mm0 \n\t" 159 "paddw %%mm1, %%mm0 \n\t"
160
126 "movq %%mm0, (%%edi, %%eax) \n\t" 161 "movq %%mm0, (%%edi, %%eax) \n\t"
127 "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF
128
129 "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF
130 "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry
131 162
132 "addl $8, %%eax \n\t" 163 "addl $8, %%eax \n\t"
133 // End 164 // End
134 "9: \n\t" 165 "9: \n\t"
135 // "int $3\n\t" 166 // "int $3\n\t"
145 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), 176 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
146 "=r" (fragmentLength) 177 "=r" (fragmentLength)
147 ); 178 );
148 179
149 xpos= xx=xalpha= 0; 180 xpos= xx=xalpha= 0;
150 //FIXME choose size and or xinc so that they fit exactly 181
182 /* choose xinc so that all 8 parts fit exactly
183 Note: we cannot use just 1 part because it would not fit in the code cache */
184 s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10;
185 // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
186 #ifdef ALT_ERROR
187 s_xinc2_diff+= ((0x10000/(dstw/8)));
188 #endif
189 s_xinc_diff= s_xinc2_diff*2;
190
191 s_xinc2+= s_xinc2_diff;
192 s_xinc+= s_xinc_diff;
151 for(i=0; i<dstw/8; i++) 193 for(i=0; i<dstw/8; i++)
152 { 194 {
153 int xx=xpos>>8; 195 int xx=xpos>>16;
154 196
155 if((i&3) == 0) 197 if((i&3) == 0)
156 { 198 {
157 int a=0; 199 int a=0;
158 int b=((xpos+s_xinc)>>8) - xx; 200 int b=((xpos+s_xinc)>>16) - xx;
159 int c=((xpos+s_xinc*2)>>8) - xx; 201 int c=((xpos+s_xinc*2)>>16) - xx;
160 int d=((xpos+s_xinc*3)>>8) - xx; 202 int d=((xpos+s_xinc*3)>>16) - xx;
161 203
162 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); 204 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
163 205
164 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= 206 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
165 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= 207 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
172 214
173 xpos= xx=xalpha= 0; 215 xpos= xx=xalpha= 0;
174 //FIXME choose size and or xinc so that they fit exactly 216 //FIXME choose size and or xinc so that they fit exactly
175 for(i=0; i<dstw/8; i++) 217 for(i=0; i<dstw/8; i++)
176 { 218 {
177 int xx=xpos>>8; 219 int xx=xpos>>16;
178 220
179 if((i&3) == 0) 221 if((i&3) == 0)
180 { 222 {
181 int a=0; 223 int a=0;
182 int b=((xpos+s_xinc2)>>8) - xx; 224 int b=((xpos+s_xinc2)>>16) - xx;
183 int c=((xpos+s_xinc2*2)>>8) - xx; 225 int c=((xpos+s_xinc2*2)>>16) - xx;
184 int d=((xpos+s_xinc2*3)>>8) - xx; 226 int d=((xpos+s_xinc2*3)>>16) - xx;
185 227
186 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); 228 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
187 229
188 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= 230 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
189 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= 231 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
195 } 237 }
196 // funnyCode[0]= RET; 238 // funnyCode[0]= RET;
197 239
198 240
199 } 241 }
200 #endif 242
243 if(canMMX2BeUsed)
244 {
245 s_xinc+= s_xinc_diff;
246 s_xinc2+= s_xinc2_diff;
247 }
248 #endif // HAVE_MMX2
201 } // reset counters 249 } // reset counters
202 250
203 while(1){ 251 while(1){
204 unsigned char *dest=dstptr+dststride*s_ypos; 252 unsigned char *dest=dstptr+dststride*s_ypos;
205 int y0=2+(s_srcypos>>16); 253 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line
206 int y1=1+(s_srcypos>>17); 254 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
255 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000;
256 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
207 int yalpha=(s_srcypos&0xFFFF)>>7; 257 int yalpha=(s_srcypos&0xFFFF)>>7;
208 int yalpha1=yalpha^511; 258 int yalpha1=yalpha^511;
209 int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; 259 int uvalpha=(srcuvpos&0x1FFFF)>>8;
210 int uvalpha1=uvalpha^511; 260 int uvalpha1=uvalpha^511;
211 uint16_t *buf0=pix_buf_y[y0&3]; 261 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice
212 uint16_t *buf1=pix_buf_y[((y0+1)&3)]; 262 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice
213 uint16_t *uvbuf0=pix_buf_uv[y1&1]; 263 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice
214 uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; 264 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice
215 int i; 265 int i;
216 266
217 if(y0>=y+h) break; 267 // if this is before the first line than use only the first src line
268 if(y0==0) buf0= buf1;
269 if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0
270
271 if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
272
273 // if this is after the last line than use only the last src line
274 if(y0>=y+h)
275 {
276 buf1= buf0;
277 s_last_ypos=y0;
278 }
279 if(y1>=(y+h)/2)
280 {
281 uvbuf1= uvbuf0;
282 s_last_y1pos=y1;
283 }
284
218 285
219 s_ypos++; s_srcypos+=s_yinc; 286 s_ypos++; s_srcypos+=s_yinc;
220 287
288 //only interpolate the src line horizontally if we didnt do it allready
221 if(s_last_ypos!=y0){ 289 if(s_last_ypos!=y0){
222 unsigned char *src=srcptr[0]+(y0-y)*stride[0]; 290 unsigned char *src=srcptr[0]+(y0-y)*stride[0];
223 unsigned int xpos=0; 291 unsigned int xpos=0;
224 s_last_ypos=y0; 292 s_last_ypos=y0;
225 // *** horizontal scale Y line to temp buffer 293 // *** horizontal scale Y line to temp buffer
226 // this loop should be rewritten in MMX assembly!!!! 294 #ifdef ARCH_X86
227 #ifdef HAVE_MMX2 295
228 asm volatile( 296 #ifdef HAVE_MMX2
229 "pxor %%mm7, %%mm7 \n\t" 297 if(canMMX2BeUsed)
230 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha 298 {
231 "movd %5, %%mm6 \n\t" // s_xinc&0xFF 299 asm volatile(
232 "punpcklwd %%mm6, %%mm6 \n\t" 300 "pxor %%mm7, %%mm7 \n\t"
233 "punpcklwd %%mm6, %%mm6 \n\t" 301 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
234 "movq %%mm6, %%mm2 \n\t" 302 "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF
235 "psllq $16, %%mm2 \n\t" 303 "punpcklwd %%mm6, %%mm6 \n\t"
236 "paddb %%mm6, %%mm2 \n\t" 304 "punpcklwd %%mm6, %%mm6 \n\t"
237 "psllq $16, %%mm2 \n\t" 305 "movq %%mm6, %%mm2 \n\t"
238 "paddb %%mm6, %%mm2 \n\t" 306 "psllq $16, %%mm2 \n\t"
239 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF 307 "paddw %%mm6, %%mm2 \n\t"
240 "movq %%mm2, temp0 \n\t" 308 "psllq $16, %%mm2 \n\t"
241 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF 309 "paddw %%mm6, %%mm2 \n\t"
242 "punpcklwd %%mm6, %%mm6 \n\t" 310 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF
243 "punpcklwd %%mm6, %%mm6 \n\t" 311 "movq %%mm2, temp0 \n\t"
244 "xorl %%eax, %%eax \n\t" // i 312 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF
245 "xorl %%ebx, %%ebx \n\t" // xx 313 "punpcklwd %%mm6, %%mm6 \n\t"
246 "movl %0, %%esi \n\t" // src 314 "punpcklwd %%mm6, %%mm6 \n\t"
247 "movl %1, %%edi \n\t" // buf1 315 "xorl %%eax, %%eax \n\t" // i
248 "movl %3, %%edx \n\t" // (s_xinc*4)>>8 316 "movl %0, %%esi \n\t" // src
249 "xorl %%ecx, %%ecx \n\t" 317 "movl %1, %%edi \n\t" // buf1
250 "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF 318 "movl %3, %%edx \n\t" // (s_xinc*4)>>16
251 // "int $3\n\t" 319 "xorl %%ecx, %%ecx \n\t"
252 "call funnyYCode \n\t" 320 "xorl %%ebx, %%ebx \n\t"
253 "movq temp0, %%mm2 \n\t" 321 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF
254 "xorb %%cl, %%cl \n\t" 322 // "int $3\n\t"
255 "call funnyYCode \n\t" 323 "call funnyYCode \n\t"
256 "movq temp0, %%mm2 \n\t" 324 "movq temp0, %%mm2 \n\t"
257 "xorb %%cl, %%cl \n\t" 325 "xorl %%ecx, %%ecx \n\t"
258 "call funnyYCode \n\t" 326 "call funnyYCode \n\t"
259 "movq temp0, %%mm2 \n\t" 327 "movq temp0, %%mm2 \n\t"
260 "xorb %%cl, %%cl \n\t" 328 "xorl %%ecx, %%ecx \n\t"
261 "call funnyYCode \n\t" 329 "call funnyYCode \n\t"
262 "movq temp0, %%mm2 \n\t" 330 "movq temp0, %%mm2 \n\t"
263 "xorb %%cl, %%cl \n\t" 331 "xorl %%ecx, %%ecx \n\t"
264 "call funnyYCode \n\t" 332 "call funnyYCode \n\t"
265 "movq temp0, %%mm2 \n\t" 333 "movq temp0, %%mm2 \n\t"
266 "xorb %%cl, %%cl \n\t" 334 "xorl %%ecx, %%ecx \n\t"
267 "call funnyYCode \n\t" 335 "call funnyYCode \n\t"
268 "movq temp0, %%mm2 \n\t" 336 "movq temp0, %%mm2 \n\t"
269 "xorb %%cl, %%cl \n\t" 337 "xorl %%ecx, %%ecx \n\t"
270 "call funnyYCode \n\t" 338 "call funnyYCode \n\t"
271 "movq temp0, %%mm2 \n\t" 339 "movq temp0, %%mm2 \n\t"
272 "xorb %%cl, %%cl \n\t" 340 "xorl %%ecx, %%ecx \n\t"
273 "call funnyYCode \n\t" 341 "call funnyYCode \n\t"
274 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), 342 "movq temp0, %%mm2 \n\t"
275 "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) 343 "xorl %%ecx, %%ecx \n\t"
276 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" 344 "call funnyYCode \n\t"
277 ); 345 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16),
278 346 "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF)
279 #elif defined (ARCH_X86) 347 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
348 );
349 }
350 else
351 {
352 #endif
280 //NO MMX just normal asm ... FIXME try/write funny MMX2 variant 353 //NO MMX just normal asm ... FIXME try/write funny MMX2 variant
281 //FIXME add prefetch 354 //FIXME add prefetch
282 asm volatile( 355 asm volatile(
283 "xorl %%eax, %%eax \n\t" // i 356 "xorl %%eax, %%eax \n\t" // i
284 "xorl %%ebx, %%ebx \n\t" // xx 357 "xorl %%ebx, %%ebx \n\t" // xx
286 "1: \n\t" 359 "1: \n\t"
287 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] 360 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
288 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] 361 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
289 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 362 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
290 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 363 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
291 "shll $8, %%edi \n\t" 364 "shll $16, %%edi \n\t"
292 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 365 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
293 "movl %1, %%edi \n\t" 366 "movl %1, %%edi \n\t"
294 "shrl $1, %%esi \n\t" 367 "shrl $9, %%esi \n\t"
295 "movw %%si, (%%edi, %%eax, 2) \n\t" 368 "movw %%si, (%%edi, %%eax, 2) \n\t"
296 "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF 369 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF
297 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry 370 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
298 371
299 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] 372 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
300 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] 373 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
301 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 374 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
302 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 375 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
303 "shll $8, %%edi \n\t" 376 "shll $16, %%edi \n\t"
304 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 377 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
305 "movl %1, %%edi \n\t" 378 "movl %1, %%edi \n\t"
306 "shrl $1, %%esi \n\t" 379 "shrl $9, %%esi \n\t"
307 "movw %%si, 2(%%edi, %%eax, 2) \n\t" 380 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
308 "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF 381 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF
309 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry 382 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
310 383
311 384
312 "addl $2, %%eax \n\t" 385 "addl $2, %%eax \n\t"
313 "cmpl %2, %%eax \n\t" 386 "cmpl %2, %%eax \n\t"
314 " jb 1b \n\t" 387 " jb 1b \n\t"
315 388
316 389
317 :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) 390 :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF)
318 : "%eax", "%ebx", "%ecx", "%edi", "%esi" 391 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
319 ); 392 );
393 #ifdef HAVE_MMX2
394 } //if MMX2 cant be used
395 #endif
320 #else 396 #else
321 for(i=0;i<dstw;i++){ 397 for(i=0;i<dstw;i++){
322 register unsigned int xx=xpos>>8; 398 register unsigned int xx=xpos>>16;
323 register unsigned int xalpha=(xpos&0xFF)>>1; 399 register unsigned int xalpha=(xpos&0xFFFF)>>9;
324 buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); 400 buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
325 xpos+=s_xinc; 401 xpos+=s_xinc;
326 } 402 }
327 #endif 403 #endif
404 }
328 // *** horizontal scale U and V lines to temp buffer 405 // *** horizontal scale U and V lines to temp buffer
329 if(!(y0&1)){ 406 if(s_last_y1pos!=y1){
330 unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; 407 unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
331 unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; 408 unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
332 xpos=0; 409 int xpos=0;
333 // this loop should be rewritten in MMX assembly!!!! 410 s_last_y1pos= y1;
334 #ifdef HAVE_MMX2 411 #ifdef ARCH_X86
335 asm volatile( 412 #ifdef HAVE_MMX2
413 if(canMMX2BeUsed)
414 {
415 asm volatile(
336 "pxor %%mm7, %%mm7 \n\t" 416 "pxor %%mm7, %%mm7 \n\t"
337 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha 417 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
338 "movd %5, %%mm6 \n\t" // s_xinc&0xFF 418 "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF
339 "punpcklwd %%mm6, %%mm6 \n\t" 419 "punpcklwd %%mm6, %%mm6 \n\t"
340 "punpcklwd %%mm6, %%mm6 \n\t" 420 "punpcklwd %%mm6, %%mm6 \n\t"
341 "movq %%mm6, %%mm2 \n\t" 421 "movq %%mm6, %%mm2 \n\t"
342 "psllq $16, %%mm2 \n\t" 422 "psllq $16, %%mm2 \n\t"
343 "paddb %%mm6, %%mm2 \n\t" 423 "paddw %%mm6, %%mm2 \n\t"
344 "psllq $16, %%mm2 \n\t" 424 "psllq $16, %%mm2 \n\t"
345 "paddb %%mm6, %%mm2 \n\t" 425 "paddw %%mm6, %%mm2 \n\t"
346 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF 426 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF
347 "movq %%mm2, temp0 \n\t" 427 "movq %%mm2, temp0 \n\t"
348 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF 428 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF
349 "punpcklwd %%mm6, %%mm6 \n\t" 429 "punpcklwd %%mm6, %%mm6 \n\t"
350 "punpcklwd %%mm6, %%mm6 \n\t" 430 "punpcklwd %%mm6, %%mm6 \n\t"
351 "xorl %%eax, %%eax \n\t" // i 431 "xorl %%eax, %%eax \n\t" // i
352 "xorl %%ebx, %%ebx \n\t" // xx
353 "movl %0, %%esi \n\t" // src 432 "movl %0, %%esi \n\t" // src
354 "movl %1, %%edi \n\t" // buf1 433 "movl %1, %%edi \n\t" // buf1
355 "movl %3, %%edx \n\t" // (s_xinc*4)>>8 434 "movl %3, %%edx \n\t" // (s_xinc*4)>>16
356 "xorl %%ecx, %%ecx \n\t" 435 "xorl %%ecx, %%ecx \n\t"
357 "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF 436 "xorl %%ebx, %%ebx \n\t"
437 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF
438
358 // "int $3\n\t" 439 // "int $3\n\t"
359 "call funnyUVCode \n\t" 440 #define FUNNYUVCODE \
360 "movq temp0, %%mm2 \n\t" 441 "call funnyUVCode \n\t"\
361 "xorb %%cl, %%cl \n\t" 442 "movq temp0, %%mm2 \n\t"\
362 "call funnyUVCode \n\t" 443 "xorl %%ecx, %%ecx \n\t"
363 "movq temp0, %%mm2 \n\t" 444
364 "xorb %%cl, %%cl \n\t" 445 FUNNYUVCODE
365 "call funnyUVCode \n\t" 446 FUNNYUVCODE
366 "movq temp0, %%mm2 \n\t" 447 FUNNYUVCODE
367 "xorb %%cl, %%cl \n\t" 448 FUNNYUVCODE
368 "call funnyUVCode \n\t" 449
369 "movq temp0, %%mm2 \n\t" 450 FUNNYUVCODE
370 "xorb %%cl, %%cl \n\t" 451 FUNNYUVCODE
371 "call funnyUVCode \n\t" 452 FUNNYUVCODE
372 "movq temp0, %%mm2 \n\t" 453 FUNNYUVCODE
373 "xorb %%cl, %%cl \n\t" 454
374 "call funnyUVCode \n\t" 455
375 "movq temp0, %%mm2 \n\t"
376 "xorb %%cl, %%cl \n\t"
377 "call funnyUVCode \n\t"
378 "movq temp0, %%mm2 \n\t"
379 "xorb %%cl, %%cl \n\t"
380 "call funnyUVCode \n\t"
381 456
382 "xorl %%eax, %%eax \n\t" // i 457 "xorl %%eax, %%eax \n\t" // i
383 "xorl %%ebx, %%ebx \n\t" // xx
384 "movl %6, %%esi \n\t" // src 458 "movl %6, %%esi \n\t" // src
385 "movl %1, %%edi \n\t" // buf1 459 "movl %1, %%edi \n\t" // buf1
386 "addl $4096, %%edi \n\t" 460 "addl $4096, %%edi \n\t"
387 461
388 "call funnyUVCode \n\t" 462 FUNNYUVCODE
389 "movq temp0, %%mm2 \n\t" 463 FUNNYUVCODE
390 "xorb %%cl, %%cl \n\t" 464 FUNNYUVCODE
391 "call funnyUVCode \n\t" 465 FUNNYUVCODE
392 "movq temp0, %%mm2 \n\t" 466
393 "xorb %%cl, %%cl \n\t" 467 FUNNYUVCODE
394 "call funnyUVCode \n\t" 468 FUNNYUVCODE
395 "movq temp0, %%mm2 \n\t" 469 FUNNYUVCODE
396 "xorb %%cl, %%cl \n\t" 470 FUNNYUVCODE
397 "call funnyUVCode \n\t" 471
398 "movq temp0, %%mm2 \n\t" 472 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16),
399 "xorb %%cl, %%cl \n\t" 473 "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2)
400 "call funnyUVCode \n\t"
401 "movq temp0, %%mm2 \n\t"
402 "xorb %%cl, %%cl \n\t"
403 "call funnyUVCode \n\t"
404 "movq temp0, %%mm2 \n\t"
405 "xorb %%cl, %%cl \n\t"
406 "call funnyUVCode \n\t"
407 "movq temp0, %%mm2 \n\t"
408 "xorb %%cl, %%cl \n\t"
409 "call funnyUVCode \n\t"
410
411 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8),
412 "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2)
413 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" 474 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
414 ); 475 );
415 476 }
416 #elif defined (ARCH_X86) 477 else
478 {
479 #endif
417 asm volatile( 480 asm volatile(
418 "xorl %%eax, %%eax \n\t" // i 481 "xorl %%eax, %%eax \n\t" // i
419 "xorl %%ebx, %%ebx \n\t" // xx 482 "xorl %%ebx, %%ebx \n\t" // xx
420 "xorl %%ecx, %%ecx \n\t" // 2*xalpha 483 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
421 "1: \n\t" 484 "1: \n\t"
422 "movl %0, %%esi \n\t" 485 "movl %0, %%esi \n\t"
423 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] 486 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
424 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] 487 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
425 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 488 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
426 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 489 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
427 "shll $8, %%edi \n\t" 490 "shll $16, %%edi \n\t"
428 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 491 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
429 "movl %1, %%edi \n\t" 492 "movl %1, %%edi \n\t"
430 "shrl $1, %%esi \n\t" 493 "shrl $9, %%esi \n\t"
431 "movw %%si, (%%edi, %%eax, 2) \n\t" 494 "movw %%si, (%%edi, %%eax, 2) \n\t"
432 495
433 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] 496 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
434 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] 497 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
435 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 498 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
436 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 499 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
437 "shll $8, %%edi \n\t" 500 "shll $16, %%edi \n\t"
438 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 501 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
439 "movl %1, %%edi \n\t" 502 "movl %1, %%edi \n\t"
440 "shrl $1, %%esi \n\t" 503 "shrl $9, %%esi \n\t"
441 "movw %%si, 4096(%%edi, %%eax, 2)\n\t" 504 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
442 505
443 "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF 506 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF
444 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry 507 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
445 "addl $1, %%eax \n\t" 508 "addl $1, %%eax \n\t"
446 "cmpl %2, %%eax \n\t" 509 "cmpl %2, %%eax \n\t"
447 " jb 1b \n\t" 510 " jb 1b \n\t"
448 511
449 512
450 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), 513 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
451 "r" (src2) 514 "r" (src2)
452 : "%eax", "%ebx", "%ecx", "%edi", "%esi" 515 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
453 ); 516 );
517 #ifdef HAVE_MMX2
518 } //if MMX2 cant be used
519 #endif
454 #else 520 #else
455 for(i=0;i<dstw;i++){ 521 for(i=0;i<dstw;i++){
456 register unsigned int xx=xpos>>8; 522 register unsigned int xx=xpos>>16;
457 register unsigned int xalpha=(xpos&0xFF)>>1; 523 register unsigned int xalpha=(xpos&0xFFFF)>>9;
458 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); 524 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
459 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); 525 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
460 xpos+=s_xinc2; 526 xpos+=s_xinc2;
461 }
462 #endif
463 } 527 }
464 if(!y0) continue; 528 #endif
465 } 529 }
530
466 531
467 // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... 532 // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...
468 // Re: Note1: ok n*4 for now 533 // Re: Note1: ok n*4 for now
469 // Note2: instead of using lookup tabs, mmx version could do the multiply... 534 // Note2: instead of using lookup tabs, mmx version could do the multiply...
470 // Re: Note2: yep 535 // Re: Note2: yep
487 "punpcklwd %%mm5, %%mm5 \n\t"\ 552 "punpcklwd %%mm5, %%mm5 \n\t"\
488 "xorl %%eax, %%eax \n\t"\ 553 "xorl %%eax, %%eax \n\t"\
489 "1: \n\t"\ 554 "1: \n\t"\
490 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ 555 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
491 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ 556 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
557 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
558 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
492 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 559 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
560 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
493 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 561 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
562 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
494 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ 563 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\
564 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
565 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
495 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 566 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
567 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
569 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
496 "psubw w10, %%mm1 \n\t" /* Y-16*/\ 570 "psubw w10, %%mm1 \n\t" /* Y-16*/\
571 "psubw w80, %%mm3 \n\t" /* (U-128)*/\
497 "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ 572 "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\
573 "psllw $3, %%mm3 \n\t" /*(U-128)8*/\
498 "pmulhw yCoeff, %%mm1 \n\t"\ 574 "pmulhw yCoeff, %%mm1 \n\t"\
499 \ 575 \
500 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
501 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
502 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
503 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
504 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
505 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
506 "psubw w80, %%mm3 \n\t" /* (U-128)*/\
507 "psllw $3, %%mm3 \n\t" /*(U-128)8*/\
508 \ 576 \
509 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
510 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
511 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
512 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 577 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
578 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
579 "pmulhw ubCoeff, %%mm3 \n\t"\
513 "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ 580 "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\
581 "pmulhw ugCoeff, %%mm2 \n\t"\
514 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 582 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
515 "psubw w80, %%mm0 \n\t" /* (V-128)*/\ 583 "psubw w80, %%mm0 \n\t" /* (V-128)*/\
516 "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ 584 "psllw $3, %%mm0 \n\t" /* (V-128)8*/\
517 \ 585 \
518 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
519 "pmulhw ubCoeff, %%mm3 \n\t"\
520 "paddw %%mm1, %%mm3 \n\t" /* B*/\
521 \ 586 \
522 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ 587 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
523 "pmulhw vrCoeff, %%mm0 \n\t"\ 588 "pmulhw vrCoeff, %%mm0 \n\t"\
589 "pmulhw vgCoeff, %%mm4 \n\t"\
590 "paddw %%mm1, %%mm3 \n\t" /* B*/\
524 "paddw %%mm1, %%mm0 \n\t" /* R*/\ 591 "paddw %%mm1, %%mm0 \n\t" /* R*/\
592 "packuswb %%mm3, %%mm3 \n\t"\
525 \ 593 \
526 "pmulhw ugCoeff, %%mm2 \n\t"\ 594 "packuswb %%mm0, %%mm0 \n\t"\
527 "pmulhw vgCoeff, %%mm4 \n\t"\
528 "paddw %%mm4, %%mm2 \n\t"\ 595 "paddw %%mm4, %%mm2 \n\t"\
529 "paddw %%mm2, %%mm1 \n\t" /* G*/\ 596 "paddw %%mm2, %%mm1 \n\t" /* G*/\
530 \ 597 \
531 "packuswb %%mm3, %%mm3 \n\t"\
532 "packuswb %%mm0, %%mm0 \n\t"\
533 "packuswb %%mm1, %%mm1 \n\t" 598 "packuswb %%mm1, %%mm1 \n\t"
534 599
535 YSCALEYUV2RGB 600 YSCALEYUV2RGB
536 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG 601 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
537 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 602 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
608 else if(dstbpp==16) 673 else if(dstbpp==16)
609 { 674 {
610 asm volatile( 675 asm volatile(
611 676
612 YSCALEYUV2RGB 677 YSCALEYUV2RGB
678 #ifdef DITHER16BPP
613 "paddusb g16Dither, %%mm1 \n\t" 679 "paddusb g16Dither, %%mm1 \n\t"
614 "paddusb b16Dither, %%mm0 \n\t" 680 "paddusb b16Dither, %%mm0 \n\t"
615 "paddusb b16Dither, %%mm3 \n\t" 681 "paddusb b16Dither, %%mm3 \n\t"
682 #endif
616 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G 683 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
617 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B 684 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
618 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R 685 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
619 686
620 "psrlw $3, %%mm3 \n\t" 687 "psrlw $3, %%mm3 \n\t"
697 #ifdef HAVE_3DNOW 764 #ifdef HAVE_3DNOW
698 asm volatile("femms"); 765 asm volatile("femms");
699 #elif defined (HAVE_MMX) 766 #elif defined (HAVE_MMX)
700 asm volatile("emms"); 767 asm volatile("emms");
701 #endif 768 #endif
702
703
704 } 769 }
705 770
706 771
707 void SwScale_Init(){ 772 void SwScale_Init(){
708 // generating tables: 773 // generating tables: