Mercurial > mplayer.hg
comparison postproc/swscale_template.c @ 2264:7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
moved mmx variables to top to avoid alignment issues
mmx2 code should work fine now if and only if the input width is %16=0 and the output width is %32=0
reordered some code (5% faster with a simply -benchmark)
first line bug fixed (i hope i didnt introduce any new bugs with that ...)
changed a lot of the vertical scale setup code, i hope i fixed something and didnt mess it up :)
a few known bugs left (rightmost line is wrong)
MMX2 code will only be used for upscaling & acceptable widthˇs
16bit dithering can be disabled
author | michael |
---|---|
date | Thu, 18 Oct 2001 22:27:13 +0000 |
parents | 00a46cd41edd |
children | 3df32dabe98c |
comparison
equal
deleted
inserted
replaced
2263:351aaf1eff87 | 2264:7851375ea156 |
---|---|
1 | 1 |
2 // Software scaling and colorspace conversion routines for MPlayer | 2 // Software scaling and colorspace conversion routines for MPlayer |
3 | |
4 // Orginal C implementation by ? | |
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at) | |
3 | 6 |
4 #include <inttypes.h> | 7 #include <inttypes.h> |
5 #include "../config.h" | 8 #include "../config.h" |
6 | 9 |
7 #undef HAVE_MMX2 //code is buggy | 10 //#undef HAVE_MMX2 |
8 //#undef HAVE_MMX | 11 //#undef HAVE_MMX |
12 //#undef ARCH_X86 | |
13 #define DITHER16BPP | |
14 #define ALT_ERROR | |
9 | 15 |
10 #define RET 0xC3 //near return opcode | 16 #define RET 0xC3 //near return opcode |
11 | 17 /* |
12 // temporary storage for 4 yuv lines: | 18 NOTES |
13 // 16bit for now (mmx likes it more compact) | 19 |
14 static uint16_t pix_buf_y[4][2048]; | 20 known BUGS with known cause (no bugreports please!) |
15 static uint16_t pix_buf_uv[2][2048*2]; | 21 line at the right (c,asm and mmx2) |
16 | 22 code reads 1 sample too much (might cause a sig11) |
17 // clipping helper table for C implementations: | 23 |
18 static unsigned char clip_table[768]; | 24 TODO |
19 | 25 check alignment off everything |
20 // yuv->rgb conversion tables: | 26 */ |
21 static int yuvtab_2568[256]; | |
22 static int yuvtab_3343[256]; | |
23 static int yuvtab_0c92[256]; | |
24 static int yuvtab_1a1e[256]; | |
25 static int yuvtab_40cf[256]; | |
26 | 27 |
27 static uint64_t yCoeff= 0x2568256825682568LL; | 28 static uint64_t yCoeff= 0x2568256825682568LL; |
28 static uint64_t ubCoeff= 0x3343334333433343LL; | 29 static uint64_t ubCoeff= 0x3343334333433343LL; |
29 static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; | 30 static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; |
30 static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; | 31 static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; |
44 static uint64_t b16Mask= 0x001F001F001F001FLL; | 45 static uint64_t b16Mask= 0x001F001F001F001FLL; |
45 static uint64_t g16Mask= 0x07E007E007E007E0LL; | 46 static uint64_t g16Mask= 0x07E007E007E007E0LL; |
46 static uint64_t r16Mask= 0xF800F800F800F800LL; | 47 static uint64_t r16Mask= 0xF800F800F800F800LL; |
47 static uint64_t temp0; | 48 static uint64_t temp0; |
48 | 49 |
50 | |
51 // temporary storage for 4 yuv lines: | |
52 // 16bit for now (mmx likes it more compact) | |
53 static uint16_t pix_buf_y[4][2048]; | |
54 static uint16_t pix_buf_uv[2][2048*2]; | |
55 | |
56 // clipping helper table for C implementations: | |
57 static unsigned char clip_table[768]; | |
58 | |
59 // yuv->rgb conversion tables: | |
60 static int yuvtab_2568[256]; | |
61 static int yuvtab_3343[256]; | |
62 static int yuvtab_0c92[256]; | |
63 static int yuvtab_1a1e[256]; | |
64 static int yuvtab_40cf[256]; | |
65 | |
66 | |
49 static uint8_t funnyYCode[10000]; | 67 static uint8_t funnyYCode[10000]; |
50 static uint8_t funnyUVCode[10000]; | 68 static uint8_t funnyUVCode[10000]; |
51 | |
52 | 69 |
53 | 70 |
54 // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | 71 // *** bilinear scaling and yuv->rgb conversion of yv12 slices: |
55 // *** Note: it's called multiple times while decoding a frame, first time y==0 | 72 // *** Note: it's called multiple times while decoding a frame, first time y==0 |
56 // *** Designed to upscale, but may work for downscale too. | 73 // *** Designed to upscale, but may work for downscale too. |
62 | 79 |
63 // scaling factors: | 80 // scaling factors: |
64 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | 81 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; |
65 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | 82 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; |
66 | 83 |
67 unsigned int s_xinc2=s_xinc>>1; | 84 unsigned int s_xinc2; |
68 | 85 |
69 static int s_srcypos; | 86 static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) |
70 static int s_ypos; | 87 static int s_ypos; |
88 | |
89 // last horzontally interpolated lines, used to avoid unnecessary calculations | |
71 static int s_last_ypos; | 90 static int s_last_ypos; |
91 static int s_last_y1pos; | |
92 | |
72 static int static_dstw; | 93 static int static_dstw; |
73 | 94 |
74 #ifdef HAVE_MMX2 | 95 #ifdef HAVE_MMX2 |
96 // used to detect a horizontal size change | |
75 static int old_dstw= -1; | 97 static int old_dstw= -1; |
76 static int old_s_xinc= -1; | 98 static int old_s_xinc= -1; |
77 #endif | 99 |
78 | 100 // difference between the requested xinc and the required one for the mmx2 routine |
79 s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other | 101 static int s_xinc_diff=0; |
102 static int s_xinc2_diff=0; | |
103 #endif | |
104 int canMMX2BeUsed; | |
105 | |
106 // we need that precission at least for the mmx2 code | |
107 s_xinc*= 256; | |
108 s_xinc2=s_xinc>>1; | |
109 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0; | |
80 | 110 |
81 if(y==0){ | 111 if(y==0){ |
82 s_srcypos=-2*s_yinc; | 112 s_srcypos= s_yinc/2 - 0x8000; |
83 s_ypos=-2; | 113 s_ypos=0; |
84 s_last_ypos=-2; | 114 |
115 // force calculation of the horizontal interpolation of the first line | |
116 s_last_ypos=-99; | |
117 s_last_y1pos=-99; | |
85 #ifdef HAVE_MMX2 | 118 #ifdef HAVE_MMX2 |
86 // cant downscale !!! | 119 // cant downscale !!! |
87 if(old_s_xinc != s_xinc || old_dstw!=dstw) | 120 if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) |
88 { | 121 { |
89 uint8_t *fragment; | 122 uint8_t *fragment; |
90 int imm8OfPShufW1; | 123 int imm8OfPShufW1; |
91 int imm8OfPShufW2; | 124 int imm8OfPShufW2; |
92 int fragmentLength; | 125 int fragmentLength; |
100 | 133 |
101 // create an optimized horizontal scaling routine | 134 // create an optimized horizontal scaling routine |
102 | 135 |
103 //code fragment | 136 //code fragment |
104 | 137 |
105 // fragmentLength=0; | |
106 // printf("%d, %d\n", fragmentLength,imm8OfPShufW1); | |
107 | |
108 asm volatile( | 138 asm volatile( |
109 "jmp 9f \n\t" | 139 "jmp 9f \n\t" |
110 // Begin | 140 // Begin |
111 "0: \n\t" | 141 "0: \n\t" |
112 "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment | 142 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment |
113 "movq %%mm0, %%mm1 \n\t" | 143 "movq %%mm0, %%mm1 \n\t" |
114 "psrlq $8, %%mm0 \n\t" | 144 "psrlq $8, %%mm0 \n\t" |
115 "punpcklbw %%mm7, %%mm1 \n\t" | 145 "punpcklbw %%mm7, %%mm1 \n\t" |
146 "movq %%mm2, %%mm3 \n\t" | |
116 "punpcklbw %%mm7, %%mm0 \n\t" | 147 "punpcklbw %%mm7, %%mm0 \n\t" |
148 "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF | |
117 "pshufw $0xFF, %%mm1, %%mm1 \n\t" | 149 "pshufw $0xFF, %%mm1, %%mm1 \n\t" |
118 "1: \n\t" | 150 "1: \n\t" |
151 "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry | |
119 "pshufw $0xFF, %%mm0, %%mm0 \n\t" | 152 "pshufw $0xFF, %%mm0, %%mm0 \n\t" |
120 "2: \n\t" | 153 "2: \n\t" |
154 "psrlw $9, %%mm3 \n\t" | |
121 "psubw %%mm1, %%mm0 \n\t" | 155 "psubw %%mm1, %%mm0 \n\t" |
122 "psraw $1, %%mm0 \n\t" | 156 "pmullw %%mm3, %%mm0 \n\t" |
123 "pmullw %%mm2, %%mm0 \n\t" | 157 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF |
124 "psllw $7, %%mm1 \n\t" | 158 "psllw $7, %%mm1 \n\t" |
125 "paddw %%mm1, %%mm0 \n\t" | 159 "paddw %%mm1, %%mm0 \n\t" |
160 | |
126 "movq %%mm0, (%%edi, %%eax) \n\t" | 161 "movq %%mm0, (%%edi, %%eax) \n\t" |
127 "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF | |
128 | |
129 "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF | |
130 "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry | |
131 | 162 |
132 "addl $8, %%eax \n\t" | 163 "addl $8, %%eax \n\t" |
133 // End | 164 // End |
134 "9: \n\t" | 165 "9: \n\t" |
135 // "int $3\n\t" | 166 // "int $3\n\t" |
145 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), | 176 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), |
146 "=r" (fragmentLength) | 177 "=r" (fragmentLength) |
147 ); | 178 ); |
148 | 179 |
149 xpos= xx=xalpha= 0; | 180 xpos= xx=xalpha= 0; |
150 //FIXME choose size and or xinc so that they fit exactly | 181 |
182 /* choose xinc so that all 8 parts fit exactly | |
183 Note: we cannot use just 1 part because it would not fit in the code cache */ | |
184 s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10; | |
185 // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); | |
186 #ifdef ALT_ERROR | |
187 s_xinc2_diff+= ((0x10000/(dstw/8))); | |
188 #endif | |
189 s_xinc_diff= s_xinc2_diff*2; | |
190 | |
191 s_xinc2+= s_xinc2_diff; | |
192 s_xinc+= s_xinc_diff; | |
151 for(i=0; i<dstw/8; i++) | 193 for(i=0; i<dstw/8; i++) |
152 { | 194 { |
153 int xx=xpos>>8; | 195 int xx=xpos>>16; |
154 | 196 |
155 if((i&3) == 0) | 197 if((i&3) == 0) |
156 { | 198 { |
157 int a=0; | 199 int a=0; |
158 int b=((xpos+s_xinc)>>8) - xx; | 200 int b=((xpos+s_xinc)>>16) - xx; |
159 int c=((xpos+s_xinc*2)>>8) - xx; | 201 int c=((xpos+s_xinc*2)>>16) - xx; |
160 int d=((xpos+s_xinc*3)>>8) - xx; | 202 int d=((xpos+s_xinc*3)>>16) - xx; |
161 | 203 |
162 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | 204 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); |
163 | 205 |
164 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= | 206 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= |
165 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= | 207 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= |
172 | 214 |
173 xpos= xx=xalpha= 0; | 215 xpos= xx=xalpha= 0; |
174 //FIXME choose size and or xinc so that they fit exactly | 216 //FIXME choose size and or xinc so that they fit exactly |
175 for(i=0; i<dstw/8; i++) | 217 for(i=0; i<dstw/8; i++) |
176 { | 218 { |
177 int xx=xpos>>8; | 219 int xx=xpos>>16; |
178 | 220 |
179 if((i&3) == 0) | 221 if((i&3) == 0) |
180 { | 222 { |
181 int a=0; | 223 int a=0; |
182 int b=((xpos+s_xinc2)>>8) - xx; | 224 int b=((xpos+s_xinc2)>>16) - xx; |
183 int c=((xpos+s_xinc2*2)>>8) - xx; | 225 int c=((xpos+s_xinc2*2)>>16) - xx; |
184 int d=((xpos+s_xinc2*3)>>8) - xx; | 226 int d=((xpos+s_xinc2*3)>>16) - xx; |
185 | 227 |
186 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | 228 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); |
187 | 229 |
188 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= | 230 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= |
189 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= | 231 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= |
195 } | 237 } |
196 // funnyCode[0]= RET; | 238 // funnyCode[0]= RET; |
197 | 239 |
198 | 240 |
199 } | 241 } |
200 #endif | 242 |
243 if(canMMX2BeUsed) | |
244 { | |
245 s_xinc+= s_xinc_diff; | |
246 s_xinc2+= s_xinc2_diff; | |
247 } | |
248 #endif // HAVE_MMX2 | |
201 } // reset counters | 249 } // reset counters |
202 | 250 |
203 while(1){ | 251 while(1){ |
204 unsigned char *dest=dstptr+dststride*s_ypos; | 252 unsigned char *dest=dstptr+dststride*s_ypos; |
205 int y0=2+(s_srcypos>>16); | 253 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line |
206 int y1=1+(s_srcypos>>17); | 254 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) |
255 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; | |
256 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line | |
207 int yalpha=(s_srcypos&0xFFFF)>>7; | 257 int yalpha=(s_srcypos&0xFFFF)>>7; |
208 int yalpha1=yalpha^511; | 258 int yalpha1=yalpha^511; |
209 int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; | 259 int uvalpha=(srcuvpos&0x1FFFF)>>8; |
210 int uvalpha1=uvalpha^511; | 260 int uvalpha1=uvalpha^511; |
211 uint16_t *buf0=pix_buf_y[y0&3]; | 261 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice |
212 uint16_t *buf1=pix_buf_y[((y0+1)&3)]; | 262 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice |
213 uint16_t *uvbuf0=pix_buf_uv[y1&1]; | 263 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice |
214 uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; | 264 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice |
215 int i; | 265 int i; |
216 | 266 |
217 if(y0>=y+h) break; | 267 // if this is before the first line than use only the first src line |
268 if(y0==0) buf0= buf1; | |
269 if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0 | |
270 | |
271 if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway | |
272 | |
273 // if this is after the last line than use only the last src line | |
274 if(y0>=y+h) | |
275 { | |
276 buf1= buf0; | |
277 s_last_ypos=y0; | |
278 } | |
279 if(y1>=(y+h)/2) | |
280 { | |
281 uvbuf1= uvbuf0; | |
282 s_last_y1pos=y1; | |
283 } | |
284 | |
218 | 285 |
219 s_ypos++; s_srcypos+=s_yinc; | 286 s_ypos++; s_srcypos+=s_yinc; |
220 | 287 |
288 //only interpolate the src line horizontally if we didnt do it allready | |
221 if(s_last_ypos!=y0){ | 289 if(s_last_ypos!=y0){ |
222 unsigned char *src=srcptr[0]+(y0-y)*stride[0]; | 290 unsigned char *src=srcptr[0]+(y0-y)*stride[0]; |
223 unsigned int xpos=0; | 291 unsigned int xpos=0; |
224 s_last_ypos=y0; | 292 s_last_ypos=y0; |
225 // *** horizontal scale Y line to temp buffer | 293 // *** horizontal scale Y line to temp buffer |
226 // this loop should be rewritten in MMX assembly!!!! | 294 #ifdef ARCH_X86 |
227 #ifdef HAVE_MMX2 | 295 |
228 asm volatile( | 296 #ifdef HAVE_MMX2 |
229 "pxor %%mm7, %%mm7 \n\t" | 297 if(canMMX2BeUsed) |
230 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | 298 { |
231 "movd %5, %%mm6 \n\t" // s_xinc&0xFF | 299 asm volatile( |
232 "punpcklwd %%mm6, %%mm6 \n\t" | 300 "pxor %%mm7, %%mm7 \n\t" |
233 "punpcklwd %%mm6, %%mm6 \n\t" | 301 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha |
234 "movq %%mm6, %%mm2 \n\t" | 302 "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF |
235 "psllq $16, %%mm2 \n\t" | 303 "punpcklwd %%mm6, %%mm6 \n\t" |
236 "paddb %%mm6, %%mm2 \n\t" | 304 "punpcklwd %%mm6, %%mm6 \n\t" |
237 "psllq $16, %%mm2 \n\t" | 305 "movq %%mm6, %%mm2 \n\t" |
238 "paddb %%mm6, %%mm2 \n\t" | 306 "psllq $16, %%mm2 \n\t" |
239 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | 307 "paddw %%mm6, %%mm2 \n\t" |
240 "movq %%mm2, temp0 \n\t" | 308 "psllq $16, %%mm2 \n\t" |
241 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | 309 "paddw %%mm6, %%mm2 \n\t" |
242 "punpcklwd %%mm6, %%mm6 \n\t" | 310 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF |
243 "punpcklwd %%mm6, %%mm6 \n\t" | 311 "movq %%mm2, temp0 \n\t" |
244 "xorl %%eax, %%eax \n\t" // i | 312 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF |
245 "xorl %%ebx, %%ebx \n\t" // xx | 313 "punpcklwd %%mm6, %%mm6 \n\t" |
246 "movl %0, %%esi \n\t" // src | 314 "punpcklwd %%mm6, %%mm6 \n\t" |
247 "movl %1, %%edi \n\t" // buf1 | 315 "xorl %%eax, %%eax \n\t" // i |
248 "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | 316 "movl %0, %%esi \n\t" // src |
249 "xorl %%ecx, %%ecx \n\t" | 317 "movl %1, %%edi \n\t" // buf1 |
250 "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | 318 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 |
251 // "int $3\n\t" | 319 "xorl %%ecx, %%ecx \n\t" |
252 "call funnyYCode \n\t" | 320 "xorl %%ebx, %%ebx \n\t" |
253 "movq temp0, %%mm2 \n\t" | 321 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF |
254 "xorb %%cl, %%cl \n\t" | 322 // "int $3\n\t" |
255 "call funnyYCode \n\t" | 323 "call funnyYCode \n\t" |
256 "movq temp0, %%mm2 \n\t" | 324 "movq temp0, %%mm2 \n\t" |
257 "xorb %%cl, %%cl \n\t" | 325 "xorl %%ecx, %%ecx \n\t" |
258 "call funnyYCode \n\t" | 326 "call funnyYCode \n\t" |
259 "movq temp0, %%mm2 \n\t" | 327 "movq temp0, %%mm2 \n\t" |
260 "xorb %%cl, %%cl \n\t" | 328 "xorl %%ecx, %%ecx \n\t" |
261 "call funnyYCode \n\t" | 329 "call funnyYCode \n\t" |
262 "movq temp0, %%mm2 \n\t" | 330 "movq temp0, %%mm2 \n\t" |
263 "xorb %%cl, %%cl \n\t" | 331 "xorl %%ecx, %%ecx \n\t" |
264 "call funnyYCode \n\t" | 332 "call funnyYCode \n\t" |
265 "movq temp0, %%mm2 \n\t" | 333 "movq temp0, %%mm2 \n\t" |
266 "xorb %%cl, %%cl \n\t" | 334 "xorl %%ecx, %%ecx \n\t" |
267 "call funnyYCode \n\t" | 335 "call funnyYCode \n\t" |
268 "movq temp0, %%mm2 \n\t" | 336 "movq temp0, %%mm2 \n\t" |
269 "xorb %%cl, %%cl \n\t" | 337 "xorl %%ecx, %%ecx \n\t" |
270 "call funnyYCode \n\t" | 338 "call funnyYCode \n\t" |
271 "movq temp0, %%mm2 \n\t" | 339 "movq temp0, %%mm2 \n\t" |
272 "xorb %%cl, %%cl \n\t" | 340 "xorl %%ecx, %%ecx \n\t" |
273 "call funnyYCode \n\t" | 341 "call funnyYCode \n\t" |
274 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), | 342 "movq temp0, %%mm2 \n\t" |
275 "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) | 343 "xorl %%ecx, %%ecx \n\t" |
276 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | 344 "call funnyYCode \n\t" |
277 ); | 345 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), |
278 | 346 "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) |
279 #elif defined (ARCH_X86) | 347 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
348 ); | |
349 } | |
350 else | |
351 { | |
352 #endif | |
280 //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | 353 //NO MMX just normal asm ... FIXME try/write funny MMX2 variant |
281 //FIXME add prefetch | 354 //FIXME add prefetch |
282 asm volatile( | 355 asm volatile( |
283 "xorl %%eax, %%eax \n\t" // i | 356 "xorl %%eax, %%eax \n\t" // i |
284 "xorl %%ebx, %%ebx \n\t" // xx | 357 "xorl %%ebx, %%ebx \n\t" // xx |
286 "1: \n\t" | 359 "1: \n\t" |
287 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | 360 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] |
288 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | 361 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] |
289 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | 362 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
290 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | 363 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha |
291 "shll $8, %%edi \n\t" | 364 "shll $16, %%edi \n\t" |
292 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | 365 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) |
293 "movl %1, %%edi \n\t" | 366 "movl %1, %%edi \n\t" |
294 "shrl $1, %%esi \n\t" | 367 "shrl $9, %%esi \n\t" |
295 "movw %%si, (%%edi, %%eax, 2) \n\t" | 368 "movw %%si, (%%edi, %%eax, 2) \n\t" |
296 "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | 369 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF |
297 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | 370 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry |
298 | 371 |
299 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | 372 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] |
300 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | 373 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] |
301 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | 374 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
302 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | 375 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha |
303 "shll $8, %%edi \n\t" | 376 "shll $16, %%edi \n\t" |
304 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | 377 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) |
305 "movl %1, %%edi \n\t" | 378 "movl %1, %%edi \n\t" |
306 "shrl $1, %%esi \n\t" | 379 "shrl $9, %%esi \n\t" |
307 "movw %%si, 2(%%edi, %%eax, 2) \n\t" | 380 "movw %%si, 2(%%edi, %%eax, 2) \n\t" |
308 "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | 381 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF |
309 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | 382 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry |
310 | 383 |
311 | 384 |
312 "addl $2, %%eax \n\t" | 385 "addl $2, %%eax \n\t" |
313 "cmpl %2, %%eax \n\t" | 386 "cmpl %2, %%eax \n\t" |
314 " jb 1b \n\t" | 387 " jb 1b \n\t" |
315 | 388 |
316 | 389 |
317 :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) | 390 :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF) |
318 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | 391 : "%eax", "%ebx", "%ecx", "%edi", "%esi" |
319 ); | 392 ); |
393 #ifdef HAVE_MMX2 | |
394 } //if MMX2 cant be used | |
395 #endif | |
320 #else | 396 #else |
321 for(i=0;i<dstw;i++){ | 397 for(i=0;i<dstw;i++){ |
322 register unsigned int xx=xpos>>8; | 398 register unsigned int xx=xpos>>16; |
323 register unsigned int xalpha=(xpos&0xFF)>>1; | 399 register unsigned int xalpha=(xpos&0xFFFF)>>9; |
324 buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | 400 buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); |
325 xpos+=s_xinc; | 401 xpos+=s_xinc; |
326 } | 402 } |
327 #endif | 403 #endif |
404 } | |
328 // *** horizontal scale U and V lines to temp buffer | 405 // *** horizontal scale U and V lines to temp buffer |
329 if(!(y0&1)){ | 406 if(s_last_y1pos!=y1){ |
330 unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | 407 unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; |
331 unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | 408 unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; |
332 xpos=0; | 409 int xpos=0; |
333 // this loop should be rewritten in MMX assembly!!!! | 410 s_last_y1pos= y1; |
334 #ifdef HAVE_MMX2 | 411 #ifdef ARCH_X86 |
335 asm volatile( | 412 #ifdef HAVE_MMX2 |
413 if(canMMX2BeUsed) | |
414 { | |
415 asm volatile( | |
336 "pxor %%mm7, %%mm7 \n\t" | 416 "pxor %%mm7, %%mm7 \n\t" |
337 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | 417 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha |
338 "movd %5, %%mm6 \n\t" // s_xinc&0xFF | 418 "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF |
339 "punpcklwd %%mm6, %%mm6 \n\t" | 419 "punpcklwd %%mm6, %%mm6 \n\t" |
340 "punpcklwd %%mm6, %%mm6 \n\t" | 420 "punpcklwd %%mm6, %%mm6 \n\t" |
341 "movq %%mm6, %%mm2 \n\t" | 421 "movq %%mm6, %%mm2 \n\t" |
342 "psllq $16, %%mm2 \n\t" | 422 "psllq $16, %%mm2 \n\t" |
343 "paddb %%mm6, %%mm2 \n\t" | 423 "paddw %%mm6, %%mm2 \n\t" |
344 "psllq $16, %%mm2 \n\t" | 424 "psllq $16, %%mm2 \n\t" |
345 "paddb %%mm6, %%mm2 \n\t" | 425 "paddw %%mm6, %%mm2 \n\t" |
346 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | 426 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF |
347 "movq %%mm2, temp0 \n\t" | 427 "movq %%mm2, temp0 \n\t" |
348 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | 428 "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF |
349 "punpcklwd %%mm6, %%mm6 \n\t" | 429 "punpcklwd %%mm6, %%mm6 \n\t" |
350 "punpcklwd %%mm6, %%mm6 \n\t" | 430 "punpcklwd %%mm6, %%mm6 \n\t" |
351 "xorl %%eax, %%eax \n\t" // i | 431 "xorl %%eax, %%eax \n\t" // i |
352 "xorl %%ebx, %%ebx \n\t" // xx | |
353 "movl %0, %%esi \n\t" // src | 432 "movl %0, %%esi \n\t" // src |
354 "movl %1, %%edi \n\t" // buf1 | 433 "movl %1, %%edi \n\t" // buf1 |
355 "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | 434 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 |
356 "xorl %%ecx, %%ecx \n\t" | 435 "xorl %%ecx, %%ecx \n\t" |
357 "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | 436 "xorl %%ebx, %%ebx \n\t" |
437 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |
438 | |
358 // "int $3\n\t" | 439 // "int $3\n\t" |
359 "call funnyUVCode \n\t" | 440 #define FUNNYUVCODE \ |
360 "movq temp0, %%mm2 \n\t" | 441 "call funnyUVCode \n\t"\ |
361 "xorb %%cl, %%cl \n\t" | 442 "movq temp0, %%mm2 \n\t"\ |
362 "call funnyUVCode \n\t" | 443 "xorl %%ecx, %%ecx \n\t" |
363 "movq temp0, %%mm2 \n\t" | 444 |
364 "xorb %%cl, %%cl \n\t" | 445 FUNNYUVCODE |
365 "call funnyUVCode \n\t" | 446 FUNNYUVCODE |
366 "movq temp0, %%mm2 \n\t" | 447 FUNNYUVCODE |
367 "xorb %%cl, %%cl \n\t" | 448 FUNNYUVCODE |
368 "call funnyUVCode \n\t" | 449 |
369 "movq temp0, %%mm2 \n\t" | 450 FUNNYUVCODE |
370 "xorb %%cl, %%cl \n\t" | 451 FUNNYUVCODE |
371 "call funnyUVCode \n\t" | 452 FUNNYUVCODE |
372 "movq temp0, %%mm2 \n\t" | 453 FUNNYUVCODE |
373 "xorb %%cl, %%cl \n\t" | 454 |
374 "call funnyUVCode \n\t" | 455 |
375 "movq temp0, %%mm2 \n\t" | |
376 "xorb %%cl, %%cl \n\t" | |
377 "call funnyUVCode \n\t" | |
378 "movq temp0, %%mm2 \n\t" | |
379 "xorb %%cl, %%cl \n\t" | |
380 "call funnyUVCode \n\t" | |
381 | 456 |
382 "xorl %%eax, %%eax \n\t" // i | 457 "xorl %%eax, %%eax \n\t" // i |
383 "xorl %%ebx, %%ebx \n\t" // xx | |
384 "movl %6, %%esi \n\t" // src | 458 "movl %6, %%esi \n\t" // src |
385 "movl %1, %%edi \n\t" // buf1 | 459 "movl %1, %%edi \n\t" // buf1 |
386 "addl $4096, %%edi \n\t" | 460 "addl $4096, %%edi \n\t" |
387 | 461 |
388 "call funnyUVCode \n\t" | 462 FUNNYUVCODE |
389 "movq temp0, %%mm2 \n\t" | 463 FUNNYUVCODE |
390 "xorb %%cl, %%cl \n\t" | 464 FUNNYUVCODE |
391 "call funnyUVCode \n\t" | 465 FUNNYUVCODE |
392 "movq temp0, %%mm2 \n\t" | 466 |
393 "xorb %%cl, %%cl \n\t" | 467 FUNNYUVCODE |
394 "call funnyUVCode \n\t" | 468 FUNNYUVCODE |
395 "movq temp0, %%mm2 \n\t" | 469 FUNNYUVCODE |
396 "xorb %%cl, %%cl \n\t" | 470 FUNNYUVCODE |
397 "call funnyUVCode \n\t" | 471 |
398 "movq temp0, %%mm2 \n\t" | 472 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), |
399 "xorb %%cl, %%cl \n\t" | 473 "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) |
400 "call funnyUVCode \n\t" | |
401 "movq temp0, %%mm2 \n\t" | |
402 "xorb %%cl, %%cl \n\t" | |
403 "call funnyUVCode \n\t" | |
404 "movq temp0, %%mm2 \n\t" | |
405 "xorb %%cl, %%cl \n\t" | |
406 "call funnyUVCode \n\t" | |
407 "movq temp0, %%mm2 \n\t" | |
408 "xorb %%cl, %%cl \n\t" | |
409 "call funnyUVCode \n\t" | |
410 | |
411 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), | |
412 "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) | |
413 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | 474 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
414 ); | 475 ); |
415 | 476 } |
416 #elif defined (ARCH_X86) | 477 else |
478 { | |
479 #endif | |
417 asm volatile( | 480 asm volatile( |
418 "xorl %%eax, %%eax \n\t" // i | 481 "xorl %%eax, %%eax \n\t" // i |
419 "xorl %%ebx, %%ebx \n\t" // xx | 482 "xorl %%ebx, %%ebx \n\t" // xx |
420 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | 483 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
421 "1: \n\t" | 484 "1: \n\t" |
422 "movl %0, %%esi \n\t" | 485 "movl %0, %%esi \n\t" |
423 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] | 486 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] |
424 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | 487 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] |
425 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | 488 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
426 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | 489 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha |
427 "shll $8, %%edi \n\t" | 490 "shll $16, %%edi \n\t" |
428 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | 491 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) |
429 "movl %1, %%edi \n\t" | 492 "movl %1, %%edi \n\t" |
430 "shrl $1, %%esi \n\t" | 493 "shrl $9, %%esi \n\t" |
431 "movw %%si, (%%edi, %%eax, 2) \n\t" | 494 "movw %%si, (%%edi, %%eax, 2) \n\t" |
432 | 495 |
433 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | 496 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] |
434 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | 497 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] |
435 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | 498 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
436 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | 499 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha |
437 "shll $8, %%edi \n\t" | 500 "shll $16, %%edi \n\t" |
438 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | 501 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) |
439 "movl %1, %%edi \n\t" | 502 "movl %1, %%edi \n\t" |
440 "shrl $1, %%esi \n\t" | 503 "shrl $9, %%esi \n\t" |
441 "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | 504 "movw %%si, 4096(%%edi, %%eax, 2)\n\t" |
442 | 505 |
443 "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | 506 "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF |
444 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | 507 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry |
445 "addl $1, %%eax \n\t" | 508 "addl $1, %%eax \n\t" |
446 "cmpl %2, %%eax \n\t" | 509 "cmpl %2, %%eax \n\t" |
447 " jb 1b \n\t" | 510 " jb 1b \n\t" |
448 | 511 |
449 | 512 |
450 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), | 513 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), |
451 "r" (src2) | 514 "r" (src2) |
452 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | 515 : "%eax", "%ebx", "%ecx", "%edi", "%esi" |
453 ); | 516 ); |
517 #ifdef HAVE_MMX2 | |
518 } //if MMX2 cant be used | |
519 #endif | |
454 #else | 520 #else |
455 for(i=0;i<dstw;i++){ | 521 for(i=0;i<dstw;i++){ |
456 register unsigned int xx=xpos>>8; | 522 register unsigned int xx=xpos>>16; |
457 register unsigned int xalpha=(xpos&0xFF)>>1; | 523 register unsigned int xalpha=(xpos&0xFFFF)>>9; |
458 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | 524 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); |
459 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | 525 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); |
460 xpos+=s_xinc2; | 526 xpos+=s_xinc2; |
461 } | |
462 #endif | |
463 } | 527 } |
464 if(!y0) continue; | 528 #endif |
465 } | 529 } |
530 | |
466 | 531 |
467 // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | 532 // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... |
468 // Re: Note1: ok n*4 for now | 533 // Re: Note1: ok n*4 for now |
469 // Note2: instead of using lookup tabs, mmx version could do the multiply... | 534 // Note2: instead of using lookup tabs, mmx version could do the multiply... |
470 // Re: Note2: yep | 535 // Re: Note2: yep |
487 "punpcklwd %%mm5, %%mm5 \n\t"\ | 552 "punpcklwd %%mm5, %%mm5 \n\t"\ |
488 "xorl %%eax, %%eax \n\t"\ | 553 "xorl %%eax, %%eax \n\t"\ |
489 "1: \n\t"\ | 554 "1: \n\t"\ |
490 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | 555 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
491 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | 556 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
557 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
558 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
492 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | 559 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
560 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
493 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | 561 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
562 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
494 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | 563 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ |
564 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
565 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | |
495 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | 566 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
567 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
569 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
496 "psubw w10, %%mm1 \n\t" /* Y-16*/\ | 570 "psubw w10, %%mm1 \n\t" /* Y-16*/\ |
571 "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | |
497 "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | 572 "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ |
573 "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | |
498 "pmulhw yCoeff, %%mm1 \n\t"\ | 574 "pmulhw yCoeff, %%mm1 \n\t"\ |
499 \ | 575 \ |
500 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
501 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
502 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
503 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
504 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | |
505 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
506 "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | |
507 "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | |
508 \ | 576 \ |
509 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
510 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
511 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
512 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | 577 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
578 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
579 "pmulhw ubCoeff, %%mm3 \n\t"\ | |
513 "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | 580 "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ |
581 "pmulhw ugCoeff, %%mm2 \n\t"\ | |
514 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | 582 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
515 "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | 583 "psubw w80, %%mm0 \n\t" /* (V-128)*/\ |
516 "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | 584 "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ |
517 \ | 585 \ |
518 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
519 "pmulhw ubCoeff, %%mm3 \n\t"\ | |
520 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
521 \ | 586 \ |
522 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | 587 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
523 "pmulhw vrCoeff, %%mm0 \n\t"\ | 588 "pmulhw vrCoeff, %%mm0 \n\t"\ |
589 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
590 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
524 "paddw %%mm1, %%mm0 \n\t" /* R*/\ | 591 "paddw %%mm1, %%mm0 \n\t" /* R*/\ |
592 "packuswb %%mm3, %%mm3 \n\t"\ | |
525 \ | 593 \ |
526 "pmulhw ugCoeff, %%mm2 \n\t"\ | 594 "packuswb %%mm0, %%mm0 \n\t"\ |
527 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
528 "paddw %%mm4, %%mm2 \n\t"\ | 595 "paddw %%mm4, %%mm2 \n\t"\ |
529 "paddw %%mm2, %%mm1 \n\t" /* G*/\ | 596 "paddw %%mm2, %%mm1 \n\t" /* G*/\ |
530 \ | 597 \ |
531 "packuswb %%mm3, %%mm3 \n\t"\ | |
532 "packuswb %%mm0, %%mm0 \n\t"\ | |
533 "packuswb %%mm1, %%mm1 \n\t" | 598 "packuswb %%mm1, %%mm1 \n\t" |
534 | 599 |
535 YSCALEYUV2RGB | 600 YSCALEYUV2RGB |
536 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | 601 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
537 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | 602 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
608 else if(dstbpp==16) | 673 else if(dstbpp==16) |
609 { | 674 { |
610 asm volatile( | 675 asm volatile( |
611 | 676 |
612 YSCALEYUV2RGB | 677 YSCALEYUV2RGB |
678 #ifdef DITHER16BPP | |
613 "paddusb g16Dither, %%mm1 \n\t" | 679 "paddusb g16Dither, %%mm1 \n\t" |
614 "paddusb b16Dither, %%mm0 \n\t" | 680 "paddusb b16Dither, %%mm0 \n\t" |
615 "paddusb b16Dither, %%mm3 \n\t" | 681 "paddusb b16Dither, %%mm3 \n\t" |
682 #endif | |
616 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | 683 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
617 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | 684 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
618 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | 685 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
619 | 686 |
620 "psrlw $3, %%mm3 \n\t" | 687 "psrlw $3, %%mm3 \n\t" |
697 #ifdef HAVE_3DNOW | 764 #ifdef HAVE_3DNOW |
698 asm volatile("femms"); | 765 asm volatile("femms"); |
699 #elif defined (HAVE_MMX) | 766 #elif defined (HAVE_MMX) |
700 asm volatile("emms"); | 767 asm volatile("emms"); |
701 #endif | 768 #endif |
702 | |
703 | |
704 } | 769 } |
705 | 770 |
706 | 771 |
707 void SwScale_Init(){ | 772 void SwScale_Init(){ |
708 // generating tables: | 773 // generating tables: |