comparison libswscale/rgb2rgb_template.c @ 23140:4d3870361b73

cosmetics attack, part I: Remove all tabs and prettyprint/reindent the code.
author diego
date Sat, 28 Apr 2007 11:44:49 +0000
parents 10425310d2da
children f9a8f92087ef
comparison
equal deleted inserted replaced
23139:10425310d2da 23140:4d3870361b73
1 /* 1 /*
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor 2 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor 3 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor 4 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor 5 * Software YUV to RGB convertor
7 * Written by Nick Kurshev. 6 * Written by Nick Kurshev.
51 #endif 50 #endif
52 51
53 #ifdef HAVE_3DNOW 52 #ifdef HAVE_3DNOW
54 #define PREFETCH "prefetch" 53 #define PREFETCH "prefetch"
55 #define PREFETCHW "prefetchw" 54 #define PREFETCHW "prefetchw"
56 #define PAVGB "pavgusb" 55 #define PAVGB "pavgusb"
57 #elif defined ( HAVE_MMX2 ) 56 #elif defined ( HAVE_MMX2 )
58 #define PREFETCH "prefetchnta" 57 #define PREFETCH "prefetchnta"
59 #define PREFETCHW "prefetcht0" 58 #define PREFETCHW "prefetcht0"
60 #define PAVGB "pavgb" 59 #define PAVGB "pavgb"
61 #else 60 #else
62 #ifdef __APPLE__ 61 #ifdef __APPLE__
63 #define PREFETCH "#" 62 #define PREFETCH "#"
64 #define PREFETCHW "#" 63 #define PREFETCHW "#"
65 #else 64 #else
83 #define SFENCE " # nop" 82 #define SFENCE " # nop"
84 #endif 83 #endif
85 84
86 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size) 85 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
87 { 86 {
88 uint8_t *dest = dst; 87 uint8_t *dest = dst;
89 const uint8_t *s = src; 88 const uint8_t *s = src;
90 const uint8_t *end; 89 const uint8_t *end;
91 #ifdef HAVE_MMX 90 #ifdef HAVE_MMX
92 const uint8_t *mm_end; 91 const uint8_t *mm_end;
93 #endif 92 #endif
94 end = s + src_size; 93 end = s + src_size;
95 #ifdef HAVE_MMX 94 #ifdef HAVE_MMX
96 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 95 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
97 mm_end = end - 23; 96 mm_end = end - 23;
98 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); 97 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
99 while(s < mm_end) 98 while (s < mm_end)
100 { 99 {
101 __asm __volatile( 100 __asm __volatile(
102 PREFETCH" 32%1\n\t" 101 PREFETCH" 32%1 \n\t"
103 "movd %1, %%mm0\n\t" 102 "movd %1, %%mm0 \n\t"
104 "punpckldq 3%1, %%mm0\n\t" 103 "punpckldq 3%1, %%mm0 \n\t"
105 "movd 6%1, %%mm1\n\t" 104 "movd 6%1, %%mm1 \n\t"
106 "punpckldq 9%1, %%mm1\n\t" 105 "punpckldq 9%1, %%mm1 \n\t"
107 "movd 12%1, %%mm2\n\t" 106 "movd 12%1, %%mm2 \n\t"
108 "punpckldq 15%1, %%mm2\n\t" 107 "punpckldq 15%1, %%mm2 \n\t"
109 "movd 18%1, %%mm3\n\t" 108 "movd 18%1, %%mm3 \n\t"
110 "punpckldq 21%1, %%mm3\n\t" 109 "punpckldq 21%1, %%mm3 \n\t"
111 "pand %%mm7, %%mm0\n\t" 110 "pand %%mm7, %%mm0 \n\t"
112 "pand %%mm7, %%mm1\n\t" 111 "pand %%mm7, %%mm1 \n\t"
113 "pand %%mm7, %%mm2\n\t" 112 "pand %%mm7, %%mm2 \n\t"
114 "pand %%mm7, %%mm3\n\t" 113 "pand %%mm7, %%mm3 \n\t"
115 MOVNTQ" %%mm0, %0\n\t" 114 MOVNTQ" %%mm0, %0 \n\t"
116 MOVNTQ" %%mm1, 8%0\n\t" 115 MOVNTQ" %%mm1, 8%0 \n\t"
117 MOVNTQ" %%mm2, 16%0\n\t" 116 MOVNTQ" %%mm2, 16%0 \n\t"
118 MOVNTQ" %%mm3, 24%0" 117 MOVNTQ" %%mm3, 24%0"
119 :"=m"(*dest) 118 :"=m"(*dest)
120 :"m"(*s) 119 :"m"(*s)
121 :"memory"); 120 :"memory");
122 dest += 32; 121 dest += 32;
123 s += 24; 122 s += 24;
124 } 123 }
125 __asm __volatile(SFENCE:::"memory"); 124 __asm __volatile(SFENCE:::"memory");
126 __asm __volatile(EMMS:::"memory"); 125 __asm __volatile(EMMS:::"memory");
127 #endif 126 #endif
128 while(s < end) 127 while (s < end)
129 { 128 {
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131 *dest++ = 0;
132 *dest++ = s[2];
133 *dest++ = s[1];
134 *dest++ = s[0];
135 s+=3;
136 #else
137 *dest++ = *s++;
138 *dest++ = *s++;
139 *dest++ = *s++;
140 *dest++ = 0;
141 #endif
142 }
143 }
144
145 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
146 {
147 uint8_t *dest = dst;
148 const uint8_t *s = src;
149 const uint8_t *end;
150 #ifdef HAVE_MMX
151 const uint8_t *mm_end;
152 #endif
153 end = s + src_size;
154 #ifdef HAVE_MMX
155 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
156 mm_end = end - 31;
157 while (s < mm_end)
158 {
159 __asm __volatile(
160 PREFETCH" 32%1 \n\t"
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 "psrlq $8, %%mm2 \n\t"
170 "psrlq $8, %%mm3 \n\t"
171 "psrlq $8, %%mm6 \n\t"
172 "psrlq $8, %%mm7 \n\t"
173 "pand %2, %%mm0 \n\t"
174 "pand %2, %%mm1 \n\t"
175 "pand %2, %%mm4 \n\t"
176 "pand %2, %%mm5 \n\t"
177 "pand %3, %%mm2 \n\t"
178 "pand %3, %%mm3 \n\t"
179 "pand %3, %%mm6 \n\t"
180 "pand %3, %%mm7 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "por %%mm6, %%mm4 \n\t"
184 "por %%mm7, %%mm5 \n\t"
185
186 "movq %%mm1, %%mm2 \n\t"
187 "movq %%mm4, %%mm3 \n\t"
188 "psllq $48, %%mm2 \n\t"
189 "psllq $32, %%mm3 \n\t"
190 "pand %4, %%mm2 \n\t"
191 "pand %5, %%mm3 \n\t"
192 "por %%mm2, %%mm0 \n\t"
193 "psrlq $16, %%mm1 \n\t"
194 "psrlq $32, %%mm4 \n\t"
195 "psllq $16, %%mm5 \n\t"
196 "por %%mm3, %%mm1 \n\t"
197 "pand %6, %%mm5 \n\t"
198 "por %%mm5, %%mm4 \n\t"
199
200 MOVNTQ" %%mm0, %0 \n\t"
201 MOVNTQ" %%mm1, 8%0 \n\t"
202 MOVNTQ" %%mm4, 16%0"
203 :"=m"(*dest)
204 :"m"(*s),"m"(mask24l),
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
206 :"memory");
207 dest += 24;
208 s += 32;
209 }
210 __asm __volatile(SFENCE:::"memory");
211 __asm __volatile(EMMS:::"memory");
212 #endif
213 while (s < end)
214 {
130 #ifdef WORDS_BIGENDIAN 215 #ifdef WORDS_BIGENDIAN
131 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ 216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
132 *dest++ = 0; 217 s++;
133 *dest++ = s[2]; 218 dest[2] = *s++;
134 *dest++ = s[1]; 219 dest[1] = *s++;
135 *dest++ = s[0]; 220 dest[0] = *s++;
136 s+=3; 221 dest += 3;
137 #else 222 #else
138 *dest++ = *s++; 223 *dest++ = *s++;
139 *dest++ = *s++; 224 *dest++ = *s++;
140 *dest++ = *s++; 225 *dest++ = *s++;
141 *dest++ = 0; 226 s++;
142 #endif 227 #endif
143 } 228 }
144 }
145
146 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
147 {
148 uint8_t *dest = dst;
149 const uint8_t *s = src;
150 const uint8_t *end;
151 #ifdef HAVE_MMX
152 const uint8_t *mm_end;
153 #endif
154 end = s + src_size;
155 #ifdef HAVE_MMX
156 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
157 mm_end = end - 31;
158 while(s < mm_end)
159 {
160 __asm __volatile(
161 PREFETCH" 32%1\n\t"
162 "movq %1, %%mm0\n\t"
163 "movq 8%1, %%mm1\n\t"
164 "movq 16%1, %%mm4\n\t"
165 "movq 24%1, %%mm5\n\t"
166 "movq %%mm0, %%mm2\n\t"
167 "movq %%mm1, %%mm3\n\t"
168 "movq %%mm4, %%mm6\n\t"
169 "movq %%mm5, %%mm7\n\t"
170 "psrlq $8, %%mm2\n\t"
171 "psrlq $8, %%mm3\n\t"
172 "psrlq $8, %%mm6\n\t"
173 "psrlq $8, %%mm7\n\t"
174 "pand %2, %%mm0\n\t"
175 "pand %2, %%mm1\n\t"
176 "pand %2, %%mm4\n\t"
177 "pand %2, %%mm5\n\t"
178 "pand %3, %%mm2\n\t"
179 "pand %3, %%mm3\n\t"
180 "pand %3, %%mm6\n\t"
181 "pand %3, %%mm7\n\t"
182 "por %%mm2, %%mm0\n\t"
183 "por %%mm3, %%mm1\n\t"
184 "por %%mm6, %%mm4\n\t"
185 "por %%mm7, %%mm5\n\t"
186
187 "movq %%mm1, %%mm2\n\t"
188 "movq %%mm4, %%mm3\n\t"
189 "psllq $48, %%mm2\n\t"
190 "psllq $32, %%mm3\n\t"
191 "pand %4, %%mm2\n\t"
192 "pand %5, %%mm3\n\t"
193 "por %%mm2, %%mm0\n\t"
194 "psrlq $16, %%mm1\n\t"
195 "psrlq $32, %%mm4\n\t"
196 "psllq $16, %%mm5\n\t"
197 "por %%mm3, %%mm1\n\t"
198 "pand %6, %%mm5\n\t"
199 "por %%mm5, %%mm4\n\t"
200
201 MOVNTQ" %%mm0, %0\n\t"
202 MOVNTQ" %%mm1, 8%0\n\t"
203 MOVNTQ" %%mm4, 16%0"
204 :"=m"(*dest)
205 :"m"(*s),"m"(mask24l),
206 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
207 :"memory");
208 dest += 24;
209 s += 32;
210 }
211 __asm __volatile(SFENCE:::"memory");
212 __asm __volatile(EMMS:::"memory");
213 #endif
214 while(s < end)
215 {
216 #ifdef WORDS_BIGENDIAN
217 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
218 s++;
219 dest[2] = *s++;
220 dest[1] = *s++;
221 dest[0] = *s++;
222 dest += 3;
223 #else
224 *dest++ = *s++;
225 *dest++ = *s++;
226 *dest++ = *s++;
227 s++;
228 #endif
229 }
230 } 229 }
231 230
232 /* 231 /*
233 Original by Strepto/Astral 232 Original by Strepto/Astral
234 ported to gcc & bugfixed : A'rpi 233 ported to gcc & bugfixed : A'rpi
235 MMX2, 3DNOW optimization by Nick Kurshev 234 MMX2, 3DNOW optimization by Nick Kurshev
236 32bit c version, and and&add trick by Michael Niedermayer 235 32bit c version, and and&add trick by Michael Niedermayer
237 */ 236 */
238 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size) 237 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
239 { 238 {
240 register const uint8_t* s=src; 239 register const uint8_t* s=src;
241 register uint8_t* d=dst; 240 register uint8_t* d=dst;
242 register const uint8_t *end; 241 register const uint8_t *end;
243 const uint8_t *mm_end; 242 const uint8_t *mm_end;
244 end = s + src_size; 243 end = s + src_size;
245 #ifdef HAVE_MMX 244 #ifdef HAVE_MMX
246 __asm __volatile(PREFETCH" %0"::"m"(*s)); 245 __asm __volatile(PREFETCH" %0"::"m"(*s));
247 __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); 246 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
248 mm_end = end - 15; 247 mm_end = end - 15;
249 while(s<mm_end) 248 while (s<mm_end)
250 { 249 {
251 __asm __volatile( 250 __asm __volatile(
252 PREFETCH" 32%1\n\t" 251 PREFETCH" 32%1 \n\t"
253 "movq %1, %%mm0\n\t" 252 "movq %1, %%mm0 \n\t"
254 "movq 8%1, %%mm2\n\t" 253 "movq 8%1, %%mm2 \n\t"
255 "movq %%mm0, %%mm1\n\t" 254 "movq %%mm0, %%mm1 \n\t"
256 "movq %%mm2, %%mm3\n\t" 255 "movq %%mm2, %%mm3 \n\t"
257 "pand %%mm4, %%mm0\n\t" 256 "pand %%mm4, %%mm0 \n\t"
258 "pand %%mm4, %%mm2\n\t" 257 "pand %%mm4, %%mm2 \n\t"
259 "paddw %%mm1, %%mm0\n\t" 258 "paddw %%mm1, %%mm0 \n\t"
260 "paddw %%mm3, %%mm2\n\t" 259 "paddw %%mm3, %%mm2 \n\t"
261 MOVNTQ" %%mm0, %0\n\t" 260 MOVNTQ" %%mm0, %0 \n\t"
262 MOVNTQ" %%mm2, 8%0" 261 MOVNTQ" %%mm2, 8%0"
263 :"=m"(*d) 262 :"=m"(*d)
264 :"m"(*s) 263 :"m"(*s)
265 ); 264 );
266 d+=16; 265 d+=16;
267 s+=16; 266 s+=16;
268 } 267 }
269 __asm __volatile(SFENCE:::"memory"); 268 __asm __volatile(SFENCE:::"memory");
270 __asm __volatile(EMMS:::"memory"); 269 __asm __volatile(EMMS:::"memory");
271 #endif 270 #endif
272 mm_end = end - 3; 271 mm_end = end - 3;
273 while(s < mm_end) 272 while (s < mm_end)
274 { 273 {
275 register unsigned x= *((uint32_t *)s); 274 register unsigned x= *((uint32_t *)s);
276 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); 275 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
277 d+=4; 276 d+=4;
278 s+=4; 277 s+=4;
279 } 278 }
280 if(s < end) 279 if (s < end)
281 { 280 {
282 register unsigned short x= *((uint16_t *)s); 281 register unsigned short x= *((uint16_t *)s);
283 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); 282 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
284 } 283 }
285 } 284 }
286 285
287 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size) 286 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
288 { 287 {
289 register const uint8_t* s=src; 288 register const uint8_t* s=src;
290 register uint8_t* d=dst; 289 register uint8_t* d=dst;
291 register const uint8_t *end; 290 register const uint8_t *end;
292 const uint8_t *mm_end; 291 const uint8_t *mm_end;
293 end = s + src_size; 292 end = s + src_size;
294 #ifdef HAVE_MMX 293 #ifdef HAVE_MMX
295 __asm __volatile(PREFETCH" %0"::"m"(*s)); 294 __asm __volatile(PREFETCH" %0"::"m"(*s));
296 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); 295 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
297 __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); 296 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
298 mm_end = end - 15; 297 mm_end = end - 15;
299 while(s<mm_end) 298 while (s<mm_end)
300 { 299 {
301 __asm __volatile( 300 __asm __volatile(
302 PREFETCH" 32%1\n\t" 301 PREFETCH" 32%1 \n\t"
303 "movq %1, %%mm0\n\t" 302 "movq %1, %%mm0 \n\t"
304 "movq 8%1, %%mm2\n\t" 303 "movq 8%1, %%mm2 \n\t"
305 "movq %%mm0, %%mm1\n\t" 304 "movq %%mm0, %%mm1 \n\t"
306 "movq %%mm2, %%mm3\n\t" 305 "movq %%mm2, %%mm3 \n\t"
307 "psrlq $1, %%mm0\n\t" 306 "psrlq $1, %%mm0 \n\t"
308 "psrlq $1, %%mm2\n\t" 307 "psrlq $1, %%mm2 \n\t"
309 "pand %%mm7, %%mm0\n\t" 308 "pand %%mm7, %%mm0 \n\t"
310 "pand %%mm7, %%mm2\n\t" 309 "pand %%mm7, %%mm2 \n\t"
311 "pand %%mm6, %%mm1\n\t" 310 "pand %%mm6, %%mm1 \n\t"
312 "pand %%mm6, %%mm3\n\t" 311 "pand %%mm6, %%mm3 \n\t"
313 "por %%mm1, %%mm0\n\t" 312 "por %%mm1, %%mm0 \n\t"
314 "por %%mm3, %%mm2\n\t" 313 "por %%mm3, %%mm2 \n\t"
315 MOVNTQ" %%mm0, %0\n\t" 314 MOVNTQ" %%mm0, %0 \n\t"
316 MOVNTQ" %%mm2, 8%0" 315 MOVNTQ" %%mm2, 8%0"
317 :"=m"(*d) 316 :"=m"(*d)
318 :"m"(*s) 317 :"m"(*s)
319 ); 318 );
320 d+=16; 319 d+=16;
321 s+=16; 320 s+=16;
322 } 321 }
323 __asm __volatile(SFENCE:::"memory"); 322 __asm __volatile(SFENCE:::"memory");
324 __asm __volatile(EMMS:::"memory"); 323 __asm __volatile(EMMS:::"memory");
325 #endif 324 #endif
326 mm_end = end - 3; 325 mm_end = end - 3;
327 while(s < mm_end) 326 while (s < mm_end)
328 { 327 {
329 register uint32_t x= *((uint32_t *)s); 328 register uint32_t x= *((uint32_t *)s);
330 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); 329 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
331 s+=4; 330 s+=4;
332 d+=4; 331 d+=4;
333 } 332 }
334 if(s < end) 333 if (s < end)
335 { 334 {
336 register uint16_t x= *((uint16_t *)s); 335 register uint16_t x= *((uint16_t *)s);
337 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); 336 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
338 s+=2; 337 s+=2;
339 d+=2; 338 d+=2;
340 } 339 }
341 } 340 }
342 341
343 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) 342 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344 { 343 {
345 const uint8_t *s = src; 344 const uint8_t *s = src;
346 const uint8_t *end; 345 const uint8_t *end;
347 #ifdef HAVE_MMX 346 #ifdef HAVE_MMX
348 const uint8_t *mm_end; 347 const uint8_t *mm_end;
349 #endif 348 #endif
350 uint16_t *d = (uint16_t *)dst; 349 uint16_t *d = (uint16_t *)dst;
351 end = s + src_size; 350 end = s + src_size;
352 #ifdef HAVE_MMX 351 #ifdef HAVE_MMX
353 mm_end = end - 15; 352 mm_end = end - 15;
354 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) 353 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
355 asm volatile( 354 asm volatile(
356 "movq %3, %%mm5 \n\t" 355 "movq %3, %%mm5 \n\t"
357 "movq %4, %%mm6 \n\t" 356 "movq %4, %%mm6 \n\t"
358 "movq %5, %%mm7 \n\t" 357 "movq %5, %%mm7 \n\t"
359 "jmp 2f \n\t" 358 "jmp 2f \n\t"
360 ASMALIGN(4) 359 ASMALIGN(4)
361 "1: \n\t" 360 "1: \n\t"
362 PREFETCH" 32(%1) \n\t" 361 PREFETCH" 32(%1) \n\t"
363 "movd (%1), %%mm0 \n\t" 362 "movd (%1), %%mm0 \n\t"
364 "movd 4(%1), %%mm3 \n\t" 363 "movd 4(%1), %%mm3 \n\t"
365 "punpckldq 8(%1), %%mm0 \n\t" 364 "punpckldq 8(%1), %%mm0 \n\t"
366 "punpckldq 12(%1), %%mm3 \n\t" 365 "punpckldq 12(%1), %%mm3 \n\t"
367 "movq %%mm0, %%mm1 \n\t" 366 "movq %%mm0, %%mm1 \n\t"
368 "movq %%mm3, %%mm4 \n\t" 367 "movq %%mm3, %%mm4 \n\t"
369 "pand %%mm6, %%mm0 \n\t" 368 "pand %%mm6, %%mm0 \n\t"
370 "pand %%mm6, %%mm3 \n\t" 369 "pand %%mm6, %%mm3 \n\t"
371 "pmaddwd %%mm7, %%mm0 \n\t" 370 "pmaddwd %%mm7, %%mm0 \n\t"
372 "pmaddwd %%mm7, %%mm3 \n\t" 371 "pmaddwd %%mm7, %%mm3 \n\t"
373 "pand %%mm5, %%mm1 \n\t" 372 "pand %%mm5, %%mm1 \n\t"
374 "pand %%mm5, %%mm4 \n\t" 373 "pand %%mm5, %%mm4 \n\t"
375 "por %%mm1, %%mm0 \n\t" 374 "por %%mm1, %%mm0 \n\t"
376 "por %%mm4, %%mm3 \n\t" 375 "por %%mm4, %%mm3 \n\t"
377 "psrld $5, %%mm0 \n\t" 376 "psrld $5, %%mm0 \n\t"
378 "pslld $11, %%mm3 \n\t" 377 "pslld $11, %%mm3 \n\t"
379 "por %%mm3, %%mm0 \n\t" 378 "por %%mm3, %%mm0 \n\t"
380 MOVNTQ" %%mm0, (%0) \n\t" 379 MOVNTQ" %%mm0, (%0) \n\t"
381 "add $16, %1 \n\t" 380 "add $16, %1 \n\t"
382 "add $8, %0 \n\t" 381 "add $8, %0 \n\t"
383 "2: \n\t" 382 "2: \n\t"
384 "cmp %2, %1 \n\t" 383 "cmp %2, %1 \n\t"
385 " jb 1b \n\t" 384 " jb 1b \n\t"
386 : "+r" (d), "+r"(s) 385 : "+r" (d), "+r"(s)
387 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) 386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
388 ); 387 );
389 #else 388 #else
390 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 389 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
391 __asm __volatile( 390 __asm __volatile(
392 "movq %0, %%mm7\n\t" 391 "movq %0, %%mm7 \n\t"
393 "movq %1, %%mm6\n\t" 392 "movq %1, %%mm6 \n\t"
394 ::"m"(red_16mask),"m"(green_16mask)); 393 ::"m"(red_16mask),"m"(green_16mask));
395 while(s < mm_end) 394 while (s < mm_end)
396 { 395 {
397 __asm __volatile( 396 __asm __volatile(
398 PREFETCH" 32%1\n\t" 397 PREFETCH" 32%1 \n\t"
399 "movd %1, %%mm0\n\t" 398 "movd %1, %%mm0 \n\t"
400 "movd 4%1, %%mm3\n\t" 399 "movd 4%1, %%mm3 \n\t"
401 "punpckldq 8%1, %%mm0\n\t" 400 "punpckldq 8%1, %%mm0 \n\t"
402 "punpckldq 12%1, %%mm3\n\t" 401 "punpckldq 12%1, %%mm3 \n\t"
403 "movq %%mm0, %%mm1\n\t" 402 "movq %%mm0, %%mm1 \n\t"
404 "movq %%mm0, %%mm2\n\t" 403 "movq %%mm0, %%mm2 \n\t"
405 "movq %%mm3, %%mm4\n\t" 404 "movq %%mm3, %%mm4 \n\t"
406 "movq %%mm3, %%mm5\n\t" 405 "movq %%mm3, %%mm5 \n\t"
407 "psrlq $3, %%mm0\n\t" 406 "psrlq $3, %%mm0 \n\t"
408 "psrlq $3, %%mm3\n\t" 407 "psrlq $3, %%mm3 \n\t"
409 "pand %2, %%mm0\n\t" 408 "pand %2, %%mm0 \n\t"
410 "pand %2, %%mm3\n\t" 409 "pand %2, %%mm3 \n\t"
411 "psrlq $5, %%mm1\n\t" 410 "psrlq $5, %%mm1 \n\t"
412 "psrlq $5, %%mm4\n\t" 411 "psrlq $5, %%mm4 \n\t"
413 "pand %%mm6, %%mm1\n\t" 412 "pand %%mm6, %%mm1 \n\t"
414 "pand %%mm6, %%mm4\n\t" 413 "pand %%mm6, %%mm4 \n\t"
415 "psrlq $8, %%mm2\n\t" 414 "psrlq $8, %%mm2 \n\t"
416 "psrlq $8, %%mm5\n\t" 415 "psrlq $8, %%mm5 \n\t"
417 "pand %%mm7, %%mm2\n\t" 416 "pand %%mm7, %%mm2 \n\t"
418 "pand %%mm7, %%mm5\n\t" 417 "pand %%mm7, %%mm5 \n\t"
419 "por %%mm1, %%mm0\n\t" 418 "por %%mm1, %%mm0 \n\t"
420 "por %%mm4, %%mm3\n\t" 419 "por %%mm4, %%mm3 \n\t"
421 "por %%mm2, %%mm0\n\t" 420 "por %%mm2, %%mm0 \n\t"
422 "por %%mm5, %%mm3\n\t" 421 "por %%mm5, %%mm3 \n\t"
423 "psllq $16, %%mm3\n\t" 422 "psllq $16, %%mm3 \n\t"
424 "por %%mm3, %%mm0\n\t" 423 "por %%mm3, %%mm0 \n\t"
425 MOVNTQ" %%mm0, %0\n\t" 424 MOVNTQ" %%mm0, %0 \n\t"
426 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
427 d += 4; 426 d += 4;
428 s += 16; 427 s += 16;
429 } 428 }
430 #endif 429 #endif
431 __asm __volatile(SFENCE:::"memory"); 430 __asm __volatile(SFENCE:::"memory");
432 __asm __volatile(EMMS:::"memory"); 431 __asm __volatile(EMMS:::"memory");
433 #endif 432 #endif
434 while(s < end) 433 while (s < end)
435 { 434 {
436 register int rgb = *(uint32_t*)s; s += 4; 435 register int rgb = *(uint32_t*)s; s += 4;
437 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); 436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
438 } 437 }
439 } 438 }
440 439
441 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) 440 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
442 { 441 {
443 const uint8_t *s = src; 442 const uint8_t *s = src;
444 const uint8_t *end; 443 const uint8_t *end;
445 #ifdef HAVE_MMX 444 #ifdef HAVE_MMX
446 const uint8_t *mm_end; 445 const uint8_t *mm_end;
447 #endif 446 #endif
448 uint16_t *d = (uint16_t *)dst; 447 uint16_t *d = (uint16_t *)dst;
449 end = s + src_size; 448 end = s + src_size;
450 #ifdef HAVE_MMX 449 #ifdef HAVE_MMX
451 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 450 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
452 __asm __volatile( 451 __asm __volatile(
453 "movq %0, %%mm7\n\t" 452 "movq %0, %%mm7 \n\t"
454 "movq %1, %%mm6\n\t" 453 "movq %1, %%mm6 \n\t"
455 ::"m"(red_16mask),"m"(green_16mask)); 454 ::"m"(red_16mask),"m"(green_16mask));
456 mm_end = end - 15; 455 mm_end = end - 15;
457 while(s < mm_end) 456 while (s < mm_end)
458 { 457 {
459 __asm __volatile( 458 __asm __volatile(
460 PREFETCH" 32%1\n\t" 459 PREFETCH" 32%1 \n\t"
461 "movd %1, %%mm0\n\t" 460 "movd %1, %%mm0 \n\t"
462 "movd 4%1, %%mm3\n\t" 461 "movd 4%1, %%mm3 \n\t"
463 "punpckldq 8%1, %%mm0\n\t" 462 "punpckldq 8%1, %%mm0 \n\t"
464 "punpckldq 12%1, %%mm3\n\t" 463 "punpckldq 12%1, %%mm3 \n\t"
465 "movq %%mm0, %%mm1\n\t" 464 "movq %%mm0, %%mm1 \n\t"
466 "movq %%mm0, %%mm2\n\t" 465 "movq %%mm0, %%mm2 \n\t"
467 "movq %%mm3, %%mm4\n\t" 466 "movq %%mm3, %%mm4 \n\t"
468 "movq %%mm3, %%mm5\n\t" 467 "movq %%mm3, %%mm5 \n\t"
469 "psllq $8, %%mm0\n\t" 468 "psllq $8, %%mm0 \n\t"
470 "psllq $8, %%mm3\n\t" 469 "psllq $8, %%mm3 \n\t"
471 "pand %%mm7, %%mm0\n\t" 470 "pand %%mm7, %%mm0 \n\t"
472 "pand %%mm7, %%mm3\n\t" 471 "pand %%mm7, %%mm3 \n\t"
473 "psrlq $5, %%mm1\n\t" 472 "psrlq $5, %%mm1 \n\t"
474 "psrlq $5, %%mm4\n\t" 473 "psrlq $5, %%mm4 \n\t"
475 "pand %%mm6, %%mm1\n\t" 474 "pand %%mm6, %%mm1 \n\t"
476 "pand %%mm6, %%mm4\n\t" 475 "pand %%mm6, %%mm4 \n\t"
477 "psrlq $19, %%mm2\n\t" 476 "psrlq $19, %%mm2 \n\t"
478 "psrlq $19, %%mm5\n\t" 477 "psrlq $19, %%mm5 \n\t"
479 "pand %2, %%mm2\n\t" 478 "pand %2, %%mm2 \n\t"
480 "pand %2, %%mm5\n\t" 479 "pand %2, %%mm5 \n\t"
481 "por %%mm1, %%mm0\n\t" 480 "por %%mm1, %%mm0 \n\t"
482 "por %%mm4, %%mm3\n\t" 481 "por %%mm4, %%mm3 \n\t"
483 "por %%mm2, %%mm0\n\t" 482 "por %%mm2, %%mm0 \n\t"
484 "por %%mm5, %%mm3\n\t" 483 "por %%mm5, %%mm3 \n\t"
485 "psllq $16, %%mm3\n\t" 484 "psllq $16, %%mm3 \n\t"
486 "por %%mm3, %%mm0\n\t" 485 "por %%mm3, %%mm0 \n\t"
487 MOVNTQ" %%mm0, %0\n\t" 486 MOVNTQ" %%mm0, %0 \n\t"
488 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
489 d += 4; 488 d += 4;
490 s += 16; 489 s += 16;
491 } 490 }
492 __asm __volatile(SFENCE:::"memory"); 491 __asm __volatile(SFENCE:::"memory");
493 __asm __volatile(EMMS:::"memory"); 492 __asm __volatile(EMMS:::"memory");
494 #endif 493 #endif
495 while(s < end) 494 while (s < end)
496 { 495 {
497 register int rgb = *(uint32_t*)s; s += 4; 496 register int rgb = *(uint32_t*)s; s += 4;
498 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); 497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
499 } 498 }
500 } 499 }
501 500
502 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) 501 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
503 { 502 {
504 const uint8_t *s = src; 503 const uint8_t *s = src;
505 const uint8_t *end; 504 const uint8_t *end;
506 #ifdef HAVE_MMX 505 #ifdef HAVE_MMX
507 const uint8_t *mm_end; 506 const uint8_t *mm_end;
508 #endif 507 #endif
509 uint16_t *d = (uint16_t *)dst; 508 uint16_t *d = (uint16_t *)dst;
510 end = s + src_size; 509 end = s + src_size;
511 #ifdef HAVE_MMX 510 #ifdef HAVE_MMX
512 mm_end = end - 15; 511 mm_end = end - 15;
513 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) 512 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
514 asm volatile( 513 asm volatile(
515 "movq %3, %%mm5 \n\t" 514 "movq %3, %%mm5 \n\t"
516 "movq %4, %%mm6 \n\t" 515 "movq %4, %%mm6 \n\t"
517 "movq %5, %%mm7 \n\t" 516 "movq %5, %%mm7 \n\t"
518 "jmp 2f \n\t" 517 "jmp 2f \n\t"
519 ASMALIGN(4) 518 ASMALIGN(4)
520 "1: \n\t" 519 "1: \n\t"
521 PREFETCH" 32(%1) \n\t" 520 PREFETCH" 32(%1) \n\t"
522 "movd (%1), %%mm0 \n\t" 521 "movd (%1), %%mm0 \n\t"
523 "movd 4(%1), %%mm3 \n\t" 522 "movd 4(%1), %%mm3 \n\t"
524 "punpckldq 8(%1), %%mm0 \n\t" 523 "punpckldq 8(%1), %%mm0 \n\t"
525 "punpckldq 12(%1), %%mm3 \n\t" 524 "punpckldq 12(%1), %%mm3 \n\t"
526 "movq %%mm0, %%mm1 \n\t" 525 "movq %%mm0, %%mm1 \n\t"
527 "movq %%mm3, %%mm4 \n\t" 526 "movq %%mm3, %%mm4 \n\t"
528 "pand %%mm6, %%mm0 \n\t" 527 "pand %%mm6, %%mm0 \n\t"
529 "pand %%mm6, %%mm3 \n\t" 528 "pand %%mm6, %%mm3 \n\t"
530 "pmaddwd %%mm7, %%mm0 \n\t" 529 "pmaddwd %%mm7, %%mm0 \n\t"
531 "pmaddwd %%mm7, %%mm3 \n\t" 530 "pmaddwd %%mm7, %%mm3 \n\t"
532 "pand %%mm5, %%mm1 \n\t" 531 "pand %%mm5, %%mm1 \n\t"
533 "pand %%mm5, %%mm4 \n\t" 532 "pand %%mm5, %%mm4 \n\t"
534 "por %%mm1, %%mm0 \n\t" 533 "por %%mm1, %%mm0 \n\t"
535 "por %%mm4, %%mm3 \n\t" 534 "por %%mm4, %%mm3 \n\t"
536 "psrld $6, %%mm0 \n\t" 535 "psrld $6, %%mm0 \n\t"
537 "pslld $10, %%mm3 \n\t" 536 "pslld $10, %%mm3 \n\t"
538 "por %%mm3, %%mm0 \n\t" 537 "por %%mm3, %%mm0 \n\t"
539 MOVNTQ" %%mm0, (%0) \n\t" 538 MOVNTQ" %%mm0, (%0) \n\t"
540 "add $16, %1 \n\t" 539 "add $16, %1 \n\t"
541 "add $8, %0 \n\t" 540 "add $8, %0 \n\t"
542 "2: \n\t" 541 "2: \n\t"
543 "cmp %2, %1 \n\t" 542 "cmp %2, %1 \n\t"
544 " jb 1b \n\t" 543 " jb 1b \n\t"
545 : "+r" (d), "+r"(s) 544 : "+r" (d), "+r"(s)
546 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) 545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
547 ); 546 );
548 #else 547 #else
549 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 548 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
550 __asm __volatile( 549 __asm __volatile(
551 "movq %0, %%mm7\n\t" 550 "movq %0, %%mm7 \n\t"
552 "movq %1, %%mm6\n\t" 551 "movq %1, %%mm6 \n\t"
553 ::"m"(red_15mask),"m"(green_15mask)); 552 ::"m"(red_15mask),"m"(green_15mask));
554 while(s < mm_end) 553 while (s < mm_end)
555 { 554 {
556 __asm __volatile( 555 __asm __volatile(
557 PREFETCH" 32%1\n\t" 556 PREFETCH" 32%1 \n\t"
558 "movd %1, %%mm0\n\t" 557 "movd %1, %%mm0 \n\t"
559 "movd 4%1, %%mm3\n\t" 558 "movd 4%1, %%mm3 \n\t"
560 "punpckldq 8%1, %%mm0\n\t" 559 "punpckldq 8%1, %%mm0 \n\t"
561 "punpckldq 12%1, %%mm3\n\t" 560 "punpckldq 12%1, %%mm3 \n\t"
562 "movq %%mm0, %%mm1\n\t" 561 "movq %%mm0, %%mm1 \n\t"
563 "movq %%mm0, %%mm2\n\t" 562 "movq %%mm0, %%mm2 \n\t"
564 "movq %%mm3, %%mm4\n\t" 563 "movq %%mm3, %%mm4 \n\t"
565 "movq %%mm3, %%mm5\n\t" 564 "movq %%mm3, %%mm5 \n\t"
566 "psrlq $3, %%mm0\n\t" 565 "psrlq $3, %%mm0 \n\t"
567 "psrlq $3, %%mm3\n\t" 566 "psrlq $3, %%mm3 \n\t"
568 "pand %2, %%mm0\n\t" 567 "pand %2, %%mm0 \n\t"
569 "pand %2, %%mm3\n\t" 568 "pand %2, %%mm3 \n\t"
570 "psrlq $6, %%mm1\n\t" 569 "psrlq $6, %%mm1 \n\t"
571 "psrlq $6, %%mm4\n\t" 570 "psrlq $6, %%mm4 \n\t"
572 "pand %%mm6, %%mm1\n\t" 571 "pand %%mm6, %%mm1 \n\t"
573 "pand %%mm6, %%mm4\n\t" 572 "pand %%mm6, %%mm4 \n\t"
574 "psrlq $9, %%mm2\n\t" 573 "psrlq $9, %%mm2 \n\t"
575 "psrlq $9, %%mm5\n\t" 574 "psrlq $9, %%mm5 \n\t"
576 "pand %%mm7, %%mm2\n\t" 575 "pand %%mm7, %%mm2 \n\t"
577 "pand %%mm7, %%mm5\n\t" 576 "pand %%mm7, %%mm5 \n\t"
578 "por %%mm1, %%mm0\n\t" 577 "por %%mm1, %%mm0 \n\t"
579 "por %%mm4, %%mm3\n\t" 578 "por %%mm4, %%mm3 \n\t"
580 "por %%mm2, %%mm0\n\t" 579 "por %%mm2, %%mm0 \n\t"
581 "por %%mm5, %%mm3\n\t" 580 "por %%mm5, %%mm3 \n\t"
582 "psllq $16, %%mm3\n\t" 581 "psllq $16, %%mm3 \n\t"
583 "por %%mm3, %%mm0\n\t" 582 "por %%mm3, %%mm0 \n\t"
584 MOVNTQ" %%mm0, %0\n\t" 583 MOVNTQ" %%mm0, %0 \n\t"
585 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
586 d += 4; 585 d += 4;
587 s += 16; 586 s += 16;
588 } 587 }
589 #endif 588 #endif
590 __asm __volatile(SFENCE:::"memory"); 589 __asm __volatile(SFENCE:::"memory");
591 __asm __volatile(EMMS:::"memory"); 590 __asm __volatile(EMMS:::"memory");
592 #endif 591 #endif
593 while(s < end) 592 while (s < end)
594 { 593 {
595 register int rgb = *(uint32_t*)s; s += 4; 594 register int rgb = *(uint32_t*)s; s += 4;
596 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); 595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
597 } 596 }
598 } 597 }
599 598
600 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) 599 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
601 { 600 {
602 const uint8_t *s = src; 601 const uint8_t *s = src;
603 const uint8_t *end; 602 const uint8_t *end;
604 #ifdef HAVE_MMX 603 #ifdef HAVE_MMX
605 const uint8_t *mm_end; 604 const uint8_t *mm_end;
606 #endif 605 #endif
607 uint16_t *d = (uint16_t *)dst; 606 uint16_t *d = (uint16_t *)dst;
608 end = s + src_size; 607 end = s + src_size;
609 #ifdef HAVE_MMX 608 #ifdef HAVE_MMX
610 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 609 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
611 __asm __volatile( 610 __asm __volatile(
612 "movq %0, %%mm7\n\t" 611 "movq %0, %%mm7 \n\t"
613 "movq %1, %%mm6\n\t" 612 "movq %1, %%mm6 \n\t"
614 ::"m"(red_15mask),"m"(green_15mask)); 613 ::"m"(red_15mask),"m"(green_15mask));
615 mm_end = end - 15; 614 mm_end = end - 15;
616 while(s < mm_end) 615 while (s < mm_end)
617 { 616 {
618 __asm __volatile( 617 __asm __volatile(
619 PREFETCH" 32%1\n\t" 618 PREFETCH" 32%1 \n\t"
620 "movd %1, %%mm0\n\t" 619 "movd %1, %%mm0 \n\t"
621 "movd 4%1, %%mm3\n\t" 620 "movd 4%1, %%mm3 \n\t"
622 "punpckldq 8%1, %%mm0\n\t" 621 "punpckldq 8%1, %%mm0 \n\t"
623 "punpckldq 12%1, %%mm3\n\t" 622 "punpckldq 12%1, %%mm3 \n\t"
624 "movq %%mm0, %%mm1\n\t" 623 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2\n\t" 624 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4\n\t" 625 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5\n\t" 626 "movq %%mm3, %%mm5 \n\t"
628 "psllq $7, %%mm0\n\t" 627 "psllq $7, %%mm0 \n\t"
629 "psllq $7, %%mm3\n\t" 628 "psllq $7, %%mm3 \n\t"
630 "pand %%mm7, %%mm0\n\t" 629 "pand %%mm7, %%mm0 \n\t"
631 "pand %%mm7, %%mm3\n\t" 630 "pand %%mm7, %%mm3 \n\t"
632 "psrlq $6, %%mm1\n\t" 631 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4\n\t" 632 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1\n\t" 633 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4\n\t" 634 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $19, %%mm2\n\t" 635 "psrlq $19, %%mm2 \n\t"
637 "psrlq $19, %%mm5\n\t" 636 "psrlq $19, %%mm5 \n\t"
638 "pand %2, %%mm2\n\t" 637 "pand %2, %%mm2 \n\t"
639 "pand %2, %%mm5\n\t" 638 "pand %2, %%mm5 \n\t"
640 "por %%mm1, %%mm0\n\t" 639 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3\n\t" 640 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0\n\t" 641 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3\n\t" 642 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3\n\t" 643 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0\n\t" 644 "por %%mm3, %%mm0 \n\t"
646 MOVNTQ" %%mm0, %0\n\t" 645 MOVNTQ" %%mm0, %0 \n\t"
647 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
648 d += 4; 647 d += 4;
649 s += 16; 648 s += 16;
650 } 649 }
651 __asm __volatile(SFENCE:::"memory"); 650 __asm __volatile(SFENCE:::"memory");
652 __asm __volatile(EMMS:::"memory"); 651 __asm __volatile(EMMS:::"memory");
653 #endif 652 #endif
654 while(s < end) 653 while (s < end)
655 { 654 {
656 register int rgb = *(uint32_t*)s; s += 4; 655 register int rgb = *(uint32_t*)s; s += 4;
657 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); 656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
658 } 657 }
659 } 658 }
660 659
661 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) 660 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
662 { 661 {
663 const uint8_t *s = src; 662 const uint8_t *s = src;
664 const uint8_t *end; 663 const uint8_t *end;
665 #ifdef HAVE_MMX 664 #ifdef HAVE_MMX
666 const uint8_t *mm_end; 665 const uint8_t *mm_end;
667 #endif 666 #endif
668 uint16_t *d = (uint16_t *)dst; 667 uint16_t *d = (uint16_t *)dst;
669 end = s + src_size; 668 end = s + src_size;
670 #ifdef HAVE_MMX 669 #ifdef HAVE_MMX
671 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 670 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
672 __asm __volatile( 671 __asm __volatile(
673 "movq %0, %%mm7\n\t" 672 "movq %0, %%mm7 \n\t"
674 "movq %1, %%mm6\n\t" 673 "movq %1, %%mm6 \n\t"
675 ::"m"(red_16mask),"m"(green_16mask)); 674 ::"m"(red_16mask),"m"(green_16mask));
676 mm_end = end - 11; 675 mm_end = end - 11;
677 while(s < mm_end) 676 while (s < mm_end)
678 { 677 {
679 __asm __volatile( 678 __asm __volatile(
680 PREFETCH" 32%1\n\t" 679 PREFETCH" 32%1 \n\t"
681 "movd %1, %%mm0\n\t" 680 "movd %1, %%mm0 \n\t"
682 "movd 3%1, %%mm3\n\t" 681 "movd 3%1, %%mm3 \n\t"
683 "punpckldq 6%1, %%mm0\n\t" 682 "punpckldq 6%1, %%mm0 \n\t"
684 "punpckldq 9%1, %%mm3\n\t" 683 "punpckldq 9%1, %%mm3 \n\t"
685 "movq %%mm0, %%mm1\n\t" 684 "movq %%mm0, %%mm1 \n\t"
686 "movq %%mm0, %%mm2\n\t" 685 "movq %%mm0, %%mm2 \n\t"
687 "movq %%mm3, %%mm4\n\t" 686 "movq %%mm3, %%mm4 \n\t"
688 "movq %%mm3, %%mm5\n\t" 687 "movq %%mm3, %%mm5 \n\t"
689 "psrlq $3, %%mm0\n\t" 688 "psrlq $3, %%mm0 \n\t"
690 "psrlq $3, %%mm3\n\t" 689 "psrlq $3, %%mm3 \n\t"
691 "pand %2, %%mm0\n\t" 690 "pand %2, %%mm0 \n\t"
692 "pand %2, %%mm3\n\t" 691 "pand %2, %%mm3 \n\t"
693 "psrlq $5, %%mm1\n\t" 692 "psrlq $5, %%mm1 \n\t"
694 "psrlq $5, %%mm4\n\t" 693 "psrlq $5, %%mm4 \n\t"
695 "pand %%mm6, %%mm1\n\t" 694 "pand %%mm6, %%mm1 \n\t"
696 "pand %%mm6, %%mm4\n\t" 695 "pand %%mm6, %%mm4 \n\t"
697 "psrlq $8, %%mm2\n\t" 696 "psrlq $8, %%mm2 \n\t"
698 "psrlq $8, %%mm5\n\t" 697 "psrlq $8, %%mm5 \n\t"
699 "pand %%mm7, %%mm2\n\t" 698 "pand %%mm7, %%mm2 \n\t"
700 "pand %%mm7, %%mm5\n\t" 699 "pand %%mm7, %%mm5 \n\t"
701 "por %%mm1, %%mm0\n\t" 700 "por %%mm1, %%mm0 \n\t"
702 "por %%mm4, %%mm3\n\t" 701 "por %%mm4, %%mm3 \n\t"
703 "por %%mm2, %%mm0\n\t" 702 "por %%mm2, %%mm0 \n\t"
704 "por %%mm5, %%mm3\n\t" 703 "por %%mm5, %%mm3 \n\t"
705 "psllq $16, %%mm3\n\t" 704 "psllq $16, %%mm3 \n\t"
706 "por %%mm3, %%mm0\n\t" 705 "por %%mm3, %%mm0 \n\t"
707 MOVNTQ" %%mm0, %0\n\t" 706 MOVNTQ" %%mm0, %0 \n\t"
708 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
709 d += 4; 708 d += 4;
710 s += 12; 709 s += 12;
711 } 710 }
712 __asm __volatile(SFENCE:::"memory"); 711 __asm __volatile(SFENCE:::"memory");
713 __asm __volatile(EMMS:::"memory"); 712 __asm __volatile(EMMS:::"memory");
714 #endif 713 #endif
715 while(s < end) 714 while (s < end)
716 { 715 {
717 const int b= *s++; 716 const int b = *s++;
718 const int g= *s++; 717 const int g = *s++;
719 const int r= *s++; 718 const int r = *s++;
720 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 719 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
721 } 720 }
722 } 721 }
723 722
724 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) 723 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
725 { 724 {
726 const uint8_t *s = src; 725 const uint8_t *s = src;
727 const uint8_t *end; 726 const uint8_t *end;
728 #ifdef HAVE_MMX 727 #ifdef HAVE_MMX
729 const uint8_t *mm_end; 728 const uint8_t *mm_end;
730 #endif 729 #endif
731 uint16_t *d = (uint16_t *)dst; 730 uint16_t *d = (uint16_t *)dst;
732 end = s + src_size; 731 end = s + src_size;
733 #ifdef HAVE_MMX 732 #ifdef HAVE_MMX
734 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 733 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
735 __asm __volatile( 734 __asm __volatile(
736 "movq %0, %%mm7\n\t" 735 "movq %0, %%mm7 \n\t"
737 "movq %1, %%mm6\n\t" 736 "movq %1, %%mm6 \n\t"
738 ::"m"(red_16mask),"m"(green_16mask)); 737 ::"m"(red_16mask),"m"(green_16mask));
739 mm_end = end - 15; 738 mm_end = end - 15;
740 while(s < mm_end) 739 while (s < mm_end)
741 { 740 {
742 __asm __volatile( 741 __asm __volatile(
743 PREFETCH" 32%1\n\t" 742 PREFETCH" 32%1 \n\t"
744 "movd %1, %%mm0\n\t" 743 "movd %1, %%mm0 \n\t"
745 "movd 3%1, %%mm3\n\t" 744 "movd 3%1, %%mm3 \n\t"
746 "punpckldq 6%1, %%mm0\n\t" 745 "punpckldq 6%1, %%mm0 \n\t"
747 "punpckldq 9%1, %%mm3\n\t" 746 "punpckldq 9%1, %%mm3 \n\t"
748 "movq %%mm0, %%mm1\n\t" 747 "movq %%mm0, %%mm1 \n\t"
749 "movq %%mm0, %%mm2\n\t" 748 "movq %%mm0, %%mm2 \n\t"
750 "movq %%mm3, %%mm4\n\t" 749 "movq %%mm3, %%mm4 \n\t"
751 "movq %%mm3, %%mm5\n\t" 750 "movq %%mm3, %%mm5 \n\t"
752 "psllq $8, %%mm0\n\t" 751 "psllq $8, %%mm0 \n\t"
753 "psllq $8, %%mm3\n\t" 752 "psllq $8, %%mm3 \n\t"
754 "pand %%mm7, %%mm0\n\t" 753 "pand %%mm7, %%mm0 \n\t"
755 "pand %%mm7, %%mm3\n\t" 754 "pand %%mm7, %%mm3 \n\t"
756 "psrlq $5, %%mm1\n\t" 755 "psrlq $5, %%mm1 \n\t"
757 "psrlq $5, %%mm4\n\t" 756 "psrlq $5, %%mm4 \n\t"
758 "pand %%mm6, %%mm1\n\t" 757 "pand %%mm6, %%mm1 \n\t"
759 "pand %%mm6, %%mm4\n\t" 758 "pand %%mm6, %%mm4 \n\t"
760 "psrlq $19, %%mm2\n\t" 759 "psrlq $19, %%mm2 \n\t"
761 "psrlq $19, %%mm5\n\t" 760 "psrlq $19, %%mm5 \n\t"
762 "pand %2, %%mm2\n\t" 761 "pand %2, %%mm2 \n\t"
763 "pand %2, %%mm5\n\t" 762 "pand %2, %%mm5 \n\t"
764 "por %%mm1, %%mm0\n\t" 763 "por %%mm1, %%mm0 \n\t"
765 "por %%mm4, %%mm3\n\t" 764 "por %%mm4, %%mm3 \n\t"
766 "por %%mm2, %%mm0\n\t" 765 "por %%mm2, %%mm0 \n\t"
767 "por %%mm5, %%mm3\n\t" 766 "por %%mm5, %%mm3 \n\t"
768 "psllq $16, %%mm3\n\t" 767 "psllq $16, %%mm3 \n\t"
769 "por %%mm3, %%mm0\n\t" 768 "por %%mm3, %%mm0 \n\t"
770 MOVNTQ" %%mm0, %0\n\t" 769 MOVNTQ" %%mm0, %0 \n\t"
771 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
772 d += 4; 771 d += 4;
773 s += 12; 772 s += 12;
774 } 773 }
775 __asm __volatile(SFENCE:::"memory"); 774 __asm __volatile(SFENCE:::"memory");
776 __asm __volatile(EMMS:::"memory"); 775 __asm __volatile(EMMS:::"memory");
777 #endif 776 #endif
778 while(s < end) 777 while (s < end)
779 { 778 {
780 const int r= *s++; 779 const int r = *s++;
781 const int g= *s++; 780 const int g = *s++;
782 const int b= *s++; 781 const int b = *s++;
783 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 782 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
784 } 783 }
785 } 784 }
786 785
787 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) 786 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
788 { 787 {
789 const uint8_t *s = src; 788 const uint8_t *s = src;
790 const uint8_t *end; 789 const uint8_t *end;
791 #ifdef HAVE_MMX 790 #ifdef HAVE_MMX
792 const uint8_t *mm_end; 791 const uint8_t *mm_end;
793 #endif 792 #endif
794 uint16_t *d = (uint16_t *)dst; 793 uint16_t *d = (uint16_t *)dst;
795 end = s + src_size; 794 end = s + src_size;
796 #ifdef HAVE_MMX 795 #ifdef HAVE_MMX
797 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 796 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
798 __asm __volatile( 797 __asm __volatile(
799 "movq %0, %%mm7\n\t" 798 "movq %0, %%mm7 \n\t"
800 "movq %1, %%mm6\n\t" 799 "movq %1, %%mm6 \n\t"
801 ::"m"(red_15mask),"m"(green_15mask)); 800 ::"m"(red_15mask),"m"(green_15mask));
802 mm_end = end - 11; 801 mm_end = end - 11;
803 while(s < mm_end) 802 while (s < mm_end)
804 { 803 {
805 __asm __volatile( 804 __asm __volatile(
806 PREFETCH" 32%1\n\t" 805 PREFETCH" 32%1 \n\t"
807 "movd %1, %%mm0\n\t" 806 "movd %1, %%mm0 \n\t"
808 "movd 3%1, %%mm3\n\t" 807 "movd 3%1, %%mm3 \n\t"
809 "punpckldq 6%1, %%mm0\n\t" 808 "punpckldq 6%1, %%mm0 \n\t"
810 "punpckldq 9%1, %%mm3\n\t" 809 "punpckldq 9%1, %%mm3 \n\t"
811 "movq %%mm0, %%mm1\n\t" 810 "movq %%mm0, %%mm1 \n\t"
812 "movq %%mm0, %%mm2\n\t" 811 "movq %%mm0, %%mm2 \n\t"
813 "movq %%mm3, %%mm4\n\t" 812 "movq %%mm3, %%mm4 \n\t"
814 "movq %%mm3, %%mm5\n\t" 813 "movq %%mm3, %%mm5 \n\t"
815 "psrlq $3, %%mm0\n\t" 814 "psrlq $3, %%mm0 \n\t"
816 "psrlq $3, %%mm3\n\t" 815 "psrlq $3, %%mm3 \n\t"
817 "pand %2, %%mm0\n\t" 816 "pand %2, %%mm0 \n\t"
818 "pand %2, %%mm3\n\t" 817 "pand %2, %%mm3 \n\t"
819 "psrlq $6, %%mm1\n\t" 818 "psrlq $6, %%mm1 \n\t"
820 "psrlq $6, %%mm4\n\t" 819 "psrlq $6, %%mm4 \n\t"
821 "pand %%mm6, %%mm1\n\t" 820 "pand %%mm6, %%mm1 \n\t"
822 "pand %%mm6, %%mm4\n\t" 821 "pand %%mm6, %%mm4 \n\t"
823 "psrlq $9, %%mm2\n\t" 822 "psrlq $9, %%mm2 \n\t"
824 "psrlq $9, %%mm5\n\t" 823 "psrlq $9, %%mm5 \n\t"
825 "pand %%mm7, %%mm2\n\t" 824 "pand %%mm7, %%mm2 \n\t"
826 "pand %%mm7, %%mm5\n\t" 825 "pand %%mm7, %%mm5 \n\t"
827 "por %%mm1, %%mm0\n\t" 826 "por %%mm1, %%mm0 \n\t"
828 "por %%mm4, %%mm3\n\t" 827 "por %%mm4, %%mm3 \n\t"
829 "por %%mm2, %%mm0\n\t" 828 "por %%mm2, %%mm0 \n\t"
830 "por %%mm5, %%mm3\n\t" 829 "por %%mm5, %%mm3 \n\t"
831 "psllq $16, %%mm3\n\t" 830 "psllq $16, %%mm3 \n\t"
832 "por %%mm3, %%mm0\n\t" 831 "por %%mm3, %%mm0 \n\t"
833 MOVNTQ" %%mm0, %0\n\t" 832 MOVNTQ" %%mm0, %0 \n\t"
834 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
835 d += 4; 834 d += 4;
836 s += 12; 835 s += 12;
837 } 836 }
838 __asm __volatile(SFENCE:::"memory"); 837 __asm __volatile(SFENCE:::"memory");
839 __asm __volatile(EMMS:::"memory"); 838 __asm __volatile(EMMS:::"memory");
840 #endif 839 #endif
841 while(s < end) 840 while (s < end)
842 { 841 {
843 const int b= *s++; 842 const int b = *s++;
844 const int g= *s++; 843 const int g = *s++;
845 const int r= *s++; 844 const int r = *s++;
846 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 845 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
847 } 846 }
848 } 847 }
849 848
850 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) 849 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
851 { 850 {
852 const uint8_t *s = src; 851 const uint8_t *s = src;
853 const uint8_t *end; 852 const uint8_t *end;
854 #ifdef HAVE_MMX 853 #ifdef HAVE_MMX
855 const uint8_t *mm_end; 854 const uint8_t *mm_end;
856 #endif 855 #endif
857 uint16_t *d = (uint16_t *)dst; 856 uint16_t *d = (uint16_t *)dst;
858 end = s + src_size; 857 end = s + src_size;
859 #ifdef HAVE_MMX 858 #ifdef HAVE_MMX
860 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 859 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
861 __asm __volatile( 860 __asm __volatile(
862 "movq %0, %%mm7\n\t" 861 "movq %0, %%mm7 \n\t"
863 "movq %1, %%mm6\n\t" 862 "movq %1, %%mm6 \n\t"
864 ::"m"(red_15mask),"m"(green_15mask)); 863 ::"m"(red_15mask),"m"(green_15mask));
865 mm_end = end - 15; 864 mm_end = end - 15;
866 while(s < mm_end) 865 while (s < mm_end)
867 { 866 {
868 __asm __volatile( 867 __asm __volatile(
869 PREFETCH" 32%1\n\t" 868 PREFETCH" 32%1 \n\t"
870 "movd %1, %%mm0\n\t" 869 "movd %1, %%mm0 \n\t"
871 "movd 3%1, %%mm3\n\t" 870 "movd 3%1, %%mm3 \n\t"
872 "punpckldq 6%1, %%mm0\n\t" 871 "punpckldq 6%1, %%mm0 \n\t"
873 "punpckldq 9%1, %%mm3\n\t" 872 "punpckldq 9%1, %%mm3 \n\t"
874 "movq %%mm0, %%mm1\n\t" 873 "movq %%mm0, %%mm1 \n\t"
875 "movq %%mm0, %%mm2\n\t" 874 "movq %%mm0, %%mm2 \n\t"
876 "movq %%mm3, %%mm4\n\t" 875 "movq %%mm3, %%mm4 \n\t"
877 "movq %%mm3, %%mm5\n\t" 876 "movq %%mm3, %%mm5 \n\t"
878 "psllq $7, %%mm0\n\t" 877 "psllq $7, %%mm0 \n\t"
879 "psllq $7, %%mm3\n\t" 878 "psllq $7, %%mm3 \n\t"
880 "pand %%mm7, %%mm0\n\t" 879 "pand %%mm7, %%mm0 \n\t"
881 "pand %%mm7, %%mm3\n\t" 880 "pand %%mm7, %%mm3 \n\t"
882 "psrlq $6, %%mm1\n\t" 881 "psrlq $6, %%mm1 \n\t"
883 "psrlq $6, %%mm4\n\t" 882 "psrlq $6, %%mm4 \n\t"
884 "pand %%mm6, %%mm1\n\t" 883 "pand %%mm6, %%mm1 \n\t"
885 "pand %%mm6, %%mm4\n\t" 884 "pand %%mm6, %%mm4 \n\t"
886 "psrlq $19, %%mm2\n\t" 885 "psrlq $19, %%mm2 \n\t"
887 "psrlq $19, %%mm5\n\t" 886 "psrlq $19, %%mm5 \n\t"
888 "pand %2, %%mm2\n\t" 887 "pand %2, %%mm2 \n\t"
889 "pand %2, %%mm5\n\t" 888 "pand %2, %%mm5 \n\t"
890 "por %%mm1, %%mm0\n\t" 889 "por %%mm1, %%mm0 \n\t"
891 "por %%mm4, %%mm3\n\t" 890 "por %%mm4, %%mm3 \n\t"
892 "por %%mm2, %%mm0\n\t" 891 "por %%mm2, %%mm0 \n\t"
893 "por %%mm5, %%mm3\n\t" 892 "por %%mm5, %%mm3 \n\t"
894 "psllq $16, %%mm3\n\t" 893 "psllq $16, %%mm3 \n\t"
895 "por %%mm3, %%mm0\n\t" 894 "por %%mm3, %%mm0 \n\t"
896 MOVNTQ" %%mm0, %0\n\t" 895 MOVNTQ" %%mm0, %0 \n\t"
897 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
898 d += 4; 897 d += 4;
899 s += 12; 898 s += 12;
900 } 899 }
901 __asm __volatile(SFENCE:::"memory"); 900 __asm __volatile(SFENCE:::"memory");
902 __asm __volatile(EMMS:::"memory"); 901 __asm __volatile(EMMS:::"memory");
903 #endif 902 #endif
904 while(s < end) 903 while (s < end)
905 { 904 {
906 const int r= *s++; 905 const int r = *s++;
907 const int g= *s++; 906 const int g = *s++;
908 const int b= *s++; 907 const int b = *s++;
909 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 908 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
910 } 909 }
911 } 910 }
912 911
913 /* 912 /*
914 I use here less accurate approximation by simply 913 I use here less accurate approximation by simply
915 left-shifting the input 914 left-shifting the input
933 | 932 |
934 Original Bits 933 Original Bits
935 */ 934 */
936 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) 935 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
937 { 936 {
938 const uint16_t *end; 937 const uint16_t *end;
939 #ifdef HAVE_MMX 938 #ifdef HAVE_MMX
940 const uint16_t *mm_end; 939 const uint16_t *mm_end;
941 #endif 940 #endif
942 uint8_t *d = (uint8_t *)dst; 941 uint8_t *d = (uint8_t *)dst;
943 const uint16_t *s = (uint16_t *)src; 942 const uint16_t *s = (uint16_t *)src;
944 end = s + src_size/2; 943 end = s + src_size/2;
945 #ifdef HAVE_MMX 944 #ifdef HAVE_MMX
946 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 945 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
947 mm_end = end - 7; 946 mm_end = end - 7;
948 while(s < mm_end) 947 while (s < mm_end)
949 { 948 {
950 __asm __volatile( 949 __asm __volatile(
951 PREFETCH" 32%1\n\t" 950 PREFETCH" 32%1 \n\t"
952 "movq %1, %%mm0\n\t" 951 "movq %1, %%mm0 \n\t"
953 "movq %1, %%mm1\n\t" 952 "movq %1, %%mm1 \n\t"
954 "movq %1, %%mm2\n\t" 953 "movq %1, %%mm2 \n\t"
955 "pand %2, %%mm0\n\t" 954 "pand %2, %%mm0 \n\t"
956 "pand %3, %%mm1\n\t" 955 "pand %3, %%mm1 \n\t"
957 "pand %4, %%mm2\n\t" 956 "pand %4, %%mm2 \n\t"
958 "psllq $3, %%mm0\n\t" 957 "psllq $3, %%mm0 \n\t"
959 "psrlq $2, %%mm1\n\t" 958 "psrlq $2, %%mm1 \n\t"
960 "psrlq $7, %%mm2\n\t" 959 "psrlq $7, %%mm2 \n\t"
961 "movq %%mm0, %%mm3\n\t" 960 "movq %%mm0, %%mm3 \n\t"
962 "movq %%mm1, %%mm4\n\t" 961 "movq %%mm1, %%mm4 \n\t"
963 "movq %%mm2, %%mm5\n\t" 962 "movq %%mm2, %%mm5 \n\t"
964 "punpcklwd %5, %%mm0\n\t" 963 "punpcklwd %5, %%mm0 \n\t"
965 "punpcklwd %5, %%mm1\n\t" 964 "punpcklwd %5, %%mm1 \n\t"
966 "punpcklwd %5, %%mm2\n\t" 965 "punpcklwd %5, %%mm2 \n\t"
967 "punpckhwd %5, %%mm3\n\t" 966 "punpckhwd %5, %%mm3 \n\t"
968 "punpckhwd %5, %%mm4\n\t" 967 "punpckhwd %5, %%mm4 \n\t"
969 "punpckhwd %5, %%mm5\n\t" 968 "punpckhwd %5, %%mm5 \n\t"
970 "psllq $8, %%mm1\n\t" 969 "psllq $8, %%mm1 \n\t"
971 "psllq $16, %%mm2\n\t" 970 "psllq $16, %%mm2 \n\t"
972 "por %%mm1, %%mm0\n\t" 971 "por %%mm1, %%mm0 \n\t"
973 "por %%mm2, %%mm0\n\t" 972 "por %%mm2, %%mm0 \n\t"
974 "psllq $8, %%mm4\n\t" 973 "psllq $8, %%mm4 \n\t"
975 "psllq $16, %%mm5\n\t" 974 "psllq $16, %%mm5 \n\t"
976 "por %%mm4, %%mm3\n\t" 975 "por %%mm4, %%mm3 \n\t"
977 "por %%mm5, %%mm3\n\t" 976 "por %%mm5, %%mm3 \n\t"
978 977
979 "movq %%mm0, %%mm6\n\t" 978 "movq %%mm0, %%mm6 \n\t"
980 "movq %%mm3, %%mm7\n\t" 979 "movq %%mm3, %%mm7 \n\t"
981 980
982 "movq 8%1, %%mm0\n\t" 981 "movq 8%1, %%mm0 \n\t"
983 "movq 8%1, %%mm1\n\t" 982 "movq 8%1, %%mm1 \n\t"
984 "movq 8%1, %%mm2\n\t" 983 "movq 8%1, %%mm2 \n\t"
985 "pand %2, %%mm0\n\t" 984 "pand %2, %%mm0 \n\t"
986 "pand %3, %%mm1\n\t" 985 "pand %3, %%mm1 \n\t"
987 "pand %4, %%mm2\n\t" 986 "pand %4, %%mm2 \n\t"
988 "psllq $3, %%mm0\n\t" 987 "psllq $3, %%mm0 \n\t"
989 "psrlq $2, %%mm1\n\t" 988 "psrlq $2, %%mm1 \n\t"
990 "psrlq $7, %%mm2\n\t" 989 "psrlq $7, %%mm2 \n\t"
991 "movq %%mm0, %%mm3\n\t" 990 "movq %%mm0, %%mm3 \n\t"
992 "movq %%mm1, %%mm4\n\t" 991 "movq %%mm1, %%mm4 \n\t"
993 "movq %%mm2, %%mm5\n\t" 992 "movq %%mm2, %%mm5 \n\t"
994 "punpcklwd %5, %%mm0\n\t" 993 "punpcklwd %5, %%mm0 \n\t"
995 "punpcklwd %5, %%mm1\n\t" 994 "punpcklwd %5, %%mm1 \n\t"
996 "punpcklwd %5, %%mm2\n\t" 995 "punpcklwd %5, %%mm2 \n\t"
997 "punpckhwd %5, %%mm3\n\t" 996 "punpckhwd %5, %%mm3 \n\t"
998 "punpckhwd %5, %%mm4\n\t" 997 "punpckhwd %5, %%mm4 \n\t"
999 "punpckhwd %5, %%mm5\n\t" 998 "punpckhwd %5, %%mm5 \n\t"
1000 "psllq $8, %%mm1\n\t" 999 "psllq $8, %%mm1 \n\t"
1001 "psllq $16, %%mm2\n\t" 1000 "psllq $16, %%mm2 \n\t"
1002 "por %%mm1, %%mm0\n\t" 1001 "por %%mm1, %%mm0 \n\t"
1003 "por %%mm2, %%mm0\n\t" 1002 "por %%mm2, %%mm0 \n\t"
1004 "psllq $8, %%mm4\n\t" 1003 "psllq $8, %%mm4 \n\t"
1005 "psllq $16, %%mm5\n\t" 1004 "psllq $16, %%mm5 \n\t"
1006 "por %%mm4, %%mm3\n\t" 1005 "por %%mm4, %%mm3 \n\t"
1007 "por %%mm5, %%mm3\n\t" 1006 "por %%mm5, %%mm3 \n\t"
1008 1007
1009 :"=m"(*d) 1008 :"=m"(*d)
1010 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) 1009 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1011 :"memory"); 1010 :"memory");
1012 /* Borrowed 32 to 24 */ 1011 /* Borrowed 32 to 24 */
1013 __asm __volatile( 1012 __asm __volatile(
1014 "movq %%mm0, %%mm4\n\t" 1013 "movq %%mm0, %%mm4 \n\t"
1015 "movq %%mm3, %%mm5\n\t" 1014 "movq %%mm3, %%mm5 \n\t"
1016 "movq %%mm6, %%mm0\n\t" 1015 "movq %%mm6, %%mm0 \n\t"
1017 "movq %%mm7, %%mm1\n\t" 1016 "movq %%mm7, %%mm1 \n\t"
1018 1017
1019 "movq %%mm4, %%mm6\n\t" 1018 "movq %%mm4, %%mm6 \n\t"
1020 "movq %%mm5, %%mm7\n\t" 1019 "movq %%mm5, %%mm7 \n\t"
1021 "movq %%mm0, %%mm2\n\t" 1020 "movq %%mm0, %%mm2 \n\t"
1022 "movq %%mm1, %%mm3\n\t" 1021 "movq %%mm1, %%mm3 \n\t"
1023 1022
1024 "psrlq $8, %%mm2\n\t" 1023 "psrlq $8, %%mm2 \n\t"
1025 "psrlq $8, %%mm3\n\t" 1024 "psrlq $8, %%mm3 \n\t"
1026 "psrlq $8, %%mm6\n\t" 1025 "psrlq $8, %%mm6 \n\t"
1027 "psrlq $8, %%mm7\n\t" 1026 "psrlq $8, %%mm7 \n\t"
1028 "pand %2, %%mm0\n\t" 1027 "pand %2, %%mm0 \n\t"
1029 "pand %2, %%mm1\n\t" 1028 "pand %2, %%mm1 \n\t"
1030 "pand %2, %%mm4\n\t" 1029 "pand %2, %%mm4 \n\t"
1031 "pand %2, %%mm5\n\t" 1030 "pand %2, %%mm5 \n\t"
1032 "pand %3, %%mm2\n\t" 1031 "pand %3, %%mm2 \n\t"
1033 "pand %3, %%mm3\n\t" 1032 "pand %3, %%mm3 \n\t"
1034 "pand %3, %%mm6\n\t" 1033 "pand %3, %%mm6 \n\t"
1035 "pand %3, %%mm7\n\t" 1034 "pand %3, %%mm7 \n\t"
1036 "por %%mm2, %%mm0\n\t" 1035 "por %%mm2, %%mm0 \n\t"
1037 "por %%mm3, %%mm1\n\t" 1036 "por %%mm3, %%mm1 \n\t"
1038 "por %%mm6, %%mm4\n\t" 1037 "por %%mm6, %%mm4 \n\t"
1039 "por %%mm7, %%mm5\n\t" 1038 "por %%mm7, %%mm5 \n\t"
1040 1039
1041 "movq %%mm1, %%mm2\n\t" 1040 "movq %%mm1, %%mm2 \n\t"
1042 "movq %%mm4, %%mm3\n\t" 1041 "movq %%mm4, %%mm3 \n\t"
1043 "psllq $48, %%mm2\n\t" 1042 "psllq $48, %%mm2 \n\t"
1044 "psllq $32, %%mm3\n\t" 1043 "psllq $32, %%mm3 \n\t"
1045 "pand %4, %%mm2\n\t" 1044 "pand %4, %%mm2 \n\t"
1046 "pand %5, %%mm3\n\t" 1045 "pand %5, %%mm3 \n\t"
1047 "por %%mm2, %%mm0\n\t" 1046 "por %%mm2, %%mm0 \n\t"
1048 "psrlq $16, %%mm1\n\t" 1047 "psrlq $16, %%mm1 \n\t"
1049 "psrlq $32, %%mm4\n\t" 1048 "psrlq $32, %%mm4 \n\t"
1050 "psllq $16, %%mm5\n\t" 1049 "psllq $16, %%mm5 \n\t"
1051 "por %%mm3, %%mm1\n\t" 1050 "por %%mm3, %%mm1 \n\t"
1052 "pand %6, %%mm5\n\t" 1051 "pand %6, %%mm5 \n\t"
1053 "por %%mm5, %%mm4\n\t" 1052 "por %%mm5, %%mm4 \n\t"
1054 1053
1055 MOVNTQ" %%mm0, %0\n\t" 1054 MOVNTQ" %%mm0, %0 \n\t"
1056 MOVNTQ" %%mm1, 8%0\n\t" 1055 MOVNTQ" %%mm1, 8%0 \n\t"
1057 MOVNTQ" %%mm4, 16%0" 1056 MOVNTQ" %%mm4, 16%0"
1058 1057
1059 :"=m"(*d) 1058 :"=m"(*d)
1060 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 1059 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1061 :"memory"); 1060 :"memory");
1062 d += 24; 1061 d += 24;
1063 s += 8; 1062 s += 8;
1064 } 1063 }
1065 __asm __volatile(SFENCE:::"memory"); 1064 __asm __volatile(SFENCE:::"memory");
1066 __asm __volatile(EMMS:::"memory"); 1065 __asm __volatile(EMMS:::"memory");
1067 #endif 1066 #endif
1068 while(s < end) 1067 while (s < end)
1069 { 1068 {
1070 register uint16_t bgr; 1069 register uint16_t bgr;
1071 bgr = *s++; 1070 bgr = *s++;
1072 *d++ = (bgr&0x1F)<<3; 1071 *d++ = (bgr&0x1F)<<3;
1073 *d++ = (bgr&0x3E0)>>2; 1072 *d++ = (bgr&0x3E0)>>2;
1074 *d++ = (bgr&0x7C00)>>7; 1073 *d++ = (bgr&0x7C00)>>7;
1075 } 1074 }
1076 } 1075 }
1077 1076
1078 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size) 1077 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1079 { 1078 {
1080 const uint16_t *end; 1079 const uint16_t *end;
1081 #ifdef HAVE_MMX 1080 #ifdef HAVE_MMX
1082 const uint16_t *mm_end; 1081 const uint16_t *mm_end;
1083 #endif 1082 #endif
1084 uint8_t *d = (uint8_t *)dst; 1083 uint8_t *d = (uint8_t *)dst;
1085 const uint16_t *s = (const uint16_t *)src; 1084 const uint16_t *s = (const uint16_t *)src;
1086 end = s + src_size/2; 1085 end = s + src_size/2;
1087 #ifdef HAVE_MMX 1086 #ifdef HAVE_MMX
1088 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 1087 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1089 mm_end = end - 7; 1088 mm_end = end - 7;
1090 while(s < mm_end) 1089 while (s < mm_end)
1091 { 1090 {
1092 __asm __volatile( 1091 __asm __volatile(
1093 PREFETCH" 32%1\n\t" 1092 PREFETCH" 32%1 \n\t"
1094 "movq %1, %%mm0\n\t" 1093 "movq %1, %%mm0 \n\t"
1095 "movq %1, %%mm1\n\t" 1094 "movq %1, %%mm1 \n\t"
1096 "movq %1, %%mm2\n\t" 1095 "movq %1, %%mm2 \n\t"
1097 "pand %2, %%mm0\n\t" 1096 "pand %2, %%mm0 \n\t"
1098 "pand %3, %%mm1\n\t" 1097 "pand %3, %%mm1 \n\t"
1099 "pand %4, %%mm2\n\t" 1098 "pand %4, %%mm2 \n\t"
1100 "psllq $3, %%mm0\n\t" 1099 "psllq $3, %%mm0 \n\t"
1101 "psrlq $3, %%mm1\n\t" 1100 "psrlq $3, %%mm1 \n\t"
1102 "psrlq $8, %%mm2\n\t" 1101 "psrlq $8, %%mm2 \n\t"
1103 "movq %%mm0, %%mm3\n\t" 1102 "movq %%mm0, %%mm3 \n\t"
1104 "movq %%mm1, %%mm4\n\t" 1103 "movq %%mm1, %%mm4 \n\t"
1105 "movq %%mm2, %%mm5\n\t" 1104 "movq %%mm2, %%mm5 \n\t"
1106 "punpcklwd %5, %%mm0\n\t" 1105 "punpcklwd %5, %%mm0 \n\t"
1107 "punpcklwd %5, %%mm1\n\t" 1106 "punpcklwd %5, %%mm1 \n\t"
1108 "punpcklwd %5, %%mm2\n\t" 1107 "punpcklwd %5, %%mm2 \n\t"
1109 "punpckhwd %5, %%mm3\n\t" 1108 "punpckhwd %5, %%mm3 \n\t"
1110 "punpckhwd %5, %%mm4\n\t" 1109 "punpckhwd %5, %%mm4 \n\t"
1111 "punpckhwd %5, %%mm5\n\t" 1110 "punpckhwd %5, %%mm5 \n\t"
1112 "psllq $8, %%mm1\n\t" 1111 "psllq $8, %%mm1 \n\t"
1113 "psllq $16, %%mm2\n\t" 1112 "psllq $16, %%mm2 \n\t"
1114 "por %%mm1, %%mm0\n\t" 1113 "por %%mm1, %%mm0 \n\t"
1115 "por %%mm2, %%mm0\n\t" 1114 "por %%mm2, %%mm0 \n\t"
1116 "psllq $8, %%mm4\n\t" 1115 "psllq $8, %%mm4 \n\t"
1117 "psllq $16, %%mm5\n\t" 1116 "psllq $16, %%mm5 \n\t"
1118 "por %%mm4, %%mm3\n\t" 1117 "por %%mm4, %%mm3 \n\t"
1119 "por %%mm5, %%mm3\n\t" 1118 "por %%mm5, %%mm3 \n\t"
1120 1119
1121 "movq %%mm0, %%mm6\n\t" 1120 "movq %%mm0, %%mm6 \n\t"
1122 "movq %%mm3, %%mm7\n\t" 1121 "movq %%mm3, %%mm7 \n\t"
1123 1122
1124 "movq 8%1, %%mm0\n\t" 1123 "movq 8%1, %%mm0 \n\t"
1125 "movq 8%1, %%mm1\n\t" 1124 "movq 8%1, %%mm1 \n\t"
1126 "movq 8%1, %%mm2\n\t" 1125 "movq 8%1, %%mm2 \n\t"
1127 "pand %2, %%mm0\n\t" 1126 "pand %2, %%mm0 \n\t"
1128 "pand %3, %%mm1\n\t" 1127 "pand %3, %%mm1 \n\t"
1129 "pand %4, %%mm2\n\t" 1128 "pand %4, %%mm2 \n\t"
1130 "psllq $3, %%mm0\n\t" 1129 "psllq $3, %%mm0 \n\t"
1131 "psrlq $3, %%mm1\n\t" 1130 "psrlq $3, %%mm1 \n\t"
1132 "psrlq $8, %%mm2\n\t" 1131 "psrlq $8, %%mm2 \n\t"
1133 "movq %%mm0, %%mm3\n\t" 1132 "movq %%mm0, %%mm3 \n\t"
1134 "movq %%mm1, %%mm4\n\t" 1133 "movq %%mm1, %%mm4 \n\t"
1135 "movq %%mm2, %%mm5\n\t" 1134 "movq %%mm2, %%mm5 \n\t"
1136 "punpcklwd %5, %%mm0\n\t" 1135 "punpcklwd %5, %%mm0 \n\t"
1137 "punpcklwd %5, %%mm1\n\t" 1136 "punpcklwd %5, %%mm1 \n\t"
1138 "punpcklwd %5, %%mm2\n\t" 1137 "punpcklwd %5, %%mm2 \n\t"
1139 "punpckhwd %5, %%mm3\n\t" 1138 "punpckhwd %5, %%mm3 \n\t"
1140 "punpckhwd %5, %%mm4\n\t" 1139 "punpckhwd %5, %%mm4 \n\t"
1141 "punpckhwd %5, %%mm5\n\t" 1140 "punpckhwd %5, %%mm5 \n\t"
1142 "psllq $8, %%mm1\n\t" 1141 "psllq $8, %%mm1 \n\t"
1143 "psllq $16, %%mm2\n\t" 1142 "psllq $16, %%mm2 \n\t"
1144 "por %%mm1, %%mm0\n\t" 1143 "por %%mm1, %%mm0 \n\t"
1145 "por %%mm2, %%mm0\n\t" 1144 "por %%mm2, %%mm0 \n\t"
1146 "psllq $8, %%mm4\n\t" 1145 "psllq $8, %%mm4 \n\t"
1147 "psllq $16, %%mm5\n\t" 1146 "psllq $16, %%mm5 \n\t"
1148 "por %%mm4, %%mm3\n\t" 1147 "por %%mm4, %%mm3 \n\t"
1149 "por %%mm5, %%mm3\n\t" 1148 "por %%mm5, %%mm3 \n\t"
1150 :"=m"(*d) 1149 :"=m"(*d)
1151 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) 1150 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1152 :"memory"); 1151 :"memory");
1153 /* Borrowed 32 to 24 */ 1152 /* Borrowed 32 to 24 */
1154 __asm __volatile( 1153 __asm __volatile(
1155 "movq %%mm0, %%mm4\n\t" 1154 "movq %%mm0, %%mm4 \n\t"
1156 "movq %%mm3, %%mm5\n\t" 1155 "movq %%mm3, %%mm5 \n\t"
1157 "movq %%mm6, %%mm0\n\t" 1156 "movq %%mm6, %%mm0 \n\t"
1158 "movq %%mm7, %%mm1\n\t" 1157 "movq %%mm7, %%mm1 \n\t"
1159 1158
1160 "movq %%mm4, %%mm6\n\t" 1159 "movq %%mm4, %%mm6 \n\t"
1161 "movq %%mm5, %%mm7\n\t" 1160 "movq %%mm5, %%mm7 \n\t"
1162 "movq %%mm0, %%mm2\n\t" 1161 "movq %%mm0, %%mm2 \n\t"
1163 "movq %%mm1, %%mm3\n\t" 1162 "movq %%mm1, %%mm3 \n\t"
1164 1163
1165 "psrlq $8, %%mm2\n\t" 1164 "psrlq $8, %%mm2 \n\t"
1166 "psrlq $8, %%mm3\n\t" 1165 "psrlq $8, %%mm3 \n\t"
1167 "psrlq $8, %%mm6\n\t" 1166 "psrlq $8, %%mm6 \n\t"
1168 "psrlq $8, %%mm7\n\t" 1167 "psrlq $8, %%mm7 \n\t"
1169 "pand %2, %%mm0\n\t" 1168 "pand %2, %%mm0 \n\t"
1170 "pand %2, %%mm1\n\t" 1169 "pand %2, %%mm1 \n\t"
1171 "pand %2, %%mm4\n\t" 1170 "pand %2, %%mm4 \n\t"
1172 "pand %2, %%mm5\n\t" 1171 "pand %2, %%mm5 \n\t"
1173 "pand %3, %%mm2\n\t" 1172 "pand %3, %%mm2 \n\t"
1174 "pand %3, %%mm3\n\t" 1173 "pand %3, %%mm3 \n\t"
1175 "pand %3, %%mm6\n\t" 1174 "pand %3, %%mm6 \n\t"
1176 "pand %3, %%mm7\n\t" 1175 "pand %3, %%mm7 \n\t"
1177 "por %%mm2, %%mm0\n\t" 1176 "por %%mm2, %%mm0 \n\t"
1178 "por %%mm3, %%mm1\n\t" 1177 "por %%mm3, %%mm1 \n\t"
1179 "por %%mm6, %%mm4\n\t" 1178 "por %%mm6, %%mm4 \n\t"
1180 "por %%mm7, %%mm5\n\t" 1179 "por %%mm7, %%mm5 \n\t"
1181 1180
1182 "movq %%mm1, %%mm2\n\t" 1181 "movq %%mm1, %%mm2 \n\t"
1183 "movq %%mm4, %%mm3\n\t" 1182 "movq %%mm4, %%mm3 \n\t"
1184 "psllq $48, %%mm2\n\t" 1183 "psllq $48, %%mm2 \n\t"
1185 "psllq $32, %%mm3\n\t" 1184 "psllq $32, %%mm3 \n\t"
1186 "pand %4, %%mm2\n\t" 1185 "pand %4, %%mm2 \n\t"
1187 "pand %5, %%mm3\n\t" 1186 "pand %5, %%mm3 \n\t"
1188 "por %%mm2, %%mm0\n\t" 1187 "por %%mm2, %%mm0 \n\t"
1189 "psrlq $16, %%mm1\n\t" 1188 "psrlq $16, %%mm1 \n\t"
1190 "psrlq $32, %%mm4\n\t" 1189 "psrlq $32, %%mm4 \n\t"
1191 "psllq $16, %%mm5\n\t" 1190 "psllq $16, %%mm5 \n\t"
1192 "por %%mm3, %%mm1\n\t" 1191 "por %%mm3, %%mm1 \n\t"
1193 "pand %6, %%mm5\n\t" 1192 "pand %6, %%mm5 \n\t"
1194 "por %%mm5, %%mm4\n\t" 1193 "por %%mm5, %%mm4 \n\t"
1195 1194
1196 MOVNTQ" %%mm0, %0\n\t" 1195 MOVNTQ" %%mm0, %0 \n\t"
1197 MOVNTQ" %%mm1, 8%0\n\t" 1196 MOVNTQ" %%mm1, 8%0 \n\t"
1198 MOVNTQ" %%mm4, 16%0" 1197 MOVNTQ" %%mm4, 16%0"
1199 1198
1200 :"=m"(*d) 1199 :"=m"(*d)
1201 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 1200 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1202 :"memory"); 1201 :"memory");
1203 d += 24; 1202 d += 24;
1204 s += 8; 1203 s += 8;
1205 } 1204 }
1206 __asm __volatile(SFENCE:::"memory"); 1205 __asm __volatile(SFENCE:::"memory");
1207 __asm __volatile(EMMS:::"memory"); 1206 __asm __volatile(EMMS:::"memory");
1208 #endif 1207 #endif
1209 while(s < end) 1208 while (s < end)
1210 { 1209 {
1211 register uint16_t bgr; 1210 register uint16_t bgr;
1212 bgr = *s++; 1211 bgr = *s++;
1213 *d++ = (bgr&0x1F)<<3; 1212 *d++ = (bgr&0x1F)<<3;
1214 *d++ = (bgr&0x7E0)>>3; 1213 *d++ = (bgr&0x7E0)>>3;
1215 *d++ = (bgr&0xF800)>>8; 1214 *d++ = (bgr&0xF800)>>8;
1216 } 1215 }
1217 } 1216 }
1218 1217
1219 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) 1218 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1220 { 1219 {
1221 const uint16_t *end; 1220 const uint16_t *end;
1222 #ifdef HAVE_MMX 1221 #ifdef HAVE_MMX
1223 const uint16_t *mm_end; 1222 const uint16_t *mm_end;
1224 #endif 1223 #endif
1225 uint8_t *d = (uint8_t *)dst; 1224 uint8_t *d = (uint8_t *)dst;
1226 const uint16_t *s = (const uint16_t *)src; 1225 const uint16_t *s = (const uint16_t *)src;
1227 end = s + src_size/2; 1226 end = s + src_size/2;
1228 #ifdef HAVE_MMX 1227 #ifdef HAVE_MMX
1229 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 1228 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1230 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); 1229 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1231 mm_end = end - 3; 1230 mm_end = end - 3;
1232 while(s < mm_end) 1231 while (s < mm_end)
1233 { 1232 {
1234 __asm __volatile( 1233 __asm __volatile(
1235 PREFETCH" 32%1\n\t" 1234 PREFETCH" 32%1 \n\t"
1236 "movq %1, %%mm0\n\t" 1235 "movq %1, %%mm0 \n\t"
1237 "movq %1, %%mm1\n\t" 1236 "movq %1, %%mm1 \n\t"
1238 "movq %1, %%mm2\n\t" 1237 "movq %1, %%mm2 \n\t"
1239 "pand %2, %%mm0\n\t" 1238 "pand %2, %%mm0 \n\t"
1240 "pand %3, %%mm1\n\t" 1239 "pand %3, %%mm1 \n\t"
1241 "pand %4, %%mm2\n\t" 1240 "pand %4, %%mm2 \n\t"
1242 "psllq $3, %%mm0\n\t" 1241 "psllq $3, %%mm0 \n\t"
1243 "psrlq $2, %%mm1\n\t" 1242 "psrlq $2, %%mm1 \n\t"
1244 "psrlq $7, %%mm2\n\t" 1243 "psrlq $7, %%mm2 \n\t"
1245 "movq %%mm0, %%mm3\n\t" 1244 "movq %%mm0, %%mm3 \n\t"
1246 "movq %%mm1, %%mm4\n\t" 1245 "movq %%mm1, %%mm4 \n\t"
1247 "movq %%mm2, %%mm5\n\t" 1246 "movq %%mm2, %%mm5 \n\t"
1248 "punpcklwd %%mm7, %%mm0\n\t" 1247 "punpcklwd %%mm7, %%mm0 \n\t"
1249 "punpcklwd %%mm7, %%mm1\n\t" 1248 "punpcklwd %%mm7, %%mm1 \n\t"
1250 "punpcklwd %%mm7, %%mm2\n\t" 1249 "punpcklwd %%mm7, %%mm2 \n\t"
1251 "punpckhwd %%mm7, %%mm3\n\t" 1250 "punpckhwd %%mm7, %%mm3 \n\t"
1252 "punpckhwd %%mm7, %%mm4\n\t" 1251 "punpckhwd %%mm7, %%mm4 \n\t"
1253 "punpckhwd %%mm7, %%mm5\n\t" 1252 "punpckhwd %%mm7, %%mm5 \n\t"
1254 "psllq $8, %%mm1\n\t" 1253 "psllq $8, %%mm1 \n\t"
1255 "psllq $16, %%mm2\n\t" 1254 "psllq $16, %%mm2 \n\t"
1256 "por %%mm1, %%mm0\n\t" 1255 "por %%mm1, %%mm0 \n\t"
1257 "por %%mm2, %%mm0\n\t" 1256 "por %%mm2, %%mm0 \n\t"
1258 "psllq $8, %%mm4\n\t" 1257 "psllq $8, %%mm4 \n\t"
1259 "psllq $16, %%mm5\n\t" 1258 "psllq $16, %%mm5 \n\t"
1260 "por %%mm4, %%mm3\n\t" 1259 "por %%mm4, %%mm3 \n\t"
1261 "por %%mm5, %%mm3\n\t" 1260 "por %%mm5, %%mm3 \n\t"
1262 MOVNTQ" %%mm0, %0\n\t" 1261 MOVNTQ" %%mm0, %0 \n\t"
1263 MOVNTQ" %%mm3, 8%0\n\t" 1262 MOVNTQ" %%mm3, 8%0 \n\t"
1264 :"=m"(*d) 1263 :"=m"(*d)
1265 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) 1264 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1266 :"memory"); 1265 :"memory");
1267 d += 16; 1266 d += 16;
1268 s += 4; 1267 s += 4;
1269 } 1268 }
1270 __asm __volatile(SFENCE:::"memory"); 1269 __asm __volatile(SFENCE:::"memory");
1271 __asm __volatile(EMMS:::"memory"); 1270 __asm __volatile(EMMS:::"memory");
1272 #endif 1271 #endif
1273 while(s < end) 1272 while (s < end)
1274 { 1273 {
1275 #if 0 //slightly slower on athlon 1274 #if 0 //slightly slower on athlon
1276 int bgr= *s++; 1275 int bgr= *s++;
1277 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); 1276 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1278 #else 1277 #else
1279 register uint16_t bgr; 1278 register uint16_t bgr;
1280 bgr = *s++; 1279 bgr = *s++;
1281 #ifdef WORDS_BIGENDIAN 1280 #ifdef WORDS_BIGENDIAN
1282 *d++ = 0; 1281 *d++ = 0;
1283 *d++ = (bgr&0x7C00)>>7; 1282 *d++ = (bgr&0x7C00)>>7;
1284 *d++ = (bgr&0x3E0)>>2; 1283 *d++ = (bgr&0x3E0)>>2;
1285 *d++ = (bgr&0x1F)<<3; 1284 *d++ = (bgr&0x1F)<<3;
1286 #else 1285 #else
1287 *d++ = (bgr&0x1F)<<3; 1286 *d++ = (bgr&0x1F)<<3;
1288 *d++ = (bgr&0x3E0)>>2; 1287 *d++ = (bgr&0x3E0)>>2;
1289 *d++ = (bgr&0x7C00)>>7; 1288 *d++ = (bgr&0x7C00)>>7;
1290 *d++ = 0; 1289 *d++ = 0;
1291 #endif 1290 #endif
1292 1291
1293 #endif 1292 #endif
1294 } 1293 }
1295 } 1294 }
1296 1295
1297 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) 1296 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1298 { 1297 {
1299 const uint16_t *end; 1298 const uint16_t *end;
1300 #ifdef HAVE_MMX 1299 #ifdef HAVE_MMX
1301 const uint16_t *mm_end; 1300 const uint16_t *mm_end;
1302 #endif 1301 #endif
1303 uint8_t *d = (uint8_t *)dst; 1302 uint8_t *d = (uint8_t *)dst;
1304 const uint16_t *s = (uint16_t *)src; 1303 const uint16_t *s = (uint16_t *)src;
1305 end = s + src_size/2; 1304 end = s + src_size/2;
1306 #ifdef HAVE_MMX 1305 #ifdef HAVE_MMX
1307 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 1306 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1308 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); 1307 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1309 mm_end = end - 3; 1308 mm_end = end - 3;
1310 while(s < mm_end) 1309 while (s < mm_end)
1311 { 1310 {
1312 __asm __volatile( 1311 __asm __volatile(
1313 PREFETCH" 32%1\n\t" 1312 PREFETCH" 32%1 \n\t"
1314 "movq %1, %%mm0\n\t" 1313 "movq %1, %%mm0 \n\t"
1315 "movq %1, %%mm1\n\t" 1314 "movq %1, %%mm1 \n\t"
1316 "movq %1, %%mm2\n\t" 1315 "movq %1, %%mm2 \n\t"
1317 "pand %2, %%mm0\n\t" 1316 "pand %2, %%mm0 \n\t"
1318 "pand %3, %%mm1\n\t" 1317 "pand %3, %%mm1 \n\t"
1319 "pand %4, %%mm2\n\t" 1318 "pand %4, %%mm2 \n\t"
1320 "psllq $3, %%mm0\n\t" 1319 "psllq $3, %%mm0 \n\t"
1321 "psrlq $3, %%mm1\n\t" 1320 "psrlq $3, %%mm1 \n\t"
1322 "psrlq $8, %%mm2\n\t" 1321 "psrlq $8, %%mm2 \n\t"
1323 "movq %%mm0, %%mm3\n\t" 1322 "movq %%mm0, %%mm3 \n\t"
1324 "movq %%mm1, %%mm4\n\t" 1323 "movq %%mm1, %%mm4 \n\t"
1325 "movq %%mm2, %%mm5\n\t" 1324 "movq %%mm2, %%mm5 \n\t"
1326 "punpcklwd %%mm7, %%mm0\n\t" 1325 "punpcklwd %%mm7, %%mm0 \n\t"
1327 "punpcklwd %%mm7, %%mm1\n\t" 1326 "punpcklwd %%mm7, %%mm1 \n\t"
1328 "punpcklwd %%mm7, %%mm2\n\t" 1327 "punpcklwd %%mm7, %%mm2 \n\t"
1329 "punpckhwd %%mm7, %%mm3\n\t" 1328 "punpckhwd %%mm7, %%mm3 \n\t"
1330 "punpckhwd %%mm7, %%mm4\n\t" 1329 "punpckhwd %%mm7, %%mm4 \n\t"
1331 "punpckhwd %%mm7, %%mm5\n\t" 1330 "punpckhwd %%mm7, %%mm5 \n\t"
1332 "psllq $8, %%mm1\n\t" 1331 "psllq $8, %%mm1 \n\t"
1333 "psllq $16, %%mm2\n\t" 1332 "psllq $16, %%mm2 \n\t"
1334 "por %%mm1, %%mm0\n\t" 1333 "por %%mm1, %%mm0 \n\t"
1335 "por %%mm2, %%mm0\n\t" 1334 "por %%mm2, %%mm0 \n\t"
1336 "psllq $8, %%mm4\n\t" 1335 "psllq $8, %%mm4 \n\t"
1337 "psllq $16, %%mm5\n\t" 1336 "psllq $16, %%mm5 \n\t"
1338 "por %%mm4, %%mm3\n\t" 1337 "por %%mm4, %%mm3 \n\t"
1339 "por %%mm5, %%mm3\n\t" 1338 "por %%mm5, %%mm3 \n\t"
1340 MOVNTQ" %%mm0, %0\n\t" 1339 MOVNTQ" %%mm0, %0 \n\t"
1341 MOVNTQ" %%mm3, 8%0\n\t" 1340 MOVNTQ" %%mm3, 8%0 \n\t"
1342 :"=m"(*d) 1341 :"=m"(*d)
1343 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) 1342 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1344 :"memory"); 1343 :"memory");
1345 d += 16; 1344 d += 16;
1346 s += 4; 1345 s += 4;
1347 } 1346 }
1348 __asm __volatile(SFENCE:::"memory"); 1347 __asm __volatile(SFENCE:::"memory");
1349 __asm __volatile(EMMS:::"memory"); 1348 __asm __volatile(EMMS:::"memory");
1350 #endif 1349 #endif
1351 while(s < end) 1350 while (s < end)
1352 { 1351 {
1353 register uint16_t bgr; 1352 register uint16_t bgr;
1354 bgr = *s++; 1353 bgr = *s++;
1355 #ifdef WORDS_BIGENDIAN 1354 #ifdef WORDS_BIGENDIAN
1356 *d++ = 0; 1355 *d++ = 0;
1357 *d++ = (bgr&0xF800)>>8; 1356 *d++ = (bgr&0xF800)>>8;
1358 *d++ = (bgr&0x7E0)>>3; 1357 *d++ = (bgr&0x7E0)>>3;
1359 *d++ = (bgr&0x1F)<<3; 1358 *d++ = (bgr&0x1F)<<3;
1360 #else 1359 #else
1361 *d++ = (bgr&0x1F)<<3; 1360 *d++ = (bgr&0x1F)<<3;
1362 *d++ = (bgr&0x7E0)>>3; 1361 *d++ = (bgr&0x7E0)>>3;
1363 *d++ = (bgr&0xF800)>>8; 1362 *d++ = (bgr&0xF800)>>8;
1364 *d++ = 0; 1363 *d++ = 0;
1365 #endif 1364 #endif
1366 } 1365 }
1367 } 1366 }
1368 1367
1369 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) 1368 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1370 { 1369 {
1371 long idx = 15 - src_size; 1370 long idx = 15 - src_size;
1372 uint8_t *s = (uint8_t *) src-idx, *d = dst-idx; 1371 uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1373 #ifdef HAVE_MMX 1372 #ifdef HAVE_MMX
1374 __asm __volatile( 1373 __asm __volatile(
1375 "test %0, %0 \n\t" 1374 "test %0, %0 \n\t"
1376 "jns 2f \n\t" 1375 "jns 2f \n\t"
1377 PREFETCH" (%1, %0) \n\t" 1376 PREFETCH" (%1, %0) \n\t"
1378 "movq %3, %%mm7 \n\t" 1377 "movq %3, %%mm7 \n\t"
1379 "pxor %4, %%mm7 \n\t" 1378 "pxor %4, %%mm7 \n\t"
1380 "movq %%mm7, %%mm6 \n\t" 1379 "movq %%mm7, %%mm6 \n\t"
1381 "pxor %5, %%mm7 \n\t" 1380 "pxor %5, %%mm7 \n\t"
1382 ASMALIGN(4) 1381 ASMALIGN(4)
1383 "1: \n\t" 1382 "1: \n\t"
1384 PREFETCH" 32(%1, %0) \n\t" 1383 PREFETCH" 32(%1, %0) \n\t"
1385 "movq (%1, %0), %%mm0 \n\t" 1384 "movq (%1, %0), %%mm0 \n\t"
1386 "movq 8(%1, %0), %%mm1 \n\t" 1385 "movq 8(%1, %0), %%mm1 \n\t"
1387 # ifdef HAVE_MMX2 1386 # ifdef HAVE_MMX2
1388 "pshufw $177, %%mm0, %%mm3 \n\t" 1387 "pshufw $177, %%mm0, %%mm3 \n\t"
1389 "pshufw $177, %%mm1, %%mm5 \n\t" 1388 "pshufw $177, %%mm1, %%mm5 \n\t"
1390 "pand %%mm7, %%mm0 \n\t" 1389 "pand %%mm7, %%mm0 \n\t"
1391 "pand %%mm6, %%mm3 \n\t" 1390 "pand %%mm6, %%mm3 \n\t"
1392 "pand %%mm7, %%mm1 \n\t" 1391 "pand %%mm7, %%mm1 \n\t"
1393 "pand %%mm6, %%mm5 \n\t" 1392 "pand %%mm6, %%mm5 \n\t"
1394 "por %%mm3, %%mm0 \n\t" 1393 "por %%mm3, %%mm0 \n\t"
1395 "por %%mm5, %%mm1 \n\t" 1394 "por %%mm5, %%mm1 \n\t"
1396 # else 1395 # else
1397 "movq %%mm0, %%mm2 \n\t" 1396 "movq %%mm0, %%mm2 \n\t"
1398 "movq %%mm1, %%mm4 \n\t" 1397 "movq %%mm1, %%mm4 \n\t"
1399 "pand %%mm7, %%mm0 \n\t" 1398 "pand %%mm7, %%mm0 \n\t"
1400 "pand %%mm6, %%mm2 \n\t" 1399 "pand %%mm6, %%mm2 \n\t"
1401 "pand %%mm7, %%mm1 \n\t" 1400 "pand %%mm7, %%mm1 \n\t"
1402 "pand %%mm6, %%mm4 \n\t" 1401 "pand %%mm6, %%mm4 \n\t"
1403 "movq %%mm2, %%mm3 \n\t" 1402 "movq %%mm2, %%mm3 \n\t"
1404 "movq %%mm4, %%mm5 \n\t" 1403 "movq %%mm4, %%mm5 \n\t"
1405 "pslld $16, %%mm2 \n\t" 1404 "pslld $16, %%mm2 \n\t"
1406 "psrld $16, %%mm3 \n\t" 1405 "psrld $16, %%mm3 \n\t"
1407 "pslld $16, %%mm4 \n\t" 1406 "pslld $16, %%mm4 \n\t"
1408 "psrld $16, %%mm5 \n\t" 1407 "psrld $16, %%mm5 \n\t"
1409 "por %%mm2, %%mm0 \n\t" 1408 "por %%mm2, %%mm0 \n\t"
1410 "por %%mm4, %%mm1 \n\t" 1409 "por %%mm4, %%mm1 \n\t"
1411 "por %%mm3, %%mm0 \n\t" 1410 "por %%mm3, %%mm0 \n\t"
1412 "por %%mm5, %%mm1 \n\t" 1411 "por %%mm5, %%mm1 \n\t"
1413 # endif 1412 # endif
1414 MOVNTQ" %%mm0, (%2, %0) \n\t" 1413 MOVNTQ" %%mm0, (%2, %0) \n\t"
1415 MOVNTQ" %%mm1, 8(%2, %0) \n\t" 1414 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1416 "add $16, %0 \n\t" 1415 "add $16, %0 \n\t"
1417 "js 1b \n\t" 1416 "js 1b \n\t"
1418 SFENCE" \n\t" 1417 SFENCE" \n\t"
1419 EMMS" \n\t" 1418 EMMS" \n\t"
1420 "2: \n\t" 1419 "2: \n\t"
1421 : "+&r"(idx) 1420 : "+&r"(idx)
1422 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) 1421 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1423 : "memory"); 1422 : "memory");
1424 #endif 1423 #endif
1425 for (; idx<15; idx+=4) { 1424 for (; idx<15; idx+=4) {
1426 register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00; 1425 register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00;
1427 v &= 0xff00ff; 1426 v &= 0xff00ff;
1428 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); 1427 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1429 } 1428 }
1430 } 1429 }
1431 1430
1432 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) 1431 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1433 { 1432 {
1434 unsigned i; 1433 unsigned i;
1435 #ifdef HAVE_MMX 1434 #ifdef HAVE_MMX
1436 long mmx_size= 23 - src_size; 1435 long mmx_size= 23 - src_size;
1437 asm volatile ( 1436 asm volatile (
1438 "test %%"REG_a", %%"REG_a" \n\t" 1437 "test %%"REG_a", %%"REG_a" \n\t"
1439 "jns 2f \n\t" 1438 "jns 2f \n\t"
1440 "movq "MANGLE(mask24r)", %%mm5 \n\t" 1439 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1441 "movq "MANGLE(mask24g)", %%mm6 \n\t" 1440 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1442 "movq "MANGLE(mask24b)", %%mm7 \n\t" 1441 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1443 ASMALIGN(4) 1442 ASMALIGN(4)
1444 "1: \n\t" 1443 "1: \n\t"
1445 PREFETCH" 32(%1, %%"REG_a") \n\t" 1444 PREFETCH" 32(%1, %%"REG_a") \n\t"
1446 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 1445 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1447 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG 1446 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1448 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B 1447 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1449 "psllq $16, %%mm0 \n\t" // 00 BGR BGR 1448 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1450 "pand %%mm5, %%mm0 \n\t" 1449 "pand %%mm5, %%mm0 \n\t"
1451 "pand %%mm6, %%mm1 \n\t" 1450 "pand %%mm6, %%mm1 \n\t"
1452 "pand %%mm7, %%mm2 \n\t" 1451 "pand %%mm7, %%mm2 \n\t"
1453 "por %%mm0, %%mm1 \n\t" 1452 "por %%mm0, %%mm1 \n\t"
1454 "por %%mm2, %%mm1 \n\t" 1453 "por %%mm2, %%mm1 \n\t"
1455 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 1454 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1456 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG 1455 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1457 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B 1456 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1458 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR 1457 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1459 "pand %%mm7, %%mm0 \n\t" 1458 "pand %%mm7, %%mm0 \n\t"
1460 "pand %%mm5, %%mm1 \n\t" 1459 "pand %%mm5, %%mm1 \n\t"
1461 "pand %%mm6, %%mm2 \n\t" 1460 "pand %%mm6, %%mm2 \n\t"
1462 "por %%mm0, %%mm1 \n\t" 1461 "por %%mm0, %%mm1 \n\t"
1463 "por %%mm2, %%mm1 \n\t" 1462 "por %%mm2, %%mm1 \n\t"
1464 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B 1463 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1465 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R 1464 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1466 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR 1465 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1467 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG 1466 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1468 "pand %%mm6, %%mm0 \n\t" 1467 "pand %%mm6, %%mm0 \n\t"
1469 "pand %%mm7, %%mm1 \n\t" 1468 "pand %%mm7, %%mm1 \n\t"
1470 "pand %%mm5, %%mm2 \n\t" 1469 "pand %%mm5, %%mm2 \n\t"
1471 "por %%mm0, %%mm1 \n\t" 1470 "por %%mm0, %%mm1 \n\t"
1472 "por %%mm2, %%mm1 \n\t" 1471 "por %%mm2, %%mm1 \n\t"
1473 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t" 1472 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1474 "add $24, %%"REG_a" \n\t" 1473 "add $24, %%"REG_a" \n\t"
1475 " js 1b \n\t" 1474 " js 1b \n\t"
1476 "2: \n\t" 1475 "2: \n\t"
1477 : "+a" (mmx_size) 1476 : "+a" (mmx_size)
1478 : "r" (src-mmx_size), "r"(dst-mmx_size) 1477 : "r" (src-mmx_size), "r"(dst-mmx_size)
1479 ); 1478 );
1480 1479
1481 __asm __volatile(SFENCE:::"memory"); 1480 __asm __volatile(SFENCE:::"memory");
1482 __asm __volatile(EMMS:::"memory"); 1481 __asm __volatile(EMMS:::"memory");
1483 1482
1484 if(mmx_size==23) return; //finihsed, was multiple of 8 1483 if (mmx_size==23) return; //finihsed, was multiple of 8
1485 1484
1486 src+= src_size; 1485 src+= src_size;
1487 dst+= src_size; 1486 dst+= src_size;
1488 src_size= 23-mmx_size; 1487 src_size= 23-mmx_size;
1489 src-= src_size; 1488 src-= src_size;
1490 dst-= src_size; 1489 dst-= src_size;
1491 #endif 1490 #endif
1492 for(i=0; i<src_size; i+=3) 1491 for (i=0; i<src_size; i+=3)
1493 { 1492 {
1494 register uint8_t x; 1493 register uint8_t x;
1495 x = src[i + 2]; 1494 x = src[i + 2];
1496 dst[i + 1] = src[i + 1]; 1495 dst[i + 1] = src[i + 1];
1497 dst[i + 2] = src[i + 0]; 1496 dst[i + 2] = src[i + 0];
1498 dst[i + 0] = x; 1497 dst[i + 0] = x;
1499 } 1498 }
1500 } 1499 }
1501 1500
1502 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1501 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1503 long width, long height, 1502 long width, long height,
1504 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) 1503 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1505 { 1504 {
1506 long y; 1505 long y;
1507 const long chromWidth= width>>1; 1506 const long chromWidth= width>>1;
1508 for(y=0; y<height; y++) 1507 for (y=0; y<height; y++)
1509 { 1508 {
1510 #ifdef HAVE_MMX 1509 #ifdef HAVE_MMX
1511 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) 1510 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1512 asm volatile( 1511 asm volatile(
1513 "xor %%"REG_a", %%"REG_a" \n\t" 1512 "xor %%"REG_a", %%"REG_a" \n\t"
1514 ASMALIGN(4) 1513 ASMALIGN(4)
1515 "1: \n\t" 1514 "1: \n\t"
1516 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 1515 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1517 PREFETCH" 32(%2, %%"REG_a") \n\t" 1516 PREFETCH" 32(%2, %%"REG_a") \n\t"
1518 PREFETCH" 32(%3, %%"REG_a") \n\t" 1517 PREFETCH" 32(%3, %%"REG_a") \n\t"
1519 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 1518 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1520 "movq %%mm0, %%mm2 \n\t" // U(0) 1519 "movq %%mm0, %%mm2 \n\t" // U(0)
1521 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 1520 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1522 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1521 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1523 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1522 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1524 1523
1525 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 1524 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1526 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 1525 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1527 "movq %%mm3, %%mm4 \n\t" // Y(0) 1526 "movq %%mm3, %%mm4 \n\t" // Y(0)
1528 "movq %%mm5, %%mm6 \n\t" // Y(8) 1527 "movq %%mm5, %%mm6 \n\t" // Y(8)
1529 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) 1528 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1530 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) 1529 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1531 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) 1530 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1532 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) 1531 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1533 1532
1534 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t" 1533 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1535 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" 1534 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1536 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t" 1535 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1537 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" 1536 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1538 1537
1539 "add $8, %%"REG_a" \n\t" 1538 "add $8, %%"REG_a" \n\t"
1540 "cmp %4, %%"REG_a" \n\t" 1539 "cmp %4, %%"REG_a" \n\t"
1541 " jb 1b \n\t" 1540 " jb 1b \n\t"
1542 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1541 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1543 : "%"REG_a 1542 : "%"REG_a
1544 ); 1543 );
1545 #else 1544 #else
1546 1545
1547 #if defined ARCH_ALPHA && defined HAVE_MVI 1546 #if defined ARCH_ALPHA && defined HAVE_MVI
1548 #define pl2yuy2(n) \ 1547 #define pl2yuy2(n) \
1549 y1 = yc[n]; \ 1548 y1 = yc[n]; \
1550 y2 = yc2[n]; \ 1549 y2 = yc2[n]; \
1551 u = uc[n]; \ 1550 u = uc[n]; \
1552 v = vc[n]; \ 1551 v = vc[n]; \
1553 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ 1552 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1554 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ 1553 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1555 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ 1554 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1556 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ 1555 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1557 yuv1 = (u << 8) + (v << 24); \ 1556 yuv1 = (u << 8) + (v << 24); \
1558 yuv2 = yuv1 + y2; \ 1557 yuv2 = yuv1 + y2; \
1559 yuv1 += y1; \ 1558 yuv1 += y1; \
1560 qdst[n] = yuv1; \ 1559 qdst[n] = yuv1; \
1561 qdst2[n] = yuv2; 1560 qdst2[n] = yuv2;
1562 1561
1563 int i; 1562 int i;
1564 uint64_t *qdst = (uint64_t *) dst; 1563 uint64_t *qdst = (uint64_t *) dst;
1565 uint64_t *qdst2 = (uint64_t *) (dst + dstStride); 1564 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1566 const uint32_t *yc = (uint32_t *) ysrc; 1565 const uint32_t *yc = (uint32_t *) ysrc;
1567 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); 1566 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1568 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; 1567 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1569 for(i = 0; i < chromWidth; i += 8){ 1568 for (i = 0; i < chromWidth; i += 8){
1570 uint64_t y1, y2, yuv1, yuv2; 1569 uint64_t y1, y2, yuv1, yuv2;
1571 uint64_t u, v; 1570 uint64_t u, v;
1572 /* Prefetch */ 1571 /* Prefetch */
1573 asm("ldq $31,64(%0)" :: "r"(yc)); 1572 asm("ldq $31,64(%0)" :: "r"(yc));
1574 asm("ldq $31,64(%0)" :: "r"(yc2)); 1573 asm("ldq $31,64(%0)" :: "r"(yc2));
1575 asm("ldq $31,64(%0)" :: "r"(uc)); 1574 asm("ldq $31,64(%0)" :: "r"(uc));
1576 asm("ldq $31,64(%0)" :: "r"(vc)); 1575 asm("ldq $31,64(%0)" :: "r"(vc));
1577 1576
1578 pl2yuy2(0); 1577 pl2yuy2(0);
1579 pl2yuy2(1); 1578 pl2yuy2(1);
1580 pl2yuy2(2); 1579 pl2yuy2(2);
1581 pl2yuy2(3); 1580 pl2yuy2(3);
1582 1581
1583 yc += 4; 1582 yc += 4;
1584 yc2 += 4; 1583 yc2 += 4;
1585 uc += 4; 1584 uc += 4;
1586 vc += 4; 1585 vc += 4;
1587 qdst += 4; 1586 qdst += 4;
1588 qdst2 += 4; 1587 qdst2 += 4;
1589 } 1588 }
1590 y++; 1589 y++;
1591 ysrc += lumStride; 1590 ysrc += lumStride;
1592 dst += dstStride; 1591 dst += dstStride;
1593 1592
1594 #elif __WORDSIZE >= 64 1593 #elif __WORDSIZE >= 64
1595 int i; 1594 int i;
1596 uint64_t *ldst = (uint64_t *) dst; 1595 uint64_t *ldst = (uint64_t *) dst;
1597 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 1596 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1598 for(i = 0; i < chromWidth; i += 2){ 1597 for (i = 0; i < chromWidth; i += 2){
1599 uint64_t k, l; 1598 uint64_t k, l;
1600 k = yc[0] + (uc[0] << 8) + 1599 k = yc[0] + (uc[0] << 8) +
1601 (yc[1] << 16) + (vc[0] << 24); 1600 (yc[1] << 16) + (vc[0] << 24);
1602 l = yc[2] + (uc[1] << 8) + 1601 l = yc[2] + (uc[1] << 8) +
1603 (yc[3] << 16) + (vc[1] << 24); 1602 (yc[3] << 16) + (vc[1] << 24);
1604 *ldst++ = k + (l << 32); 1603 *ldst++ = k + (l << 32);
1605 yc += 4; 1604 yc += 4;
1606 uc += 2; 1605 uc += 2;
1607 vc += 2; 1606 vc += 2;
1608 } 1607 }
1609 1608
1610 #else 1609 #else
1611 int i, *idst = (int32_t *) dst; 1610 int i, *idst = (int32_t *) dst;
1612 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 1611 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1613 for(i = 0; i < chromWidth; i++){ 1612 for (i = 0; i < chromWidth; i++){
1614 #ifdef WORDS_BIGENDIAN 1613 #ifdef WORDS_BIGENDIAN
1615 *idst++ = (yc[0] << 24)+ (uc[0] << 16) + 1614 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1616 (yc[1] << 8) + (vc[0] << 0); 1615 (yc[1] << 8) + (vc[0] << 0);
1617 #else 1616 #else
1618 *idst++ = yc[0] + (uc[0] << 8) + 1617 *idst++ = yc[0] + (uc[0] << 8) +
1619 (yc[1] << 16) + (vc[0] << 24); 1618 (yc[1] << 16) + (vc[0] << 24);
1620 #endif 1619 #endif
1621 yc += 2; 1620 yc += 2;
1622 uc++; 1621 uc++;
1623 vc++; 1622 vc++;
1624 } 1623 }
1625 #endif 1624 #endif
1626 #endif 1625 #endif
1627 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) 1626 if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1628 { 1627 {
1629 usrc += chromStride; 1628 usrc += chromStride;
1630 vsrc += chromStride; 1629 vsrc += chromStride;
1631 } 1630 }
1632 ysrc += lumStride; 1631 ysrc += lumStride;
1633 dst += dstStride; 1632 dst += dstStride;
1634 } 1633 }
1635 #ifdef HAVE_MMX 1634 #ifdef HAVE_MMX
1636 asm( EMMS" \n\t" 1635 asm( EMMS" \n\t"
1637 SFENCE" \n\t" 1636 SFENCE" \n\t"
1638 :::"memory"); 1637 :::"memory");
1639 #endif 1638 #endif
1640 } 1639 }
1641 1640
1642 /** 1641 /**
1643 * 1642 *
1644 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a 1643 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1645 * problem for anyone then tell me, and ill fix it) 1644 * problem for anyone then tell me, and ill fix it)
1646 */ 1645 */
1647 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1646 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1648 long width, long height, 1647 long width, long height,
1649 long lumStride, long chromStride, long dstStride) 1648 long lumStride, long chromStride, long dstStride)
1650 { 1649 {
1651 //FIXME interpolate chroma 1650 //FIXME interpolate chroma
1652 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 1651 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1653 } 1652 }
1654 1653
1655 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1654 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1656 long width, long height, 1655 long width, long height,
1657 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) 1656 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1658 { 1657 {
1659 long y; 1658 long y;
1660 const long chromWidth= width>>1; 1659 const long chromWidth= width>>1;
1661 for(y=0; y<height; y++) 1660 for (y=0; y<height; y++)
1662 { 1661 {
1663 #ifdef HAVE_MMX 1662 #ifdef HAVE_MMX
1664 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) 1663 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1665 asm volatile( 1664 asm volatile(
1666 "xor %%"REG_a", %%"REG_a" \n\t" 1665 "xor %%"REG_a", %%"REG_a" \n\t"
1667 ASMALIGN(4) 1666 ASMALIGN(4)
1668 "1: \n\t" 1667 "1: \n\t"
1669 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 1668 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1670 PREFETCH" 32(%2, %%"REG_a") \n\t" 1669 PREFETCH" 32(%2, %%"REG_a") \n\t"
1671 PREFETCH" 32(%3, %%"REG_a") \n\t" 1670 PREFETCH" 32(%3, %%"REG_a") \n\t"
1672 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 1671 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1673 "movq %%mm0, %%mm2 \n\t" // U(0) 1672 "movq %%mm0, %%mm2 \n\t" // U(0)
1674 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 1673 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1675 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1674 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1676 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1675 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1677 1676
1678 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 1677 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1679 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 1678 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1680 "movq %%mm0, %%mm4 \n\t" // Y(0) 1679 "movq %%mm0, %%mm4 \n\t" // Y(0)
1681 "movq %%mm2, %%mm6 \n\t" // Y(8) 1680 "movq %%mm2, %%mm6 \n\t" // Y(8)
1682 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) 1681 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1683 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) 1682 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1684 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) 1683 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1685 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) 1684 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1686 1685
1687 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t" 1686 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1688 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" 1687 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1689 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t" 1688 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1690 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" 1689 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1691 1690
1692 "add $8, %%"REG_a" \n\t" 1691 "add $8, %%"REG_a" \n\t"
1693 "cmp %4, %%"REG_a" \n\t" 1692 "cmp %4, %%"REG_a" \n\t"
1694 " jb 1b \n\t" 1693 " jb 1b \n\t"
1695 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1694 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1696 : "%"REG_a 1695 : "%"REG_a
1697 ); 1696 );
1698 #else 1697 #else
1699 //FIXME adapt the alpha asm code from yv12->yuy2 1698 //FIXME adapt the alpha asm code from yv12->yuy2
1700 1699
1701 #if __WORDSIZE >= 64 1700 #if __WORDSIZE >= 64
1702 int i; 1701 int i;
1703 uint64_t *ldst = (uint64_t *) dst; 1702 uint64_t *ldst = (uint64_t *) dst;
1704 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 1703 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1705 for(i = 0; i < chromWidth; i += 2){ 1704 for (i = 0; i < chromWidth; i += 2){
1706 uint64_t k, l; 1705 uint64_t k, l;
1707 k = uc[0] + (yc[0] << 8) + 1706 k = uc[0] + (yc[0] << 8) +
1708 (vc[0] << 16) + (yc[1] << 24); 1707 (vc[0] << 16) + (yc[1] << 24);
1709 l = uc[1] + (yc[2] << 8) + 1708 l = uc[1] + (yc[2] << 8) +
1710 (vc[1] << 16) + (yc[3] << 24); 1709 (vc[1] << 16) + (yc[3] << 24);
1711 *ldst++ = k + (l << 32); 1710 *ldst++ = k + (l << 32);
1712 yc += 4; 1711 yc += 4;
1713 uc += 2; 1712 uc += 2;
1714 vc += 2; 1713 vc += 2;
1715 } 1714 }
1716 1715
1717 #else 1716 #else
1718 int i, *idst = (int32_t *) dst; 1717 int i, *idst = (int32_t *) dst;
1719 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 1718 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1720 for(i = 0; i < chromWidth; i++){ 1719 for (i = 0; i < chromWidth; i++){
1721 #ifdef WORDS_BIGENDIAN 1720 #ifdef WORDS_BIGENDIAN
1722 *idst++ = (uc[0] << 24)+ (yc[0] << 16) + 1721 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1723 (vc[0] << 8) + (yc[1] << 0); 1722 (vc[0] << 8) + (yc[1] << 0);
1724 #else 1723 #else
1725 *idst++ = uc[0] + (yc[0] << 8) + 1724 *idst++ = uc[0] + (yc[0] << 8) +
1726 (vc[0] << 16) + (yc[1] << 24); 1725 (vc[0] << 16) + (yc[1] << 24);
1727 #endif 1726 #endif
1728 yc += 2; 1727 yc += 2;
1729 uc++; 1728 uc++;
1730 vc++; 1729 vc++;
1731 } 1730 }
1732 #endif 1731 #endif
1733 #endif 1732 #endif
1734 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) 1733 if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1735 { 1734 {
1736 usrc += chromStride; 1735 usrc += chromStride;
1737 vsrc += chromStride; 1736 vsrc += chromStride;
1738 } 1737 }
1739 ysrc += lumStride; 1738 ysrc += lumStride;
1740 dst += dstStride; 1739 dst += dstStride;
1741 } 1740 }
1742 #ifdef HAVE_MMX 1741 #ifdef HAVE_MMX
1743 asm( EMMS" \n\t" 1742 asm( EMMS" \n\t"
1744 SFENCE" \n\t" 1743 SFENCE" \n\t"
1745 :::"memory"); 1744 :::"memory");
1746 #endif 1745 #endif
1747 } 1746 }
1748 1747
1749 /** 1748 /**
1750 * 1749 *
1751 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a 1750 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1752 * problem for anyone then tell me, and ill fix it) 1751 * problem for anyone then tell me, and ill fix it)
1753 */ 1752 */
1754 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1753 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1755 long width, long height, 1754 long width, long height,
1756 long lumStride, long chromStride, long dstStride) 1755 long lumStride, long chromStride, long dstStride)
1757 { 1756 {
1758 //FIXME interpolate chroma 1757 //FIXME interpolate chroma
1759 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 1758 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1760 } 1759 }
1761 1760
1762 /** 1761 /**
1763 * 1762 *
1764 * width should be a multiple of 16 1763 * width should be a multiple of 16
1765 */ 1764 */
1766 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1765 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1767 long width, long height, 1766 long width, long height,
1768 long lumStride, long chromStride, long dstStride) 1767 long lumStride, long chromStride, long dstStride)
1769 { 1768 {
1770 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); 1769 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1771 } 1770 }
1772 1771
1773 /** 1772 /**
1774 * 1773 *
1775 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a 1774 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1776 * problem for anyone then tell me, and ill fix it) 1775 * problem for anyone then tell me, and ill fix it)
1777 */ 1776 */
1778 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 1777 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1779 long width, long height, 1778 long width, long height,
1780 long lumStride, long chromStride, long srcStride) 1779 long lumStride, long chromStride, long srcStride)
1781 { 1780 {
1782 long y; 1781 long y;
1783 const long chromWidth= width>>1; 1782 const long chromWidth= width>>1;
1784 for(y=0; y<height; y+=2) 1783 for (y=0; y<height; y+=2)
1785 { 1784 {
1786 #ifdef HAVE_MMX 1785 #ifdef HAVE_MMX
1787 asm volatile( 1786 asm volatile(
1788 "xor %%"REG_a", %%"REG_a" \n\t" 1787 "xor %%"REG_a", %%"REG_a" \n\t"
1789 "pcmpeqw %%mm7, %%mm7 \n\t" 1788 "pcmpeqw %%mm7, %%mm7 \n\t"
1790 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 1789 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1791 ASMALIGN(4) 1790 ASMALIGN(4)
1792 "1: \n\t" 1791 "1: \n\t"
1793 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1792 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1794 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 1793 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1795 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) 1794 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1796 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) 1795 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1797 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) 1796 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1798 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) 1797 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1799 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) 1798 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1800 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 1799 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1801 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 1800 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1802 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1801 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1803 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 1802 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1804 1803
1805 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t" 1804 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1806 1805
1807 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8) 1806 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1808 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12) 1807 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1809 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) 1808 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1810 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) 1809 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1811 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) 1810 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1812 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) 1811 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1813 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 1812 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1814 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 1813 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1815 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 1814 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1816 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 1815 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1817 1816
1818 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t" 1817 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1819 1818
1820 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 1819 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1821 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 1820 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1822 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 1821 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1823 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 1822 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1824 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 1823 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1825 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 1824 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1826 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 1825 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1827 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 1826 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1828 1827
1829 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 1828 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1830 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 1829 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1831 1830
1832 "add $8, %%"REG_a" \n\t" 1831 "add $8, %%"REG_a" \n\t"
1833 "cmp %4, %%"REG_a" \n\t" 1832 "cmp %4, %%"REG_a" \n\t"
1834 " jb 1b \n\t" 1833 " jb 1b \n\t"
1835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1834 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1836 : "memory", "%"REG_a 1835 : "memory", "%"REG_a
1837 ); 1836 );
1838 1837
1839 ydst += lumStride; 1838 ydst += lumStride;
1840 src += srcStride; 1839 src += srcStride;
1841 1840
1842 asm volatile( 1841 asm volatile(
1843 "xor %%"REG_a", %%"REG_a" \n\t" 1842 "xor %%"REG_a", %%"REG_a" \n\t"
1844 ASMALIGN(4) 1843 ASMALIGN(4)
1845 "1: \n\t" 1844 "1: \n\t"
1846 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1845 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1847 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 1846 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1848 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) 1847 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1849 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8) 1848 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1850 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12) 1849 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1851 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 1850 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1852 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 1851 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1853 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 1852 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1854 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 1853 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1855 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 1854 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1856 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 1855 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1857 1856
1858 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t" 1857 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1859 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t" 1858 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1860 1859
1861 "add $8, %%"REG_a" \n\t" 1860 "add $8, %%"REG_a" \n\t"
1862 "cmp %4, %%"REG_a" \n\t" 1861 "cmp %4, %%"REG_a" \n\t"
1863 " jb 1b \n\t" 1862 " jb 1b \n\t"
1864 1863
1865 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1864 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1866 : "memory", "%"REG_a 1865 : "memory", "%"REG_a
1867 ); 1866 );
1868 #else 1867 #else
1869 long i; 1868 long i;
1870 for(i=0; i<chromWidth; i++) 1869 for (i=0; i<chromWidth; i++)
1871 { 1870 {
1872 ydst[2*i+0] = src[4*i+0]; 1871 ydst[2*i+0] = src[4*i+0];
1873 udst[i] = src[4*i+1]; 1872 udst[i] = src[4*i+1];
1874 ydst[2*i+1] = src[4*i+2]; 1873 ydst[2*i+1] = src[4*i+2];
1875 vdst[i] = src[4*i+3]; 1874 vdst[i] = src[4*i+3];
1876 } 1875 }
1877 ydst += lumStride; 1876 ydst += lumStride;
1878 src += srcStride; 1877 src += srcStride;
1879 1878
1880 for(i=0; i<chromWidth; i++) 1879 for (i=0; i<chromWidth; i++)
1881 { 1880 {
1882 ydst[2*i+0] = src[4*i+0]; 1881 ydst[2*i+0] = src[4*i+0];
1883 ydst[2*i+1] = src[4*i+2]; 1882 ydst[2*i+1] = src[4*i+2];
1884 } 1883 }
1885 #endif 1884 #endif
1886 udst += chromStride; 1885 udst += chromStride;
1887 vdst += chromStride; 1886 vdst += chromStride;
1888 ydst += lumStride; 1887 ydst += lumStride;
1889 src += srcStride; 1888 src += srcStride;
1890 } 1889 }
1891 #ifdef HAVE_MMX 1890 #ifdef HAVE_MMX
1892 asm volatile( EMMS" \n\t" 1891 asm volatile( EMMS" \n\t"
1893 SFENCE" \n\t" 1892 SFENCE" \n\t"
1894 :::"memory"); 1893 :::"memory");
1895 #endif 1894 #endif
1896 } 1895 }
1897 1896
1898 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, 1897 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1899 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 1898 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1900 long width, long height, long lumStride, long chromStride) 1899 long width, long height, long lumStride, long chromStride)
1901 { 1900 {
1902 /* Y Plane */ 1901 /* Y Plane */
1903 memcpy(ydst, ysrc, width*height); 1902 memcpy(ydst, ysrc, width*height);
1904 1903
1905 /* XXX: implement upscaling for U,V */ 1904 /* XXX: implement upscaling for U,V */
1906 } 1905 }
1907 1906
1908 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride) 1907 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1909 { 1908 {
1910 long x,y; 1909 long x,y;
1911 1910
1912 dst[0]= src[0]; 1911 dst[0]= src[0];
1913 1912
1914 // first line 1913 // first line
1915 for(x=0; x<srcWidth-1; x++){ 1914 for (x=0; x<srcWidth-1; x++){
1916 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 1915 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1917 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 1916 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1918 } 1917 }
1919 dst[2*srcWidth-1]= src[srcWidth-1]; 1918 dst[2*srcWidth-1]= src[srcWidth-1];
1920 1919
1921 dst+= dstStride; 1920 dst+= dstStride;
1922 1921
1923 for(y=1; y<srcHeight; y++){ 1922 for (y=1; y<srcHeight; y++){
1924 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1923 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1925 const long mmxSize= srcWidth&~15; 1924 const long mmxSize= srcWidth&~15;
1926 asm volatile( 1925 asm volatile(
1927 "mov %4, %%"REG_a" \n\t" 1926 "mov %4, %%"REG_a" \n\t"
1928 "1: \n\t" 1927 "1: \n\t"
1929 "movq (%0, %%"REG_a"), %%mm0 \n\t" 1928 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1930 "movq (%1, %%"REG_a"), %%mm1 \n\t" 1929 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1931 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" 1930 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1932 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" 1931 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1933 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" 1932 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1934 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" 1933 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1935 PAVGB" %%mm0, %%mm5 \n\t" 1934 PAVGB" %%mm0, %%mm5 \n\t"
1936 PAVGB" %%mm0, %%mm3 \n\t" 1935 PAVGB" %%mm0, %%mm3 \n\t"
1937 PAVGB" %%mm0, %%mm5 \n\t" 1936 PAVGB" %%mm0, %%mm5 \n\t"
1938 PAVGB" %%mm0, %%mm3 \n\t" 1937 PAVGB" %%mm0, %%mm3 \n\t"
1939 PAVGB" %%mm1, %%mm4 \n\t" 1938 PAVGB" %%mm1, %%mm4 \n\t"
1940 PAVGB" %%mm1, %%mm2 \n\t" 1939 PAVGB" %%mm1, %%mm2 \n\t"
1941 PAVGB" %%mm1, %%mm4 \n\t" 1940 PAVGB" %%mm1, %%mm4 \n\t"
1942 PAVGB" %%mm1, %%mm2 \n\t" 1941 PAVGB" %%mm1, %%mm2 \n\t"
1943 "movq %%mm5, %%mm7 \n\t" 1942 "movq %%mm5, %%mm7 \n\t"
1944 "movq %%mm4, %%mm6 \n\t" 1943 "movq %%mm4, %%mm6 \n\t"
1945 "punpcklbw %%mm3, %%mm5 \n\t" 1944 "punpcklbw %%mm3, %%mm5 \n\t"
1946 "punpckhbw %%mm3, %%mm7 \n\t" 1945 "punpckhbw %%mm3, %%mm7 \n\t"
1947 "punpcklbw %%mm2, %%mm4 \n\t" 1946 "punpcklbw %%mm2, %%mm4 \n\t"
1948 "punpckhbw %%mm2, %%mm6 \n\t" 1947 "punpckhbw %%mm2, %%mm6 \n\t"
1949 #if 1 1948 #if 1
1950 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t" 1949 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1951 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t" 1950 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1952 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t" 1951 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1953 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t" 1952 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1954 #else 1953 #else
1955 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" 1954 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1956 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t" 1955 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1957 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" 1956 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1958 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t" 1957 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1959 #endif 1958 #endif
1960 "add $8, %%"REG_a" \n\t" 1959 "add $8, %%"REG_a" \n\t"
1961 " js 1b \n\t" 1960 " js 1b \n\t"
1962 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), 1961 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1963 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), 1962 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1964 "g" (-mmxSize) 1963 "g" (-mmxSize)
1965 : "%"REG_a 1964 : "%"REG_a
1966 1965
1967 ); 1966 );
1968 #else 1967 #else
1969 const long mmxSize=1; 1968 const long mmxSize=1;
1970 #endif 1969 #endif
1971 dst[0 ]= (3*src[0] + src[srcStride])>>2; 1970 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1972 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; 1971 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1973 1972
1974 for(x=mmxSize-1; x<srcWidth-1; x++){ 1973 for (x=mmxSize-1; x<srcWidth-1; x++){
1975 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; 1974 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1976 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; 1975 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1977 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; 1976 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1978 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; 1977 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1979 } 1978 }
1980 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; 1979 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1981 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; 1980 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1982 1981
1983 dst+=dstStride*2; 1982 dst+=dstStride*2;
1984 src+=srcStride; 1983 src+=srcStride;
1985 } 1984 }
1986 1985
1987 // last line 1986 // last line
1988 #if 1 1987 #if 1
1989 dst[0]= src[0]; 1988 dst[0]= src[0];
1990 1989
1991 for(x=0; x<srcWidth-1; x++){ 1990 for (x=0; x<srcWidth-1; x++){
1992 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 1991 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1993 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 1992 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1994 } 1993 }
1995 dst[2*srcWidth-1]= src[srcWidth-1]; 1994 dst[2*srcWidth-1]= src[srcWidth-1];
1996 #else 1995 #else
1997 for(x=0; x<srcWidth; x++){ 1996 for (x=0; x<srcWidth; x++){
1998 dst[2*x+0]= 1997 dst[2*x+0]=
1999 dst[2*x+1]= src[x]; 1998 dst[2*x+1]= src[x];
2000 } 1999 }
2001 #endif 2000 #endif
2002 2001
2003 #ifdef HAVE_MMX 2002 #ifdef HAVE_MMX
2004 asm volatile( EMMS" \n\t" 2003 asm volatile( EMMS" \n\t"
2005 SFENCE" \n\t" 2004 SFENCE" \n\t"
2006 :::"memory"); 2005 :::"memory");
2007 #endif 2006 #endif
2008 } 2007 }
2009 2008
2010 /** 2009 /**
2011 * 2010 *
2012 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a 2011 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
2013 * problem for anyone then tell me, and ill fix it) 2012 * problem for anyone then tell me, and ill fix it)
2014 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version 2013 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
2015 */ 2014 */
2016 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 2015 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2017 long width, long height, 2016 long width, long height,
2018 long lumStride, long chromStride, long srcStride) 2017 long lumStride, long chromStride, long srcStride)
2019 { 2018 {
2020 long y; 2019 long y;
2021 const long chromWidth= width>>1; 2020 const long chromWidth= width>>1;
2022 for(y=0; y<height; y+=2) 2021 for (y=0; y<height; y+=2)
2023 { 2022 {
2024 #ifdef HAVE_MMX 2023 #ifdef HAVE_MMX
2025 asm volatile( 2024 asm volatile(
2026 "xorl %%eax, %%eax \n\t" 2025 "xorl %%eax, %%eax \n\t"
2027 "pcmpeqw %%mm7, %%mm7 \n\t" 2026 "pcmpeqw %%mm7, %%mm7 \n\t"
2028 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 2027 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2029 ASMALIGN(4) 2028 ASMALIGN(4)
2030 "1: \n\t" 2029 "1: \n\t"
2031 PREFETCH" 64(%0, %%eax, 4) \n\t" 2030 PREFETCH" 64(%0, %%eax, 4) \n\t"
2032 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) 2031 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2033 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) 2032 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2034 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) 2033 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2035 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) 2034 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2036 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) 2035 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2037 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) 2036 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2038 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 2037 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2039 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 2038 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2040 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 2039 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2041 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 2040 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2042 2041
2043 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" 2042 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2044 2043
2045 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) 2044 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2046 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) 2045 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2047 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) 2046 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2048 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) 2047 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2049 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) 2048 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2050 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) 2049 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2051 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 2050 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2052 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 2051 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2053 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 2052 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2054 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 2053 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2055 2054
2056 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" 2055 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2057 2056
2058 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 2057 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2059 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 2058 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2060 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 2059 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2061 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 2060 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2062 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 2061 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2063 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 2062 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2064 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 2063 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2065 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 2064 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2066 2065
2067 MOVNTQ" %%mm0, (%3, %%eax) \n\t" 2066 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2068 MOVNTQ" %%mm2, (%2, %%eax) \n\t" 2067 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2069 2068
2070 "addl $8, %%eax \n\t" 2069 "addl $8, %%eax \n\t"
2071 "cmpl %4, %%eax \n\t" 2070 "cmpl %4, %%eax \n\t"
2072 " jb 1b \n\t" 2071 " jb 1b \n\t"
2073 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 2072 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2074 : "memory", "%eax" 2073 : "memory", "%eax"
2075 ); 2074 );
2076 2075
2077 ydst += lumStride; 2076 ydst += lumStride;
2078 src += srcStride; 2077 src += srcStride;
2079 2078
2080 asm volatile( 2079 asm volatile(
2081 "xorl %%eax, %%eax \n\t" 2080 "xorl %%eax, %%eax \n\t"
2082 ASMALIGN(4) 2081 ASMALIGN(4)
2083 "1: \n\t" 2082 "1: \n\t"
2084 PREFETCH" 64(%0, %%eax, 4) \n\t" 2083 PREFETCH" 64(%0, %%eax, 4) \n\t"
2085 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) 2084 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2086 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) 2085 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2087 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) 2086 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2088 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) 2087 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2089 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 2088 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2090 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 2089 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2091 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 2090 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2092 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 2091 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2093 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 2092 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2094 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 2093 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2095 2094
2096 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" 2095 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2097 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" 2096 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2098 2097
2099 "addl $8, %%eax \n\t" 2098 "addl $8, %%eax \n\t"
2100 "cmpl %4, %%eax \n\t" 2099 "cmpl %4, %%eax \n\t"
2101 " jb 1b \n\t" 2100 " jb 1b \n\t"
2102 2101
2103 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 2102 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2104 : "memory", "%eax" 2103 : "memory", "%eax"
2105 ); 2104 );
2106 #else 2105 #else
2107 long i; 2106 long i;
2108 for(i=0; i<chromWidth; i++) 2107 for (i=0; i<chromWidth; i++)
2109 { 2108 {
2110 udst[i] = src[4*i+0]; 2109 udst[i] = src[4*i+0];
2111 ydst[2*i+0] = src[4*i+1]; 2110 ydst[2*i+0] = src[4*i+1];
2112 vdst[i] = src[4*i+2]; 2111 vdst[i] = src[4*i+2];
2113 ydst[2*i+1] = src[4*i+3]; 2112 ydst[2*i+1] = src[4*i+3];
2114 } 2113 }
2115 ydst += lumStride; 2114 ydst += lumStride;
2116 src += srcStride; 2115 src += srcStride;
2117 2116
2118 for(i=0; i<chromWidth; i++) 2117 for (i=0; i<chromWidth; i++)
2119 { 2118 {
2120 ydst[2*i+0] = src[4*i+1]; 2119 ydst[2*i+0] = src[4*i+1];
2121 ydst[2*i+1] = src[4*i+3]; 2120 ydst[2*i+1] = src[4*i+3];
2122 } 2121 }
2123 #endif 2122 #endif
2124 udst += chromStride; 2123 udst += chromStride;
2125 vdst += chromStride; 2124 vdst += chromStride;
2126 ydst += lumStride; 2125 ydst += lumStride;
2127 src += srcStride; 2126 src += srcStride;
2128 } 2127 }
2129 #ifdef HAVE_MMX 2128 #ifdef HAVE_MMX
2130 asm volatile( EMMS" \n\t" 2129 asm volatile( EMMS" \n\t"
2131 SFENCE" \n\t" 2130 SFENCE" \n\t"
2132 :::"memory"); 2131 :::"memory");
2133 #endif 2132 #endif
2134 } 2133 }
2135 2134
2136 /** 2135 /**
2137 * 2136 *
2138 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a 2137 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2139 * problem for anyone then tell me, and ill fix it) 2138 * problem for anyone then tell me, and ill fix it)
2140 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version 2139 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2141 */ 2140 */
2142 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 2141 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2143 long width, long height, 2142 long width, long height,
2144 long lumStride, long chromStride, long srcStride) 2143 long lumStride, long chromStride, long srcStride)
2145 { 2144 {
2146 long y; 2145 long y;
2147 const long chromWidth= width>>1; 2146 const long chromWidth= width>>1;
2148 #ifdef HAVE_MMX 2147 #ifdef HAVE_MMX
2149 for(y=0; y<height-2; y+=2) 2148 for (y=0; y<height-2; y+=2)
2150 { 2149 {
2151 long i; 2150 long i;
2152 for(i=0; i<2; i++) 2151 for (i=0; i<2; i++)
2153 { 2152 {
2154 asm volatile( 2153 asm volatile(
2155 "mov %2, %%"REG_a" \n\t" 2154 "mov %2, %%"REG_a" \n\t"
2156 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" 2155 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2157 "movq "MANGLE(w1111)", %%mm5 \n\t" 2156 "movq "MANGLE(w1111)", %%mm5 \n\t"
2158 "pxor %%mm7, %%mm7 \n\t" 2157 "pxor %%mm7, %%mm7 \n\t"
2159 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" 2158 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2160 ASMALIGN(4) 2159 ASMALIGN(4)
2161 "1: \n\t" 2160 "1: \n\t"
2162 PREFETCH" 64(%0, %%"REG_d") \n\t" 2161 PREFETCH" 64(%0, %%"REG_d") \n\t"
2163 "movd (%0, %%"REG_d"), %%mm0 \n\t" 2162 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2164 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" 2163 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2165 "punpcklbw %%mm7, %%mm0 \n\t" 2164 "punpcklbw %%mm7, %%mm0 \n\t"
2166 "punpcklbw %%mm7, %%mm1 \n\t" 2165 "punpcklbw %%mm7, %%mm1 \n\t"
2167 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" 2166 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2168 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" 2167 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t" 2168 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "punpcklbw %%mm7, %%mm3 \n\t" 2169 "punpcklbw %%mm7, %%mm3 \n\t"
2171 "pmaddwd %%mm6, %%mm0 \n\t" 2170 "pmaddwd %%mm6, %%mm0 \n\t"
2172 "pmaddwd %%mm6, %%mm1 \n\t" 2171 "pmaddwd %%mm6, %%mm1 \n\t"
2173 "pmaddwd %%mm6, %%mm2 \n\t" 2172 "pmaddwd %%mm6, %%mm2 \n\t"
2174 "pmaddwd %%mm6, %%mm3 \n\t" 2173 "pmaddwd %%mm6, %%mm3 \n\t"
2175 #ifndef FAST_BGR2YV12 2174 #ifndef FAST_BGR2YV12
2176 "psrad $8, %%mm0 \n\t" 2175 "psrad $8, %%mm0 \n\t"
2177 "psrad $8, %%mm1 \n\t" 2176 "psrad $8, %%mm1 \n\t"
2178 "psrad $8, %%mm2 \n\t" 2177 "psrad $8, %%mm2 \n\t"
2179 "psrad $8, %%mm3 \n\t" 2178 "psrad $8, %%mm3 \n\t"
2180 #endif 2179 #endif
2181 "packssdw %%mm1, %%mm0 \n\t" 2180 "packssdw %%mm1, %%mm0 \n\t"
2182 "packssdw %%mm3, %%mm2 \n\t" 2181 "packssdw %%mm3, %%mm2 \n\t"
2183 "pmaddwd %%mm5, %%mm0 \n\t" 2182 "pmaddwd %%mm5, %%mm0 \n\t"
2184 "pmaddwd %%mm5, %%mm2 \n\t" 2183 "pmaddwd %%mm5, %%mm2 \n\t"
2185 "packssdw %%mm2, %%mm0 \n\t" 2184 "packssdw %%mm2, %%mm0 \n\t"
2186 "psraw $7, %%mm0 \n\t" 2185 "psraw $7, %%mm0 \n\t"
2187 2186
2188 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 2187 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2189 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" 2188 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2190 "punpcklbw %%mm7, %%mm4 \n\t" 2189 "punpcklbw %%mm7, %%mm4 \n\t"
2191 "punpcklbw %%mm7, %%mm1 \n\t" 2190 "punpcklbw %%mm7, %%mm1 \n\t"
2192 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" 2191 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2193 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" 2192 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2194 "punpcklbw %%mm7, %%mm2 \n\t" 2193 "punpcklbw %%mm7, %%mm2 \n\t"
2195 "punpcklbw %%mm7, %%mm3 \n\t" 2194 "punpcklbw %%mm7, %%mm3 \n\t"
2196 "pmaddwd %%mm6, %%mm4 \n\t" 2195 "pmaddwd %%mm6, %%mm4 \n\t"
2197 "pmaddwd %%mm6, %%mm1 \n\t" 2196 "pmaddwd %%mm6, %%mm1 \n\t"
2198 "pmaddwd %%mm6, %%mm2 \n\t" 2197 "pmaddwd %%mm6, %%mm2 \n\t"
2199 "pmaddwd %%mm6, %%mm3 \n\t" 2198 "pmaddwd %%mm6, %%mm3 \n\t"
2200 #ifndef FAST_BGR2YV12 2199 #ifndef FAST_BGR2YV12
2201 "psrad $8, %%mm4 \n\t" 2200 "psrad $8, %%mm4 \n\t"
2202 "psrad $8, %%mm1 \n\t" 2201 "psrad $8, %%mm1 \n\t"
2203 "psrad $8, %%mm2 \n\t" 2202 "psrad $8, %%mm2 \n\t"
2204 "psrad $8, %%mm3 \n\t" 2203 "psrad $8, %%mm3 \n\t"
2205 #endif 2204 #endif
2206 "packssdw %%mm1, %%mm4 \n\t" 2205 "packssdw %%mm1, %%mm4 \n\t"
2207 "packssdw %%mm3, %%mm2 \n\t" 2206 "packssdw %%mm3, %%mm2 \n\t"
2208 "pmaddwd %%mm5, %%mm4 \n\t" 2207 "pmaddwd %%mm5, %%mm4 \n\t"
2209 "pmaddwd %%mm5, %%mm2 \n\t" 2208 "pmaddwd %%mm5, %%mm2 \n\t"
2210 "add $24, %%"REG_d" \n\t" 2209 "add $24, %%"REG_d" \n\t"
2211 "packssdw %%mm2, %%mm4 \n\t" 2210 "packssdw %%mm2, %%mm4 \n\t"
2212 "psraw $7, %%mm4 \n\t" 2211 "psraw $7, %%mm4 \n\t"
2213 2212
2214 "packuswb %%mm4, %%mm0 \n\t" 2213 "packuswb %%mm4, %%mm0 \n\t"
2215 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" 2214 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2216 2215
2217 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" 2216 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2218 "add $8, %%"REG_a" \n\t" 2217 "add $8, %%"REG_a" \n\t"
2219 " js 1b \n\t" 2218 " js 1b \n\t"
2220 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) 2219 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2221 : "%"REG_a, "%"REG_d 2220 : "%"REG_a, "%"REG_d
2222 ); 2221 );
2223 ydst += lumStride; 2222 ydst += lumStride;
2224 src += srcStride; 2223 src += srcStride;
2225 } 2224 }
2226 src -= srcStride*2; 2225 src -= srcStride*2;
2227 asm volatile( 2226 asm volatile(
2228 "mov %4, %%"REG_a" \n\t" 2227 "mov %4, %%"REG_a" \n\t"
2229 "movq "MANGLE(w1111)", %%mm5 \n\t" 2228 "movq "MANGLE(w1111)", %%mm5 \n\t"
2230 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" 2229 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2231 "pxor %%mm7, %%mm7 \n\t" 2230 "pxor %%mm7, %%mm7 \n\t"
2232 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" 2231 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2233 "add %%"REG_d", %%"REG_d" \n\t" 2232 "add %%"REG_d", %%"REG_d" \n\t"
2234 ASMALIGN(4) 2233 ASMALIGN(4)
2235 "1: \n\t" 2234 "1: \n\t"
2236 PREFETCH" 64(%0, %%"REG_d") \n\t" 2235 PREFETCH" 64(%0, %%"REG_d") \n\t"
2237 PREFETCH" 64(%1, %%"REG_d") \n\t" 2236 PREFETCH" 64(%1, %%"REG_d") \n\t"
2238 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2237 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2239 "movq (%0, %%"REG_d"), %%mm0 \n\t" 2238 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2240 "movq (%1, %%"REG_d"), %%mm1 \n\t" 2239 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2241 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" 2240 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2242 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" 2241 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2243 PAVGB" %%mm1, %%mm0 \n\t" 2242 PAVGB" %%mm1, %%mm0 \n\t"
2244 PAVGB" %%mm3, %%mm2 \n\t" 2243 PAVGB" %%mm3, %%mm2 \n\t"
2245 "movq %%mm0, %%mm1 \n\t" 2244 "movq %%mm0, %%mm1 \n\t"
2246 "movq %%mm2, %%mm3 \n\t" 2245 "movq %%mm2, %%mm3 \n\t"
2247 "psrlq $24, %%mm0 \n\t" 2246 "psrlq $24, %%mm0 \n\t"
2248 "psrlq $24, %%mm2 \n\t" 2247 "psrlq $24, %%mm2 \n\t"
2249 PAVGB" %%mm1, %%mm0 \n\t" 2248 PAVGB" %%mm1, %%mm0 \n\t"
2250 PAVGB" %%mm3, %%mm2 \n\t" 2249 PAVGB" %%mm3, %%mm2 \n\t"
2251 "punpcklbw %%mm7, %%mm0 \n\t" 2250 "punpcklbw %%mm7, %%mm0 \n\t"
2252 "punpcklbw %%mm7, %%mm2 \n\t" 2251 "punpcklbw %%mm7, %%mm2 \n\t"
2253 #else 2252 #else
2254 "movd (%0, %%"REG_d"), %%mm0 \n\t" 2253 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2255 "movd (%1, %%"REG_d"), %%mm1 \n\t" 2254 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2256 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" 2255 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2257 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" 2256 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2258 "punpcklbw %%mm7, %%mm0 \n\t" 2257 "punpcklbw %%mm7, %%mm0 \n\t"
2259 "punpcklbw %%mm7, %%mm1 \n\t" 2258 "punpcklbw %%mm7, %%mm1 \n\t"
2260 "punpcklbw %%mm7, %%mm2 \n\t" 2259 "punpcklbw %%mm7, %%mm2 \n\t"
2261 "punpcklbw %%mm7, %%mm3 \n\t" 2260 "punpcklbw %%mm7, %%mm3 \n\t"
2262 "paddw %%mm1, %%mm0 \n\t" 2261 "paddw %%mm1, %%mm0 \n\t"
2263 "paddw %%mm3, %%mm2 \n\t" 2262 "paddw %%mm3, %%mm2 \n\t"
2264 "paddw %%mm2, %%mm0 \n\t" 2263 "paddw %%mm2, %%mm0 \n\t"
2265 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" 2264 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2266 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" 2265 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2267 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" 2266 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2268 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" 2267 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2269 "punpcklbw %%mm7, %%mm4 \n\t" 2268 "punpcklbw %%mm7, %%mm4 \n\t"
2270 "punpcklbw %%mm7, %%mm1 \n\t" 2269 "punpcklbw %%mm7, %%mm1 \n\t"
2271 "punpcklbw %%mm7, %%mm2 \n\t" 2270 "punpcklbw %%mm7, %%mm2 \n\t"
2272 "punpcklbw %%mm7, %%mm3 \n\t" 2271 "punpcklbw %%mm7, %%mm3 \n\t"
2273 "paddw %%mm1, %%mm4 \n\t" 2272 "paddw %%mm1, %%mm4 \n\t"
2274 "paddw %%mm3, %%mm2 \n\t" 2273 "paddw %%mm3, %%mm2 \n\t"
2275 "paddw %%mm4, %%mm2 \n\t" 2274 "paddw %%mm4, %%mm2 \n\t"
2276 "psrlw $2, %%mm0 \n\t" 2275 "psrlw $2, %%mm0 \n\t"
2277 "psrlw $2, %%mm2 \n\t" 2276 "psrlw $2, %%mm2 \n\t"
2278 #endif 2277 #endif
2279 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" 2278 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2280 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" 2279 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2281 2280
2282 "pmaddwd %%mm0, %%mm1 \n\t" 2281 "pmaddwd %%mm0, %%mm1 \n\t"
2283 "pmaddwd %%mm2, %%mm3 \n\t" 2282 "pmaddwd %%mm2, %%mm3 \n\t"
2284 "pmaddwd %%mm6, %%mm0 \n\t" 2283 "pmaddwd %%mm6, %%mm0 \n\t"
2285 "pmaddwd %%mm6, %%mm2 \n\t" 2284 "pmaddwd %%mm6, %%mm2 \n\t"
2286 #ifndef FAST_BGR2YV12 2285 #ifndef FAST_BGR2YV12
2287 "psrad $8, %%mm0 \n\t" 2286 "psrad $8, %%mm0 \n\t"
2288 "psrad $8, %%mm1 \n\t" 2287 "psrad $8, %%mm1 \n\t"
2289 "psrad $8, %%mm2 \n\t" 2288 "psrad $8, %%mm2 \n\t"
2290 "psrad $8, %%mm3 \n\t" 2289 "psrad $8, %%mm3 \n\t"
2291 #endif 2290 #endif
2292 "packssdw %%mm2, %%mm0 \n\t" 2291 "packssdw %%mm2, %%mm0 \n\t"
2293 "packssdw %%mm3, %%mm1 \n\t" 2292 "packssdw %%mm3, %%mm1 \n\t"
2294 "pmaddwd %%mm5, %%mm0 \n\t" 2293 "pmaddwd %%mm5, %%mm0 \n\t"
2295 "pmaddwd %%mm5, %%mm1 \n\t" 2294 "pmaddwd %%mm5, %%mm1 \n\t"
2296 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 2295 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2297 "psraw $7, %%mm0 \n\t" 2296 "psraw $7, %%mm0 \n\t"
2298 2297
2299 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2298 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2300 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" 2299 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2301 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" 2300 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2302 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" 2301 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2303 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" 2302 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2304 PAVGB" %%mm1, %%mm4 \n\t" 2303 PAVGB" %%mm1, %%mm4 \n\t"
2305 PAVGB" %%mm3, %%mm2 \n\t" 2304 PAVGB" %%mm3, %%mm2 \n\t"
2306 "movq %%mm4, %%mm1 \n\t" 2305 "movq %%mm4, %%mm1 \n\t"
2307 "movq %%mm2, %%mm3 \n\t" 2306 "movq %%mm2, %%mm3 \n\t"
2308 "psrlq $24, %%mm4 \n\t" 2307 "psrlq $24, %%mm4 \n\t"
2309 "psrlq $24, %%mm2 \n\t" 2308 "psrlq $24, %%mm2 \n\t"
2310 PAVGB" %%mm1, %%mm4 \n\t" 2309 PAVGB" %%mm1, %%mm4 \n\t"
2311 PAVGB" %%mm3, %%mm2 \n\t" 2310 PAVGB" %%mm3, %%mm2 \n\t"
2312 "punpcklbw %%mm7, %%mm4 \n\t" 2311 "punpcklbw %%mm7, %%mm4 \n\t"
2313 "punpcklbw %%mm7, %%mm2 \n\t" 2312 "punpcklbw %%mm7, %%mm2 \n\t"
2314 #else 2313 #else
2315 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 2314 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2316 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" 2315 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2317 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" 2316 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2318 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" 2317 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2319 "punpcklbw %%mm7, %%mm4 \n\t" 2318 "punpcklbw %%mm7, %%mm4 \n\t"
2320 "punpcklbw %%mm7, %%mm1 \n\t" 2319 "punpcklbw %%mm7, %%mm1 \n\t"
2321 "punpcklbw %%mm7, %%mm2 \n\t" 2320 "punpcklbw %%mm7, %%mm2 \n\t"
2322 "punpcklbw %%mm7, %%mm3 \n\t" 2321 "punpcklbw %%mm7, %%mm3 \n\t"
2323 "paddw %%mm1, %%mm4 \n\t" 2322 "paddw %%mm1, %%mm4 \n\t"
2324 "paddw %%mm3, %%mm2 \n\t" 2323 "paddw %%mm3, %%mm2 \n\t"
2325 "paddw %%mm2, %%mm4 \n\t" 2324 "paddw %%mm2, %%mm4 \n\t"
2326 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" 2325 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2327 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" 2326 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2328 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" 2327 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2329 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" 2328 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2330 "punpcklbw %%mm7, %%mm5 \n\t" 2329 "punpcklbw %%mm7, %%mm5 \n\t"
2331 "punpcklbw %%mm7, %%mm1 \n\t" 2330 "punpcklbw %%mm7, %%mm1 \n\t"
2332 "punpcklbw %%mm7, %%mm2 \n\t" 2331 "punpcklbw %%mm7, %%mm2 \n\t"
2333 "punpcklbw %%mm7, %%mm3 \n\t" 2332 "punpcklbw %%mm7, %%mm3 \n\t"
2334 "paddw %%mm1, %%mm5 \n\t" 2333 "paddw %%mm1, %%mm5 \n\t"
2335 "paddw %%mm3, %%mm2 \n\t" 2334 "paddw %%mm3, %%mm2 \n\t"
2336 "paddw %%mm5, %%mm2 \n\t" 2335 "paddw %%mm5, %%mm2 \n\t"
2337 "movq "MANGLE(w1111)", %%mm5 \n\t" 2336 "movq "MANGLE(w1111)", %%mm5 \n\t"
2338 "psrlw $2, %%mm4 \n\t" 2337 "psrlw $2, %%mm4 \n\t"
2339 "psrlw $2, %%mm2 \n\t" 2338 "psrlw $2, %%mm2 \n\t"
2340 #endif 2339 #endif
2341 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" 2340 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2342 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" 2341 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2343 2342
2344 "pmaddwd %%mm4, %%mm1 \n\t" 2343 "pmaddwd %%mm4, %%mm1 \n\t"
2345 "pmaddwd %%mm2, %%mm3 \n\t" 2344 "pmaddwd %%mm2, %%mm3 \n\t"
2346 "pmaddwd %%mm6, %%mm4 \n\t" 2345 "pmaddwd %%mm6, %%mm4 \n\t"
2347 "pmaddwd %%mm6, %%mm2 \n\t" 2346 "pmaddwd %%mm6, %%mm2 \n\t"
2348 #ifndef FAST_BGR2YV12 2347 #ifndef FAST_BGR2YV12
2349 "psrad $8, %%mm4 \n\t" 2348 "psrad $8, %%mm4 \n\t"
2350 "psrad $8, %%mm1 \n\t" 2349 "psrad $8, %%mm1 \n\t"
2351 "psrad $8, %%mm2 \n\t" 2350 "psrad $8, %%mm2 \n\t"
2352 "psrad $8, %%mm3 \n\t" 2351 "psrad $8, %%mm3 \n\t"
2353 #endif 2352 #endif
2354 "packssdw %%mm2, %%mm4 \n\t" 2353 "packssdw %%mm2, %%mm4 \n\t"
2355 "packssdw %%mm3, %%mm1 \n\t" 2354 "packssdw %%mm3, %%mm1 \n\t"
2356 "pmaddwd %%mm5, %%mm4 \n\t" 2355 "pmaddwd %%mm5, %%mm4 \n\t"
2357 "pmaddwd %%mm5, %%mm1 \n\t" 2356 "pmaddwd %%mm5, %%mm1 \n\t"
2358 "add $24, %%"REG_d" \n\t" 2357 "add $24, %%"REG_d" \n\t"
2359 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 2358 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2360 "psraw $7, %%mm4 \n\t" 2359 "psraw $7, %%mm4 \n\t"
2361 2360
2362 "movq %%mm0, %%mm1 \n\t" 2361 "movq %%mm0, %%mm1 \n\t"
2363 "punpckldq %%mm4, %%mm0 \n\t" 2362 "punpckldq %%mm4, %%mm0 \n\t"
2364 "punpckhdq %%mm4, %%mm1 \n\t" 2363 "punpckhdq %%mm4, %%mm1 \n\t"
2365 "packsswb %%mm1, %%mm0 \n\t" 2364 "packsswb %%mm1, %%mm0 \n\t"
2366 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" 2365 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2367 "movd %%mm0, (%2, %%"REG_a") \n\t" 2366 "movd %%mm0, (%2, %%"REG_a") \n\t"
2368 "punpckhdq %%mm0, %%mm0 \n\t" 2367 "punpckhdq %%mm0, %%mm0 \n\t"
2369 "movd %%mm0, (%3, %%"REG_a") \n\t" 2368 "movd %%mm0, (%3, %%"REG_a") \n\t"
2370 "add $4, %%"REG_a" \n\t" 2369 "add $4, %%"REG_a" \n\t"
2371 " js 1b \n\t" 2370 " js 1b \n\t"
2372 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) 2371 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2373 : "%"REG_a, "%"REG_d 2372 : "%"REG_a, "%"REG_d
2374 ); 2373 );
2375 2374
2376 udst += chromStride; 2375 udst += chromStride;
2377 vdst += chromStride; 2376 vdst += chromStride;
2378 src += srcStride*2; 2377 src += srcStride*2;
2379 } 2378 }
2380 2379
2381 asm volatile( EMMS" \n\t" 2380 asm volatile( EMMS" \n\t"
2382 SFENCE" \n\t" 2381 SFENCE" \n\t"
2383 :::"memory"); 2382 :::"memory");
2384 #else 2383 #else
2385 y=0; 2384 y=0;
2386 #endif 2385 #endif
2387 for(; y<height; y+=2) 2386 for (; y<height; y+=2)
2388 { 2387 {
2389 long i; 2388 long i;
2390 for(i=0; i<chromWidth; i++) 2389 for (i=0; i<chromWidth; i++)
2391 { 2390 {
2392 unsigned int b= src[6*i+0]; 2391 unsigned int b = src[6*i+0];
2393 unsigned int g= src[6*i+1]; 2392 unsigned int g = src[6*i+1];
2394 unsigned int r= src[6*i+2]; 2393 unsigned int r = src[6*i+2];
2395 2394
2396 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 2395 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2397 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; 2396 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2398 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; 2397 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2399 2398
2400 udst[i] = U; 2399 udst[i] = U;
2401 vdst[i] = V; 2400 vdst[i] = V;
2402 ydst[2*i] = Y; 2401 ydst[2*i] = Y;
2403 2402
2404 b= src[6*i+3]; 2403 b = src[6*i+3];
2405 g= src[6*i+4]; 2404 g = src[6*i+4];
2406 r= src[6*i+5]; 2405 r = src[6*i+5];
2407 2406
2408 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 2407 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2409 ydst[2*i+1] = Y; 2408 ydst[2*i+1] = Y;
2410 } 2409 }
2411 ydst += lumStride; 2410 ydst += lumStride;
2412 src += srcStride; 2411 src += srcStride;
2413 2412
2414 for(i=0; i<chromWidth; i++) 2413 for (i=0; i<chromWidth; i++)
2415 { 2414 {
2416 unsigned int b= src[6*i+0]; 2415 unsigned int b = src[6*i+0];
2417 unsigned int g= src[6*i+1]; 2416 unsigned int g = src[6*i+1];
2418 unsigned int r= src[6*i+2]; 2417 unsigned int r = src[6*i+2];
2419 2418
2420 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 2419 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2421 2420
2422 ydst[2*i] = Y; 2421 ydst[2*i] = Y;
2423 2422
2424 b= src[6*i+3]; 2423 b = src[6*i+3];
2425 g= src[6*i+4]; 2424 g = src[6*i+4];
2426 r= src[6*i+5]; 2425 r = src[6*i+5];
2427 2426
2428 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 2427 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2429 ydst[2*i+1] = Y; 2428 ydst[2*i+1] = Y;
2430 } 2429 }
2431 udst += chromStride; 2430 udst += chromStride;
2432 vdst += chromStride; 2431 vdst += chromStride;
2433 ydst += lumStride; 2432 ydst += lumStride;
2434 src += srcStride; 2433 src += srcStride;
2435 } 2434 }
2436 } 2435 }
2437 2436
2438 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, 2437 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2439 long width, long height, long src1Stride, 2438 long width, long height, long src1Stride,
2440 long src2Stride, long dstStride){ 2439 long src2Stride, long dstStride){
2441 long h; 2440 long h;
2442 2441
2443 for(h=0; h < height; h++) 2442 for (h=0; h < height; h++)
2444 { 2443 {
2445 long w; 2444 long w;
2446 2445
2447 #ifdef HAVE_MMX 2446 #ifdef HAVE_MMX
2448 #ifdef HAVE_SSE2 2447 #ifdef HAVE_SSE2
2449 asm( 2448 asm(
2450 "xor %%"REG_a", %%"REG_a" \n\t" 2449 "xor %%"REG_a", %%"REG_a" \n\t"
2451 "1: \n\t" 2450 "1: \n\t"
2452 PREFETCH" 64(%1, %%"REG_a") \n\t" 2451 PREFETCH" 64(%1, %%"REG_a") \n\t"
2453 PREFETCH" 64(%2, %%"REG_a") \n\t" 2452 PREFETCH" 64(%2, %%"REG_a") \n\t"
2454 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" 2453 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2455 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" 2454 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2456 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" 2455 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2457 "punpcklbw %%xmm2, %%xmm0 \n\t" 2456 "punpcklbw %%xmm2, %%xmm0 \n\t"
2458 "punpckhbw %%xmm2, %%xmm1 \n\t" 2457 "punpckhbw %%xmm2, %%xmm1 \n\t"
2459 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t" 2458 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2460 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t" 2459 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2461 "add $16, %%"REG_a" \n\t" 2460 "add $16, %%"REG_a" \n\t"
2462 "cmp %3, %%"REG_a" \n\t" 2461 "cmp %3, %%"REG_a" \n\t"
2463 " jb 1b \n\t" 2462 " jb 1b \n\t"
2464 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) 2463 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2465 : "memory", "%"REG_a"" 2464 : "memory", "%"REG_a""
2466 ); 2465 );
2467 #else 2466 #else
2468 asm( 2467 asm(
2469 "xor %%"REG_a", %%"REG_a" \n\t" 2468 "xor %%"REG_a", %%"REG_a" \n\t"
2470 "1: \n\t" 2469 "1: \n\t"
2471 PREFETCH" 64(%1, %%"REG_a") \n\t" 2470 PREFETCH" 64(%1, %%"REG_a") \n\t"
2472 PREFETCH" 64(%2, %%"REG_a") \n\t" 2471 PREFETCH" 64(%2, %%"REG_a") \n\t"
2473 "movq (%1, %%"REG_a"), %%mm0 \n\t" 2472 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2474 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" 2473 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2475 "movq %%mm0, %%mm1 \n\t" 2474 "movq %%mm0, %%mm1 \n\t"
2476 "movq %%mm2, %%mm3 \n\t" 2475 "movq %%mm2, %%mm3 \n\t"
2477 "movq (%2, %%"REG_a"), %%mm4 \n\t" 2476 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2478 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" 2477 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2479 "punpcklbw %%mm4, %%mm0 \n\t" 2478 "punpcklbw %%mm4, %%mm0 \n\t"
2480 "punpckhbw %%mm4, %%mm1 \n\t" 2479 "punpckhbw %%mm4, %%mm1 \n\t"
2481 "punpcklbw %%mm5, %%mm2 \n\t" 2480 "punpcklbw %%mm5, %%mm2 \n\t"
2482 "punpckhbw %%mm5, %%mm3 \n\t" 2481 "punpckhbw %%mm5, %%mm3 \n\t"
2483 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t" 2482 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2484 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t" 2483 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2485 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t" 2484 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2486 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t" 2485 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2487 "add $16, %%"REG_a" \n\t" 2486 "add $16, %%"REG_a" \n\t"
2488 "cmp %3, %%"REG_a" \n\t" 2487 "cmp %3, %%"REG_a" \n\t"
2489 " jb 1b \n\t" 2488 " jb 1b \n\t"
2490 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) 2489 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2491 : "memory", "%"REG_a 2490 : "memory", "%"REG_a
2492 ); 2491 );
2493 #endif 2492 #endif
2494 for(w= (width&(~15)); w < width; w++) 2493 for (w= (width&(~15)); w < width; w++)
2495 { 2494 {
2496 dest[2*w+0] = src1[w]; 2495 dest[2*w+0] = src1[w];
2497 dest[2*w+1] = src2[w]; 2496 dest[2*w+1] = src2[w];
2498 } 2497 }
2499 #else 2498 #else
2500 for(w=0; w < width; w++) 2499 for (w=0; w < width; w++)
2501 { 2500 {
2502 dest[2*w+0] = src1[w]; 2501 dest[2*w+0] = src1[w];
2503 dest[2*w+1] = src2[w]; 2502 dest[2*w+1] = src2[w];
2504 } 2503 }
2505 #endif 2504 #endif
2506 dest += dstStride; 2505 dest += dstStride;
2507 src1 += src1Stride; 2506 src1 += src1Stride;
2508 src2 += src2Stride; 2507 src2 += src2Stride;
2509 } 2508 }
2510 #ifdef HAVE_MMX 2509 #ifdef HAVE_MMX
2511 asm( 2510 asm(
2512 EMMS" \n\t" 2511 EMMS" \n\t"
2513 SFENCE" \n\t" 2512 SFENCE" \n\t"
2514 ::: "memory" 2513 ::: "memory"
2515 ); 2514 );
2516 #endif 2515 #endif
2517 } 2516 }
2518 2517
2519 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, 2518 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2520 uint8_t *dst1, uint8_t *dst2, 2519 uint8_t *dst1, uint8_t *dst2,
2521 long width, long height, 2520 long width, long height,
2522 long srcStride1, long srcStride2, 2521 long srcStride1, long srcStride2,
2523 long dstStride1, long dstStride2) 2522 long dstStride1, long dstStride2)
2524 { 2523 {
2525 long y,x,w,h; 2524 long y,x,w,h;
2526 w=width/2; h=height/2; 2525 w=width/2; h=height/2;
2527 #ifdef HAVE_MMX 2526 #ifdef HAVE_MMX
2528 asm volatile( 2527 asm volatile(
2529 PREFETCH" %0\n\t" 2528 PREFETCH" %0 \n\t"
2530 PREFETCH" %1\n\t" 2529 PREFETCH" %1 \n\t"
2531 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); 2530 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2532 #endif 2531 #endif
2533 for(y=0;y<h;y++){ 2532 for (y=0;y<h;y++){
2534 const uint8_t* s1=src1+srcStride1*(y>>1); 2533 const uint8_t* s1=src1+srcStride1*(y>>1);
2535 uint8_t* d=dst1+dstStride1*y; 2534 uint8_t* d=dst1+dstStride1*y;
2536 x=0; 2535 x=0;
2537 #ifdef HAVE_MMX 2536 #ifdef HAVE_MMX
2538 for(;x<w-31;x+=32) 2537 for (;x<w-31;x+=32)
2539 { 2538 {
2540 asm volatile( 2539 asm volatile(
2541 PREFETCH" 32%1\n\t" 2540 PREFETCH" 32%1 \n\t"
2542 "movq %1, %%mm0\n\t" 2541 "movq %1, %%mm0 \n\t"
2543 "movq 8%1, %%mm2\n\t" 2542 "movq 8%1, %%mm2 \n\t"
2544 "movq 16%1, %%mm4\n\t" 2543 "movq 16%1, %%mm4 \n\t"
2545 "movq 24%1, %%mm6\n\t" 2544 "movq 24%1, %%mm6 \n\t"
2546 "movq %%mm0, %%mm1\n\t" 2545 "movq %%mm0, %%mm1 \n\t"
2547 "movq %%mm2, %%mm3\n\t" 2546 "movq %%mm2, %%mm3 \n\t"
2548 "movq %%mm4, %%mm5\n\t" 2547 "movq %%mm4, %%mm5 \n\t"
2549 "movq %%mm6, %%mm7\n\t" 2548 "movq %%mm6, %%mm7 \n\t"
2550 "punpcklbw %%mm0, %%mm0\n\t" 2549 "punpcklbw %%mm0, %%mm0 \n\t"
2551 "punpckhbw %%mm1, %%mm1\n\t" 2550 "punpckhbw %%mm1, %%mm1 \n\t"
2552 "punpcklbw %%mm2, %%mm2\n\t" 2551 "punpcklbw %%mm2, %%mm2 \n\t"
2553 "punpckhbw %%mm3, %%mm3\n\t" 2552 "punpckhbw %%mm3, %%mm3 \n\t"
2554 "punpcklbw %%mm4, %%mm4\n\t" 2553 "punpcklbw %%mm4, %%mm4 \n\t"
2555 "punpckhbw %%mm5, %%mm5\n\t" 2554 "punpckhbw %%mm5, %%mm5 \n\t"
2556 "punpcklbw %%mm6, %%mm6\n\t" 2555 "punpcklbw %%mm6, %%mm6 \n\t"
2557 "punpckhbw %%mm7, %%mm7\n\t" 2556 "punpckhbw %%mm7, %%mm7 \n\t"
2558 MOVNTQ" %%mm0, %0\n\t" 2557 MOVNTQ" %%mm0, %0 \n\t"
2559 MOVNTQ" %%mm1, 8%0\n\t" 2558 MOVNTQ" %%mm1, 8%0 \n\t"
2560 MOVNTQ" %%mm2, 16%0\n\t" 2559 MOVNTQ" %%mm2, 16%0 \n\t"
2561 MOVNTQ" %%mm3, 24%0\n\t" 2560 MOVNTQ" %%mm3, 24%0 \n\t"
2562 MOVNTQ" %%mm4, 32%0\n\t" 2561 MOVNTQ" %%mm4, 32%0 \n\t"
2563 MOVNTQ" %%mm5, 40%0\n\t" 2562 MOVNTQ" %%mm5, 40%0 \n\t"
2564 MOVNTQ" %%mm6, 48%0\n\t" 2563 MOVNTQ" %%mm6, 48%0 \n\t"
2565 MOVNTQ" %%mm7, 56%0" 2564 MOVNTQ" %%mm7, 56%0"
2566 :"=m"(d[2*x]) 2565 :"=m"(d[2*x])
2567 :"m"(s1[x]) 2566 :"m"(s1[x])
2568 :"memory"); 2567 :"memory");
2569 } 2568 }
2570 #endif 2569 #endif
2571 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; 2570 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2572 } 2571 }
2573 for(y=0;y<h;y++){ 2572 for (y=0;y<h;y++){
2574 const uint8_t* s2=src2+srcStride2*(y>>1); 2573 const uint8_t* s2=src2+srcStride2*(y>>1);
2575 uint8_t* d=dst2+dstStride2*y; 2574 uint8_t* d=dst2+dstStride2*y;
2576 x=0; 2575 x=0;
2577 #ifdef HAVE_MMX 2576 #ifdef HAVE_MMX
2578 for(;x<w-31;x+=32) 2577 for (;x<w-31;x+=32)
2579 { 2578 {
2580 asm volatile( 2579 asm volatile(
2581 PREFETCH" 32%1\n\t" 2580 PREFETCH" 32%1 \n\t"
2582 "movq %1, %%mm0\n\t" 2581 "movq %1, %%mm0 \n\t"
2583 "movq 8%1, %%mm2\n\t" 2582 "movq 8%1, %%mm2 \n\t"
2584 "movq 16%1, %%mm4\n\t" 2583 "movq 16%1, %%mm4 \n\t"
2585 "movq 24%1, %%mm6\n\t" 2584 "movq 24%1, %%mm6 \n\t"
2586 "movq %%mm0, %%mm1\n\t" 2585 "movq %%mm0, %%mm1 \n\t"
2587 "movq %%mm2, %%mm3\n\t" 2586 "movq %%mm2, %%mm3 \n\t"
2588 "movq %%mm4, %%mm5\n\t" 2587 "movq %%mm4, %%mm5 \n\t"
2589 "movq %%mm6, %%mm7\n\t" 2588 "movq %%mm6, %%mm7 \n\t"
2590 "punpcklbw %%mm0, %%mm0\n\t" 2589 "punpcklbw %%mm0, %%mm0 \n\t"
2591 "punpckhbw %%mm1, %%mm1\n\t" 2590 "punpckhbw %%mm1, %%mm1 \n\t"
2592 "punpcklbw %%mm2, %%mm2\n\t" 2591 "punpcklbw %%mm2, %%mm2 \n\t"
2593 "punpckhbw %%mm3, %%mm3\n\t" 2592 "punpckhbw %%mm3, %%mm3 \n\t"
2594 "punpcklbw %%mm4, %%mm4\n\t" 2593 "punpcklbw %%mm4, %%mm4 \n\t"
2595 "punpckhbw %%mm5, %%mm5\n\t" 2594 "punpckhbw %%mm5, %%mm5 \n\t"
2596 "punpcklbw %%mm6, %%mm6\n\t" 2595 "punpcklbw %%mm6, %%mm6 \n\t"
2597 "punpckhbw %%mm7, %%mm7\n\t" 2596 "punpckhbw %%mm7, %%mm7 \n\t"
2598 MOVNTQ" %%mm0, %0\n\t" 2597 MOVNTQ" %%mm0, %0 \n\t"
2599 MOVNTQ" %%mm1, 8%0\n\t" 2598 MOVNTQ" %%mm1, 8%0 \n\t"
2600 MOVNTQ" %%mm2, 16%0\n\t" 2599 MOVNTQ" %%mm2, 16%0 \n\t"
2601 MOVNTQ" %%mm3, 24%0\n\t" 2600 MOVNTQ" %%mm3, 24%0 \n\t"
2602 MOVNTQ" %%mm4, 32%0\n\t" 2601 MOVNTQ" %%mm4, 32%0 \n\t"
2603 MOVNTQ" %%mm5, 40%0\n\t" 2602 MOVNTQ" %%mm5, 40%0 \n\t"
2604 MOVNTQ" %%mm6, 48%0\n\t" 2603 MOVNTQ" %%mm6, 48%0 \n\t"
2605 MOVNTQ" %%mm7, 56%0" 2604 MOVNTQ" %%mm7, 56%0"
2606 :"=m"(d[2*x]) 2605 :"=m"(d[2*x])
2607 :"m"(s2[x]) 2606 :"m"(s2[x])
2608 :"memory"); 2607 :"memory");
2609 } 2608 }
2610 #endif 2609 #endif
2611 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; 2610 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2612 } 2611 }
2613 #ifdef HAVE_MMX 2612 #ifdef HAVE_MMX
2614 asm( 2613 asm(
2615 EMMS" \n\t" 2614 EMMS" \n\t"
2616 SFENCE" \n\t" 2615 SFENCE" \n\t"
2617 ::: "memory" 2616 ::: "memory"
2618 ); 2617 );
2619 #endif 2618 #endif
2620 } 2619 }
2621 2620
2622 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, 2621 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2623 uint8_t *dst, 2622 uint8_t *dst,
2624 long width, long height, 2623 long width, long height,
2625 long srcStride1, long srcStride2, 2624 long srcStride1, long srcStride2,
2626 long srcStride3, long dstStride) 2625 long srcStride3, long dstStride)
2627 { 2626 {
2628 long y,x,w,h; 2627 long y,x,w,h;
2629 w=width/2; h=height; 2628 w=width/2; h=height;
2630 for(y=0;y<h;y++){ 2629 for (y=0;y<h;y++){
2631 const uint8_t* yp=src1+srcStride1*y; 2630 const uint8_t* yp=src1+srcStride1*y;
2632 const uint8_t* up=src2+srcStride2*(y>>2); 2631 const uint8_t* up=src2+srcStride2*(y>>2);
2633 const uint8_t* vp=src3+srcStride3*(y>>2); 2632 const uint8_t* vp=src3+srcStride3*(y>>2);
2634 uint8_t* d=dst+dstStride*y; 2633 uint8_t* d=dst+dstStride*y;
2635 x=0; 2634 x=0;
2636 #ifdef HAVE_MMX 2635 #ifdef HAVE_MMX
2637 for(;x<w-7;x+=8) 2636 for (;x<w-7;x+=8)
2638 { 2637 {
2639 asm volatile( 2638 asm volatile(
2640 PREFETCH" 32(%1, %0)\n\t" 2639 PREFETCH" 32(%1, %0) \n\t"
2641 PREFETCH" 32(%2, %0)\n\t" 2640 PREFETCH" 32(%2, %0) \n\t"
2642 PREFETCH" 32(%3, %0)\n\t" 2641 PREFETCH" 32(%3, %0) \n\t"
2643 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 2642 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2644 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */ 2643 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2645 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */ 2644 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2646 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 2645 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2647 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */ 2646 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2648 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */ 2647 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2649 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */ 2648 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2650 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */ 2649 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2651 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */ 2650 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2652 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */ 2651 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2653 2652
2654 "movq %%mm1, %%mm6\n\t" 2653 "movq %%mm1, %%mm6 \n\t"
2655 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/ 2654 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2656 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ 2655 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2657 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ 2656 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2658 MOVNTQ" %%mm0, (%4, %0, 8)\n\t" 2657 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2659 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t" 2658 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2660 2659
2661 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/ 2660 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2662 "movq 8(%1, %0, 4), %%mm0\n\t" 2661 "movq 8(%1, %0, 4), %%mm0 \n\t"
2663 "movq %%mm0, %%mm3\n\t" 2662 "movq %%mm0, %%mm3 \n\t"
2664 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/ 2663 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2665 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/ 2664 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2666 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t" 2665 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2667 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t" 2666 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2668 2667
2669 "movq %%mm4, %%mm6\n\t" 2668 "movq %%mm4, %%mm6 \n\t"
2670 "movq 16(%1, %0, 4), %%mm0\n\t" 2669 "movq 16(%1, %0, 4), %%mm0 \n\t"
2671 "movq %%mm0, %%mm3\n\t" 2670 "movq %%mm0, %%mm3 \n\t"
2672 "punpcklbw %%mm5, %%mm4\n\t" 2671 "punpcklbw %%mm5, %%mm4 \n\t"
2673 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/ 2672 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2674 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/ 2673 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2675 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t" 2674 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2676 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t" 2675 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2677 2676
2678 "punpckhbw %%mm5, %%mm6\n\t" 2677 "punpckhbw %%mm5, %%mm6 \n\t"
2679 "movq 24(%1, %0, 4), %%mm0\n\t" 2678 "movq 24(%1, %0, 4), %%mm0 \n\t"
2680 "movq %%mm0, %%mm3\n\t" 2679 "movq %%mm0, %%mm3 \n\t"
2681 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/ 2680 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2682 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/ 2681 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2683 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t" 2682 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2684 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t" 2683 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2685 2684
2686 : "+r" (x) 2685 : "+r" (x)
2687 : "r"(yp), "r" (up), "r"(vp), "r"(d) 2686 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2688 :"memory"); 2687 :"memory");
2689 } 2688 }
2690 #endif 2689 #endif
2691 for(; x<w; x++) 2690 for (; x<w; x++)
2692 { 2691 {
2693 const long x2= x<<2; 2692 const long x2 = x<<2;
2694 d[8*x+0]=yp[x2]; 2693 d[8*x+0] = yp[x2];
2695 d[8*x+1]=up[x]; 2694 d[8*x+1] = up[x];
2696 d[8*x+2]=yp[x2+1]; 2695 d[8*x+2] = yp[x2+1];
2697 d[8*x+3]=vp[x]; 2696 d[8*x+3] = vp[x];
2698 d[8*x+4]=yp[x2+2]; 2697 d[8*x+4] = yp[x2+2];
2699 d[8*x+5]=up[x]; 2698 d[8*x+5] = up[x];
2700 d[8*x+6]=yp[x2+3]; 2699 d[8*x+6] = yp[x2+3];
2701 d[8*x+7]=vp[x]; 2700 d[8*x+7] = vp[x];
2702 } 2701 }
2703 } 2702 }
2704 #ifdef HAVE_MMX 2703 #ifdef HAVE_MMX
2705 asm( 2704 asm(
2706 EMMS" \n\t" 2705 EMMS" \n\t"
2707 SFENCE" \n\t" 2706 SFENCE" \n\t"
2708 ::: "memory" 2707 ::: "memory"
2709 ); 2708 );
2710 #endif 2709 #endif
2711 } 2710 }
2712 2711
2713 static inline void RENAME(rgb2rgb_init)(void){ 2712 static inline void RENAME(rgb2rgb_init)(void){
2714 rgb15to16= RENAME(rgb15to16); 2713 rgb15to16 = RENAME(rgb15to16);
2715 rgb15to24= RENAME(rgb15to24); 2714 rgb15to24 = RENAME(rgb15to24);
2716 rgb15to32= RENAME(rgb15to32); 2715 rgb15to32 = RENAME(rgb15to32);
2717 rgb16to24= RENAME(rgb16to24); 2716 rgb16to24 = RENAME(rgb16to24);
2718 rgb16to32= RENAME(rgb16to32); 2717 rgb16to32 = RENAME(rgb16to32);
2719 rgb16to15= RENAME(rgb16to15); 2718 rgb16to15 = RENAME(rgb16to15);
2720 rgb24to16= RENAME(rgb24to16); 2719 rgb24to16 = RENAME(rgb24to16);
2721 rgb24to15= RENAME(rgb24to15); 2720 rgb24to15 = RENAME(rgb24to15);
2722 rgb24to32= RENAME(rgb24to32); 2721 rgb24to32 = RENAME(rgb24to32);
2723 rgb32to16= RENAME(rgb32to16); 2722 rgb32to16 = RENAME(rgb32to16);
2724 rgb32to15= RENAME(rgb32to15); 2723 rgb32to15 = RENAME(rgb32to15);
2725 rgb32to24= RENAME(rgb32to24); 2724 rgb32to24 = RENAME(rgb32to24);
2726 rgb24tobgr15= RENAME(rgb24tobgr15); 2725 rgb24tobgr15 = RENAME(rgb24tobgr15);
2727 rgb24tobgr16= RENAME(rgb24tobgr16); 2726 rgb24tobgr16 = RENAME(rgb24tobgr16);
2728 rgb24tobgr24= RENAME(rgb24tobgr24); 2727 rgb24tobgr24 = RENAME(rgb24tobgr24);
2729 rgb32tobgr32= RENAME(rgb32tobgr32); 2728 rgb32tobgr32 = RENAME(rgb32tobgr32);
2730 rgb32tobgr16= RENAME(rgb32tobgr16); 2729 rgb32tobgr16 = RENAME(rgb32tobgr16);
2731 rgb32tobgr15= RENAME(rgb32tobgr15); 2730 rgb32tobgr15 = RENAME(rgb32tobgr15);
2732 yv12toyuy2= RENAME(yv12toyuy2); 2731 yv12toyuy2 = RENAME(yv12toyuy2);
2733 yv12touyvy= RENAME(yv12touyvy); 2732 yv12touyvy = RENAME(yv12touyvy);
2734 yuv422ptoyuy2= RENAME(yuv422ptoyuy2); 2733 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2735 yuy2toyv12= RENAME(yuy2toyv12); 2734 yuy2toyv12 = RENAME(yuy2toyv12);
2736 // uyvytoyv12= RENAME(uyvytoyv12); 2735 // uyvytoyv12 = RENAME(uyvytoyv12);
2737 // yvu9toyv12= RENAME(yvu9toyv12); 2736 // yvu9toyv12 = RENAME(yvu9toyv12);
2738 planar2x= RENAME(planar2x); 2737 planar2x = RENAME(planar2x);
2739 rgb24toyv12= RENAME(rgb24toyv12); 2738 rgb24toyv12 = RENAME(rgb24toyv12);
2740 interleaveBytes= RENAME(interleaveBytes); 2739 interleaveBytes = RENAME(interleaveBytes);
2741 vu9_to_vu12= RENAME(vu9_to_vu12); 2740 vu9_to_vu12 = RENAME(vu9_to_vu12);
2742 yvu9_to_yuy2= RENAME(yvu9_to_yuy2); 2741 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2743 } 2742 }