comparison libswscale/rgb2rgb_template.c @ 29480:a4d8dee13834

Indent libswscale: - Use 4 spaces throughout for indentation; - Fix inconsistent indentation; - Indent function calls and declarations aligning arguments on multiple lines to the column after the opening parentheses; - Align asm code to the column 4 spaces after the call to __asm__(); - Align cases in switch statements to the same column as "switch".
author ramiro
date Sun, 16 Aug 2009 00:32:04 +0000
parents 0673fad0546f
children c080f1f5c07e
comparison
equal deleted inserted replaced
29479:55f33b0748c9 29480:a4d8dee13834
75 { 75 {
76 uint8_t *dest = dst; 76 uint8_t *dest = dst;
77 const uint8_t *s = src; 77 const uint8_t *s = src;
78 const uint8_t *end; 78 const uint8_t *end;
79 #if HAVE_MMX 79 #if HAVE_MMX
80 const uint8_t *mm_end; 80 const uint8_t *mm_end;
81 #endif 81 #endif
82 end = s + src_size; 82 end = s + src_size;
83 #if HAVE_MMX 83 #if HAVE_MMX
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
85 mm_end = end - 23; 85 mm_end = end - 23;
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); 86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
87 while (s < mm_end) 87 while (s < mm_end)
88 { 88 {
89 __asm__ volatile( 89 __asm__ volatile(
90 PREFETCH" 32%1 \n\t" 90 PREFETCH" 32%1 \n\t"
91 "movd %1, %%mm0 \n\t" 91 "movd %1, %%mm0 \n\t"
92 "punpckldq 3%1, %%mm0 \n\t" 92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t" 93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t" 94 "punpckldq 9%1, %%mm1 \n\t"
105 MOVNTQ" %%mm2, 16%0 \n\t" 105 MOVNTQ" %%mm2, 16%0 \n\t"
106 MOVNTQ" %%mm3, 24%0" 106 MOVNTQ" %%mm3, 24%0"
107 :"=m"(*dest) 107 :"=m"(*dest)
108 :"m"(*s) 108 :"m"(*s)
109 :"memory"); 109 :"memory");
110 dest += 32; 110 dest += 32;
111 s += 24; 111 s += 24;
112 } 112 }
113 __asm__ volatile(SFENCE:::"memory"); 113 __asm__ volatile(SFENCE:::"memory");
114 __asm__ volatile(EMMS:::"memory"); 114 __asm__ volatile(EMMS:::"memory");
115 #endif 115 #endif
116 while (s < end) 116 while (s < end)
117 { 117 {
118 #if HAVE_BIGENDIAN 118 #if HAVE_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ 119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
145 mm_end = end - 31; 145 mm_end = end - 31;
146 while (s < mm_end) 146 while (s < mm_end)
147 { 147 {
148 __asm__ volatile( 148 __asm__ volatile(
149 PREFETCH" 32%1 \n\t" 149 PREFETCH" 32%1 \n\t"
150 "movq %1, %%mm0 \n\t" 150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t" 151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t" 152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t" 153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t" 154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t" 155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t" 156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t" 157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t" 158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t" 159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t" 160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t" 161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t" 162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t" 163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t" 164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t" 165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t" 166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t" 167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t" 168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t" 169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t" 170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t" 171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t" 172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t" 173 "por %%mm7, %%mm5 \n\t"
174 174
175 "movq %%mm1, %%mm2 \n\t" 175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t" 176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t" 177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t" 178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t" 179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t" 180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t" 181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t" 182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t" 183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t" 184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t" 185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t" 186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t" 187 "por %%mm5, %%mm4 \n\t"
188 188
189 MOVNTQ" %%mm0, %0 \n\t" 189 MOVNTQ" %%mm0, %0 \n\t"
190 MOVNTQ" %%mm1, 8%0 \n\t" 190 MOVNTQ" %%mm1, 8%0 \n\t"
191 MOVNTQ" %%mm4, 16%0" 191 MOVNTQ" %%mm4, 16%0"
192 :"=m"(*dest) 192 :"=m"(*dest)
193 :"m"(*s),"m"(mask24l), 193 :"m"(*s),"m"(mask24l),
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195 :"memory"); 195 :"memory");
196 dest += 24; 196 dest += 24;
197 s += 32; 197 s += 32;
198 } 198 }
199 __asm__ volatile(SFENCE:::"memory"); 199 __asm__ volatile(SFENCE:::"memory");
200 __asm__ volatile(EMMS:::"memory"); 200 __asm__ volatile(EMMS:::"memory");
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); 235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
236 mm_end = end - 15; 236 mm_end = end - 15;
237 while (s<mm_end) 237 while (s<mm_end)
238 { 238 {
239 __asm__ volatile( 239 __asm__ volatile(
240 PREFETCH" 32%1 \n\t" 240 PREFETCH" 32%1 \n\t"
241 "movq %1, %%mm0 \n\t" 241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t" 242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t" 243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t" 244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t" 245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t" 246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t" 247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t" 248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ" %%mm0, %0 \n\t" 249 MOVNTQ" %%mm0, %0 \n\t"
250 MOVNTQ" %%mm2, 8%0" 250 MOVNTQ" %%mm2, 8%0"
251 :"=m"(*d) 251 :"=m"(*d)
252 :"m"(*s) 252 :"m"(*s)
253 ); 253 );
254 d+=16; 254 d+=16;
255 s+=16; 255 s+=16;
256 } 256 }
257 __asm__ volatile(SFENCE:::"memory"); 257 __asm__ volatile(SFENCE:::"memory");
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); 285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
286 mm_end = end - 15; 286 mm_end = end - 15;
287 while (s<mm_end) 287 while (s<mm_end)
288 { 288 {
289 __asm__ volatile( 289 __asm__ volatile(
290 PREFETCH" 32%1 \n\t" 290 PREFETCH" 32%1 \n\t"
291 "movq %1, %%mm0 \n\t" 291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t" 292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t" 293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t" 294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t" 295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t" 296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t" 297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t" 298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t" 299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t" 300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t" 301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t" 302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ" %%mm0, %0 \n\t" 303 MOVNTQ" %%mm0, %0 \n\t"
304 MOVNTQ" %%mm2, 8%0" 304 MOVNTQ" %%mm2, 8%0"
305 :"=m"(*d) 305 :"=m"(*d)
306 :"m"(*s) 306 :"m"(*s)
307 ); 307 );
308 d+=16; 308 d+=16;
309 s+=16; 309 s+=16;
310 } 310 }
311 __asm__ volatile(SFENCE:::"memory"); 311 __asm__ volatile(SFENCE:::"memory");
337 end = s + src_size; 337 end = s + src_size;
338 #if HAVE_MMX 338 #if HAVE_MMX
339 mm_end = end - 15; 339 mm_end = end - 15;
340 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) 340 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
341 __asm__ volatile( 341 __asm__ volatile(
342 "movq %3, %%mm5 \n\t" 342 "movq %3, %%mm5 \n\t"
343 "movq %4, %%mm6 \n\t" 343 "movq %4, %%mm6 \n\t"
344 "movq %5, %%mm7 \n\t" 344 "movq %5, %%mm7 \n\t"
345 "jmp 2f \n\t" 345 "jmp 2f \n\t"
346 ASMALIGN(4) 346 ASMALIGN(4)
347 "1: \n\t" 347 "1: \n\t"
348 PREFETCH" 32(%1) \n\t" 348 PREFETCH" 32(%1) \n\t"
349 "movd (%1), %%mm0 \n\t" 349 "movd (%1), %%mm0 \n\t"
350 "movd 4(%1), %%mm3 \n\t" 350 "movd 4(%1), %%mm3 \n\t"
351 "punpckldq 8(%1), %%mm0 \n\t" 351 "punpckldq 8(%1), %%mm0 \n\t"
352 "punpckldq 12(%1), %%mm3 \n\t" 352 "punpckldq 12(%1), %%mm3 \n\t"
353 "movq %%mm0, %%mm1 \n\t" 353 "movq %%mm0, %%mm1 \n\t"
354 "movq %%mm3, %%mm4 \n\t" 354 "movq %%mm3, %%mm4 \n\t"
355 "pand %%mm6, %%mm0 \n\t" 355 "pand %%mm6, %%mm0 \n\t"
356 "pand %%mm6, %%mm3 \n\t" 356 "pand %%mm6, %%mm3 \n\t"
357 "pmaddwd %%mm7, %%mm0 \n\t" 357 "pmaddwd %%mm7, %%mm0 \n\t"
358 "pmaddwd %%mm7, %%mm3 \n\t" 358 "pmaddwd %%mm7, %%mm3 \n\t"
359 "pand %%mm5, %%mm1 \n\t" 359 "pand %%mm5, %%mm1 \n\t"
360 "pand %%mm5, %%mm4 \n\t" 360 "pand %%mm5, %%mm4 \n\t"
361 "por %%mm1, %%mm0 \n\t" 361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t" 362 "por %%mm4, %%mm3 \n\t"
363 "psrld $5, %%mm0 \n\t" 363 "psrld $5, %%mm0 \n\t"
364 "pslld $11, %%mm3 \n\t" 364 "pslld $11, %%mm3 \n\t"
365 "por %%mm3, %%mm0 \n\t" 365 "por %%mm3, %%mm0 \n\t"
366 MOVNTQ" %%mm0, (%0) \n\t" 366 MOVNTQ" %%mm0, (%0) \n\t"
367 "add $16, %1 \n\t" 367 "add $16, %1 \n\t"
368 "add $8, %0 \n\t" 368 "add $8, %0 \n\t"
369 "2: \n\t" 369 "2: \n\t"
370 "cmp %2, %1 \n\t" 370 "cmp %2, %1 \n\t"
371 " jb 1b \n\t" 371 " jb 1b \n\t"
372 : "+r" (d), "+r"(s) 372 : "+r" (d), "+r"(s)
373 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) 373 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
374 ); 374 );
375 #else 375 #else
376 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 376 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
377 __asm__ volatile( 377 __asm__ volatile(
378 "movq %0, %%mm7 \n\t" 378 "movq %0, %%mm7 \n\t"
379 "movq %1, %%mm6 \n\t" 379 "movq %1, %%mm6 \n\t"
380 ::"m"(red_16mask),"m"(green_16mask)); 380 ::"m"(red_16mask),"m"(green_16mask));
381 while (s < mm_end) 381 while (s < mm_end)
382 { 382 {
383 __asm__ volatile( 383 __asm__ volatile(
384 PREFETCH" 32%1 \n\t" 384 PREFETCH" 32%1 \n\t"
385 "movd %1, %%mm0 \n\t" 385 "movd %1, %%mm0 \n\t"
386 "movd 4%1, %%mm3 \n\t" 386 "movd 4%1, %%mm3 \n\t"
387 "punpckldq 8%1, %%mm0 \n\t" 387 "punpckldq 8%1, %%mm0 \n\t"
388 "punpckldq 12%1, %%mm3 \n\t" 388 "punpckldq 12%1, %%mm3 \n\t"
389 "movq %%mm0, %%mm1 \n\t" 389 "movq %%mm0, %%mm1 \n\t"
390 "movq %%mm0, %%mm2 \n\t" 390 "movq %%mm0, %%mm2 \n\t"
391 "movq %%mm3, %%mm4 \n\t" 391 "movq %%mm3, %%mm4 \n\t"
392 "movq %%mm3, %%mm5 \n\t" 392 "movq %%mm3, %%mm5 \n\t"
393 "psrlq $3, %%mm0 \n\t" 393 "psrlq $3, %%mm0 \n\t"
394 "psrlq $3, %%mm3 \n\t" 394 "psrlq $3, %%mm3 \n\t"
395 "pand %2, %%mm0 \n\t" 395 "pand %2, %%mm0 \n\t"
396 "pand %2, %%mm3 \n\t" 396 "pand %2, %%mm3 \n\t"
397 "psrlq $5, %%mm1 \n\t" 397 "psrlq $5, %%mm1 \n\t"
398 "psrlq $5, %%mm4 \n\t" 398 "psrlq $5, %%mm4 \n\t"
399 "pand %%mm6, %%mm1 \n\t" 399 "pand %%mm6, %%mm1 \n\t"
400 "pand %%mm6, %%mm4 \n\t" 400 "pand %%mm6, %%mm4 \n\t"
401 "psrlq $8, %%mm2 \n\t" 401 "psrlq $8, %%mm2 \n\t"
402 "psrlq $8, %%mm5 \n\t" 402 "psrlq $8, %%mm5 \n\t"
403 "pand %%mm7, %%mm2 \n\t" 403 "pand %%mm7, %%mm2 \n\t"
404 "pand %%mm7, %%mm5 \n\t" 404 "pand %%mm7, %%mm5 \n\t"
405 "por %%mm1, %%mm0 \n\t" 405 "por %%mm1, %%mm0 \n\t"
406 "por %%mm4, %%mm3 \n\t" 406 "por %%mm4, %%mm3 \n\t"
407 "por %%mm2, %%mm0 \n\t" 407 "por %%mm2, %%mm0 \n\t"
408 "por %%mm5, %%mm3 \n\t" 408 "por %%mm5, %%mm3 \n\t"
409 "psllq $16, %%mm3 \n\t" 409 "psllq $16, %%mm3 \n\t"
410 "por %%mm3, %%mm0 \n\t" 410 "por %%mm3, %%mm0 \n\t"
411 MOVNTQ" %%mm0, %0 \n\t" 411 MOVNTQ" %%mm0, %0 \n\t"
412 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 412 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
413 d += 4; 413 d += 4;
414 s += 16; 414 s += 16;
415 } 415 }
416 #endif 416 #endif
417 __asm__ volatile(SFENCE:::"memory"); 417 __asm__ volatile(SFENCE:::"memory");
441 ::"m"(red_16mask),"m"(green_16mask)); 441 ::"m"(red_16mask),"m"(green_16mask));
442 mm_end = end - 15; 442 mm_end = end - 15;
443 while (s < mm_end) 443 while (s < mm_end)
444 { 444 {
445 __asm__ volatile( 445 __asm__ volatile(
446 PREFETCH" 32%1 \n\t" 446 PREFETCH" 32%1 \n\t"
447 "movd %1, %%mm0 \n\t" 447 "movd %1, %%mm0 \n\t"
448 "movd 4%1, %%mm3 \n\t" 448 "movd 4%1, %%mm3 \n\t"
449 "punpckldq 8%1, %%mm0 \n\t" 449 "punpckldq 8%1, %%mm0 \n\t"
450 "punpckldq 12%1, %%mm3 \n\t" 450 "punpckldq 12%1, %%mm3 \n\t"
451 "movq %%mm0, %%mm1 \n\t" 451 "movq %%mm0, %%mm1 \n\t"
452 "movq %%mm0, %%mm2 \n\t" 452 "movq %%mm0, %%mm2 \n\t"
453 "movq %%mm3, %%mm4 \n\t" 453 "movq %%mm3, %%mm4 \n\t"
454 "movq %%mm3, %%mm5 \n\t" 454 "movq %%mm3, %%mm5 \n\t"
455 "psllq $8, %%mm0 \n\t" 455 "psllq $8, %%mm0 \n\t"
456 "psllq $8, %%mm3 \n\t" 456 "psllq $8, %%mm3 \n\t"
457 "pand %%mm7, %%mm0 \n\t" 457 "pand %%mm7, %%mm0 \n\t"
458 "pand %%mm7, %%mm3 \n\t" 458 "pand %%mm7, %%mm3 \n\t"
459 "psrlq $5, %%mm1 \n\t" 459 "psrlq $5, %%mm1 \n\t"
460 "psrlq $5, %%mm4 \n\t" 460 "psrlq $5, %%mm4 \n\t"
461 "pand %%mm6, %%mm1 \n\t" 461 "pand %%mm6, %%mm1 \n\t"
462 "pand %%mm6, %%mm4 \n\t" 462 "pand %%mm6, %%mm4 \n\t"
463 "psrlq $19, %%mm2 \n\t" 463 "psrlq $19, %%mm2 \n\t"
464 "psrlq $19, %%mm5 \n\t" 464 "psrlq $19, %%mm5 \n\t"
465 "pand %2, %%mm2 \n\t" 465 "pand %2, %%mm2 \n\t"
466 "pand %2, %%mm5 \n\t" 466 "pand %2, %%mm5 \n\t"
467 "por %%mm1, %%mm0 \n\t" 467 "por %%mm1, %%mm0 \n\t"
468 "por %%mm4, %%mm3 \n\t" 468 "por %%mm4, %%mm3 \n\t"
469 "por %%mm2, %%mm0 \n\t" 469 "por %%mm2, %%mm0 \n\t"
470 "por %%mm5, %%mm3 \n\t" 470 "por %%mm5, %%mm3 \n\t"
471 "psllq $16, %%mm3 \n\t" 471 "psllq $16, %%mm3 \n\t"
472 "por %%mm3, %%mm0 \n\t" 472 "por %%mm3, %%mm0 \n\t"
473 MOVNTQ" %%mm0, %0 \n\t" 473 MOVNTQ" %%mm0, %0 \n\t"
474 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 474 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
475 d += 4; 475 d += 4;
476 s += 16; 476 s += 16;
477 } 477 }
478 __asm__ volatile(SFENCE:::"memory"); 478 __asm__ volatile(SFENCE:::"memory");
479 __asm__ volatile(EMMS:::"memory"); 479 __asm__ volatile(EMMS:::"memory");
496 end = s + src_size; 496 end = s + src_size;
497 #if HAVE_MMX 497 #if HAVE_MMX
498 mm_end = end - 15; 498 mm_end = end - 15;
499 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) 499 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
500 __asm__ volatile( 500 __asm__ volatile(
501 "movq %3, %%mm5 \n\t" 501 "movq %3, %%mm5 \n\t"
502 "movq %4, %%mm6 \n\t" 502 "movq %4, %%mm6 \n\t"
503 "movq %5, %%mm7 \n\t" 503 "movq %5, %%mm7 \n\t"
504 "jmp 2f \n\t" 504 "jmp 2f \n\t"
505 ASMALIGN(4) 505 ASMALIGN(4)
506 "1: \n\t" 506 "1: \n\t"
507 PREFETCH" 32(%1) \n\t" 507 PREFETCH" 32(%1) \n\t"
508 "movd (%1), %%mm0 \n\t" 508 "movd (%1), %%mm0 \n\t"
509 "movd 4(%1), %%mm3 \n\t" 509 "movd 4(%1), %%mm3 \n\t"
510 "punpckldq 8(%1), %%mm0 \n\t" 510 "punpckldq 8(%1), %%mm0 \n\t"
511 "punpckldq 12(%1), %%mm3 \n\t" 511 "punpckldq 12(%1), %%mm3 \n\t"
512 "movq %%mm0, %%mm1 \n\t" 512 "movq %%mm0, %%mm1 \n\t"
513 "movq %%mm3, %%mm4 \n\t" 513 "movq %%mm3, %%mm4 \n\t"
514 "pand %%mm6, %%mm0 \n\t" 514 "pand %%mm6, %%mm0 \n\t"
515 "pand %%mm6, %%mm3 \n\t" 515 "pand %%mm6, %%mm3 \n\t"
516 "pmaddwd %%mm7, %%mm0 \n\t" 516 "pmaddwd %%mm7, %%mm0 \n\t"
517 "pmaddwd %%mm7, %%mm3 \n\t" 517 "pmaddwd %%mm7, %%mm3 \n\t"
518 "pand %%mm5, %%mm1 \n\t" 518 "pand %%mm5, %%mm1 \n\t"
519 "pand %%mm5, %%mm4 \n\t" 519 "pand %%mm5, %%mm4 \n\t"
520 "por %%mm1, %%mm0 \n\t" 520 "por %%mm1, %%mm0 \n\t"
521 "por %%mm4, %%mm3 \n\t" 521 "por %%mm4, %%mm3 \n\t"
522 "psrld $6, %%mm0 \n\t" 522 "psrld $6, %%mm0 \n\t"
523 "pslld $10, %%mm3 \n\t" 523 "pslld $10, %%mm3 \n\t"
524 "por %%mm3, %%mm0 \n\t" 524 "por %%mm3, %%mm0 \n\t"
525 MOVNTQ" %%mm0, (%0) \n\t" 525 MOVNTQ" %%mm0, (%0) \n\t"
526 "add $16, %1 \n\t" 526 "add $16, %1 \n\t"
527 "add $8, %0 \n\t" 527 "add $8, %0 \n\t"
528 "2: \n\t" 528 "2: \n\t"
529 "cmp %2, %1 \n\t" 529 "cmp %2, %1 \n\t"
530 " jb 1b \n\t" 530 " jb 1b \n\t"
531 : "+r" (d), "+r"(s) 531 : "+r" (d), "+r"(s)
532 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) 532 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
533 ); 533 );
534 #else 534 #else
535 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 535 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
536 __asm__ volatile( 536 __asm__ volatile(
537 "movq %0, %%mm7 \n\t" 537 "movq %0, %%mm7 \n\t"
538 "movq %1, %%mm6 \n\t" 538 "movq %1, %%mm6 \n\t"
539 ::"m"(red_15mask),"m"(green_15mask)); 539 ::"m"(red_15mask),"m"(green_15mask));
540 while (s < mm_end) 540 while (s < mm_end)
541 { 541 {
542 __asm__ volatile( 542 __asm__ volatile(
543 PREFETCH" 32%1 \n\t" 543 PREFETCH" 32%1 \n\t"
544 "movd %1, %%mm0 \n\t" 544 "movd %1, %%mm0 \n\t"
545 "movd 4%1, %%mm3 \n\t" 545 "movd 4%1, %%mm3 \n\t"
546 "punpckldq 8%1, %%mm0 \n\t" 546 "punpckldq 8%1, %%mm0 \n\t"
547 "punpckldq 12%1, %%mm3 \n\t" 547 "punpckldq 12%1, %%mm3 \n\t"
548 "movq %%mm0, %%mm1 \n\t" 548 "movq %%mm0, %%mm1 \n\t"
549 "movq %%mm0, %%mm2 \n\t" 549 "movq %%mm0, %%mm2 \n\t"
550 "movq %%mm3, %%mm4 \n\t" 550 "movq %%mm3, %%mm4 \n\t"
551 "movq %%mm3, %%mm5 \n\t" 551 "movq %%mm3, %%mm5 \n\t"
552 "psrlq $3, %%mm0 \n\t" 552 "psrlq $3, %%mm0 \n\t"
553 "psrlq $3, %%mm3 \n\t" 553 "psrlq $3, %%mm3 \n\t"
554 "pand %2, %%mm0 \n\t" 554 "pand %2, %%mm0 \n\t"
555 "pand %2, %%mm3 \n\t" 555 "pand %2, %%mm3 \n\t"
556 "psrlq $6, %%mm1 \n\t" 556 "psrlq $6, %%mm1 \n\t"
557 "psrlq $6, %%mm4 \n\t" 557 "psrlq $6, %%mm4 \n\t"
558 "pand %%mm6, %%mm1 \n\t" 558 "pand %%mm6, %%mm1 \n\t"
559 "pand %%mm6, %%mm4 \n\t" 559 "pand %%mm6, %%mm4 \n\t"
560 "psrlq $9, %%mm2 \n\t" 560 "psrlq $9, %%mm2 \n\t"
561 "psrlq $9, %%mm5 \n\t" 561 "psrlq $9, %%mm5 \n\t"
562 "pand %%mm7, %%mm2 \n\t" 562 "pand %%mm7, %%mm2 \n\t"
563 "pand %%mm7, %%mm5 \n\t" 563 "pand %%mm7, %%mm5 \n\t"
564 "por %%mm1, %%mm0 \n\t" 564 "por %%mm1, %%mm0 \n\t"
565 "por %%mm4, %%mm3 \n\t" 565 "por %%mm4, %%mm3 \n\t"
566 "por %%mm2, %%mm0 \n\t" 566 "por %%mm2, %%mm0 \n\t"
567 "por %%mm5, %%mm3 \n\t" 567 "por %%mm5, %%mm3 \n\t"
568 "psllq $16, %%mm3 \n\t" 568 "psllq $16, %%mm3 \n\t"
569 "por %%mm3, %%mm0 \n\t" 569 "por %%mm3, %%mm0 \n\t"
570 MOVNTQ" %%mm0, %0 \n\t" 570 MOVNTQ" %%mm0, %0 \n\t"
571 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 571 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
572 d += 4; 572 d += 4;
573 s += 16; 573 s += 16;
574 } 574 }
575 #endif 575 #endif
576 __asm__ volatile(SFENCE:::"memory"); 576 __asm__ volatile(SFENCE:::"memory");
600 ::"m"(red_15mask),"m"(green_15mask)); 600 ::"m"(red_15mask),"m"(green_15mask));
601 mm_end = end - 15; 601 mm_end = end - 15;
602 while (s < mm_end) 602 while (s < mm_end)
603 { 603 {
604 __asm__ volatile( 604 __asm__ volatile(
605 PREFETCH" 32%1 \n\t" 605 PREFETCH" 32%1 \n\t"
606 "movd %1, %%mm0 \n\t" 606 "movd %1, %%mm0 \n\t"
607 "movd 4%1, %%mm3 \n\t" 607 "movd 4%1, %%mm3 \n\t"
608 "punpckldq 8%1, %%mm0 \n\t" 608 "punpckldq 8%1, %%mm0 \n\t"
609 "punpckldq 12%1, %%mm3 \n\t" 609 "punpckldq 12%1, %%mm3 \n\t"
610 "movq %%mm0, %%mm1 \n\t" 610 "movq %%mm0, %%mm1 \n\t"
611 "movq %%mm0, %%mm2 \n\t" 611 "movq %%mm0, %%mm2 \n\t"
612 "movq %%mm3, %%mm4 \n\t" 612 "movq %%mm3, %%mm4 \n\t"
613 "movq %%mm3, %%mm5 \n\t" 613 "movq %%mm3, %%mm5 \n\t"
614 "psllq $7, %%mm0 \n\t" 614 "psllq $7, %%mm0 \n\t"
615 "psllq $7, %%mm3 \n\t" 615 "psllq $7, %%mm3 \n\t"
616 "pand %%mm7, %%mm0 \n\t" 616 "pand %%mm7, %%mm0 \n\t"
617 "pand %%mm7, %%mm3 \n\t" 617 "pand %%mm7, %%mm3 \n\t"
618 "psrlq $6, %%mm1 \n\t" 618 "psrlq $6, %%mm1 \n\t"
619 "psrlq $6, %%mm4 \n\t" 619 "psrlq $6, %%mm4 \n\t"
620 "pand %%mm6, %%mm1 \n\t" 620 "pand %%mm6, %%mm1 \n\t"
621 "pand %%mm6, %%mm4 \n\t" 621 "pand %%mm6, %%mm4 \n\t"
622 "psrlq $19, %%mm2 \n\t" 622 "psrlq $19, %%mm2 \n\t"
623 "psrlq $19, %%mm5 \n\t" 623 "psrlq $19, %%mm5 \n\t"
624 "pand %2, %%mm2 \n\t" 624 "pand %2, %%mm2 \n\t"
625 "pand %2, %%mm5 \n\t" 625 "pand %2, %%mm5 \n\t"
626 "por %%mm1, %%mm0 \n\t" 626 "por %%mm1, %%mm0 \n\t"
627 "por %%mm4, %%mm3 \n\t" 627 "por %%mm4, %%mm3 \n\t"
628 "por %%mm2, %%mm0 \n\t" 628 "por %%mm2, %%mm0 \n\t"
629 "por %%mm5, %%mm3 \n\t" 629 "por %%mm5, %%mm3 \n\t"
630 "psllq $16, %%mm3 \n\t" 630 "psllq $16, %%mm3 \n\t"
631 "por %%mm3, %%mm0 \n\t" 631 "por %%mm3, %%mm0 \n\t"
632 MOVNTQ" %%mm0, %0 \n\t" 632 MOVNTQ" %%mm0, %0 \n\t"
633 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 633 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
634 d += 4; 634 d += 4;
635 s += 16; 635 s += 16;
636 } 636 }
637 __asm__ volatile(SFENCE:::"memory"); 637 __asm__ volatile(SFENCE:::"memory");
638 __asm__ volatile(EMMS:::"memory"); 638 __asm__ volatile(EMMS:::"memory");
661 ::"m"(red_16mask),"m"(green_16mask)); 661 ::"m"(red_16mask),"m"(green_16mask));
662 mm_end = end - 11; 662 mm_end = end - 11;
663 while (s < mm_end) 663 while (s < mm_end)
664 { 664 {
665 __asm__ volatile( 665 __asm__ volatile(
666 PREFETCH" 32%1 \n\t" 666 PREFETCH" 32%1 \n\t"
667 "movd %1, %%mm0 \n\t" 667 "movd %1, %%mm0 \n\t"
668 "movd 3%1, %%mm3 \n\t" 668 "movd 3%1, %%mm3 \n\t"
669 "punpckldq 6%1, %%mm0 \n\t" 669 "punpckldq 6%1, %%mm0 \n\t"
670 "punpckldq 9%1, %%mm3 \n\t" 670 "punpckldq 9%1, %%mm3 \n\t"
671 "movq %%mm0, %%mm1 \n\t" 671 "movq %%mm0, %%mm1 \n\t"
672 "movq %%mm0, %%mm2 \n\t" 672 "movq %%mm0, %%mm2 \n\t"
673 "movq %%mm3, %%mm4 \n\t" 673 "movq %%mm3, %%mm4 \n\t"
674 "movq %%mm3, %%mm5 \n\t" 674 "movq %%mm3, %%mm5 \n\t"
675 "psrlq $3, %%mm0 \n\t" 675 "psrlq $3, %%mm0 \n\t"
676 "psrlq $3, %%mm3 \n\t" 676 "psrlq $3, %%mm3 \n\t"
677 "pand %2, %%mm0 \n\t" 677 "pand %2, %%mm0 \n\t"
678 "pand %2, %%mm3 \n\t" 678 "pand %2, %%mm3 \n\t"
679 "psrlq $5, %%mm1 \n\t" 679 "psrlq $5, %%mm1 \n\t"
680 "psrlq $5, %%mm4 \n\t" 680 "psrlq $5, %%mm4 \n\t"
681 "pand %%mm6, %%mm1 \n\t" 681 "pand %%mm6, %%mm1 \n\t"
682 "pand %%mm6, %%mm4 \n\t" 682 "pand %%mm6, %%mm4 \n\t"
683 "psrlq $8, %%mm2 \n\t" 683 "psrlq $8, %%mm2 \n\t"
684 "psrlq $8, %%mm5 \n\t" 684 "psrlq $8, %%mm5 \n\t"
685 "pand %%mm7, %%mm2 \n\t" 685 "pand %%mm7, %%mm2 \n\t"
686 "pand %%mm7, %%mm5 \n\t" 686 "pand %%mm7, %%mm5 \n\t"
687 "por %%mm1, %%mm0 \n\t" 687 "por %%mm1, %%mm0 \n\t"
688 "por %%mm4, %%mm3 \n\t" 688 "por %%mm4, %%mm3 \n\t"
689 "por %%mm2, %%mm0 \n\t" 689 "por %%mm2, %%mm0 \n\t"
690 "por %%mm5, %%mm3 \n\t" 690 "por %%mm5, %%mm3 \n\t"
691 "psllq $16, %%mm3 \n\t" 691 "psllq $16, %%mm3 \n\t"
692 "por %%mm3, %%mm0 \n\t" 692 "por %%mm3, %%mm0 \n\t"
693 MOVNTQ" %%mm0, %0 \n\t" 693 MOVNTQ" %%mm0, %0 \n\t"
694 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 694 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
695 d += 4; 695 d += 4;
696 s += 12; 696 s += 12;
697 } 697 }
698 __asm__ volatile(SFENCE:::"memory"); 698 __asm__ volatile(SFENCE:::"memory");
699 __asm__ volatile(EMMS:::"memory"); 699 __asm__ volatile(EMMS:::"memory");
724 ::"m"(red_16mask),"m"(green_16mask)); 724 ::"m"(red_16mask),"m"(green_16mask));
725 mm_end = end - 15; 725 mm_end = end - 15;
726 while (s < mm_end) 726 while (s < mm_end)
727 { 727 {
728 __asm__ volatile( 728 __asm__ volatile(
729 PREFETCH" 32%1 \n\t" 729 PREFETCH" 32%1 \n\t"
730 "movd %1, %%mm0 \n\t" 730 "movd %1, %%mm0 \n\t"
731 "movd 3%1, %%mm3 \n\t" 731 "movd 3%1, %%mm3 \n\t"
732 "punpckldq 6%1, %%mm0 \n\t" 732 "punpckldq 6%1, %%mm0 \n\t"
733 "punpckldq 9%1, %%mm3 \n\t" 733 "punpckldq 9%1, %%mm3 \n\t"
734 "movq %%mm0, %%mm1 \n\t" 734 "movq %%mm0, %%mm1 \n\t"
735 "movq %%mm0, %%mm2 \n\t" 735 "movq %%mm0, %%mm2 \n\t"
736 "movq %%mm3, %%mm4 \n\t" 736 "movq %%mm3, %%mm4 \n\t"
737 "movq %%mm3, %%mm5 \n\t" 737 "movq %%mm3, %%mm5 \n\t"
738 "psllq $8, %%mm0 \n\t" 738 "psllq $8, %%mm0 \n\t"
739 "psllq $8, %%mm3 \n\t" 739 "psllq $8, %%mm3 \n\t"
740 "pand %%mm7, %%mm0 \n\t" 740 "pand %%mm7, %%mm0 \n\t"
741 "pand %%mm7, %%mm3 \n\t" 741 "pand %%mm7, %%mm3 \n\t"
742 "psrlq $5, %%mm1 \n\t" 742 "psrlq $5, %%mm1 \n\t"
743 "psrlq $5, %%mm4 \n\t" 743 "psrlq $5, %%mm4 \n\t"
744 "pand %%mm6, %%mm1 \n\t" 744 "pand %%mm6, %%mm1 \n\t"
745 "pand %%mm6, %%mm4 \n\t" 745 "pand %%mm6, %%mm4 \n\t"
746 "psrlq $19, %%mm2 \n\t" 746 "psrlq $19, %%mm2 \n\t"
747 "psrlq $19, %%mm5 \n\t" 747 "psrlq $19, %%mm5 \n\t"
748 "pand %2, %%mm2 \n\t" 748 "pand %2, %%mm2 \n\t"
749 "pand %2, %%mm5 \n\t" 749 "pand %2, %%mm5 \n\t"
750 "por %%mm1, %%mm0 \n\t" 750 "por %%mm1, %%mm0 \n\t"
751 "por %%mm4, %%mm3 \n\t" 751 "por %%mm4, %%mm3 \n\t"
752 "por %%mm2, %%mm0 \n\t" 752 "por %%mm2, %%mm0 \n\t"
753 "por %%mm5, %%mm3 \n\t" 753 "por %%mm5, %%mm3 \n\t"
754 "psllq $16, %%mm3 \n\t" 754 "psllq $16, %%mm3 \n\t"
755 "por %%mm3, %%mm0 \n\t" 755 "por %%mm3, %%mm0 \n\t"
756 MOVNTQ" %%mm0, %0 \n\t" 756 MOVNTQ" %%mm0, %0 \n\t"
757 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 757 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
758 d += 4; 758 d += 4;
759 s += 12; 759 s += 12;
760 } 760 }
761 __asm__ volatile(SFENCE:::"memory"); 761 __asm__ volatile(SFENCE:::"memory");
762 __asm__ volatile(EMMS:::"memory"); 762 __asm__ volatile(EMMS:::"memory");
787 ::"m"(red_15mask),"m"(green_15mask)); 787 ::"m"(red_15mask),"m"(green_15mask));
788 mm_end = end - 11; 788 mm_end = end - 11;
789 while (s < mm_end) 789 while (s < mm_end)
790 { 790 {
791 __asm__ volatile( 791 __asm__ volatile(
792 PREFETCH" 32%1 \n\t" 792 PREFETCH" 32%1 \n\t"
793 "movd %1, %%mm0 \n\t" 793 "movd %1, %%mm0 \n\t"
794 "movd 3%1, %%mm3 \n\t" 794 "movd 3%1, %%mm3 \n\t"
795 "punpckldq 6%1, %%mm0 \n\t" 795 "punpckldq 6%1, %%mm0 \n\t"
796 "punpckldq 9%1, %%mm3 \n\t" 796 "punpckldq 9%1, %%mm3 \n\t"
797 "movq %%mm0, %%mm1 \n\t" 797 "movq %%mm0, %%mm1 \n\t"
798 "movq %%mm0, %%mm2 \n\t" 798 "movq %%mm0, %%mm2 \n\t"
799 "movq %%mm3, %%mm4 \n\t" 799 "movq %%mm3, %%mm4 \n\t"
800 "movq %%mm3, %%mm5 \n\t" 800 "movq %%mm3, %%mm5 \n\t"
801 "psrlq $3, %%mm0 \n\t" 801 "psrlq $3, %%mm0 \n\t"
802 "psrlq $3, %%mm3 \n\t" 802 "psrlq $3, %%mm3 \n\t"
803 "pand %2, %%mm0 \n\t" 803 "pand %2, %%mm0 \n\t"
804 "pand %2, %%mm3 \n\t" 804 "pand %2, %%mm3 \n\t"
805 "psrlq $6, %%mm1 \n\t" 805 "psrlq $6, %%mm1 \n\t"
806 "psrlq $6, %%mm4 \n\t" 806 "psrlq $6, %%mm4 \n\t"
807 "pand %%mm6, %%mm1 \n\t" 807 "pand %%mm6, %%mm1 \n\t"
808 "pand %%mm6, %%mm4 \n\t" 808 "pand %%mm6, %%mm4 \n\t"
809 "psrlq $9, %%mm2 \n\t" 809 "psrlq $9, %%mm2 \n\t"
810 "psrlq $9, %%mm5 \n\t" 810 "psrlq $9, %%mm5 \n\t"
811 "pand %%mm7, %%mm2 \n\t" 811 "pand %%mm7, %%mm2 \n\t"
812 "pand %%mm7, %%mm5 \n\t" 812 "pand %%mm7, %%mm5 \n\t"
813 "por %%mm1, %%mm0 \n\t" 813 "por %%mm1, %%mm0 \n\t"
814 "por %%mm4, %%mm3 \n\t" 814 "por %%mm4, %%mm3 \n\t"
815 "por %%mm2, %%mm0 \n\t" 815 "por %%mm2, %%mm0 \n\t"
816 "por %%mm5, %%mm3 \n\t" 816 "por %%mm5, %%mm3 \n\t"
817 "psllq $16, %%mm3 \n\t" 817 "psllq $16, %%mm3 \n\t"
818 "por %%mm3, %%mm0 \n\t" 818 "por %%mm3, %%mm0 \n\t"
819 MOVNTQ" %%mm0, %0 \n\t" 819 MOVNTQ" %%mm0, %0 \n\t"
820 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 820 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
821 d += 4; 821 d += 4;
822 s += 12; 822 s += 12;
823 } 823 }
824 __asm__ volatile(SFENCE:::"memory"); 824 __asm__ volatile(SFENCE:::"memory");
825 __asm__ volatile(EMMS:::"memory"); 825 __asm__ volatile(EMMS:::"memory");
850 ::"m"(red_15mask),"m"(green_15mask)); 850 ::"m"(red_15mask),"m"(green_15mask));
851 mm_end = end - 15; 851 mm_end = end - 15;
852 while (s < mm_end) 852 while (s < mm_end)
853 { 853 {
854 __asm__ volatile( 854 __asm__ volatile(
855 PREFETCH" 32%1 \n\t" 855 PREFETCH" 32%1 \n\t"
856 "movd %1, %%mm0 \n\t" 856 "movd %1, %%mm0 \n\t"
857 "movd 3%1, %%mm3 \n\t" 857 "movd 3%1, %%mm3 \n\t"
858 "punpckldq 6%1, %%mm0 \n\t" 858 "punpckldq 6%1, %%mm0 \n\t"
859 "punpckldq 9%1, %%mm3 \n\t" 859 "punpckldq 9%1, %%mm3 \n\t"
860 "movq %%mm0, %%mm1 \n\t" 860 "movq %%mm0, %%mm1 \n\t"
861 "movq %%mm0, %%mm2 \n\t" 861 "movq %%mm0, %%mm2 \n\t"
862 "movq %%mm3, %%mm4 \n\t" 862 "movq %%mm3, %%mm4 \n\t"
863 "movq %%mm3, %%mm5 \n\t" 863 "movq %%mm3, %%mm5 \n\t"
864 "psllq $7, %%mm0 \n\t" 864 "psllq $7, %%mm0 \n\t"
865 "psllq $7, %%mm3 \n\t" 865 "psllq $7, %%mm3 \n\t"
866 "pand %%mm7, %%mm0 \n\t" 866 "pand %%mm7, %%mm0 \n\t"
867 "pand %%mm7, %%mm3 \n\t" 867 "pand %%mm7, %%mm3 \n\t"
868 "psrlq $6, %%mm1 \n\t" 868 "psrlq $6, %%mm1 \n\t"
869 "psrlq $6, %%mm4 \n\t" 869 "psrlq $6, %%mm4 \n\t"
870 "pand %%mm6, %%mm1 \n\t" 870 "pand %%mm6, %%mm1 \n\t"
871 "pand %%mm6, %%mm4 \n\t" 871 "pand %%mm6, %%mm4 \n\t"
872 "psrlq $19, %%mm2 \n\t" 872 "psrlq $19, %%mm2 \n\t"
873 "psrlq $19, %%mm5 \n\t" 873 "psrlq $19, %%mm5 \n\t"
874 "pand %2, %%mm2 \n\t" 874 "pand %2, %%mm2 \n\t"
875 "pand %2, %%mm5 \n\t" 875 "pand %2, %%mm5 \n\t"
876 "por %%mm1, %%mm0 \n\t" 876 "por %%mm1, %%mm0 \n\t"
877 "por %%mm4, %%mm3 \n\t" 877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm2, %%mm0 \n\t" 878 "por %%mm2, %%mm0 \n\t"
879 "por %%mm5, %%mm3 \n\t" 879 "por %%mm5, %%mm3 \n\t"
880 "psllq $16, %%mm3 \n\t" 880 "psllq $16, %%mm3 \n\t"
881 "por %%mm3, %%mm0 \n\t" 881 "por %%mm3, %%mm0 \n\t"
882 MOVNTQ" %%mm0, %0 \n\t" 882 MOVNTQ" %%mm0, %0 \n\t"
883 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 883 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
884 d += 4; 884 d += 4;
885 s += 12; 885 s += 12;
886 } 886 }
887 __asm__ volatile(SFENCE:::"memory"); 887 __asm__ volatile(SFENCE:::"memory");
888 __asm__ volatile(EMMS:::"memory"); 888 __asm__ volatile(EMMS:::"memory");
930 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 930 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
931 mm_end = end - 7; 931 mm_end = end - 7;
932 while (s < mm_end) 932 while (s < mm_end)
933 { 933 {
934 __asm__ volatile( 934 __asm__ volatile(
935 PREFETCH" 32%1 \n\t" 935 PREFETCH" 32%1 \n\t"
936 "movq %1, %%mm0 \n\t" 936 "movq %1, %%mm0 \n\t"
937 "movq %1, %%mm1 \n\t" 937 "movq %1, %%mm1 \n\t"
938 "movq %1, %%mm2 \n\t" 938 "movq %1, %%mm2 \n\t"
939 "pand %2, %%mm0 \n\t" 939 "pand %2, %%mm0 \n\t"
940 "pand %3, %%mm1 \n\t" 940 "pand %3, %%mm1 \n\t"
941 "pand %4, %%mm2 \n\t" 941 "pand %4, %%mm2 \n\t"
942 "psllq $3, %%mm0 \n\t" 942 "psllq $3, %%mm0 \n\t"
943 "psrlq $2, %%mm1 \n\t" 943 "psrlq $2, %%mm1 \n\t"
944 "psrlq $7, %%mm2 \n\t" 944 "psrlq $7, %%mm2 \n\t"
945 "movq %%mm0, %%mm3 \n\t" 945 "movq %%mm0, %%mm3 \n\t"
946 "movq %%mm1, %%mm4 \n\t" 946 "movq %%mm1, %%mm4 \n\t"
947 "movq %%mm2, %%mm5 \n\t" 947 "movq %%mm2, %%mm5 \n\t"
948 "punpcklwd %5, %%mm0 \n\t" 948 "punpcklwd %5, %%mm0 \n\t"
949 "punpcklwd %5, %%mm1 \n\t" 949 "punpcklwd %5, %%mm1 \n\t"
950 "punpcklwd %5, %%mm2 \n\t" 950 "punpcklwd %5, %%mm2 \n\t"
951 "punpckhwd %5, %%mm3 \n\t" 951 "punpckhwd %5, %%mm3 \n\t"
952 "punpckhwd %5, %%mm4 \n\t" 952 "punpckhwd %5, %%mm4 \n\t"
953 "punpckhwd %5, %%mm5 \n\t" 953 "punpckhwd %5, %%mm5 \n\t"
954 "psllq $8, %%mm1 \n\t" 954 "psllq $8, %%mm1 \n\t"
955 "psllq $16, %%mm2 \n\t" 955 "psllq $16, %%mm2 \n\t"
956 "por %%mm1, %%mm0 \n\t" 956 "por %%mm1, %%mm0 \n\t"
957 "por %%mm2, %%mm0 \n\t" 957 "por %%mm2, %%mm0 \n\t"
958 "psllq $8, %%mm4 \n\t" 958 "psllq $8, %%mm4 \n\t"
959 "psllq $16, %%mm5 \n\t" 959 "psllq $16, %%mm5 \n\t"
960 "por %%mm4, %%mm3 \n\t" 960 "por %%mm4, %%mm3 \n\t"
961 "por %%mm5, %%mm3 \n\t" 961 "por %%mm5, %%mm3 \n\t"
962 962
963 "movq %%mm0, %%mm6 \n\t" 963 "movq %%mm0, %%mm6 \n\t"
964 "movq %%mm3, %%mm7 \n\t" 964 "movq %%mm3, %%mm7 \n\t"
965 965
966 "movq 8%1, %%mm0 \n\t" 966 "movq 8%1, %%mm0 \n\t"
967 "movq 8%1, %%mm1 \n\t" 967 "movq 8%1, %%mm1 \n\t"
968 "movq 8%1, %%mm2 \n\t" 968 "movq 8%1, %%mm2 \n\t"
969 "pand %2, %%mm0 \n\t" 969 "pand %2, %%mm0 \n\t"
970 "pand %3, %%mm1 \n\t" 970 "pand %3, %%mm1 \n\t"
971 "pand %4, %%mm2 \n\t" 971 "pand %4, %%mm2 \n\t"
972 "psllq $3, %%mm0 \n\t" 972 "psllq $3, %%mm0 \n\t"
973 "psrlq $2, %%mm1 \n\t" 973 "psrlq $2, %%mm1 \n\t"
974 "psrlq $7, %%mm2 \n\t" 974 "psrlq $7, %%mm2 \n\t"
975 "movq %%mm0, %%mm3 \n\t" 975 "movq %%mm0, %%mm3 \n\t"
976 "movq %%mm1, %%mm4 \n\t" 976 "movq %%mm1, %%mm4 \n\t"
977 "movq %%mm2, %%mm5 \n\t" 977 "movq %%mm2, %%mm5 \n\t"
978 "punpcklwd %5, %%mm0 \n\t" 978 "punpcklwd %5, %%mm0 \n\t"
979 "punpcklwd %5, %%mm1 \n\t" 979 "punpcklwd %5, %%mm1 \n\t"
980 "punpcklwd %5, %%mm2 \n\t" 980 "punpcklwd %5, %%mm2 \n\t"
981 "punpckhwd %5, %%mm3 \n\t" 981 "punpckhwd %5, %%mm3 \n\t"
982 "punpckhwd %5, %%mm4 \n\t" 982 "punpckhwd %5, %%mm4 \n\t"
983 "punpckhwd %5, %%mm5 \n\t" 983 "punpckhwd %5, %%mm5 \n\t"
984 "psllq $8, %%mm1 \n\t" 984 "psllq $8, %%mm1 \n\t"
985 "psllq $16, %%mm2 \n\t" 985 "psllq $16, %%mm2 \n\t"
986 "por %%mm1, %%mm0 \n\t" 986 "por %%mm1, %%mm0 \n\t"
987 "por %%mm2, %%mm0 \n\t" 987 "por %%mm2, %%mm0 \n\t"
988 "psllq $8, %%mm4 \n\t" 988 "psllq $8, %%mm4 \n\t"
989 "psllq $16, %%mm5 \n\t" 989 "psllq $16, %%mm5 \n\t"
990 "por %%mm4, %%mm3 \n\t" 990 "por %%mm4, %%mm3 \n\t"
991 "por %%mm5, %%mm3 \n\t" 991 "por %%mm5, %%mm3 \n\t"
992 992
993 :"=m"(*d) 993 :"=m"(*d)
994 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) 994 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
995 :"memory"); 995 :"memory");
996 /* borrowed 32 to 24 */ 996 /* borrowed 32 to 24 */
997 __asm__ volatile( 997 __asm__ volatile(
998 "movq %%mm0, %%mm4 \n\t" 998 "movq %%mm0, %%mm4 \n\t"
999 "movq %%mm3, %%mm5 \n\t" 999 "movq %%mm3, %%mm5 \n\t"
1000 "movq %%mm6, %%mm0 \n\t" 1000 "movq %%mm6, %%mm0 \n\t"
1001 "movq %%mm7, %%mm1 \n\t" 1001 "movq %%mm7, %%mm1 \n\t"
1002 1002
1003 "movq %%mm4, %%mm6 \n\t" 1003 "movq %%mm4, %%mm6 \n\t"
1004 "movq %%mm5, %%mm7 \n\t" 1004 "movq %%mm5, %%mm7 \n\t"
1005 "movq %%mm0, %%mm2 \n\t" 1005 "movq %%mm0, %%mm2 \n\t"
1006 "movq %%mm1, %%mm3 \n\t" 1006 "movq %%mm1, %%mm3 \n\t"
1007 1007
1008 "psrlq $8, %%mm2 \n\t" 1008 "psrlq $8, %%mm2 \n\t"
1009 "psrlq $8, %%mm3 \n\t" 1009 "psrlq $8, %%mm3 \n\t"
1010 "psrlq $8, %%mm6 \n\t" 1010 "psrlq $8, %%mm6 \n\t"
1011 "psrlq $8, %%mm7 \n\t" 1011 "psrlq $8, %%mm7 \n\t"
1012 "pand %2, %%mm0 \n\t" 1012 "pand %2, %%mm0 \n\t"
1013 "pand %2, %%mm1 \n\t" 1013 "pand %2, %%mm1 \n\t"
1014 "pand %2, %%mm4 \n\t" 1014 "pand %2, %%mm4 \n\t"
1015 "pand %2, %%mm5 \n\t" 1015 "pand %2, %%mm5 \n\t"
1016 "pand %3, %%mm2 \n\t" 1016 "pand %3, %%mm2 \n\t"
1017 "pand %3, %%mm3 \n\t" 1017 "pand %3, %%mm3 \n\t"
1018 "pand %3, %%mm6 \n\t" 1018 "pand %3, %%mm6 \n\t"
1019 "pand %3, %%mm7 \n\t" 1019 "pand %3, %%mm7 \n\t"
1020 "por %%mm2, %%mm0 \n\t" 1020 "por %%mm2, %%mm0 \n\t"
1021 "por %%mm3, %%mm1 \n\t" 1021 "por %%mm3, %%mm1 \n\t"
1022 "por %%mm6, %%mm4 \n\t" 1022 "por %%mm6, %%mm4 \n\t"
1023 "por %%mm7, %%mm5 \n\t" 1023 "por %%mm7, %%mm5 \n\t"
1024 1024
1025 "movq %%mm1, %%mm2 \n\t" 1025 "movq %%mm1, %%mm2 \n\t"
1026 "movq %%mm4, %%mm3 \n\t" 1026 "movq %%mm4, %%mm3 \n\t"
1027 "psllq $48, %%mm2 \n\t" 1027 "psllq $48, %%mm2 \n\t"
1028 "psllq $32, %%mm3 \n\t" 1028 "psllq $32, %%mm3 \n\t"
1029 "pand %4, %%mm2 \n\t" 1029 "pand %4, %%mm2 \n\t"
1030 "pand %5, %%mm3 \n\t" 1030 "pand %5, %%mm3 \n\t"
1031 "por %%mm2, %%mm0 \n\t" 1031 "por %%mm2, %%mm0 \n\t"
1032 "psrlq $16, %%mm1 \n\t" 1032 "psrlq $16, %%mm1 \n\t"
1033 "psrlq $32, %%mm4 \n\t" 1033 "psrlq $32, %%mm4 \n\t"
1034 "psllq $16, %%mm5 \n\t" 1034 "psllq $16, %%mm5 \n\t"
1035 "por %%mm3, %%mm1 \n\t" 1035 "por %%mm3, %%mm1 \n\t"
1036 "pand %6, %%mm5 \n\t" 1036 "pand %6, %%mm5 \n\t"
1037 "por %%mm5, %%mm4 \n\t" 1037 "por %%mm5, %%mm4 \n\t"
1038 1038
1039 MOVNTQ" %%mm0, %0 \n\t" 1039 MOVNTQ" %%mm0, %0 \n\t"
1040 MOVNTQ" %%mm1, 8%0 \n\t" 1040 MOVNTQ" %%mm1, 8%0 \n\t"
1041 MOVNTQ" %%mm4, 16%0" 1041 MOVNTQ" %%mm4, 16%0"
1042 1042
1043 :"=m"(*d) 1043 :"=m"(*d)
1044 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 1044 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1045 :"memory"); 1045 :"memory");
1046 d += 24; 1046 d += 24;
1047 s += 8; 1047 s += 8;
1048 } 1048 }
1049 __asm__ volatile(SFENCE:::"memory"); 1049 __asm__ volatile(SFENCE:::"memory");
1050 __asm__ volatile(EMMS:::"memory"); 1050 __asm__ volatile(EMMS:::"memory");
1072 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 1072 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1073 mm_end = end - 7; 1073 mm_end = end - 7;
1074 while (s < mm_end) 1074 while (s < mm_end)
1075 { 1075 {
1076 __asm__ volatile( 1076 __asm__ volatile(
1077 PREFETCH" 32%1 \n\t" 1077 PREFETCH" 32%1 \n\t"
1078 "movq %1, %%mm0 \n\t" 1078 "movq %1, %%mm0 \n\t"
1079 "movq %1, %%mm1 \n\t" 1079 "movq %1, %%mm1 \n\t"
1080 "movq %1, %%mm2 \n\t" 1080 "movq %1, %%mm2 \n\t"
1081 "pand %2, %%mm0 \n\t" 1081 "pand %2, %%mm0 \n\t"
1082 "pand %3, %%mm1 \n\t" 1082 "pand %3, %%mm1 \n\t"
1083 "pand %4, %%mm2 \n\t" 1083 "pand %4, %%mm2 \n\t"
1084 "psllq $3, %%mm0 \n\t" 1084 "psllq $3, %%mm0 \n\t"
1085 "psrlq $3, %%mm1 \n\t" 1085 "psrlq $3, %%mm1 \n\t"
1086 "psrlq $8, %%mm2 \n\t" 1086 "psrlq $8, %%mm2 \n\t"
1087 "movq %%mm0, %%mm3 \n\t" 1087 "movq %%mm0, %%mm3 \n\t"
1088 "movq %%mm1, %%mm4 \n\t" 1088 "movq %%mm1, %%mm4 \n\t"
1089 "movq %%mm2, %%mm5 \n\t" 1089 "movq %%mm2, %%mm5 \n\t"
1090 "punpcklwd %5, %%mm0 \n\t" 1090 "punpcklwd %5, %%mm0 \n\t"
1091 "punpcklwd %5, %%mm1 \n\t" 1091 "punpcklwd %5, %%mm1 \n\t"
1092 "punpcklwd %5, %%mm2 \n\t" 1092 "punpcklwd %5, %%mm2 \n\t"
1093 "punpckhwd %5, %%mm3 \n\t" 1093 "punpckhwd %5, %%mm3 \n\t"
1094 "punpckhwd %5, %%mm4 \n\t" 1094 "punpckhwd %5, %%mm4 \n\t"
1095 "punpckhwd %5, %%mm5 \n\t" 1095 "punpckhwd %5, %%mm5 \n\t"
1096 "psllq $8, %%mm1 \n\t" 1096 "psllq $8, %%mm1 \n\t"
1097 "psllq $16, %%mm2 \n\t" 1097 "psllq $16, %%mm2 \n\t"
1098 "por %%mm1, %%mm0 \n\t" 1098 "por %%mm1, %%mm0 \n\t"
1099 "por %%mm2, %%mm0 \n\t" 1099 "por %%mm2, %%mm0 \n\t"
1100 "psllq $8, %%mm4 \n\t" 1100 "psllq $8, %%mm4 \n\t"
1101 "psllq $16, %%mm5 \n\t" 1101 "psllq $16, %%mm5 \n\t"
1102 "por %%mm4, %%mm3 \n\t" 1102 "por %%mm4, %%mm3 \n\t"
1103 "por %%mm5, %%mm3 \n\t" 1103 "por %%mm5, %%mm3 \n\t"
1104 1104
1105 "movq %%mm0, %%mm6 \n\t" 1105 "movq %%mm0, %%mm6 \n\t"
1106 "movq %%mm3, %%mm7 \n\t" 1106 "movq %%mm3, %%mm7 \n\t"
1107 1107
1108 "movq 8%1, %%mm0 \n\t" 1108 "movq 8%1, %%mm0 \n\t"
1109 "movq 8%1, %%mm1 \n\t" 1109 "movq 8%1, %%mm1 \n\t"
1110 "movq 8%1, %%mm2 \n\t" 1110 "movq 8%1, %%mm2 \n\t"
1111 "pand %2, %%mm0 \n\t" 1111 "pand %2, %%mm0 \n\t"
1112 "pand %3, %%mm1 \n\t" 1112 "pand %3, %%mm1 \n\t"
1113 "pand %4, %%mm2 \n\t" 1113 "pand %4, %%mm2 \n\t"
1114 "psllq $3, %%mm0 \n\t" 1114 "psllq $3, %%mm0 \n\t"
1115 "psrlq $3, %%mm1 \n\t" 1115 "psrlq $3, %%mm1 \n\t"
1116 "psrlq $8, %%mm2 \n\t" 1116 "psrlq $8, %%mm2 \n\t"
1117 "movq %%mm0, %%mm3 \n\t" 1117 "movq %%mm0, %%mm3 \n\t"
1118 "movq %%mm1, %%mm4 \n\t" 1118 "movq %%mm1, %%mm4 \n\t"
1119 "movq %%mm2, %%mm5 \n\t" 1119 "movq %%mm2, %%mm5 \n\t"
1120 "punpcklwd %5, %%mm0 \n\t" 1120 "punpcklwd %5, %%mm0 \n\t"
1121 "punpcklwd %5, %%mm1 \n\t" 1121 "punpcklwd %5, %%mm1 \n\t"
1122 "punpcklwd %5, %%mm2 \n\t" 1122 "punpcklwd %5, %%mm2 \n\t"
1123 "punpckhwd %5, %%mm3 \n\t" 1123 "punpckhwd %5, %%mm3 \n\t"
1124 "punpckhwd %5, %%mm4 \n\t" 1124 "punpckhwd %5, %%mm4 \n\t"
1125 "punpckhwd %5, %%mm5 \n\t" 1125 "punpckhwd %5, %%mm5 \n\t"
1126 "psllq $8, %%mm1 \n\t" 1126 "psllq $8, %%mm1 \n\t"
1127 "psllq $16, %%mm2 \n\t" 1127 "psllq $16, %%mm2 \n\t"
1128 "por %%mm1, %%mm0 \n\t" 1128 "por %%mm1, %%mm0 \n\t"
1129 "por %%mm2, %%mm0 \n\t" 1129 "por %%mm2, %%mm0 \n\t"
1130 "psllq $8, %%mm4 \n\t" 1130 "psllq $8, %%mm4 \n\t"
1131 "psllq $16, %%mm5 \n\t" 1131 "psllq $16, %%mm5 \n\t"
1132 "por %%mm4, %%mm3 \n\t" 1132 "por %%mm4, %%mm3 \n\t"
1133 "por %%mm5, %%mm3 \n\t" 1133 "por %%mm5, %%mm3 \n\t"
1134 :"=m"(*d) 1134 :"=m"(*d)
1135 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) 1135 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1136 :"memory"); 1136 :"memory");
1137 /* borrowed 32 to 24 */ 1137 /* borrowed 32 to 24 */
1138 __asm__ volatile( 1138 __asm__ volatile(
1139 "movq %%mm0, %%mm4 \n\t" 1139 "movq %%mm0, %%mm4 \n\t"
1140 "movq %%mm3, %%mm5 \n\t" 1140 "movq %%mm3, %%mm5 \n\t"
1141 "movq %%mm6, %%mm0 \n\t" 1141 "movq %%mm6, %%mm0 \n\t"
1142 "movq %%mm7, %%mm1 \n\t" 1142 "movq %%mm7, %%mm1 \n\t"
1143 1143
1144 "movq %%mm4, %%mm6 \n\t" 1144 "movq %%mm4, %%mm6 \n\t"
1145 "movq %%mm5, %%mm7 \n\t" 1145 "movq %%mm5, %%mm7 \n\t"
1146 "movq %%mm0, %%mm2 \n\t" 1146 "movq %%mm0, %%mm2 \n\t"
1147 "movq %%mm1, %%mm3 \n\t" 1147 "movq %%mm1, %%mm3 \n\t"
1148 1148
1149 "psrlq $8, %%mm2 \n\t" 1149 "psrlq $8, %%mm2 \n\t"
1150 "psrlq $8, %%mm3 \n\t" 1150 "psrlq $8, %%mm3 \n\t"
1151 "psrlq $8, %%mm6 \n\t" 1151 "psrlq $8, %%mm6 \n\t"
1152 "psrlq $8, %%mm7 \n\t" 1152 "psrlq $8, %%mm7 \n\t"
1153 "pand %2, %%mm0 \n\t" 1153 "pand %2, %%mm0 \n\t"
1154 "pand %2, %%mm1 \n\t" 1154 "pand %2, %%mm1 \n\t"
1155 "pand %2, %%mm4 \n\t" 1155 "pand %2, %%mm4 \n\t"
1156 "pand %2, %%mm5 \n\t" 1156 "pand %2, %%mm5 \n\t"
1157 "pand %3, %%mm2 \n\t" 1157 "pand %3, %%mm2 \n\t"
1158 "pand %3, %%mm3 \n\t" 1158 "pand %3, %%mm3 \n\t"
1159 "pand %3, %%mm6 \n\t" 1159 "pand %3, %%mm6 \n\t"
1160 "pand %3, %%mm7 \n\t" 1160 "pand %3, %%mm7 \n\t"
1161 "por %%mm2, %%mm0 \n\t" 1161 "por %%mm2, %%mm0 \n\t"
1162 "por %%mm3, %%mm1 \n\t" 1162 "por %%mm3, %%mm1 \n\t"
1163 "por %%mm6, %%mm4 \n\t" 1163 "por %%mm6, %%mm4 \n\t"
1164 "por %%mm7, %%mm5 \n\t" 1164 "por %%mm7, %%mm5 \n\t"
1165 1165
1166 "movq %%mm1, %%mm2 \n\t" 1166 "movq %%mm1, %%mm2 \n\t"
1167 "movq %%mm4, %%mm3 \n\t" 1167 "movq %%mm4, %%mm3 \n\t"
1168 "psllq $48, %%mm2 \n\t" 1168 "psllq $48, %%mm2 \n\t"
1169 "psllq $32, %%mm3 \n\t" 1169 "psllq $32, %%mm3 \n\t"
1170 "pand %4, %%mm2 \n\t" 1170 "pand %4, %%mm2 \n\t"
1171 "pand %5, %%mm3 \n\t" 1171 "pand %5, %%mm3 \n\t"
1172 "por %%mm2, %%mm0 \n\t" 1172 "por %%mm2, %%mm0 \n\t"
1173 "psrlq $16, %%mm1 \n\t" 1173 "psrlq $16, %%mm1 \n\t"
1174 "psrlq $32, %%mm4 \n\t" 1174 "psrlq $32, %%mm4 \n\t"
1175 "psllq $16, %%mm5 \n\t" 1175 "psllq $16, %%mm5 \n\t"
1176 "por %%mm3, %%mm1 \n\t" 1176 "por %%mm3, %%mm1 \n\t"
1177 "pand %6, %%mm5 \n\t" 1177 "pand %6, %%mm5 \n\t"
1178 "por %%mm5, %%mm4 \n\t" 1178 "por %%mm5, %%mm4 \n\t"
1179 1179
1180 MOVNTQ" %%mm0, %0 \n\t" 1180 MOVNTQ" %%mm0, %0 \n\t"
1181 MOVNTQ" %%mm1, 8%0 \n\t" 1181 MOVNTQ" %%mm1, 8%0 \n\t"
1182 MOVNTQ" %%mm4, 16%0" 1182 MOVNTQ" %%mm4, 16%0"
1183 1183
1184 :"=m"(*d) 1184 :"=m"(*d)
1185 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 1185 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1186 :"memory"); 1186 :"memory");
1187 d += 24; 1187 d += 24;
1188 s += 8; 1188 s += 8;
1189 } 1189 }
1190 __asm__ volatile(SFENCE:::"memory"); 1190 __asm__ volatile(SFENCE:::"memory");
1191 __asm__ volatile(EMMS:::"memory"); 1191 __asm__ volatile(EMMS:::"memory");
1234 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 1234 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1235 mm_end = end - 3; 1235 mm_end = end - 3;
1236 while (s < mm_end) 1236 while (s < mm_end)
1237 { 1237 {
1238 __asm__ volatile( 1238 __asm__ volatile(
1239 PREFETCH" 32%1 \n\t" 1239 PREFETCH" 32%1 \n\t"
1240 "movq %1, %%mm0 \n\t" 1240 "movq %1, %%mm0 \n\t"
1241 "movq %1, %%mm1 \n\t" 1241 "movq %1, %%mm1 \n\t"
1242 "movq %1, %%mm2 \n\t" 1242 "movq %1, %%mm2 \n\t"
1243 "pand %2, %%mm0 \n\t" 1243 "pand %2, %%mm0 \n\t"
1244 "pand %3, %%mm1 \n\t" 1244 "pand %3, %%mm1 \n\t"
1245 "pand %4, %%mm2 \n\t" 1245 "pand %4, %%mm2 \n\t"
1246 "psllq $3, %%mm0 \n\t" 1246 "psllq $3, %%mm0 \n\t"
1247 "psrlq $2, %%mm1 \n\t" 1247 "psrlq $2, %%mm1 \n\t"
1248 "psrlq $7, %%mm2 \n\t" 1248 "psrlq $7, %%mm2 \n\t"
1249 PACK_RGB32 1249 PACK_RGB32
1250 :"=m"(*d) 1250 :"=m"(*d)
1251 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) 1251 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1252 :"memory"); 1252 :"memory");
1253 d += 16; 1253 d += 16;
1254 s += 4; 1254 s += 4;
1255 } 1255 }
1256 __asm__ volatile(SFENCE:::"memory"); 1256 __asm__ volatile(SFENCE:::"memory");
1257 __asm__ volatile(EMMS:::"memory"); 1257 __asm__ volatile(EMMS:::"memory");
1289 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 1289 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1290 mm_end = end - 3; 1290 mm_end = end - 3;
1291 while (s < mm_end) 1291 while (s < mm_end)
1292 { 1292 {
1293 __asm__ volatile( 1293 __asm__ volatile(
1294 PREFETCH" 32%1 \n\t" 1294 PREFETCH" 32%1 \n\t"
1295 "movq %1, %%mm0 \n\t" 1295 "movq %1, %%mm0 \n\t"
1296 "movq %1, %%mm1 \n\t" 1296 "movq %1, %%mm1 \n\t"
1297 "movq %1, %%mm2 \n\t" 1297 "movq %1, %%mm2 \n\t"
1298 "pand %2, %%mm0 \n\t" 1298 "pand %2, %%mm0 \n\t"
1299 "pand %3, %%mm1 \n\t" 1299 "pand %3, %%mm1 \n\t"
1300 "pand %4, %%mm2 \n\t" 1300 "pand %4, %%mm2 \n\t"
1301 "psllq $3, %%mm0 \n\t" 1301 "psllq $3, %%mm0 \n\t"
1302 "psrlq $3, %%mm1 \n\t" 1302 "psrlq $3, %%mm1 \n\t"
1303 "psrlq $8, %%mm2 \n\t" 1303 "psrlq $8, %%mm2 \n\t"
1304 PACK_RGB32 1304 PACK_RGB32
1305 :"=m"(*d) 1305 :"=m"(*d)
1306 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) 1306 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1307 :"memory"); 1307 :"memory");
1308 d += 16; 1308 d += 16;
1309 s += 4; 1309 s += 4;
1310 } 1310 }
1311 __asm__ volatile(SFENCE:::"memory"); 1311 __asm__ volatile(SFENCE:::"memory");
1312 __asm__ volatile(EMMS:::"memory"); 1312 __asm__ volatile(EMMS:::"memory");
1334 x86_reg idx = 15 - src_size; 1334 x86_reg idx = 15 - src_size;
1335 const uint8_t *s = src-idx; 1335 const uint8_t *s = src-idx;
1336 uint8_t *d = dst-idx; 1336 uint8_t *d = dst-idx;
1337 #if HAVE_MMX 1337 #if HAVE_MMX
1338 __asm__ volatile( 1338 __asm__ volatile(
1339 "test %0, %0 \n\t" 1339 "test %0, %0 \n\t"
1340 "jns 2f \n\t" 1340 "jns 2f \n\t"
1341 PREFETCH" (%1, %0) \n\t" 1341 PREFETCH" (%1, %0) \n\t"
1342 "movq %3, %%mm7 \n\t" 1342 "movq %3, %%mm7 \n\t"
1343 "pxor %4, %%mm7 \n\t" 1343 "pxor %4, %%mm7 \n\t"
1344 "movq %%mm7, %%mm6 \n\t" 1344 "movq %%mm7, %%mm6 \n\t"
1345 "pxor %5, %%mm7 \n\t" 1345 "pxor %5, %%mm7 \n\t"
1346 ASMALIGN(4) 1346 ASMALIGN(4)
1347 "1: \n\t" 1347 "1: \n\t"
1348 PREFETCH" 32(%1, %0) \n\t" 1348 PREFETCH" 32(%1, %0) \n\t"
1349 "movq (%1, %0), %%mm0 \n\t" 1349 "movq (%1, %0), %%mm0 \n\t"
1350 "movq 8(%1, %0), %%mm1 \n\t" 1350 "movq 8(%1, %0), %%mm1 \n\t"
1351 # if HAVE_MMX2 1351 # if HAVE_MMX2
1352 "pshufw $177, %%mm0, %%mm3 \n\t" 1352 "pshufw $177, %%mm0, %%mm3 \n\t"
1353 "pshufw $177, %%mm1, %%mm5 \n\t" 1353 "pshufw $177, %%mm1, %%mm5 \n\t"
1354 "pand %%mm7, %%mm0 \n\t" 1354 "pand %%mm7, %%mm0 \n\t"
1355 "pand %%mm6, %%mm3 \n\t" 1355 "pand %%mm6, %%mm3 \n\t"
1356 "pand %%mm7, %%mm1 \n\t" 1356 "pand %%mm7, %%mm1 \n\t"
1357 "pand %%mm6, %%mm5 \n\t" 1357 "pand %%mm6, %%mm5 \n\t"
1358 "por %%mm3, %%mm0 \n\t" 1358 "por %%mm3, %%mm0 \n\t"
1359 "por %%mm5, %%mm1 \n\t" 1359 "por %%mm5, %%mm1 \n\t"
1360 # else 1360 # else
1361 "movq %%mm0, %%mm2 \n\t" 1361 "movq %%mm0, %%mm2 \n\t"
1362 "movq %%mm1, %%mm4 \n\t" 1362 "movq %%mm1, %%mm4 \n\t"
1363 "pand %%mm7, %%mm0 \n\t" 1363 "pand %%mm7, %%mm0 \n\t"
1364 "pand %%mm6, %%mm2 \n\t" 1364 "pand %%mm6, %%mm2 \n\t"
1365 "pand %%mm7, %%mm1 \n\t" 1365 "pand %%mm7, %%mm1 \n\t"
1366 "pand %%mm6, %%mm4 \n\t" 1366 "pand %%mm6, %%mm4 \n\t"
1367 "movq %%mm2, %%mm3 \n\t" 1367 "movq %%mm2, %%mm3 \n\t"
1368 "movq %%mm4, %%mm5 \n\t" 1368 "movq %%mm4, %%mm5 \n\t"
1369 "pslld $16, %%mm2 \n\t" 1369 "pslld $16, %%mm2 \n\t"
1370 "psrld $16, %%mm3 \n\t" 1370 "psrld $16, %%mm3 \n\t"
1371 "pslld $16, %%mm4 \n\t" 1371 "pslld $16, %%mm4 \n\t"
1372 "psrld $16, %%mm5 \n\t" 1372 "psrld $16, %%mm5 \n\t"
1373 "por %%mm2, %%mm0 \n\t" 1373 "por %%mm2, %%mm0 \n\t"
1374 "por %%mm4, %%mm1 \n\t" 1374 "por %%mm4, %%mm1 \n\t"
1375 "por %%mm3, %%mm0 \n\t" 1375 "por %%mm3, %%mm0 \n\t"
1376 "por %%mm5, %%mm1 \n\t" 1376 "por %%mm5, %%mm1 \n\t"
1377 # endif 1377 # endif
1378 MOVNTQ" %%mm0, (%2, %0) \n\t" 1378 MOVNTQ" %%mm0, (%2, %0) \n\t"
1379 MOVNTQ" %%mm1, 8(%2, %0) \n\t" 1379 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1380 "add $16, %0 \n\t" 1380 "add $16, %0 \n\t"
1381 "js 1b \n\t" 1381 "js 1b \n\t"
1382 SFENCE" \n\t" 1382 SFENCE" \n\t"
1383 EMMS" \n\t" 1383 EMMS" \n\t"
1384 "2: \n\t" 1384 "2: \n\t"
1385 : "+&r"(idx) 1385 : "+&r"(idx)
1386 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) 1386 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1387 : "memory"); 1387 : "memory");
1388 #endif 1388 #endif
1389 for (; idx<15; idx+=4) { 1389 for (; idx<15; idx+=4) {
1390 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; 1390 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1391 v &= 0xff00ff; 1391 v &= 0xff00ff;
1392 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); 1392 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1397 { 1397 {
1398 unsigned i; 1398 unsigned i;
1399 #if HAVE_MMX 1399 #if HAVE_MMX
1400 x86_reg mmx_size= 23 - src_size; 1400 x86_reg mmx_size= 23 - src_size;
1401 __asm__ volatile ( 1401 __asm__ volatile (
1402 "test %%"REG_a", %%"REG_a" \n\t" 1402 "test %%"REG_a", %%"REG_a" \n\t"
1403 "jns 2f \n\t" 1403 "jns 2f \n\t"
1404 "movq "MANGLE(mask24r)", %%mm5 \n\t" 1404 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1405 "movq "MANGLE(mask24g)", %%mm6 \n\t" 1405 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1406 "movq "MANGLE(mask24b)", %%mm7 \n\t" 1406 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1407 ASMALIGN(4) 1407 ASMALIGN(4)
1408 "1: \n\t" 1408 "1: \n\t"
1409 PREFETCH" 32(%1, %%"REG_a") \n\t" 1409 PREFETCH" 32(%1, %%"REG_a") \n\t"
1410 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 1410 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1411 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG 1411 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1412 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B 1412 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1413 "psllq $16, %%mm0 \n\t" // 00 BGR BGR 1413 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1414 "pand %%mm5, %%mm0 \n\t" 1414 "pand %%mm5, %%mm0 \n\t"
1415 "pand %%mm6, %%mm1 \n\t" 1415 "pand %%mm6, %%mm1 \n\t"
1416 "pand %%mm7, %%mm2 \n\t" 1416 "pand %%mm7, %%mm2 \n\t"
1417 "por %%mm0, %%mm1 \n\t" 1417 "por %%mm0, %%mm1 \n\t"
1418 "por %%mm2, %%mm1 \n\t" 1418 "por %%mm2, %%mm1 \n\t"
1419 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 1419 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1420 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG 1420 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1421 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B 1421 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1422 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR 1422 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1423 "pand %%mm7, %%mm0 \n\t" 1423 "pand %%mm7, %%mm0 \n\t"
1424 "pand %%mm5, %%mm1 \n\t" 1424 "pand %%mm5, %%mm1 \n\t"
1425 "pand %%mm6, %%mm2 \n\t" 1425 "pand %%mm6, %%mm2 \n\t"
1426 "por %%mm0, %%mm1 \n\t" 1426 "por %%mm0, %%mm1 \n\t"
1427 "por %%mm2, %%mm1 \n\t" 1427 "por %%mm2, %%mm1 \n\t"
1428 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B 1428 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1429 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R 1429 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1430 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR 1430 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1431 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG 1431 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1432 "pand %%mm6, %%mm0 \n\t" 1432 "pand %%mm6, %%mm0 \n\t"
1433 "pand %%mm7, %%mm1 \n\t" 1433 "pand %%mm7, %%mm1 \n\t"
1434 "pand %%mm5, %%mm2 \n\t" 1434 "pand %%mm5, %%mm2 \n\t"
1435 "por %%mm0, %%mm1 \n\t" 1435 "por %%mm0, %%mm1 \n\t"
1436 "por %%mm2, %%mm1 \n\t" 1436 "por %%mm2, %%mm1 \n\t"
1437 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" 1437 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1438 "add $24, %%"REG_a" \n\t" 1438 "add $24, %%"REG_a" \n\t"
1439 " js 1b \n\t" 1439 " js 1b \n\t"
1440 "2: \n\t" 1440 "2: \n\t"
1441 : "+a" (mmx_size) 1441 : "+a" (mmx_size)
1442 : "r" (src-mmx_size), "r"(dst-mmx_size) 1442 : "r" (src-mmx_size), "r"(dst-mmx_size)
1443 ); 1443 );
1444 1444
1445 __asm__ volatile(SFENCE:::"memory"); 1445 __asm__ volatile(SFENCE:::"memory");
1446 __asm__ volatile(EMMS:::"memory"); 1446 __asm__ volatile(EMMS:::"memory");
1447 1447
1472 for (y=0; y<height; y++) 1472 for (y=0; y<height; y++)
1473 { 1473 {
1474 #if HAVE_MMX 1474 #if HAVE_MMX
1475 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 1475 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1476 __asm__ volatile( 1476 __asm__ volatile(
1477 "xor %%"REG_a", %%"REG_a" \n\t" 1477 "xor %%"REG_a", %%"REG_a" \n\t"
1478 ASMALIGN(4) 1478 ASMALIGN(4)
1479 "1: \n\t" 1479 "1: \n\t"
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1481 PREFETCH" 32(%2, %%"REG_a") \n\t" 1481 PREFETCH" 32(%2, %%"REG_a") \n\t"
1482 PREFETCH" 32(%3, %%"REG_a") \n\t" 1482 PREFETCH" 32(%3, %%"REG_a") \n\t"
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1484 "movq %%mm0, %%mm2 \n\t" // U(0) 1484 "movq %%mm0, %%mm2 \n\t" // U(0)
1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1488 1488
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1491 "movq %%mm3, %%mm4 \n\t" // Y(0) 1491 "movq %%mm3, %%mm4 \n\t" // Y(0)
1492 "movq %%mm5, %%mm6 \n\t" // Y(8) 1492 "movq %%mm5, %%mm6 \n\t" // Y(8)
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) 1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) 1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) 1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) 1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1497 1497
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" 1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" 1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1502 1502
1503 "add $8, %%"REG_a" \n\t" 1503 "add $8, %%"REG_a" \n\t"
1504 "cmp %4, %%"REG_a" \n\t" 1504 "cmp %4, %%"REG_a" \n\t"
1505 " jb 1b \n\t" 1505 " jb 1b \n\t"
1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1507 : "%"REG_a 1507 : "%"REG_a
1508 ); 1508 );
1509 #else 1509 #else
1510 1510
1511 #if ARCH_ALPHA && HAVE_MVI 1511 #if ARCH_ALPHA && HAVE_MVI
1512 #define pl2yuy2(n) \ 1512 #define pl2yuy2(n) \
1595 } 1595 }
1596 ysrc += lumStride; 1596 ysrc += lumStride;
1597 dst += dstStride; 1597 dst += dstStride;
1598 } 1598 }
1599 #if HAVE_MMX 1599 #if HAVE_MMX
1600 __asm__( EMMS" \n\t" 1600 __asm__(EMMS" \n\t"
1601 SFENCE" \n\t" 1601 SFENCE" \n\t"
1602 :::"memory"); 1602 :::"memory");
1603 #endif 1603 #endif
1604 } 1604 }
1605 1605
1606 /** 1606 /**
1607 * Height should be a multiple of 2 and width should be a multiple of 16. 1607 * Height should be a multiple of 2 and width should be a multiple of 16.
1624 for (y=0; y<height; y++) 1624 for (y=0; y<height; y++)
1625 { 1625 {
1626 #if HAVE_MMX 1626 #if HAVE_MMX
1627 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 1627 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1628 __asm__ volatile( 1628 __asm__ volatile(
1629 "xor %%"REG_a", %%"REG_a" \n\t" 1629 "xor %%"REG_a", %%"REG_a" \n\t"
1630 ASMALIGN(4) 1630 ASMALIGN(4)
1631 "1: \n\t" 1631 "1: \n\t"
1632 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 1632 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1633 PREFETCH" 32(%2, %%"REG_a") \n\t" 1633 PREFETCH" 32(%2, %%"REG_a") \n\t"
1634 PREFETCH" 32(%3, %%"REG_a") \n\t" 1634 PREFETCH" 32(%3, %%"REG_a") \n\t"
1635 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 1635 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1636 "movq %%mm0, %%mm2 \n\t" // U(0) 1636 "movq %%mm0, %%mm2 \n\t" // U(0)
1637 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 1637 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1638 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1638 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1639 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1639 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1640 1640
1641 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 1641 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1642 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 1642 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1643 "movq %%mm0, %%mm4 \n\t" // Y(0) 1643 "movq %%mm0, %%mm4 \n\t" // Y(0)
1644 "movq %%mm2, %%mm6 \n\t" // Y(8) 1644 "movq %%mm2, %%mm6 \n\t" // Y(8)
1645 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) 1645 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1646 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) 1646 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1647 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) 1647 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1648 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) 1648 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1649 1649
1650 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" 1650 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1651 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 1651 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1652 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" 1652 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1653 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 1653 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1654 1654
1655 "add $8, %%"REG_a" \n\t" 1655 "add $8, %%"REG_a" \n\t"
1656 "cmp %4, %%"REG_a" \n\t" 1656 "cmp %4, %%"REG_a" \n\t"
1657 " jb 1b \n\t" 1657 " jb 1b \n\t"
1658 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1658 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1659 : "%"REG_a 1659 : "%"REG_a
1660 ); 1660 );
1661 #else 1661 #else
1662 //FIXME adapt the Alpha ASM code from yv12->yuy2 1662 //FIXME adapt the Alpha ASM code from yv12->yuy2
1663 1663
1664 #if HAVE_FAST_64BIT 1664 #if HAVE_FAST_64BIT
1701 } 1701 }
1702 ysrc += lumStride; 1702 ysrc += lumStride;
1703 dst += dstStride; 1703 dst += dstStride;
1704 } 1704 }
1705 #if HAVE_MMX 1705 #if HAVE_MMX
1706 __asm__( EMMS" \n\t" 1706 __asm__(EMMS" \n\t"
1707 SFENCE" \n\t" 1707 SFENCE" \n\t"
1708 :::"memory"); 1708 :::"memory");
1709 #endif 1709 #endif
1710 } 1710 }
1711 1711
1712 /** 1712 /**
1713 * Height should be a multiple of 2 and width should be a multiple of 16 1713 * Height should be a multiple of 2 and width should be a multiple of 16
1753 const x86_reg chromWidth= width>>1; 1753 const x86_reg chromWidth= width>>1;
1754 for (y=0; y<height; y+=2) 1754 for (y=0; y<height; y+=2)
1755 { 1755 {
1756 #if HAVE_MMX 1756 #if HAVE_MMX
1757 __asm__ volatile( 1757 __asm__ volatile(
1758 "xor %%"REG_a", %%"REG_a" \n\t" 1758 "xor %%"REG_a", %%"REG_a" \n\t"
1759 "pcmpeqw %%mm7, %%mm7 \n\t" 1759 "pcmpeqw %%mm7, %%mm7 \n\t"
1760 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 1760 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1761 ASMALIGN(4) 1761 ASMALIGN(4)
1762 "1: \n\t" 1762 "1: \n\t"
1763 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1763 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1764 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 1764 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1765 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 1765 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1766 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) 1766 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1767 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) 1767 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1768 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) 1768 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1769 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) 1769 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1770 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 1770 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1771 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 1771 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1772 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1772 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1773 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 1773 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1774 1774
1775 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 1775 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1776 1776
1777 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) 1777 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1778 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) 1778 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1779 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) 1779 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1780 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) 1780 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1781 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) 1781 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1782 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) 1782 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1783 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 1783 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1784 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 1784 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1785 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 1785 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1786 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 1786 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1787 1787
1788 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 1788 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1789 1789
1790 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 1790 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1791 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 1791 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1792 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 1792 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1793 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 1793 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1794 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 1794 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1795 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 1795 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1796 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 1796 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1797 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 1797 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1798 1798
1799 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 1799 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1800 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 1800 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1801 1801
1802 "add $8, %%"REG_a" \n\t" 1802 "add $8, %%"REG_a" \n\t"
1803 "cmp %4, %%"REG_a" \n\t" 1803 "cmp %4, %%"REG_a" \n\t"
1804 " jb 1b \n\t" 1804 " jb 1b \n\t"
1805 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1805 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1806 : "memory", "%"REG_a 1806 : "memory", "%"REG_a
1807 ); 1807 );
1808 1808
1809 ydst += lumStride; 1809 ydst += lumStride;
1810 src += srcStride; 1810 src += srcStride;
1811 1811
1812 __asm__ volatile( 1812 __asm__ volatile(
1813 "xor %%"REG_a", %%"REG_a" \n\t" 1813 "xor %%"REG_a", %%"REG_a" \n\t"
1814 ASMALIGN(4) 1814 ASMALIGN(4)
1815 "1: \n\t" 1815 "1: \n\t"
1816 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1816 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1817 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 1817 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1818 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 1818 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1819 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 1819 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1820 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 1820 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1821 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 1821 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1822 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 1822 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1823 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 1823 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1824 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 1824 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1825 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 1825 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1826 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 1826 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1827 1827
1828 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 1828 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1829 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 1829 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1830 1830
1831 "add $8, %%"REG_a" \n\t" 1831 "add $8, %%"REG_a" \n\t"
1832 "cmp %4, %%"REG_a" \n\t" 1832 "cmp %4, %%"REG_a" \n\t"
1833 " jb 1b \n\t" 1833 " jb 1b \n\t"
1834 1834
1835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1836 : "memory", "%"REG_a 1836 : "memory", "%"REG_a
1837 ); 1837 );
1838 #else 1838 #else
1839 long i; 1839 long i;
1840 for (i=0; i<chromWidth; i++) 1840 for (i=0; i<chromWidth; i++)
1841 { 1841 {
1857 vdst += chromStride; 1857 vdst += chromStride;
1858 ydst += lumStride; 1858 ydst += lumStride;
1859 src += srcStride; 1859 src += srcStride;
1860 } 1860 }
1861 #if HAVE_MMX 1861 #if HAVE_MMX
1862 __asm__ volatile( EMMS" \n\t" 1862 __asm__ volatile(EMMS" \n\t"
1863 SFENCE" \n\t" 1863 SFENCE" \n\t"
1864 :::"memory"); 1864 :::"memory");
1865 #endif 1865 #endif
1866 } 1866 }
1867 1867
1868 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, 1868 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1869 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 1869 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1886 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 1886 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1887 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 1887 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1888 } 1888 }
1889 dst[2*srcWidth-1]= src[srcWidth-1]; 1889 dst[2*srcWidth-1]= src[srcWidth-1];
1890 1890
1891 dst+= dstStride; 1891 dst+= dstStride;
1892 1892
1893 for (y=1; y<srcHeight; y++){ 1893 for (y=1; y<srcHeight; y++){
1894 #if HAVE_MMX2 || HAVE_AMD3DNOW 1894 #if HAVE_MMX2 || HAVE_AMD3DNOW
1895 const x86_reg mmxSize= srcWidth&~15; 1895 const x86_reg mmxSize= srcWidth&~15;
1896 __asm__ volatile( 1896 __asm__ volatile(
1897 "mov %4, %%"REG_a" \n\t" 1897 "mov %4, %%"REG_a" \n\t"
1898 "1: \n\t" 1898 "1: \n\t"
1899 "movq (%0, %%"REG_a"), %%mm0 \n\t" 1899 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1900 "movq (%1, %%"REG_a"), %%mm1 \n\t" 1900 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1901 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" 1901 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1902 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" 1902 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1903 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" 1903 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1904 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" 1904 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1905 PAVGB" %%mm0, %%mm5 \n\t" 1905 PAVGB" %%mm0, %%mm5 \n\t"
1906 PAVGB" %%mm0, %%mm3 \n\t" 1906 PAVGB" %%mm0, %%mm3 \n\t"
1907 PAVGB" %%mm0, %%mm5 \n\t" 1907 PAVGB" %%mm0, %%mm5 \n\t"
1908 PAVGB" %%mm0, %%mm3 \n\t" 1908 PAVGB" %%mm0, %%mm3 \n\t"
1909 PAVGB" %%mm1, %%mm4 \n\t" 1909 PAVGB" %%mm1, %%mm4 \n\t"
1910 PAVGB" %%mm1, %%mm2 \n\t" 1910 PAVGB" %%mm1, %%mm2 \n\t"
1911 PAVGB" %%mm1, %%mm4 \n\t" 1911 PAVGB" %%mm1, %%mm4 \n\t"
1912 PAVGB" %%mm1, %%mm2 \n\t" 1912 PAVGB" %%mm1, %%mm2 \n\t"
1913 "movq %%mm5, %%mm7 \n\t" 1913 "movq %%mm5, %%mm7 \n\t"
1914 "movq %%mm4, %%mm6 \n\t" 1914 "movq %%mm4, %%mm6 \n\t"
1915 "punpcklbw %%mm3, %%mm5 \n\t" 1915 "punpcklbw %%mm3, %%mm5 \n\t"
1916 "punpckhbw %%mm3, %%mm7 \n\t" 1916 "punpckhbw %%mm3, %%mm7 \n\t"
1917 "punpcklbw %%mm2, %%mm4 \n\t" 1917 "punpcklbw %%mm2, %%mm4 \n\t"
1918 "punpckhbw %%mm2, %%mm6 \n\t" 1918 "punpckhbw %%mm2, %%mm6 \n\t"
1919 #if 1 1919 #if 1
1920 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" 1920 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1921 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" 1921 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1922 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" 1922 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1923 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" 1923 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1924 #else 1924 #else
1925 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" 1925 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1926 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t" 1926 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1927 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" 1927 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1928 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t" 1928 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1929 #endif 1929 #endif
1930 "add $8, %%"REG_a" \n\t" 1930 "add $8, %%"REG_a" \n\t"
1931 " js 1b \n\t" 1931 " js 1b \n\t"
1932 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), 1932 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1933 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), 1933 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1934 "g" (-mmxSize) 1934 "g" (-mmxSize)
1935 : "%"REG_a 1935 : "%"REG_a
1936 1936
1937 ); 1937 );
1938 #else 1938 #else
1939 const x86_reg mmxSize=1; 1939 const x86_reg mmxSize=1;
1940 #endif 1940 #endif
1969 dst[2*x+1]= src[x]; 1969 dst[2*x+1]= src[x];
1970 } 1970 }
1971 #endif 1971 #endif
1972 1972
1973 #if HAVE_MMX 1973 #if HAVE_MMX
1974 __asm__ volatile( EMMS" \n\t" 1974 __asm__ volatile(EMMS" \n\t"
1975 SFENCE" \n\t" 1975 SFENCE" \n\t"
1976 :::"memory"); 1976 :::"memory");
1977 #endif 1977 #endif
1978 } 1978 }
1979 1979
1980 /** 1980 /**
1981 * Height should be a multiple of 2 and width should be a multiple of 16. 1981 * Height should be a multiple of 2 and width should be a multiple of 16.
1991 const x86_reg chromWidth= width>>1; 1991 const x86_reg chromWidth= width>>1;
1992 for (y=0; y<height; y+=2) 1992 for (y=0; y<height; y+=2)
1993 { 1993 {
1994 #if HAVE_MMX 1994 #if HAVE_MMX
1995 __asm__ volatile( 1995 __asm__ volatile(
1996 "xor %%"REG_a", %%"REG_a" \n\t" 1996 "xor %%"REG_a", %%"REG_a" \n\t"
1997 "pcmpeqw %%mm7, %%mm7 \n\t" 1997 "pcmpeqw %%mm7, %%mm7 \n\t"
1998 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 1998 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1999 ASMALIGN(4) 1999 ASMALIGN(4)
2000 "1: \n\t" 2000 "1: \n\t"
2001 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 2001 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2002 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) 2002 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
2003 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) 2003 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
2004 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) 2004 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2005 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) 2005 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2006 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) 2006 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2007 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) 2007 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2008 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 2008 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2009 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 2009 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2010 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 2010 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2011 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 2011 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2012 2012
2013 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 2013 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
2014 2014
2015 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) 2015 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
2016 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) 2016 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
2017 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) 2017 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2018 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) 2018 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2019 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) 2019 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2020 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) 2020 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2021 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 2021 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2022 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 2022 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2023 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 2023 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2024 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 2024 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2025 2025
2026 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 2026 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2027 2027
2028 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 2028 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2029 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 2029 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2030 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 2030 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2031 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 2031 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2032 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 2032 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2033 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 2033 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2034 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 2034 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2035 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 2035 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2036 2036
2037 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 2037 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
2038 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 2038 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
2039 2039
2040 "add $8, %%"REG_a" \n\t" 2040 "add $8, %%"REG_a" \n\t"
2041 "cmp %4, %%"REG_a" \n\t" 2041 "cmp %4, %%"REG_a" \n\t"
2042 " jb 1b \n\t" 2042 " jb 1b \n\t"
2043 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 2043 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2044 : "memory", "%"REG_a 2044 : "memory", "%"REG_a
2045 ); 2045 );
2046 2046
2047 ydst += lumStride; 2047 ydst += lumStride;
2048 src += srcStride; 2048 src += srcStride;
2049 2049
2050 __asm__ volatile( 2050 __asm__ volatile(
2051 "xor %%"REG_a", %%"REG_a" \n\t" 2051 "xor %%"REG_a", %%"REG_a" \n\t"
2052 ASMALIGN(4) 2052 ASMALIGN(4)
2053 "1: \n\t" 2053 "1: \n\t"
2054 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 2054 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2055 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 2055 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2056 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 2056 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2057 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 2057 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2058 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 2058 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2059 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 2059 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2060 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 2060 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2061 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 2061 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2062 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 2062 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2063 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 2063 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2064 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 2064 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2065 2065
2066 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 2066 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2067 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 2067 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2068 2068
2069 "add $8, %%"REG_a" \n\t" 2069 "add $8, %%"REG_a" \n\t"
2070 "cmp %4, %%"REG_a" \n\t" 2070 "cmp %4, %%"REG_a" \n\t"
2071 " jb 1b \n\t" 2071 " jb 1b \n\t"
2072 2072
2073 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 2073 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2074 : "memory", "%"REG_a 2074 : "memory", "%"REG_a
2075 ); 2075 );
2076 #else 2076 #else
2077 long i; 2077 long i;
2078 for (i=0; i<chromWidth; i++) 2078 for (i=0; i<chromWidth; i++)
2079 { 2079 {
2095 vdst += chromStride; 2095 vdst += chromStride;
2096 ydst += lumStride; 2096 ydst += lumStride;
2097 src += srcStride; 2097 src += srcStride;
2098 } 2098 }
2099 #if HAVE_MMX 2099 #if HAVE_MMX
2100 __asm__ volatile( EMMS" \n\t" 2100 __asm__ volatile(EMMS" \n\t"
2101 SFENCE" \n\t" 2101 SFENCE" \n\t"
2102 :::"memory"); 2102 :::"memory");
2103 #endif 2103 #endif
2104 } 2104 }
2105 2105
2106 /** 2106 /**
2107 * Height should be a multiple of 2 and width should be a multiple of 2. 2107 * Height should be a multiple of 2 and width should be a multiple of 2.
2121 { 2121 {
2122 long i; 2122 long i;
2123 for (i=0; i<2; i++) 2123 for (i=0; i<2; i++)
2124 { 2124 {
2125 __asm__ volatile( 2125 __asm__ volatile(
2126 "mov %2, %%"REG_a" \n\t" 2126 "mov %2, %%"REG_a" \n\t"
2127 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" 2127 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2128 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 2128 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2129 "pxor %%mm7, %%mm7 \n\t" 2129 "pxor %%mm7, %%mm7 \n\t"
2130 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 2130 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2131 ASMALIGN(4) 2131 ASMALIGN(4)
2132 "1: \n\t" 2132 "1: \n\t"
2133 PREFETCH" 64(%0, %%"REG_d") \n\t" 2133 PREFETCH" 64(%0, %%"REG_d") \n\t"
2134 "movd (%0, %%"REG_d"), %%mm0 \n\t" 2134 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2135 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" 2135 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2136 "punpcklbw %%mm7, %%mm0 \n\t" 2136 "punpcklbw %%mm7, %%mm0 \n\t"
2137 "punpcklbw %%mm7, %%mm1 \n\t" 2137 "punpcklbw %%mm7, %%mm1 \n\t"
2138 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" 2138 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2139 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" 2139 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2140 "punpcklbw %%mm7, %%mm2 \n\t" 2140 "punpcklbw %%mm7, %%mm2 \n\t"
2141 "punpcklbw %%mm7, %%mm3 \n\t" 2141 "punpcklbw %%mm7, %%mm3 \n\t"
2142 "pmaddwd %%mm6, %%mm0 \n\t" 2142 "pmaddwd %%mm6, %%mm0 \n\t"
2143 "pmaddwd %%mm6, %%mm1 \n\t" 2143 "pmaddwd %%mm6, %%mm1 \n\t"
2144 "pmaddwd %%mm6, %%mm2 \n\t" 2144 "pmaddwd %%mm6, %%mm2 \n\t"
2145 "pmaddwd %%mm6, %%mm3 \n\t" 2145 "pmaddwd %%mm6, %%mm3 \n\t"
2146 #ifndef FAST_BGR2YV12 2146 #ifndef FAST_BGR2YV12
2147 "psrad $8, %%mm0 \n\t" 2147 "psrad $8, %%mm0 \n\t"
2148 "psrad $8, %%mm1 \n\t" 2148 "psrad $8, %%mm1 \n\t"
2149 "psrad $8, %%mm2 \n\t" 2149 "psrad $8, %%mm2 \n\t"
2150 "psrad $8, %%mm3 \n\t" 2150 "psrad $8, %%mm3 \n\t"
2151 #endif 2151 #endif
2152 "packssdw %%mm1, %%mm0 \n\t" 2152 "packssdw %%mm1, %%mm0 \n\t"
2153 "packssdw %%mm3, %%mm2 \n\t" 2153 "packssdw %%mm3, %%mm2 \n\t"
2154 "pmaddwd %%mm5, %%mm0 \n\t" 2154 "pmaddwd %%mm5, %%mm0 \n\t"
2155 "pmaddwd %%mm5, %%mm2 \n\t" 2155 "pmaddwd %%mm5, %%mm2 \n\t"
2156 "packssdw %%mm2, %%mm0 \n\t" 2156 "packssdw %%mm2, %%mm0 \n\t"
2157 "psraw $7, %%mm0 \n\t" 2157 "psraw $7, %%mm0 \n\t"
2158 2158
2159 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 2159 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2160 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" 2160 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2161 "punpcklbw %%mm7, %%mm4 \n\t" 2161 "punpcklbw %%mm7, %%mm4 \n\t"
2162 "punpcklbw %%mm7, %%mm1 \n\t" 2162 "punpcklbw %%mm7, %%mm1 \n\t"
2163 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" 2163 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2164 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" 2164 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2165 "punpcklbw %%mm7, %%mm2 \n\t" 2165 "punpcklbw %%mm7, %%mm2 \n\t"
2166 "punpcklbw %%mm7, %%mm3 \n\t" 2166 "punpcklbw %%mm7, %%mm3 \n\t"
2167 "pmaddwd %%mm6, %%mm4 \n\t" 2167 "pmaddwd %%mm6, %%mm4 \n\t"
2168 "pmaddwd %%mm6, %%mm1 \n\t" 2168 "pmaddwd %%mm6, %%mm1 \n\t"
2169 "pmaddwd %%mm6, %%mm2 \n\t" 2169 "pmaddwd %%mm6, %%mm2 \n\t"
2170 "pmaddwd %%mm6, %%mm3 \n\t" 2170 "pmaddwd %%mm6, %%mm3 \n\t"
2171 #ifndef FAST_BGR2YV12 2171 #ifndef FAST_BGR2YV12
2172 "psrad $8, %%mm4 \n\t" 2172 "psrad $8, %%mm4 \n\t"
2173 "psrad $8, %%mm1 \n\t" 2173 "psrad $8, %%mm1 \n\t"
2174 "psrad $8, %%mm2 \n\t" 2174 "psrad $8, %%mm2 \n\t"
2175 "psrad $8, %%mm3 \n\t" 2175 "psrad $8, %%mm3 \n\t"
2176 #endif 2176 #endif
2177 "packssdw %%mm1, %%mm4 \n\t" 2177 "packssdw %%mm1, %%mm4 \n\t"
2178 "packssdw %%mm3, %%mm2 \n\t" 2178 "packssdw %%mm3, %%mm2 \n\t"
2179 "pmaddwd %%mm5, %%mm4 \n\t" 2179 "pmaddwd %%mm5, %%mm4 \n\t"
2180 "pmaddwd %%mm5, %%mm2 \n\t" 2180 "pmaddwd %%mm5, %%mm2 \n\t"
2181 "add $24, %%"REG_d" \n\t" 2181 "add $24, %%"REG_d" \n\t"
2182 "packssdw %%mm2, %%mm4 \n\t" 2182 "packssdw %%mm2, %%mm4 \n\t"
2183 "psraw $7, %%mm4 \n\t" 2183 "psraw $7, %%mm4 \n\t"
2184 2184
2185 "packuswb %%mm4, %%mm0 \n\t" 2185 "packuswb %%mm4, %%mm0 \n\t"
2186 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" 2186 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2187 2187
2188 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" 2188 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2189 "add $8, %%"REG_a" \n\t" 2189 "add $8, %%"REG_a" \n\t"
2190 " js 1b \n\t" 2190 " js 1b \n\t"
2191 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) 2191 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2192 : "%"REG_a, "%"REG_d 2192 : "%"REG_a, "%"REG_d
2193 ); 2193 );
2194 ydst += lumStride; 2194 ydst += lumStride;
2195 src += srcStride; 2195 src += srcStride;
2196 } 2196 }
2197 src -= srcStride*2; 2197 src -= srcStride*2;
2198 __asm__ volatile( 2198 __asm__ volatile(
2199 "mov %4, %%"REG_a" \n\t" 2199 "mov %4, %%"REG_a" \n\t"
2200 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 2200 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2201 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" 2201 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2202 "pxor %%mm7, %%mm7 \n\t" 2202 "pxor %%mm7, %%mm7 \n\t"
2203 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 2203 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2204 "add %%"REG_d", %%"REG_d" \n\t" 2204 "add %%"REG_d", %%"REG_d" \n\t"
2205 ASMALIGN(4) 2205 ASMALIGN(4)
2206 "1: \n\t" 2206 "1: \n\t"
2207 PREFETCH" 64(%0, %%"REG_d") \n\t" 2207 PREFETCH" 64(%0, %%"REG_d") \n\t"
2208 PREFETCH" 64(%1, %%"REG_d") \n\t" 2208 PREFETCH" 64(%1, %%"REG_d") \n\t"
2209 #if HAVE_MMX2 || HAVE_AMD3DNOW 2209 #if HAVE_MMX2 || HAVE_AMD3DNOW
2210 "movq (%0, %%"REG_d"), %%mm0 \n\t" 2210 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2211 "movq (%1, %%"REG_d"), %%mm1 \n\t" 2211 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2212 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" 2212 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2213 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" 2213 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2214 PAVGB" %%mm1, %%mm0 \n\t" 2214 PAVGB" %%mm1, %%mm0 \n\t"
2215 PAVGB" %%mm3, %%mm2 \n\t" 2215 PAVGB" %%mm3, %%mm2 \n\t"
2216 "movq %%mm0, %%mm1 \n\t" 2216 "movq %%mm0, %%mm1 \n\t"
2217 "movq %%mm2, %%mm3 \n\t" 2217 "movq %%mm2, %%mm3 \n\t"
2218 "psrlq $24, %%mm0 \n\t" 2218 "psrlq $24, %%mm0 \n\t"
2219 "psrlq $24, %%mm2 \n\t" 2219 "psrlq $24, %%mm2 \n\t"
2220 PAVGB" %%mm1, %%mm0 \n\t" 2220 PAVGB" %%mm1, %%mm0 \n\t"
2221 PAVGB" %%mm3, %%mm2 \n\t" 2221 PAVGB" %%mm3, %%mm2 \n\t"
2222 "punpcklbw %%mm7, %%mm0 \n\t" 2222 "punpcklbw %%mm7, %%mm0 \n\t"
2223 "punpcklbw %%mm7, %%mm2 \n\t" 2223 "punpcklbw %%mm7, %%mm2 \n\t"
2224 #else 2224 #else
2225 "movd (%0, %%"REG_d"), %%mm0 \n\t" 2225 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2226 "movd (%1, %%"REG_d"), %%mm1 \n\t" 2226 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2227 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" 2227 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2228 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" 2228 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2229 "punpcklbw %%mm7, %%mm0 \n\t" 2229 "punpcklbw %%mm7, %%mm0 \n\t"
2230 "punpcklbw %%mm7, %%mm1 \n\t" 2230 "punpcklbw %%mm7, %%mm1 \n\t"
2231 "punpcklbw %%mm7, %%mm2 \n\t" 2231 "punpcklbw %%mm7, %%mm2 \n\t"
2232 "punpcklbw %%mm7, %%mm3 \n\t" 2232 "punpcklbw %%mm7, %%mm3 \n\t"
2233 "paddw %%mm1, %%mm0 \n\t" 2233 "paddw %%mm1, %%mm0 \n\t"
2234 "paddw %%mm3, %%mm2 \n\t" 2234 "paddw %%mm3, %%mm2 \n\t"
2235 "paddw %%mm2, %%mm0 \n\t" 2235 "paddw %%mm2, %%mm0 \n\t"
2236 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" 2236 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2237 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" 2237 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2238 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" 2238 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2239 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" 2239 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2240 "punpcklbw %%mm7, %%mm4 \n\t" 2240 "punpcklbw %%mm7, %%mm4 \n\t"
2241 "punpcklbw %%mm7, %%mm1 \n\t" 2241 "punpcklbw %%mm7, %%mm1 \n\t"
2242 "punpcklbw %%mm7, %%mm2 \n\t" 2242 "punpcklbw %%mm7, %%mm2 \n\t"
2243 "punpcklbw %%mm7, %%mm3 \n\t" 2243 "punpcklbw %%mm7, %%mm3 \n\t"
2244 "paddw %%mm1, %%mm4 \n\t" 2244 "paddw %%mm1, %%mm4 \n\t"
2245 "paddw %%mm3, %%mm2 \n\t" 2245 "paddw %%mm3, %%mm2 \n\t"
2246 "paddw %%mm4, %%mm2 \n\t" 2246 "paddw %%mm4, %%mm2 \n\t"
2247 "psrlw $2, %%mm0 \n\t" 2247 "psrlw $2, %%mm0 \n\t"
2248 "psrlw $2, %%mm2 \n\t" 2248 "psrlw $2, %%mm2 \n\t"
2249 #endif 2249 #endif
2250 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" 2250 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2251 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" 2251 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2252 2252
2253 "pmaddwd %%mm0, %%mm1 \n\t" 2253 "pmaddwd %%mm0, %%mm1 \n\t"
2254 "pmaddwd %%mm2, %%mm3 \n\t" 2254 "pmaddwd %%mm2, %%mm3 \n\t"
2255 "pmaddwd %%mm6, %%mm0 \n\t" 2255 "pmaddwd %%mm6, %%mm0 \n\t"
2256 "pmaddwd %%mm6, %%mm2 \n\t" 2256 "pmaddwd %%mm6, %%mm2 \n\t"
2257 #ifndef FAST_BGR2YV12 2257 #ifndef FAST_BGR2YV12
2258 "psrad $8, %%mm0 \n\t" 2258 "psrad $8, %%mm0 \n\t"
2259 "psrad $8, %%mm1 \n\t" 2259 "psrad $8, %%mm1 \n\t"
2260 "psrad $8, %%mm2 \n\t" 2260 "psrad $8, %%mm2 \n\t"
2261 "psrad $8, %%mm3 \n\t" 2261 "psrad $8, %%mm3 \n\t"
2262 #endif 2262 #endif
2263 "packssdw %%mm2, %%mm0 \n\t" 2263 "packssdw %%mm2, %%mm0 \n\t"
2264 "packssdw %%mm3, %%mm1 \n\t" 2264 "packssdw %%mm3, %%mm1 \n\t"
2265 "pmaddwd %%mm5, %%mm0 \n\t" 2265 "pmaddwd %%mm5, %%mm0 \n\t"
2266 "pmaddwd %%mm5, %%mm1 \n\t" 2266 "pmaddwd %%mm5, %%mm1 \n\t"
2267 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 2267 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2268 "psraw $7, %%mm0 \n\t" 2268 "psraw $7, %%mm0 \n\t"
2269 2269
2270 #if HAVE_MMX2 || HAVE_AMD3DNOW 2270 #if HAVE_MMX2 || HAVE_AMD3DNOW
2271 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" 2271 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2272 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" 2272 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2273 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" 2273 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2274 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" 2274 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2275 PAVGB" %%mm1, %%mm4 \n\t" 2275 PAVGB" %%mm1, %%mm4 \n\t"
2276 PAVGB" %%mm3, %%mm2 \n\t" 2276 PAVGB" %%mm3, %%mm2 \n\t"
2277 "movq %%mm4, %%mm1 \n\t" 2277 "movq %%mm4, %%mm1 \n\t"
2278 "movq %%mm2, %%mm3 \n\t" 2278 "movq %%mm2, %%mm3 \n\t"
2279 "psrlq $24, %%mm4 \n\t" 2279 "psrlq $24, %%mm4 \n\t"
2280 "psrlq $24, %%mm2 \n\t" 2280 "psrlq $24, %%mm2 \n\t"
2281 PAVGB" %%mm1, %%mm4 \n\t" 2281 PAVGB" %%mm1, %%mm4 \n\t"
2282 PAVGB" %%mm3, %%mm2 \n\t" 2282 PAVGB" %%mm3, %%mm2 \n\t"
2283 "punpcklbw %%mm7, %%mm4 \n\t" 2283 "punpcklbw %%mm7, %%mm4 \n\t"
2284 "punpcklbw %%mm7, %%mm2 \n\t" 2284 "punpcklbw %%mm7, %%mm2 \n\t"
2285 #else 2285 #else
2286 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 2286 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2287 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" 2287 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2288 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" 2288 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2289 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" 2289 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2290 "punpcklbw %%mm7, %%mm4 \n\t" 2290 "punpcklbw %%mm7, %%mm4 \n\t"
2291 "punpcklbw %%mm7, %%mm1 \n\t" 2291 "punpcklbw %%mm7, %%mm1 \n\t"
2292 "punpcklbw %%mm7, %%mm2 \n\t" 2292 "punpcklbw %%mm7, %%mm2 \n\t"
2293 "punpcklbw %%mm7, %%mm3 \n\t" 2293 "punpcklbw %%mm7, %%mm3 \n\t"
2294 "paddw %%mm1, %%mm4 \n\t" 2294 "paddw %%mm1, %%mm4 \n\t"
2295 "paddw %%mm3, %%mm2 \n\t" 2295 "paddw %%mm3, %%mm2 \n\t"
2296 "paddw %%mm2, %%mm4 \n\t" 2296 "paddw %%mm2, %%mm4 \n\t"
2297 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" 2297 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2298 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" 2298 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2299 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" 2299 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2300 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" 2300 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2301 "punpcklbw %%mm7, %%mm5 \n\t" 2301 "punpcklbw %%mm7, %%mm5 \n\t"
2302 "punpcklbw %%mm7, %%mm1 \n\t" 2302 "punpcklbw %%mm7, %%mm1 \n\t"
2303 "punpcklbw %%mm7, %%mm2 \n\t" 2303 "punpcklbw %%mm7, %%mm2 \n\t"
2304 "punpcklbw %%mm7, %%mm3 \n\t" 2304 "punpcklbw %%mm7, %%mm3 \n\t"
2305 "paddw %%mm1, %%mm5 \n\t" 2305 "paddw %%mm1, %%mm5 \n\t"
2306 "paddw %%mm3, %%mm2 \n\t" 2306 "paddw %%mm3, %%mm2 \n\t"
2307 "paddw %%mm5, %%mm2 \n\t" 2307 "paddw %%mm5, %%mm2 \n\t"
2308 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 2308 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2309 "psrlw $2, %%mm4 \n\t" 2309 "psrlw $2, %%mm4 \n\t"
2310 "psrlw $2, %%mm2 \n\t" 2310 "psrlw $2, %%mm2 \n\t"
2311 #endif 2311 #endif
2312 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" 2312 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2313 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" 2313 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2314 2314
2315 "pmaddwd %%mm4, %%mm1 \n\t" 2315 "pmaddwd %%mm4, %%mm1 \n\t"
2316 "pmaddwd %%mm2, %%mm3 \n\t" 2316 "pmaddwd %%mm2, %%mm3 \n\t"
2317 "pmaddwd %%mm6, %%mm4 \n\t" 2317 "pmaddwd %%mm6, %%mm4 \n\t"
2318 "pmaddwd %%mm6, %%mm2 \n\t" 2318 "pmaddwd %%mm6, %%mm2 \n\t"
2319 #ifndef FAST_BGR2YV12 2319 #ifndef FAST_BGR2YV12
2320 "psrad $8, %%mm4 \n\t" 2320 "psrad $8, %%mm4 \n\t"
2321 "psrad $8, %%mm1 \n\t" 2321 "psrad $8, %%mm1 \n\t"
2322 "psrad $8, %%mm2 \n\t" 2322 "psrad $8, %%mm2 \n\t"
2323 "psrad $8, %%mm3 \n\t" 2323 "psrad $8, %%mm3 \n\t"
2324 #endif 2324 #endif
2325 "packssdw %%mm2, %%mm4 \n\t" 2325 "packssdw %%mm2, %%mm4 \n\t"
2326 "packssdw %%mm3, %%mm1 \n\t" 2326 "packssdw %%mm3, %%mm1 \n\t"
2327 "pmaddwd %%mm5, %%mm4 \n\t" 2327 "pmaddwd %%mm5, %%mm4 \n\t"
2328 "pmaddwd %%mm5, %%mm1 \n\t" 2328 "pmaddwd %%mm5, %%mm1 \n\t"
2329 "add $24, %%"REG_d" \n\t" 2329 "add $24, %%"REG_d" \n\t"
2330 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 2330 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2331 "psraw $7, %%mm4 \n\t" 2331 "psraw $7, %%mm4 \n\t"
2332 2332
2333 "movq %%mm0, %%mm1 \n\t" 2333 "movq %%mm0, %%mm1 \n\t"
2334 "punpckldq %%mm4, %%mm0 \n\t" 2334 "punpckldq %%mm4, %%mm0 \n\t"
2335 "punpckhdq %%mm4, %%mm1 \n\t" 2335 "punpckhdq %%mm4, %%mm1 \n\t"
2336 "packsswb %%mm1, %%mm0 \n\t" 2336 "packsswb %%mm1, %%mm0 \n\t"
2337 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" 2337 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2338 "movd %%mm0, (%2, %%"REG_a") \n\t" 2338 "movd %%mm0, (%2, %%"REG_a") \n\t"
2339 "punpckhdq %%mm0, %%mm0 \n\t" 2339 "punpckhdq %%mm0, %%mm0 \n\t"
2340 "movd %%mm0, (%3, %%"REG_a") \n\t" 2340 "movd %%mm0, (%3, %%"REG_a") \n\t"
2341 "add $4, %%"REG_a" \n\t" 2341 "add $4, %%"REG_a" \n\t"
2342 " js 1b \n\t" 2342 " js 1b \n\t"
2343 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) 2343 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2344 : "%"REG_a, "%"REG_d 2344 : "%"REG_a, "%"REG_d
2345 ); 2345 );
2346 2346
2347 udst += chromStride; 2347 udst += chromStride;
2348 vdst += chromStride; 2348 vdst += chromStride;
2349 src += srcStride*2; 2349 src += srcStride*2;
2350 } 2350 }
2351 2351
2352 __asm__ volatile( EMMS" \n\t" 2352 __asm__ volatile(EMMS" \n\t"
2353 SFENCE" \n\t" 2353 SFENCE" \n\t"
2354 :::"memory"); 2354 :::"memory");
2355 #else 2355 #else
2356 y=0; 2356 y=0;
2357 #endif 2357 #endif
2358 for (; y<height; y+=2) 2358 for (; y<height; y+=2)
2359 { 2359 {
2416 long w; 2416 long w;
2417 2417
2418 #if HAVE_MMX 2418 #if HAVE_MMX
2419 #if HAVE_SSE2 2419 #if HAVE_SSE2
2420 __asm__( 2420 __asm__(
2421 "xor %%"REG_a", %%"REG_a" \n\t" 2421 "xor %%"REG_a", %%"REG_a" \n\t"
2422 "1: \n\t" 2422 "1: \n\t"
2423 PREFETCH" 64(%1, %%"REG_a") \n\t" 2423 PREFETCH" 64(%1, %%"REG_a") \n\t"
2424 PREFETCH" 64(%2, %%"REG_a") \n\t" 2424 PREFETCH" 64(%2, %%"REG_a") \n\t"
2425 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" 2425 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2426 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" 2426 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2427 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" 2427 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2428 "punpcklbw %%xmm2, %%xmm0 \n\t" 2428 "punpcklbw %%xmm2, %%xmm0 \n\t"
2429 "punpckhbw %%xmm2, %%xmm1 \n\t" 2429 "punpckhbw %%xmm2, %%xmm1 \n\t"
2430 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" 2430 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2431 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" 2431 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2432 "add $16, %%"REG_a" \n\t" 2432 "add $16, %%"REG_a" \n\t"
2433 "cmp %3, %%"REG_a" \n\t" 2433 "cmp %3, %%"REG_a" \n\t"
2434 " jb 1b \n\t" 2434 " jb 1b \n\t"
2435 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 2435 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2436 : "memory", "%"REG_a"" 2436 : "memory", "%"REG_a""
2437 ); 2437 );
2438 #else 2438 #else
2439 __asm__( 2439 __asm__(
2440 "xor %%"REG_a", %%"REG_a" \n\t" 2440 "xor %%"REG_a", %%"REG_a" \n\t"
2441 "1: \n\t" 2441 "1: \n\t"
2442 PREFETCH" 64(%1, %%"REG_a") \n\t" 2442 PREFETCH" 64(%1, %%"REG_a") \n\t"
2443 PREFETCH" 64(%2, %%"REG_a") \n\t" 2443 PREFETCH" 64(%2, %%"REG_a") \n\t"
2444 "movq (%1, %%"REG_a"), %%mm0 \n\t" 2444 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2445 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" 2445 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2446 "movq %%mm0, %%mm1 \n\t" 2446 "movq %%mm0, %%mm1 \n\t"
2447 "movq %%mm2, %%mm3 \n\t" 2447 "movq %%mm2, %%mm3 \n\t"
2448 "movq (%2, %%"REG_a"), %%mm4 \n\t" 2448 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2449 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" 2449 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2450 "punpcklbw %%mm4, %%mm0 \n\t" 2450 "punpcklbw %%mm4, %%mm0 \n\t"
2451 "punpckhbw %%mm4, %%mm1 \n\t" 2451 "punpckhbw %%mm4, %%mm1 \n\t"
2452 "punpcklbw %%mm5, %%mm2 \n\t" 2452 "punpcklbw %%mm5, %%mm2 \n\t"
2453 "punpckhbw %%mm5, %%mm3 \n\t" 2453 "punpckhbw %%mm5, %%mm3 \n\t"
2454 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" 2454 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2455 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" 2455 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2456 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" 2456 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2457 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" 2457 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2458 "add $16, %%"REG_a" \n\t" 2458 "add $16, %%"REG_a" \n\t"
2459 "cmp %3, %%"REG_a" \n\t" 2459 "cmp %3, %%"REG_a" \n\t"
2460 " jb 1b \n\t" 2460 " jb 1b \n\t"
2461 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 2461 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2462 : "memory", "%"REG_a 2462 : "memory", "%"REG_a
2463 ); 2463 );
2464 #endif 2464 #endif
2465 for (w= (width&(~15)); w < width; w++) 2465 for (w= (width&(~15)); w < width; w++)
2466 { 2466 {
2467 dest[2*w+0] = src1[w]; 2467 dest[2*w+0] = src1[w];
2478 src1 += src1Stride; 2478 src1 += src1Stride;
2479 src2 += src2Stride; 2479 src2 += src2Stride;
2480 } 2480 }
2481 #if HAVE_MMX 2481 #if HAVE_MMX
2482 __asm__( 2482 __asm__(
2483 EMMS" \n\t" 2483 EMMS" \n\t"
2484 SFENCE" \n\t" 2484 SFENCE" \n\t"
2485 ::: "memory" 2485 ::: "memory"
2486 ); 2486 );
2487 #endif 2487 #endif
2488 } 2488 }
2489 2489
2490 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, 2490 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2491 uint8_t *dst1, uint8_t *dst2, 2491 uint8_t *dst1, uint8_t *dst2,
2496 x86_reg y; 2496 x86_reg y;
2497 long x,w,h; 2497 long x,w,h;
2498 w=width/2; h=height/2; 2498 w=width/2; h=height/2;
2499 #if HAVE_MMX 2499 #if HAVE_MMX
2500 __asm__ volatile( 2500 __asm__ volatile(
2501 PREFETCH" %0 \n\t" 2501 PREFETCH" %0 \n\t"
2502 PREFETCH" %1 \n\t" 2502 PREFETCH" %1 \n\t"
2503 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); 2503 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2504 #endif 2504 #endif
2505 for (y=0;y<h;y++){ 2505 for (y=0;y<h;y++){
2506 const uint8_t* s1=src1+srcStride1*(y>>1); 2506 const uint8_t* s1=src1+srcStride1*(y>>1);
2507 uint8_t* d=dst1+dstStride1*y; 2507 uint8_t* d=dst1+dstStride1*y;
2508 x=0; 2508 x=0;
2509 #if HAVE_MMX 2509 #if HAVE_MMX
2510 for (;x<w-31;x+=32) 2510 for (;x<w-31;x+=32)
2511 { 2511 {
2512 __asm__ volatile( 2512 __asm__ volatile(
2513 PREFETCH" 32%1 \n\t" 2513 PREFETCH" 32%1 \n\t"
2514 "movq %1, %%mm0 \n\t" 2514 "movq %1, %%mm0 \n\t"
2515 "movq 8%1, %%mm2 \n\t" 2515 "movq 8%1, %%mm2 \n\t"
2516 "movq 16%1, %%mm4 \n\t" 2516 "movq 16%1, %%mm4 \n\t"
2517 "movq 24%1, %%mm6 \n\t" 2517 "movq 24%1, %%mm6 \n\t"
2518 "movq %%mm0, %%mm1 \n\t" 2518 "movq %%mm0, %%mm1 \n\t"
2519 "movq %%mm2, %%mm3 \n\t" 2519 "movq %%mm2, %%mm3 \n\t"
2520 "movq %%mm4, %%mm5 \n\t" 2520 "movq %%mm4, %%mm5 \n\t"
2521 "movq %%mm6, %%mm7 \n\t" 2521 "movq %%mm6, %%mm7 \n\t"
2522 "punpcklbw %%mm0, %%mm0 \n\t" 2522 "punpcklbw %%mm0, %%mm0 \n\t"
2523 "punpckhbw %%mm1, %%mm1 \n\t" 2523 "punpckhbw %%mm1, %%mm1 \n\t"
2524 "punpcklbw %%mm2, %%mm2 \n\t" 2524 "punpcklbw %%mm2, %%mm2 \n\t"
2525 "punpckhbw %%mm3, %%mm3 \n\t" 2525 "punpckhbw %%mm3, %%mm3 \n\t"
2526 "punpcklbw %%mm4, %%mm4 \n\t" 2526 "punpcklbw %%mm4, %%mm4 \n\t"
2527 "punpckhbw %%mm5, %%mm5 \n\t" 2527 "punpckhbw %%mm5, %%mm5 \n\t"
2528 "punpcklbw %%mm6, %%mm6 \n\t" 2528 "punpcklbw %%mm6, %%mm6 \n\t"
2529 "punpckhbw %%mm7, %%mm7 \n\t" 2529 "punpckhbw %%mm7, %%mm7 \n\t"
2530 MOVNTQ" %%mm0, %0 \n\t" 2530 MOVNTQ" %%mm0, %0 \n\t"
2531 MOVNTQ" %%mm1, 8%0 \n\t" 2531 MOVNTQ" %%mm1, 8%0 \n\t"
2532 MOVNTQ" %%mm2, 16%0 \n\t" 2532 MOVNTQ" %%mm2, 16%0 \n\t"
2533 MOVNTQ" %%mm3, 24%0 \n\t" 2533 MOVNTQ" %%mm3, 24%0 \n\t"
2534 MOVNTQ" %%mm4, 32%0 \n\t" 2534 MOVNTQ" %%mm4, 32%0 \n\t"
2535 MOVNTQ" %%mm5, 40%0 \n\t" 2535 MOVNTQ" %%mm5, 40%0 \n\t"
2536 MOVNTQ" %%mm6, 48%0 \n\t" 2536 MOVNTQ" %%mm6, 48%0 \n\t"
2537 MOVNTQ" %%mm7, 56%0" 2537 MOVNTQ" %%mm7, 56%0"
2538 :"=m"(d[2*x]) 2538 :"=m"(d[2*x])
2539 :"m"(s1[x]) 2539 :"m"(s1[x])
2540 :"memory"); 2540 :"memory");
2541 } 2541 }
2542 #endif 2542 #endif
2543 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; 2543 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2544 } 2544 }
2545 for (y=0;y<h;y++){ 2545 for (y=0;y<h;y++){
2546 const uint8_t* s2=src2+srcStride2*(y>>1); 2546 const uint8_t* s2=src2+srcStride2*(y>>1);
2547 uint8_t* d=dst2+dstStride2*y; 2547 uint8_t* d=dst2+dstStride2*y;
2548 x=0; 2548 x=0;
2549 #if HAVE_MMX 2549 #if HAVE_MMX
2550 for (;x<w-31;x+=32) 2550 for (;x<w-31;x+=32)
2551 { 2551 {
2552 __asm__ volatile( 2552 __asm__ volatile(
2553 PREFETCH" 32%1 \n\t" 2553 PREFETCH" 32%1 \n\t"
2554 "movq %1, %%mm0 \n\t" 2554 "movq %1, %%mm0 \n\t"
2555 "movq 8%1, %%mm2 \n\t" 2555 "movq 8%1, %%mm2 \n\t"
2556 "movq 16%1, %%mm4 \n\t" 2556 "movq 16%1, %%mm4 \n\t"
2557 "movq 24%1, %%mm6 \n\t" 2557 "movq 24%1, %%mm6 \n\t"
2558 "movq %%mm0, %%mm1 \n\t" 2558 "movq %%mm0, %%mm1 \n\t"
2559 "movq %%mm2, %%mm3 \n\t" 2559 "movq %%mm2, %%mm3 \n\t"
2560 "movq %%mm4, %%mm5 \n\t" 2560 "movq %%mm4, %%mm5 \n\t"
2561 "movq %%mm6, %%mm7 \n\t" 2561 "movq %%mm6, %%mm7 \n\t"
2562 "punpcklbw %%mm0, %%mm0 \n\t" 2562 "punpcklbw %%mm0, %%mm0 \n\t"
2563 "punpckhbw %%mm1, %%mm1 \n\t" 2563 "punpckhbw %%mm1, %%mm1 \n\t"
2564 "punpcklbw %%mm2, %%mm2 \n\t" 2564 "punpcklbw %%mm2, %%mm2 \n\t"
2565 "punpckhbw %%mm3, %%mm3 \n\t" 2565 "punpckhbw %%mm3, %%mm3 \n\t"
2566 "punpcklbw %%mm4, %%mm4 \n\t" 2566 "punpcklbw %%mm4, %%mm4 \n\t"
2567 "punpckhbw %%mm5, %%mm5 \n\t" 2567 "punpckhbw %%mm5, %%mm5 \n\t"
2568 "punpcklbw %%mm6, %%mm6 \n\t" 2568 "punpcklbw %%mm6, %%mm6 \n\t"
2569 "punpckhbw %%mm7, %%mm7 \n\t" 2569 "punpckhbw %%mm7, %%mm7 \n\t"
2570 MOVNTQ" %%mm0, %0 \n\t" 2570 MOVNTQ" %%mm0, %0 \n\t"
2571 MOVNTQ" %%mm1, 8%0 \n\t" 2571 MOVNTQ" %%mm1, 8%0 \n\t"
2572 MOVNTQ" %%mm2, 16%0 \n\t" 2572 MOVNTQ" %%mm2, 16%0 \n\t"
2573 MOVNTQ" %%mm3, 24%0 \n\t" 2573 MOVNTQ" %%mm3, 24%0 \n\t"
2574 MOVNTQ" %%mm4, 32%0 \n\t" 2574 MOVNTQ" %%mm4, 32%0 \n\t"
2575 MOVNTQ" %%mm5, 40%0 \n\t" 2575 MOVNTQ" %%mm5, 40%0 \n\t"
2576 MOVNTQ" %%mm6, 48%0 \n\t" 2576 MOVNTQ" %%mm6, 48%0 \n\t"
2577 MOVNTQ" %%mm7, 56%0" 2577 MOVNTQ" %%mm7, 56%0"
2578 :"=m"(d[2*x]) 2578 :"=m"(d[2*x])
2579 :"m"(s2[x]) 2579 :"m"(s2[x])
2580 :"memory"); 2580 :"memory");
2581 } 2581 }
2582 #endif 2582 #endif
2583 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; 2583 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2584 } 2584 }
2585 #if HAVE_MMX 2585 #if HAVE_MMX
2586 __asm__( 2586 __asm__(
2587 EMMS" \n\t" 2587 EMMS" \n\t"
2588 SFENCE" \n\t" 2588 SFENCE" \n\t"
2589 ::: "memory" 2589 ::: "memory"
2590 ); 2590 );
2591 #endif 2591 #endif
2592 } 2592 }
2593 2593
2594 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, 2594 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2599 { 2599 {
2600 x86_reg x; 2600 x86_reg x;
2601 long y,w,h; 2601 long y,w,h;
2602 w=width/2; h=height; 2602 w=width/2; h=height;
2603 for (y=0;y<h;y++){ 2603 for (y=0;y<h;y++){
2604 const uint8_t* yp=src1+srcStride1*y; 2604 const uint8_t* yp=src1+srcStride1*y;
2605 const uint8_t* up=src2+srcStride2*(y>>2); 2605 const uint8_t* up=src2+srcStride2*(y>>2);
2606 const uint8_t* vp=src3+srcStride3*(y>>2); 2606 const uint8_t* vp=src3+srcStride3*(y>>2);
2607 uint8_t* d=dst+dstStride*y; 2607 uint8_t* d=dst+dstStride*y;
2608 x=0; 2608 x=0;
2609 #if HAVE_MMX 2609 #if HAVE_MMX
2610 for (;x<w-7;x+=8) 2610 for (;x<w-7;x+=8)
2611 { 2611 {
2612 __asm__ volatile( 2612 __asm__ volatile(
2613 PREFETCH" 32(%1, %0) \n\t" 2613 PREFETCH" 32(%1, %0) \n\t"
2614 PREFETCH" 32(%2, %0) \n\t" 2614 PREFETCH" 32(%2, %0) \n\t"
2615 PREFETCH" 32(%3, %0) \n\t" 2615 PREFETCH" 32(%3, %0) \n\t"
2616 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 2616 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2617 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ 2617 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2618 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ 2618 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2619 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 2619 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2620 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ 2620 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2621 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ 2621 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2622 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ 2622 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2623 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ 2623 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2624 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ 2624 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2625 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ 2625 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2626 2626
2627 "movq %%mm1, %%mm6 \n\t" 2627 "movq %%mm1, %%mm6 \n\t"
2628 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ 2628 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2629 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ 2629 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2630 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ 2630 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2631 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" 2631 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2632 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" 2632 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2633 2633
2634 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ 2634 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2635 "movq 8(%1, %0, 4), %%mm0 \n\t" 2635 "movq 8(%1, %0, 4), %%mm0 \n\t"
2636 "movq %%mm0, %%mm3 \n\t" 2636 "movq %%mm0, %%mm3 \n\t"
2637 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ 2637 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2638 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ 2638 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2639 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" 2639 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2640 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" 2640 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2641 2641
2642 "movq %%mm4, %%mm6 \n\t" 2642 "movq %%mm4, %%mm6 \n\t"
2643 "movq 16(%1, %0, 4), %%mm0 \n\t" 2643 "movq 16(%1, %0, 4), %%mm0 \n\t"
2644 "movq %%mm0, %%mm3 \n\t" 2644 "movq %%mm0, %%mm3 \n\t"
2645 "punpcklbw %%mm5, %%mm4 \n\t" 2645 "punpcklbw %%mm5, %%mm4 \n\t"
2646 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ 2646 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2647 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ 2647 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2648 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" 2648 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2649 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" 2649 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2650 2650
2651 "punpckhbw %%mm5, %%mm6 \n\t" 2651 "punpckhbw %%mm5, %%mm6 \n\t"
2652 "movq 24(%1, %0, 4), %%mm0 \n\t" 2652 "movq 24(%1, %0, 4), %%mm0 \n\t"
2653 "movq %%mm0, %%mm3 \n\t" 2653 "movq %%mm0, %%mm3 \n\t"
2654 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ 2654 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2655 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ 2655 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2656 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" 2656 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2657 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" 2657 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2658 2658
2659 : "+r" (x) 2659 : "+r" (x)
2660 : "r"(yp), "r" (up), "r"(vp), "r"(d) 2660 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2661 :"memory"); 2661 :"memory");
2662 } 2662 }
2663 #endif 2663 #endif
2664 for (; x<w; x++) 2664 for (; x<w; x++)
2665 { 2665 {
2666 const long x2 = x<<2; 2666 const long x2 = x<<2;
2667 d[8*x+0] = yp[x2]; 2667 d[8*x+0] = yp[x2];
2668 d[8*x+1] = up[x]; 2668 d[8*x+1] = up[x];
2669 d[8*x+2] = yp[x2+1]; 2669 d[8*x+2] = yp[x2+1];
2670 d[8*x+3] = vp[x]; 2670 d[8*x+3] = vp[x];
2671 d[8*x+4] = yp[x2+2]; 2671 d[8*x+4] = yp[x2+2];
2672 d[8*x+5] = up[x]; 2672 d[8*x+5] = up[x];
2673 d[8*x+6] = yp[x2+3]; 2673 d[8*x+6] = yp[x2+3];
2674 d[8*x+7] = vp[x]; 2674 d[8*x+7] = vp[x];
2675 } 2675 }
2676 } 2676 }
2677 #if HAVE_MMX 2677 #if HAVE_MMX
2678 __asm__( 2678 __asm__(
2679 EMMS" \n\t" 2679 EMMS" \n\t"
2680 SFENCE" \n\t" 2680 SFENCE" \n\t"
2681 ::: "memory" 2681 ::: "memory"
2682 ); 2682 );
2683 #endif 2683 #endif
2684 } 2684 }
2685 2685
2686 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) 2686 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2945 src += srcStride; 2945 src += srcStride;
2946 ydst+= lumStride; 2946 ydst+= lumStride;
2947 } 2947 }
2948 #if HAVE_MMX 2948 #if HAVE_MMX
2949 __asm__( 2949 __asm__(
2950 EMMS" \n\t" 2950 EMMS" \n\t"
2951 SFENCE" \n\t" 2951 SFENCE" \n\t"
2952 ::: "memory" 2952 ::: "memory"
2953 ); 2953 );
2954 #endif 2954 #endif
2955 } 2955 }
2956 2956
2957 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 2957 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2970 udst+= chromStride; 2970 udst+= chromStride;
2971 vdst+= chromStride; 2971 vdst+= chromStride;
2972 } 2972 }
2973 #if HAVE_MMX 2973 #if HAVE_MMX
2974 __asm__( 2974 __asm__(
2975 EMMS" \n\t" 2975 EMMS" \n\t"
2976 SFENCE" \n\t" 2976 SFENCE" \n\t"
2977 ::: "memory" 2977 ::: "memory"
2978 ); 2978 );
2979 #endif 2979 #endif
2980 } 2980 }
2981 2981
2982 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 2982 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2997 src += srcStride; 2997 src += srcStride;
2998 ydst+= lumStride; 2998 ydst+= lumStride;
2999 } 2999 }
3000 #if HAVE_MMX 3000 #if HAVE_MMX
3001 __asm__( 3001 __asm__(
3002 EMMS" \n\t" 3002 EMMS" \n\t"
3003 SFENCE" \n\t" 3003 SFENCE" \n\t"
3004 ::: "memory" 3004 ::: "memory"
3005 ); 3005 );
3006 #endif 3006 #endif
3007 } 3007 }
3008 3008
3009 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 3009 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
3022 udst+= chromStride; 3022 udst+= chromStride;
3023 vdst+= chromStride; 3023 vdst+= chromStride;
3024 } 3024 }
3025 #if HAVE_MMX 3025 #if HAVE_MMX
3026 __asm__( 3026 __asm__(
3027 EMMS" \n\t" 3027 EMMS" \n\t"
3028 SFENCE" \n\t" 3028 SFENCE" \n\t"
3029 ::: "memory" 3029 ::: "memory"
3030 ); 3030 );
3031 #endif 3031 #endif
3032 } 3032 }
3033 3033
3034 static inline void RENAME(rgb2rgb_init)(void){ 3034 static inline void RENAME(rgb2rgb_init)(void){