Mercurial > mplayer.hg
comparison libswscale/rgb2rgb_template.c @ 29480:a4d8dee13834
Indent libswscale:
- Use 4 spaces throughout for indentation;
- Fix inconsistent indentation;
- Indent function calls and declarations aligning arguments on multiple lines
to the column after the opening parentheses;
- Align asm code to the column 4 spaces after the call to __asm__();
- Align cases in switch statements to the same column as "switch".
author | ramiro |
---|---|
date | Sun, 16 Aug 2009 00:32:04 +0000 |
parents | 0673fad0546f |
children | c080f1f5c07e |
comparison
equal
deleted
inserted
replaced
29479:55f33b0748c9 | 29480:a4d8dee13834 |
---|---|
75 { | 75 { |
76 uint8_t *dest = dst; | 76 uint8_t *dest = dst; |
77 const uint8_t *s = src; | 77 const uint8_t *s = src; |
78 const uint8_t *end; | 78 const uint8_t *end; |
79 #if HAVE_MMX | 79 #if HAVE_MMX |
80 const uint8_t *mm_end; | 80 const uint8_t *mm_end; |
81 #endif | 81 #endif |
82 end = s + src_size; | 82 end = s + src_size; |
83 #if HAVE_MMX | 83 #if HAVE_MMX |
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | 84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
85 mm_end = end - 23; | 85 mm_end = end - 23; |
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); | 86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); |
87 while (s < mm_end) | 87 while (s < mm_end) |
88 { | 88 { |
89 __asm__ volatile( | 89 __asm__ volatile( |
90 PREFETCH" 32%1 \n\t" | 90 PREFETCH" 32%1 \n\t" |
91 "movd %1, %%mm0 \n\t" | 91 "movd %1, %%mm0 \n\t" |
92 "punpckldq 3%1, %%mm0 \n\t" | 92 "punpckldq 3%1, %%mm0 \n\t" |
93 "movd 6%1, %%mm1 \n\t" | 93 "movd 6%1, %%mm1 \n\t" |
94 "punpckldq 9%1, %%mm1 \n\t" | 94 "punpckldq 9%1, %%mm1 \n\t" |
105 MOVNTQ" %%mm2, 16%0 \n\t" | 105 MOVNTQ" %%mm2, 16%0 \n\t" |
106 MOVNTQ" %%mm3, 24%0" | 106 MOVNTQ" %%mm3, 24%0" |
107 :"=m"(*dest) | 107 :"=m"(*dest) |
108 :"m"(*s) | 108 :"m"(*s) |
109 :"memory"); | 109 :"memory"); |
110 dest += 32; | 110 dest += 32; |
111 s += 24; | 111 s += 24; |
112 } | 112 } |
113 __asm__ volatile(SFENCE:::"memory"); | 113 __asm__ volatile(SFENCE:::"memory"); |
114 __asm__ volatile(EMMS:::"memory"); | 114 __asm__ volatile(EMMS:::"memory"); |
115 #endif | 115 #endif |
116 while (s < end) | 116 while (s < end) |
117 { | 117 { |
118 #if HAVE_BIGENDIAN | 118 #if HAVE_BIGENDIAN |
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ | 119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ |
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | 144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
145 mm_end = end - 31; | 145 mm_end = end - 31; |
146 while (s < mm_end) | 146 while (s < mm_end) |
147 { | 147 { |
148 __asm__ volatile( | 148 __asm__ volatile( |
149 PREFETCH" 32%1 \n\t" | 149 PREFETCH" 32%1 \n\t" |
150 "movq %1, %%mm0 \n\t" | 150 "movq %1, %%mm0 \n\t" |
151 "movq 8%1, %%mm1 \n\t" | 151 "movq 8%1, %%mm1 \n\t" |
152 "movq 16%1, %%mm4 \n\t" | 152 "movq 16%1, %%mm4 \n\t" |
153 "movq 24%1, %%mm5 \n\t" | 153 "movq 24%1, %%mm5 \n\t" |
154 "movq %%mm0, %%mm2 \n\t" | 154 "movq %%mm0, %%mm2 \n\t" |
155 "movq %%mm1, %%mm3 \n\t" | 155 "movq %%mm1, %%mm3 \n\t" |
156 "movq %%mm4, %%mm6 \n\t" | 156 "movq %%mm4, %%mm6 \n\t" |
157 "movq %%mm5, %%mm7 \n\t" | 157 "movq %%mm5, %%mm7 \n\t" |
158 "psrlq $8, %%mm2 \n\t" | 158 "psrlq $8, %%mm2 \n\t" |
159 "psrlq $8, %%mm3 \n\t" | 159 "psrlq $8, %%mm3 \n\t" |
160 "psrlq $8, %%mm6 \n\t" | 160 "psrlq $8, %%mm6 \n\t" |
161 "psrlq $8, %%mm7 \n\t" | 161 "psrlq $8, %%mm7 \n\t" |
162 "pand %2, %%mm0 \n\t" | 162 "pand %2, %%mm0 \n\t" |
163 "pand %2, %%mm1 \n\t" | 163 "pand %2, %%mm1 \n\t" |
164 "pand %2, %%mm4 \n\t" | 164 "pand %2, %%mm4 \n\t" |
165 "pand %2, %%mm5 \n\t" | 165 "pand %2, %%mm5 \n\t" |
166 "pand %3, %%mm2 \n\t" | 166 "pand %3, %%mm2 \n\t" |
167 "pand %3, %%mm3 \n\t" | 167 "pand %3, %%mm3 \n\t" |
168 "pand %3, %%mm6 \n\t" | 168 "pand %3, %%mm6 \n\t" |
169 "pand %3, %%mm7 \n\t" | 169 "pand %3, %%mm7 \n\t" |
170 "por %%mm2, %%mm0 \n\t" | 170 "por %%mm2, %%mm0 \n\t" |
171 "por %%mm3, %%mm1 \n\t" | 171 "por %%mm3, %%mm1 \n\t" |
172 "por %%mm6, %%mm4 \n\t" | 172 "por %%mm6, %%mm4 \n\t" |
173 "por %%mm7, %%mm5 \n\t" | 173 "por %%mm7, %%mm5 \n\t" |
174 | 174 |
175 "movq %%mm1, %%mm2 \n\t" | 175 "movq %%mm1, %%mm2 \n\t" |
176 "movq %%mm4, %%mm3 \n\t" | 176 "movq %%mm4, %%mm3 \n\t" |
177 "psllq $48, %%mm2 \n\t" | 177 "psllq $48, %%mm2 \n\t" |
178 "psllq $32, %%mm3 \n\t" | 178 "psllq $32, %%mm3 \n\t" |
179 "pand %4, %%mm2 \n\t" | 179 "pand %4, %%mm2 \n\t" |
180 "pand %5, %%mm3 \n\t" | 180 "pand %5, %%mm3 \n\t" |
181 "por %%mm2, %%mm0 \n\t" | 181 "por %%mm2, %%mm0 \n\t" |
182 "psrlq $16, %%mm1 \n\t" | 182 "psrlq $16, %%mm1 \n\t" |
183 "psrlq $32, %%mm4 \n\t" | 183 "psrlq $32, %%mm4 \n\t" |
184 "psllq $16, %%mm5 \n\t" | 184 "psllq $16, %%mm5 \n\t" |
185 "por %%mm3, %%mm1 \n\t" | 185 "por %%mm3, %%mm1 \n\t" |
186 "pand %6, %%mm5 \n\t" | 186 "pand %6, %%mm5 \n\t" |
187 "por %%mm5, %%mm4 \n\t" | 187 "por %%mm5, %%mm4 \n\t" |
188 | 188 |
189 MOVNTQ" %%mm0, %0 \n\t" | 189 MOVNTQ" %%mm0, %0 \n\t" |
190 MOVNTQ" %%mm1, 8%0 \n\t" | 190 MOVNTQ" %%mm1, 8%0 \n\t" |
191 MOVNTQ" %%mm4, 16%0" | 191 MOVNTQ" %%mm4, 16%0" |
192 :"=m"(*dest) | 192 :"=m"(*dest) |
193 :"m"(*s),"m"(mask24l), | 193 :"m"(*s),"m"(mask24l), |
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | 194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
195 :"memory"); | 195 :"memory"); |
196 dest += 24; | 196 dest += 24; |
197 s += 32; | 197 s += 32; |
198 } | 198 } |
199 __asm__ volatile(SFENCE:::"memory"); | 199 __asm__ volatile(SFENCE:::"memory"); |
200 __asm__ volatile(EMMS:::"memory"); | 200 __asm__ volatile(EMMS:::"memory"); |
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); | 235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); |
236 mm_end = end - 15; | 236 mm_end = end - 15; |
237 while (s<mm_end) | 237 while (s<mm_end) |
238 { | 238 { |
239 __asm__ volatile( | 239 __asm__ volatile( |
240 PREFETCH" 32%1 \n\t" | 240 PREFETCH" 32%1 \n\t" |
241 "movq %1, %%mm0 \n\t" | 241 "movq %1, %%mm0 \n\t" |
242 "movq 8%1, %%mm2 \n\t" | 242 "movq 8%1, %%mm2 \n\t" |
243 "movq %%mm0, %%mm1 \n\t" | 243 "movq %%mm0, %%mm1 \n\t" |
244 "movq %%mm2, %%mm3 \n\t" | 244 "movq %%mm2, %%mm3 \n\t" |
245 "pand %%mm4, %%mm0 \n\t" | 245 "pand %%mm4, %%mm0 \n\t" |
246 "pand %%mm4, %%mm2 \n\t" | 246 "pand %%mm4, %%mm2 \n\t" |
247 "paddw %%mm1, %%mm0 \n\t" | 247 "paddw %%mm1, %%mm0 \n\t" |
248 "paddw %%mm3, %%mm2 \n\t" | 248 "paddw %%mm3, %%mm2 \n\t" |
249 MOVNTQ" %%mm0, %0 \n\t" | 249 MOVNTQ" %%mm0, %0 \n\t" |
250 MOVNTQ" %%mm2, 8%0" | 250 MOVNTQ" %%mm2, 8%0" |
251 :"=m"(*d) | 251 :"=m"(*d) |
252 :"m"(*s) | 252 :"m"(*s) |
253 ); | 253 ); |
254 d+=16; | 254 d+=16; |
255 s+=16; | 255 s+=16; |
256 } | 256 } |
257 __asm__ volatile(SFENCE:::"memory"); | 257 __asm__ volatile(SFENCE:::"memory"); |
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); | 285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); |
286 mm_end = end - 15; | 286 mm_end = end - 15; |
287 while (s<mm_end) | 287 while (s<mm_end) |
288 { | 288 { |
289 __asm__ volatile( | 289 __asm__ volatile( |
290 PREFETCH" 32%1 \n\t" | 290 PREFETCH" 32%1 \n\t" |
291 "movq %1, %%mm0 \n\t" | 291 "movq %1, %%mm0 \n\t" |
292 "movq 8%1, %%mm2 \n\t" | 292 "movq 8%1, %%mm2 \n\t" |
293 "movq %%mm0, %%mm1 \n\t" | 293 "movq %%mm0, %%mm1 \n\t" |
294 "movq %%mm2, %%mm3 \n\t" | 294 "movq %%mm2, %%mm3 \n\t" |
295 "psrlq $1, %%mm0 \n\t" | 295 "psrlq $1, %%mm0 \n\t" |
296 "psrlq $1, %%mm2 \n\t" | 296 "psrlq $1, %%mm2 \n\t" |
297 "pand %%mm7, %%mm0 \n\t" | 297 "pand %%mm7, %%mm0 \n\t" |
298 "pand %%mm7, %%mm2 \n\t" | 298 "pand %%mm7, %%mm2 \n\t" |
299 "pand %%mm6, %%mm1 \n\t" | 299 "pand %%mm6, %%mm1 \n\t" |
300 "pand %%mm6, %%mm3 \n\t" | 300 "pand %%mm6, %%mm3 \n\t" |
301 "por %%mm1, %%mm0 \n\t" | 301 "por %%mm1, %%mm0 \n\t" |
302 "por %%mm3, %%mm2 \n\t" | 302 "por %%mm3, %%mm2 \n\t" |
303 MOVNTQ" %%mm0, %0 \n\t" | 303 MOVNTQ" %%mm0, %0 \n\t" |
304 MOVNTQ" %%mm2, 8%0" | 304 MOVNTQ" %%mm2, 8%0" |
305 :"=m"(*d) | 305 :"=m"(*d) |
306 :"m"(*s) | 306 :"m"(*s) |
307 ); | 307 ); |
308 d+=16; | 308 d+=16; |
309 s+=16; | 309 s+=16; |
310 } | 310 } |
311 __asm__ volatile(SFENCE:::"memory"); | 311 __asm__ volatile(SFENCE:::"memory"); |
337 end = s + src_size; | 337 end = s + src_size; |
338 #if HAVE_MMX | 338 #if HAVE_MMX |
339 mm_end = end - 15; | 339 mm_end = end - 15; |
340 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) | 340 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) |
341 __asm__ volatile( | 341 __asm__ volatile( |
342 "movq %3, %%mm5 \n\t" | 342 "movq %3, %%mm5 \n\t" |
343 "movq %4, %%mm6 \n\t" | 343 "movq %4, %%mm6 \n\t" |
344 "movq %5, %%mm7 \n\t" | 344 "movq %5, %%mm7 \n\t" |
345 "jmp 2f \n\t" | 345 "jmp 2f \n\t" |
346 ASMALIGN(4) | 346 ASMALIGN(4) |
347 "1: \n\t" | 347 "1: \n\t" |
348 PREFETCH" 32(%1) \n\t" | 348 PREFETCH" 32(%1) \n\t" |
349 "movd (%1), %%mm0 \n\t" | 349 "movd (%1), %%mm0 \n\t" |
350 "movd 4(%1), %%mm3 \n\t" | 350 "movd 4(%1), %%mm3 \n\t" |
351 "punpckldq 8(%1), %%mm0 \n\t" | 351 "punpckldq 8(%1), %%mm0 \n\t" |
352 "punpckldq 12(%1), %%mm3 \n\t" | 352 "punpckldq 12(%1), %%mm3 \n\t" |
353 "movq %%mm0, %%mm1 \n\t" | 353 "movq %%mm0, %%mm1 \n\t" |
354 "movq %%mm3, %%mm4 \n\t" | 354 "movq %%mm3, %%mm4 \n\t" |
355 "pand %%mm6, %%mm0 \n\t" | 355 "pand %%mm6, %%mm0 \n\t" |
356 "pand %%mm6, %%mm3 \n\t" | 356 "pand %%mm6, %%mm3 \n\t" |
357 "pmaddwd %%mm7, %%mm0 \n\t" | 357 "pmaddwd %%mm7, %%mm0 \n\t" |
358 "pmaddwd %%mm7, %%mm3 \n\t" | 358 "pmaddwd %%mm7, %%mm3 \n\t" |
359 "pand %%mm5, %%mm1 \n\t" | 359 "pand %%mm5, %%mm1 \n\t" |
360 "pand %%mm5, %%mm4 \n\t" | 360 "pand %%mm5, %%mm4 \n\t" |
361 "por %%mm1, %%mm0 \n\t" | 361 "por %%mm1, %%mm0 \n\t" |
362 "por %%mm4, %%mm3 \n\t" | 362 "por %%mm4, %%mm3 \n\t" |
363 "psrld $5, %%mm0 \n\t" | 363 "psrld $5, %%mm0 \n\t" |
364 "pslld $11, %%mm3 \n\t" | 364 "pslld $11, %%mm3 \n\t" |
365 "por %%mm3, %%mm0 \n\t" | 365 "por %%mm3, %%mm0 \n\t" |
366 MOVNTQ" %%mm0, (%0) \n\t" | 366 MOVNTQ" %%mm0, (%0) \n\t" |
367 "add $16, %1 \n\t" | 367 "add $16, %1 \n\t" |
368 "add $8, %0 \n\t" | 368 "add $8, %0 \n\t" |
369 "2: \n\t" | 369 "2: \n\t" |
370 "cmp %2, %1 \n\t" | 370 "cmp %2, %1 \n\t" |
371 " jb 1b \n\t" | 371 " jb 1b \n\t" |
372 : "+r" (d), "+r"(s) | 372 : "+r" (d), "+r"(s) |
373 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | 373 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) |
374 ); | 374 ); |
375 #else | 375 #else |
376 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | 376 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
377 __asm__ volatile( | 377 __asm__ volatile( |
378 "movq %0, %%mm7 \n\t" | 378 "movq %0, %%mm7 \n\t" |
379 "movq %1, %%mm6 \n\t" | 379 "movq %1, %%mm6 \n\t" |
380 ::"m"(red_16mask),"m"(green_16mask)); | 380 ::"m"(red_16mask),"m"(green_16mask)); |
381 while (s < mm_end) | 381 while (s < mm_end) |
382 { | 382 { |
383 __asm__ volatile( | 383 __asm__ volatile( |
384 PREFETCH" 32%1 \n\t" | 384 PREFETCH" 32%1 \n\t" |
385 "movd %1, %%mm0 \n\t" | 385 "movd %1, %%mm0 \n\t" |
386 "movd 4%1, %%mm3 \n\t" | 386 "movd 4%1, %%mm3 \n\t" |
387 "punpckldq 8%1, %%mm0 \n\t" | 387 "punpckldq 8%1, %%mm0 \n\t" |
388 "punpckldq 12%1, %%mm3 \n\t" | 388 "punpckldq 12%1, %%mm3 \n\t" |
389 "movq %%mm0, %%mm1 \n\t" | 389 "movq %%mm0, %%mm1 \n\t" |
390 "movq %%mm0, %%mm2 \n\t" | 390 "movq %%mm0, %%mm2 \n\t" |
391 "movq %%mm3, %%mm4 \n\t" | 391 "movq %%mm3, %%mm4 \n\t" |
392 "movq %%mm3, %%mm5 \n\t" | 392 "movq %%mm3, %%mm5 \n\t" |
393 "psrlq $3, %%mm0 \n\t" | 393 "psrlq $3, %%mm0 \n\t" |
394 "psrlq $3, %%mm3 \n\t" | 394 "psrlq $3, %%mm3 \n\t" |
395 "pand %2, %%mm0 \n\t" | 395 "pand %2, %%mm0 \n\t" |
396 "pand %2, %%mm3 \n\t" | 396 "pand %2, %%mm3 \n\t" |
397 "psrlq $5, %%mm1 \n\t" | 397 "psrlq $5, %%mm1 \n\t" |
398 "psrlq $5, %%mm4 \n\t" | 398 "psrlq $5, %%mm4 \n\t" |
399 "pand %%mm6, %%mm1 \n\t" | 399 "pand %%mm6, %%mm1 \n\t" |
400 "pand %%mm6, %%mm4 \n\t" | 400 "pand %%mm6, %%mm4 \n\t" |
401 "psrlq $8, %%mm2 \n\t" | 401 "psrlq $8, %%mm2 \n\t" |
402 "psrlq $8, %%mm5 \n\t" | 402 "psrlq $8, %%mm5 \n\t" |
403 "pand %%mm7, %%mm2 \n\t" | 403 "pand %%mm7, %%mm2 \n\t" |
404 "pand %%mm7, %%mm5 \n\t" | 404 "pand %%mm7, %%mm5 \n\t" |
405 "por %%mm1, %%mm0 \n\t" | 405 "por %%mm1, %%mm0 \n\t" |
406 "por %%mm4, %%mm3 \n\t" | 406 "por %%mm4, %%mm3 \n\t" |
407 "por %%mm2, %%mm0 \n\t" | 407 "por %%mm2, %%mm0 \n\t" |
408 "por %%mm5, %%mm3 \n\t" | 408 "por %%mm5, %%mm3 \n\t" |
409 "psllq $16, %%mm3 \n\t" | 409 "psllq $16, %%mm3 \n\t" |
410 "por %%mm3, %%mm0 \n\t" | 410 "por %%mm3, %%mm0 \n\t" |
411 MOVNTQ" %%mm0, %0 \n\t" | 411 MOVNTQ" %%mm0, %0 \n\t" |
412 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 412 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
413 d += 4; | 413 d += 4; |
414 s += 16; | 414 s += 16; |
415 } | 415 } |
416 #endif | 416 #endif |
417 __asm__ volatile(SFENCE:::"memory"); | 417 __asm__ volatile(SFENCE:::"memory"); |
441 ::"m"(red_16mask),"m"(green_16mask)); | 441 ::"m"(red_16mask),"m"(green_16mask)); |
442 mm_end = end - 15; | 442 mm_end = end - 15; |
443 while (s < mm_end) | 443 while (s < mm_end) |
444 { | 444 { |
445 __asm__ volatile( | 445 __asm__ volatile( |
446 PREFETCH" 32%1 \n\t" | 446 PREFETCH" 32%1 \n\t" |
447 "movd %1, %%mm0 \n\t" | 447 "movd %1, %%mm0 \n\t" |
448 "movd 4%1, %%mm3 \n\t" | 448 "movd 4%1, %%mm3 \n\t" |
449 "punpckldq 8%1, %%mm0 \n\t" | 449 "punpckldq 8%1, %%mm0 \n\t" |
450 "punpckldq 12%1, %%mm3 \n\t" | 450 "punpckldq 12%1, %%mm3 \n\t" |
451 "movq %%mm0, %%mm1 \n\t" | 451 "movq %%mm0, %%mm1 \n\t" |
452 "movq %%mm0, %%mm2 \n\t" | 452 "movq %%mm0, %%mm2 \n\t" |
453 "movq %%mm3, %%mm4 \n\t" | 453 "movq %%mm3, %%mm4 \n\t" |
454 "movq %%mm3, %%mm5 \n\t" | 454 "movq %%mm3, %%mm5 \n\t" |
455 "psllq $8, %%mm0 \n\t" | 455 "psllq $8, %%mm0 \n\t" |
456 "psllq $8, %%mm3 \n\t" | 456 "psllq $8, %%mm3 \n\t" |
457 "pand %%mm7, %%mm0 \n\t" | 457 "pand %%mm7, %%mm0 \n\t" |
458 "pand %%mm7, %%mm3 \n\t" | 458 "pand %%mm7, %%mm3 \n\t" |
459 "psrlq $5, %%mm1 \n\t" | 459 "psrlq $5, %%mm1 \n\t" |
460 "psrlq $5, %%mm4 \n\t" | 460 "psrlq $5, %%mm4 \n\t" |
461 "pand %%mm6, %%mm1 \n\t" | 461 "pand %%mm6, %%mm1 \n\t" |
462 "pand %%mm6, %%mm4 \n\t" | 462 "pand %%mm6, %%mm4 \n\t" |
463 "psrlq $19, %%mm2 \n\t" | 463 "psrlq $19, %%mm2 \n\t" |
464 "psrlq $19, %%mm5 \n\t" | 464 "psrlq $19, %%mm5 \n\t" |
465 "pand %2, %%mm2 \n\t" | 465 "pand %2, %%mm2 \n\t" |
466 "pand %2, %%mm5 \n\t" | 466 "pand %2, %%mm5 \n\t" |
467 "por %%mm1, %%mm0 \n\t" | 467 "por %%mm1, %%mm0 \n\t" |
468 "por %%mm4, %%mm3 \n\t" | 468 "por %%mm4, %%mm3 \n\t" |
469 "por %%mm2, %%mm0 \n\t" | 469 "por %%mm2, %%mm0 \n\t" |
470 "por %%mm5, %%mm3 \n\t" | 470 "por %%mm5, %%mm3 \n\t" |
471 "psllq $16, %%mm3 \n\t" | 471 "psllq $16, %%mm3 \n\t" |
472 "por %%mm3, %%mm0 \n\t" | 472 "por %%mm3, %%mm0 \n\t" |
473 MOVNTQ" %%mm0, %0 \n\t" | 473 MOVNTQ" %%mm0, %0 \n\t" |
474 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 474 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
475 d += 4; | 475 d += 4; |
476 s += 16; | 476 s += 16; |
477 } | 477 } |
478 __asm__ volatile(SFENCE:::"memory"); | 478 __asm__ volatile(SFENCE:::"memory"); |
479 __asm__ volatile(EMMS:::"memory"); | 479 __asm__ volatile(EMMS:::"memory"); |
496 end = s + src_size; | 496 end = s + src_size; |
497 #if HAVE_MMX | 497 #if HAVE_MMX |
498 mm_end = end - 15; | 498 mm_end = end - 15; |
499 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) | 499 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) |
500 __asm__ volatile( | 500 __asm__ volatile( |
501 "movq %3, %%mm5 \n\t" | 501 "movq %3, %%mm5 \n\t" |
502 "movq %4, %%mm6 \n\t" | 502 "movq %4, %%mm6 \n\t" |
503 "movq %5, %%mm7 \n\t" | 503 "movq %5, %%mm7 \n\t" |
504 "jmp 2f \n\t" | 504 "jmp 2f \n\t" |
505 ASMALIGN(4) | 505 ASMALIGN(4) |
506 "1: \n\t" | 506 "1: \n\t" |
507 PREFETCH" 32(%1) \n\t" | 507 PREFETCH" 32(%1) \n\t" |
508 "movd (%1), %%mm0 \n\t" | 508 "movd (%1), %%mm0 \n\t" |
509 "movd 4(%1), %%mm3 \n\t" | 509 "movd 4(%1), %%mm3 \n\t" |
510 "punpckldq 8(%1), %%mm0 \n\t" | 510 "punpckldq 8(%1), %%mm0 \n\t" |
511 "punpckldq 12(%1), %%mm3 \n\t" | 511 "punpckldq 12(%1), %%mm3 \n\t" |
512 "movq %%mm0, %%mm1 \n\t" | 512 "movq %%mm0, %%mm1 \n\t" |
513 "movq %%mm3, %%mm4 \n\t" | 513 "movq %%mm3, %%mm4 \n\t" |
514 "pand %%mm6, %%mm0 \n\t" | 514 "pand %%mm6, %%mm0 \n\t" |
515 "pand %%mm6, %%mm3 \n\t" | 515 "pand %%mm6, %%mm3 \n\t" |
516 "pmaddwd %%mm7, %%mm0 \n\t" | 516 "pmaddwd %%mm7, %%mm0 \n\t" |
517 "pmaddwd %%mm7, %%mm3 \n\t" | 517 "pmaddwd %%mm7, %%mm3 \n\t" |
518 "pand %%mm5, %%mm1 \n\t" | 518 "pand %%mm5, %%mm1 \n\t" |
519 "pand %%mm5, %%mm4 \n\t" | 519 "pand %%mm5, %%mm4 \n\t" |
520 "por %%mm1, %%mm0 \n\t" | 520 "por %%mm1, %%mm0 \n\t" |
521 "por %%mm4, %%mm3 \n\t" | 521 "por %%mm4, %%mm3 \n\t" |
522 "psrld $6, %%mm0 \n\t" | 522 "psrld $6, %%mm0 \n\t" |
523 "pslld $10, %%mm3 \n\t" | 523 "pslld $10, %%mm3 \n\t" |
524 "por %%mm3, %%mm0 \n\t" | 524 "por %%mm3, %%mm0 \n\t" |
525 MOVNTQ" %%mm0, (%0) \n\t" | 525 MOVNTQ" %%mm0, (%0) \n\t" |
526 "add $16, %1 \n\t" | 526 "add $16, %1 \n\t" |
527 "add $8, %0 \n\t" | 527 "add $8, %0 \n\t" |
528 "2: \n\t" | 528 "2: \n\t" |
529 "cmp %2, %1 \n\t" | 529 "cmp %2, %1 \n\t" |
530 " jb 1b \n\t" | 530 " jb 1b \n\t" |
531 : "+r" (d), "+r"(s) | 531 : "+r" (d), "+r"(s) |
532 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | 532 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) |
533 ); | 533 ); |
534 #else | 534 #else |
535 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | 535 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
536 __asm__ volatile( | 536 __asm__ volatile( |
537 "movq %0, %%mm7 \n\t" | 537 "movq %0, %%mm7 \n\t" |
538 "movq %1, %%mm6 \n\t" | 538 "movq %1, %%mm6 \n\t" |
539 ::"m"(red_15mask),"m"(green_15mask)); | 539 ::"m"(red_15mask),"m"(green_15mask)); |
540 while (s < mm_end) | 540 while (s < mm_end) |
541 { | 541 { |
542 __asm__ volatile( | 542 __asm__ volatile( |
543 PREFETCH" 32%1 \n\t" | 543 PREFETCH" 32%1 \n\t" |
544 "movd %1, %%mm0 \n\t" | 544 "movd %1, %%mm0 \n\t" |
545 "movd 4%1, %%mm3 \n\t" | 545 "movd 4%1, %%mm3 \n\t" |
546 "punpckldq 8%1, %%mm0 \n\t" | 546 "punpckldq 8%1, %%mm0 \n\t" |
547 "punpckldq 12%1, %%mm3 \n\t" | 547 "punpckldq 12%1, %%mm3 \n\t" |
548 "movq %%mm0, %%mm1 \n\t" | 548 "movq %%mm0, %%mm1 \n\t" |
549 "movq %%mm0, %%mm2 \n\t" | 549 "movq %%mm0, %%mm2 \n\t" |
550 "movq %%mm3, %%mm4 \n\t" | 550 "movq %%mm3, %%mm4 \n\t" |
551 "movq %%mm3, %%mm5 \n\t" | 551 "movq %%mm3, %%mm5 \n\t" |
552 "psrlq $3, %%mm0 \n\t" | 552 "psrlq $3, %%mm0 \n\t" |
553 "psrlq $3, %%mm3 \n\t" | 553 "psrlq $3, %%mm3 \n\t" |
554 "pand %2, %%mm0 \n\t" | 554 "pand %2, %%mm0 \n\t" |
555 "pand %2, %%mm3 \n\t" | 555 "pand %2, %%mm3 \n\t" |
556 "psrlq $6, %%mm1 \n\t" | 556 "psrlq $6, %%mm1 \n\t" |
557 "psrlq $6, %%mm4 \n\t" | 557 "psrlq $6, %%mm4 \n\t" |
558 "pand %%mm6, %%mm1 \n\t" | 558 "pand %%mm6, %%mm1 \n\t" |
559 "pand %%mm6, %%mm4 \n\t" | 559 "pand %%mm6, %%mm4 \n\t" |
560 "psrlq $9, %%mm2 \n\t" | 560 "psrlq $9, %%mm2 \n\t" |
561 "psrlq $9, %%mm5 \n\t" | 561 "psrlq $9, %%mm5 \n\t" |
562 "pand %%mm7, %%mm2 \n\t" | 562 "pand %%mm7, %%mm2 \n\t" |
563 "pand %%mm7, %%mm5 \n\t" | 563 "pand %%mm7, %%mm5 \n\t" |
564 "por %%mm1, %%mm0 \n\t" | 564 "por %%mm1, %%mm0 \n\t" |
565 "por %%mm4, %%mm3 \n\t" | 565 "por %%mm4, %%mm3 \n\t" |
566 "por %%mm2, %%mm0 \n\t" | 566 "por %%mm2, %%mm0 \n\t" |
567 "por %%mm5, %%mm3 \n\t" | 567 "por %%mm5, %%mm3 \n\t" |
568 "psllq $16, %%mm3 \n\t" | 568 "psllq $16, %%mm3 \n\t" |
569 "por %%mm3, %%mm0 \n\t" | 569 "por %%mm3, %%mm0 \n\t" |
570 MOVNTQ" %%mm0, %0 \n\t" | 570 MOVNTQ" %%mm0, %0 \n\t" |
571 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 571 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
572 d += 4; | 572 d += 4; |
573 s += 16; | 573 s += 16; |
574 } | 574 } |
575 #endif | 575 #endif |
576 __asm__ volatile(SFENCE:::"memory"); | 576 __asm__ volatile(SFENCE:::"memory"); |
600 ::"m"(red_15mask),"m"(green_15mask)); | 600 ::"m"(red_15mask),"m"(green_15mask)); |
601 mm_end = end - 15; | 601 mm_end = end - 15; |
602 while (s < mm_end) | 602 while (s < mm_end) |
603 { | 603 { |
604 __asm__ volatile( | 604 __asm__ volatile( |
605 PREFETCH" 32%1 \n\t" | 605 PREFETCH" 32%1 \n\t" |
606 "movd %1, %%mm0 \n\t" | 606 "movd %1, %%mm0 \n\t" |
607 "movd 4%1, %%mm3 \n\t" | 607 "movd 4%1, %%mm3 \n\t" |
608 "punpckldq 8%1, %%mm0 \n\t" | 608 "punpckldq 8%1, %%mm0 \n\t" |
609 "punpckldq 12%1, %%mm3 \n\t" | 609 "punpckldq 12%1, %%mm3 \n\t" |
610 "movq %%mm0, %%mm1 \n\t" | 610 "movq %%mm0, %%mm1 \n\t" |
611 "movq %%mm0, %%mm2 \n\t" | 611 "movq %%mm0, %%mm2 \n\t" |
612 "movq %%mm3, %%mm4 \n\t" | 612 "movq %%mm3, %%mm4 \n\t" |
613 "movq %%mm3, %%mm5 \n\t" | 613 "movq %%mm3, %%mm5 \n\t" |
614 "psllq $7, %%mm0 \n\t" | 614 "psllq $7, %%mm0 \n\t" |
615 "psllq $7, %%mm3 \n\t" | 615 "psllq $7, %%mm3 \n\t" |
616 "pand %%mm7, %%mm0 \n\t" | 616 "pand %%mm7, %%mm0 \n\t" |
617 "pand %%mm7, %%mm3 \n\t" | 617 "pand %%mm7, %%mm3 \n\t" |
618 "psrlq $6, %%mm1 \n\t" | 618 "psrlq $6, %%mm1 \n\t" |
619 "psrlq $6, %%mm4 \n\t" | 619 "psrlq $6, %%mm4 \n\t" |
620 "pand %%mm6, %%mm1 \n\t" | 620 "pand %%mm6, %%mm1 \n\t" |
621 "pand %%mm6, %%mm4 \n\t" | 621 "pand %%mm6, %%mm4 \n\t" |
622 "psrlq $19, %%mm2 \n\t" | 622 "psrlq $19, %%mm2 \n\t" |
623 "psrlq $19, %%mm5 \n\t" | 623 "psrlq $19, %%mm5 \n\t" |
624 "pand %2, %%mm2 \n\t" | 624 "pand %2, %%mm2 \n\t" |
625 "pand %2, %%mm5 \n\t" | 625 "pand %2, %%mm5 \n\t" |
626 "por %%mm1, %%mm0 \n\t" | 626 "por %%mm1, %%mm0 \n\t" |
627 "por %%mm4, %%mm3 \n\t" | 627 "por %%mm4, %%mm3 \n\t" |
628 "por %%mm2, %%mm0 \n\t" | 628 "por %%mm2, %%mm0 \n\t" |
629 "por %%mm5, %%mm3 \n\t" | 629 "por %%mm5, %%mm3 \n\t" |
630 "psllq $16, %%mm3 \n\t" | 630 "psllq $16, %%mm3 \n\t" |
631 "por %%mm3, %%mm0 \n\t" | 631 "por %%mm3, %%mm0 \n\t" |
632 MOVNTQ" %%mm0, %0 \n\t" | 632 MOVNTQ" %%mm0, %0 \n\t" |
633 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 633 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
634 d += 4; | 634 d += 4; |
635 s += 16; | 635 s += 16; |
636 } | 636 } |
637 __asm__ volatile(SFENCE:::"memory"); | 637 __asm__ volatile(SFENCE:::"memory"); |
638 __asm__ volatile(EMMS:::"memory"); | 638 __asm__ volatile(EMMS:::"memory"); |
661 ::"m"(red_16mask),"m"(green_16mask)); | 661 ::"m"(red_16mask),"m"(green_16mask)); |
662 mm_end = end - 11; | 662 mm_end = end - 11; |
663 while (s < mm_end) | 663 while (s < mm_end) |
664 { | 664 { |
665 __asm__ volatile( | 665 __asm__ volatile( |
666 PREFETCH" 32%1 \n\t" | 666 PREFETCH" 32%1 \n\t" |
667 "movd %1, %%mm0 \n\t" | 667 "movd %1, %%mm0 \n\t" |
668 "movd 3%1, %%mm3 \n\t" | 668 "movd 3%1, %%mm3 \n\t" |
669 "punpckldq 6%1, %%mm0 \n\t" | 669 "punpckldq 6%1, %%mm0 \n\t" |
670 "punpckldq 9%1, %%mm3 \n\t" | 670 "punpckldq 9%1, %%mm3 \n\t" |
671 "movq %%mm0, %%mm1 \n\t" | 671 "movq %%mm0, %%mm1 \n\t" |
672 "movq %%mm0, %%mm2 \n\t" | 672 "movq %%mm0, %%mm2 \n\t" |
673 "movq %%mm3, %%mm4 \n\t" | 673 "movq %%mm3, %%mm4 \n\t" |
674 "movq %%mm3, %%mm5 \n\t" | 674 "movq %%mm3, %%mm5 \n\t" |
675 "psrlq $3, %%mm0 \n\t" | 675 "psrlq $3, %%mm0 \n\t" |
676 "psrlq $3, %%mm3 \n\t" | 676 "psrlq $3, %%mm3 \n\t" |
677 "pand %2, %%mm0 \n\t" | 677 "pand %2, %%mm0 \n\t" |
678 "pand %2, %%mm3 \n\t" | 678 "pand %2, %%mm3 \n\t" |
679 "psrlq $5, %%mm1 \n\t" | 679 "psrlq $5, %%mm1 \n\t" |
680 "psrlq $5, %%mm4 \n\t" | 680 "psrlq $5, %%mm4 \n\t" |
681 "pand %%mm6, %%mm1 \n\t" | 681 "pand %%mm6, %%mm1 \n\t" |
682 "pand %%mm6, %%mm4 \n\t" | 682 "pand %%mm6, %%mm4 \n\t" |
683 "psrlq $8, %%mm2 \n\t" | 683 "psrlq $8, %%mm2 \n\t" |
684 "psrlq $8, %%mm5 \n\t" | 684 "psrlq $8, %%mm5 \n\t" |
685 "pand %%mm7, %%mm2 \n\t" | 685 "pand %%mm7, %%mm2 \n\t" |
686 "pand %%mm7, %%mm5 \n\t" | 686 "pand %%mm7, %%mm5 \n\t" |
687 "por %%mm1, %%mm0 \n\t" | 687 "por %%mm1, %%mm0 \n\t" |
688 "por %%mm4, %%mm3 \n\t" | 688 "por %%mm4, %%mm3 \n\t" |
689 "por %%mm2, %%mm0 \n\t" | 689 "por %%mm2, %%mm0 \n\t" |
690 "por %%mm5, %%mm3 \n\t" | 690 "por %%mm5, %%mm3 \n\t" |
691 "psllq $16, %%mm3 \n\t" | 691 "psllq $16, %%mm3 \n\t" |
692 "por %%mm3, %%mm0 \n\t" | 692 "por %%mm3, %%mm0 \n\t" |
693 MOVNTQ" %%mm0, %0 \n\t" | 693 MOVNTQ" %%mm0, %0 \n\t" |
694 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 694 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
695 d += 4; | 695 d += 4; |
696 s += 12; | 696 s += 12; |
697 } | 697 } |
698 __asm__ volatile(SFENCE:::"memory"); | 698 __asm__ volatile(SFENCE:::"memory"); |
699 __asm__ volatile(EMMS:::"memory"); | 699 __asm__ volatile(EMMS:::"memory"); |
724 ::"m"(red_16mask),"m"(green_16mask)); | 724 ::"m"(red_16mask),"m"(green_16mask)); |
725 mm_end = end - 15; | 725 mm_end = end - 15; |
726 while (s < mm_end) | 726 while (s < mm_end) |
727 { | 727 { |
728 __asm__ volatile( | 728 __asm__ volatile( |
729 PREFETCH" 32%1 \n\t" | 729 PREFETCH" 32%1 \n\t" |
730 "movd %1, %%mm0 \n\t" | 730 "movd %1, %%mm0 \n\t" |
731 "movd 3%1, %%mm3 \n\t" | 731 "movd 3%1, %%mm3 \n\t" |
732 "punpckldq 6%1, %%mm0 \n\t" | 732 "punpckldq 6%1, %%mm0 \n\t" |
733 "punpckldq 9%1, %%mm3 \n\t" | 733 "punpckldq 9%1, %%mm3 \n\t" |
734 "movq %%mm0, %%mm1 \n\t" | 734 "movq %%mm0, %%mm1 \n\t" |
735 "movq %%mm0, %%mm2 \n\t" | 735 "movq %%mm0, %%mm2 \n\t" |
736 "movq %%mm3, %%mm4 \n\t" | 736 "movq %%mm3, %%mm4 \n\t" |
737 "movq %%mm3, %%mm5 \n\t" | 737 "movq %%mm3, %%mm5 \n\t" |
738 "psllq $8, %%mm0 \n\t" | 738 "psllq $8, %%mm0 \n\t" |
739 "psllq $8, %%mm3 \n\t" | 739 "psllq $8, %%mm3 \n\t" |
740 "pand %%mm7, %%mm0 \n\t" | 740 "pand %%mm7, %%mm0 \n\t" |
741 "pand %%mm7, %%mm3 \n\t" | 741 "pand %%mm7, %%mm3 \n\t" |
742 "psrlq $5, %%mm1 \n\t" | 742 "psrlq $5, %%mm1 \n\t" |
743 "psrlq $5, %%mm4 \n\t" | 743 "psrlq $5, %%mm4 \n\t" |
744 "pand %%mm6, %%mm1 \n\t" | 744 "pand %%mm6, %%mm1 \n\t" |
745 "pand %%mm6, %%mm4 \n\t" | 745 "pand %%mm6, %%mm4 \n\t" |
746 "psrlq $19, %%mm2 \n\t" | 746 "psrlq $19, %%mm2 \n\t" |
747 "psrlq $19, %%mm5 \n\t" | 747 "psrlq $19, %%mm5 \n\t" |
748 "pand %2, %%mm2 \n\t" | 748 "pand %2, %%mm2 \n\t" |
749 "pand %2, %%mm5 \n\t" | 749 "pand %2, %%mm5 \n\t" |
750 "por %%mm1, %%mm0 \n\t" | 750 "por %%mm1, %%mm0 \n\t" |
751 "por %%mm4, %%mm3 \n\t" | 751 "por %%mm4, %%mm3 \n\t" |
752 "por %%mm2, %%mm0 \n\t" | 752 "por %%mm2, %%mm0 \n\t" |
753 "por %%mm5, %%mm3 \n\t" | 753 "por %%mm5, %%mm3 \n\t" |
754 "psllq $16, %%mm3 \n\t" | 754 "psllq $16, %%mm3 \n\t" |
755 "por %%mm3, %%mm0 \n\t" | 755 "por %%mm3, %%mm0 \n\t" |
756 MOVNTQ" %%mm0, %0 \n\t" | 756 MOVNTQ" %%mm0, %0 \n\t" |
757 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 757 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
758 d += 4; | 758 d += 4; |
759 s += 12; | 759 s += 12; |
760 } | 760 } |
761 __asm__ volatile(SFENCE:::"memory"); | 761 __asm__ volatile(SFENCE:::"memory"); |
762 __asm__ volatile(EMMS:::"memory"); | 762 __asm__ volatile(EMMS:::"memory"); |
787 ::"m"(red_15mask),"m"(green_15mask)); | 787 ::"m"(red_15mask),"m"(green_15mask)); |
788 mm_end = end - 11; | 788 mm_end = end - 11; |
789 while (s < mm_end) | 789 while (s < mm_end) |
790 { | 790 { |
791 __asm__ volatile( | 791 __asm__ volatile( |
792 PREFETCH" 32%1 \n\t" | 792 PREFETCH" 32%1 \n\t" |
793 "movd %1, %%mm0 \n\t" | 793 "movd %1, %%mm0 \n\t" |
794 "movd 3%1, %%mm3 \n\t" | 794 "movd 3%1, %%mm3 \n\t" |
795 "punpckldq 6%1, %%mm0 \n\t" | 795 "punpckldq 6%1, %%mm0 \n\t" |
796 "punpckldq 9%1, %%mm3 \n\t" | 796 "punpckldq 9%1, %%mm3 \n\t" |
797 "movq %%mm0, %%mm1 \n\t" | 797 "movq %%mm0, %%mm1 \n\t" |
798 "movq %%mm0, %%mm2 \n\t" | 798 "movq %%mm0, %%mm2 \n\t" |
799 "movq %%mm3, %%mm4 \n\t" | 799 "movq %%mm3, %%mm4 \n\t" |
800 "movq %%mm3, %%mm5 \n\t" | 800 "movq %%mm3, %%mm5 \n\t" |
801 "psrlq $3, %%mm0 \n\t" | 801 "psrlq $3, %%mm0 \n\t" |
802 "psrlq $3, %%mm3 \n\t" | 802 "psrlq $3, %%mm3 \n\t" |
803 "pand %2, %%mm0 \n\t" | 803 "pand %2, %%mm0 \n\t" |
804 "pand %2, %%mm3 \n\t" | 804 "pand %2, %%mm3 \n\t" |
805 "psrlq $6, %%mm1 \n\t" | 805 "psrlq $6, %%mm1 \n\t" |
806 "psrlq $6, %%mm4 \n\t" | 806 "psrlq $6, %%mm4 \n\t" |
807 "pand %%mm6, %%mm1 \n\t" | 807 "pand %%mm6, %%mm1 \n\t" |
808 "pand %%mm6, %%mm4 \n\t" | 808 "pand %%mm6, %%mm4 \n\t" |
809 "psrlq $9, %%mm2 \n\t" | 809 "psrlq $9, %%mm2 \n\t" |
810 "psrlq $9, %%mm5 \n\t" | 810 "psrlq $9, %%mm5 \n\t" |
811 "pand %%mm7, %%mm2 \n\t" | 811 "pand %%mm7, %%mm2 \n\t" |
812 "pand %%mm7, %%mm5 \n\t" | 812 "pand %%mm7, %%mm5 \n\t" |
813 "por %%mm1, %%mm0 \n\t" | 813 "por %%mm1, %%mm0 \n\t" |
814 "por %%mm4, %%mm3 \n\t" | 814 "por %%mm4, %%mm3 \n\t" |
815 "por %%mm2, %%mm0 \n\t" | 815 "por %%mm2, %%mm0 \n\t" |
816 "por %%mm5, %%mm3 \n\t" | 816 "por %%mm5, %%mm3 \n\t" |
817 "psllq $16, %%mm3 \n\t" | 817 "psllq $16, %%mm3 \n\t" |
818 "por %%mm3, %%mm0 \n\t" | 818 "por %%mm3, %%mm0 \n\t" |
819 MOVNTQ" %%mm0, %0 \n\t" | 819 MOVNTQ" %%mm0, %0 \n\t" |
820 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 820 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
821 d += 4; | 821 d += 4; |
822 s += 12; | 822 s += 12; |
823 } | 823 } |
824 __asm__ volatile(SFENCE:::"memory"); | 824 __asm__ volatile(SFENCE:::"memory"); |
825 __asm__ volatile(EMMS:::"memory"); | 825 __asm__ volatile(EMMS:::"memory"); |
850 ::"m"(red_15mask),"m"(green_15mask)); | 850 ::"m"(red_15mask),"m"(green_15mask)); |
851 mm_end = end - 15; | 851 mm_end = end - 15; |
852 while (s < mm_end) | 852 while (s < mm_end) |
853 { | 853 { |
854 __asm__ volatile( | 854 __asm__ volatile( |
855 PREFETCH" 32%1 \n\t" | 855 PREFETCH" 32%1 \n\t" |
856 "movd %1, %%mm0 \n\t" | 856 "movd %1, %%mm0 \n\t" |
857 "movd 3%1, %%mm3 \n\t" | 857 "movd 3%1, %%mm3 \n\t" |
858 "punpckldq 6%1, %%mm0 \n\t" | 858 "punpckldq 6%1, %%mm0 \n\t" |
859 "punpckldq 9%1, %%mm3 \n\t" | 859 "punpckldq 9%1, %%mm3 \n\t" |
860 "movq %%mm0, %%mm1 \n\t" | 860 "movq %%mm0, %%mm1 \n\t" |
861 "movq %%mm0, %%mm2 \n\t" | 861 "movq %%mm0, %%mm2 \n\t" |
862 "movq %%mm3, %%mm4 \n\t" | 862 "movq %%mm3, %%mm4 \n\t" |
863 "movq %%mm3, %%mm5 \n\t" | 863 "movq %%mm3, %%mm5 \n\t" |
864 "psllq $7, %%mm0 \n\t" | 864 "psllq $7, %%mm0 \n\t" |
865 "psllq $7, %%mm3 \n\t" | 865 "psllq $7, %%mm3 \n\t" |
866 "pand %%mm7, %%mm0 \n\t" | 866 "pand %%mm7, %%mm0 \n\t" |
867 "pand %%mm7, %%mm3 \n\t" | 867 "pand %%mm7, %%mm3 \n\t" |
868 "psrlq $6, %%mm1 \n\t" | 868 "psrlq $6, %%mm1 \n\t" |
869 "psrlq $6, %%mm4 \n\t" | 869 "psrlq $6, %%mm4 \n\t" |
870 "pand %%mm6, %%mm1 \n\t" | 870 "pand %%mm6, %%mm1 \n\t" |
871 "pand %%mm6, %%mm4 \n\t" | 871 "pand %%mm6, %%mm4 \n\t" |
872 "psrlq $19, %%mm2 \n\t" | 872 "psrlq $19, %%mm2 \n\t" |
873 "psrlq $19, %%mm5 \n\t" | 873 "psrlq $19, %%mm5 \n\t" |
874 "pand %2, %%mm2 \n\t" | 874 "pand %2, %%mm2 \n\t" |
875 "pand %2, %%mm5 \n\t" | 875 "pand %2, %%mm5 \n\t" |
876 "por %%mm1, %%mm0 \n\t" | 876 "por %%mm1, %%mm0 \n\t" |
877 "por %%mm4, %%mm3 \n\t" | 877 "por %%mm4, %%mm3 \n\t" |
878 "por %%mm2, %%mm0 \n\t" | 878 "por %%mm2, %%mm0 \n\t" |
879 "por %%mm5, %%mm3 \n\t" | 879 "por %%mm5, %%mm3 \n\t" |
880 "psllq $16, %%mm3 \n\t" | 880 "psllq $16, %%mm3 \n\t" |
881 "por %%mm3, %%mm0 \n\t" | 881 "por %%mm3, %%mm0 \n\t" |
882 MOVNTQ" %%mm0, %0 \n\t" | 882 MOVNTQ" %%mm0, %0 \n\t" |
883 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 883 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
884 d += 4; | 884 d += 4; |
885 s += 12; | 885 s += 12; |
886 } | 886 } |
887 __asm__ volatile(SFENCE:::"memory"); | 887 __asm__ volatile(SFENCE:::"memory"); |
888 __asm__ volatile(EMMS:::"memory"); | 888 __asm__ volatile(EMMS:::"memory"); |
930 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | 930 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
931 mm_end = end - 7; | 931 mm_end = end - 7; |
932 while (s < mm_end) | 932 while (s < mm_end) |
933 { | 933 { |
934 __asm__ volatile( | 934 __asm__ volatile( |
935 PREFETCH" 32%1 \n\t" | 935 PREFETCH" 32%1 \n\t" |
936 "movq %1, %%mm0 \n\t" | 936 "movq %1, %%mm0 \n\t" |
937 "movq %1, %%mm1 \n\t" | 937 "movq %1, %%mm1 \n\t" |
938 "movq %1, %%mm2 \n\t" | 938 "movq %1, %%mm2 \n\t" |
939 "pand %2, %%mm0 \n\t" | 939 "pand %2, %%mm0 \n\t" |
940 "pand %3, %%mm1 \n\t" | 940 "pand %3, %%mm1 \n\t" |
941 "pand %4, %%mm2 \n\t" | 941 "pand %4, %%mm2 \n\t" |
942 "psllq $3, %%mm0 \n\t" | 942 "psllq $3, %%mm0 \n\t" |
943 "psrlq $2, %%mm1 \n\t" | 943 "psrlq $2, %%mm1 \n\t" |
944 "psrlq $7, %%mm2 \n\t" | 944 "psrlq $7, %%mm2 \n\t" |
945 "movq %%mm0, %%mm3 \n\t" | 945 "movq %%mm0, %%mm3 \n\t" |
946 "movq %%mm1, %%mm4 \n\t" | 946 "movq %%mm1, %%mm4 \n\t" |
947 "movq %%mm2, %%mm5 \n\t" | 947 "movq %%mm2, %%mm5 \n\t" |
948 "punpcklwd %5, %%mm0 \n\t" | 948 "punpcklwd %5, %%mm0 \n\t" |
949 "punpcklwd %5, %%mm1 \n\t" | 949 "punpcklwd %5, %%mm1 \n\t" |
950 "punpcklwd %5, %%mm2 \n\t" | 950 "punpcklwd %5, %%mm2 \n\t" |
951 "punpckhwd %5, %%mm3 \n\t" | 951 "punpckhwd %5, %%mm3 \n\t" |
952 "punpckhwd %5, %%mm4 \n\t" | 952 "punpckhwd %5, %%mm4 \n\t" |
953 "punpckhwd %5, %%mm5 \n\t" | 953 "punpckhwd %5, %%mm5 \n\t" |
954 "psllq $8, %%mm1 \n\t" | 954 "psllq $8, %%mm1 \n\t" |
955 "psllq $16, %%mm2 \n\t" | 955 "psllq $16, %%mm2 \n\t" |
956 "por %%mm1, %%mm0 \n\t" | 956 "por %%mm1, %%mm0 \n\t" |
957 "por %%mm2, %%mm0 \n\t" | 957 "por %%mm2, %%mm0 \n\t" |
958 "psllq $8, %%mm4 \n\t" | 958 "psllq $8, %%mm4 \n\t" |
959 "psllq $16, %%mm5 \n\t" | 959 "psllq $16, %%mm5 \n\t" |
960 "por %%mm4, %%mm3 \n\t" | 960 "por %%mm4, %%mm3 \n\t" |
961 "por %%mm5, %%mm3 \n\t" | 961 "por %%mm5, %%mm3 \n\t" |
962 | 962 |
963 "movq %%mm0, %%mm6 \n\t" | 963 "movq %%mm0, %%mm6 \n\t" |
964 "movq %%mm3, %%mm7 \n\t" | 964 "movq %%mm3, %%mm7 \n\t" |
965 | 965 |
966 "movq 8%1, %%mm0 \n\t" | 966 "movq 8%1, %%mm0 \n\t" |
967 "movq 8%1, %%mm1 \n\t" | 967 "movq 8%1, %%mm1 \n\t" |
968 "movq 8%1, %%mm2 \n\t" | 968 "movq 8%1, %%mm2 \n\t" |
969 "pand %2, %%mm0 \n\t" | 969 "pand %2, %%mm0 \n\t" |
970 "pand %3, %%mm1 \n\t" | 970 "pand %3, %%mm1 \n\t" |
971 "pand %4, %%mm2 \n\t" | 971 "pand %4, %%mm2 \n\t" |
972 "psllq $3, %%mm0 \n\t" | 972 "psllq $3, %%mm0 \n\t" |
973 "psrlq $2, %%mm1 \n\t" | 973 "psrlq $2, %%mm1 \n\t" |
974 "psrlq $7, %%mm2 \n\t" | 974 "psrlq $7, %%mm2 \n\t" |
975 "movq %%mm0, %%mm3 \n\t" | 975 "movq %%mm0, %%mm3 \n\t" |
976 "movq %%mm1, %%mm4 \n\t" | 976 "movq %%mm1, %%mm4 \n\t" |
977 "movq %%mm2, %%mm5 \n\t" | 977 "movq %%mm2, %%mm5 \n\t" |
978 "punpcklwd %5, %%mm0 \n\t" | 978 "punpcklwd %5, %%mm0 \n\t" |
979 "punpcklwd %5, %%mm1 \n\t" | 979 "punpcklwd %5, %%mm1 \n\t" |
980 "punpcklwd %5, %%mm2 \n\t" | 980 "punpcklwd %5, %%mm2 \n\t" |
981 "punpckhwd %5, %%mm3 \n\t" | 981 "punpckhwd %5, %%mm3 \n\t" |
982 "punpckhwd %5, %%mm4 \n\t" | 982 "punpckhwd %5, %%mm4 \n\t" |
983 "punpckhwd %5, %%mm5 \n\t" | 983 "punpckhwd %5, %%mm5 \n\t" |
984 "psllq $8, %%mm1 \n\t" | 984 "psllq $8, %%mm1 \n\t" |
985 "psllq $16, %%mm2 \n\t" | 985 "psllq $16, %%mm2 \n\t" |
986 "por %%mm1, %%mm0 \n\t" | 986 "por %%mm1, %%mm0 \n\t" |
987 "por %%mm2, %%mm0 \n\t" | 987 "por %%mm2, %%mm0 \n\t" |
988 "psllq $8, %%mm4 \n\t" | 988 "psllq $8, %%mm4 \n\t" |
989 "psllq $16, %%mm5 \n\t" | 989 "psllq $16, %%mm5 \n\t" |
990 "por %%mm4, %%mm3 \n\t" | 990 "por %%mm4, %%mm3 \n\t" |
991 "por %%mm5, %%mm3 \n\t" | 991 "por %%mm5, %%mm3 \n\t" |
992 | 992 |
993 :"=m"(*d) | 993 :"=m"(*d) |
994 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | 994 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) |
995 :"memory"); | 995 :"memory"); |
996 /* borrowed 32 to 24 */ | 996 /* borrowed 32 to 24 */ |
997 __asm__ volatile( | 997 __asm__ volatile( |
998 "movq %%mm0, %%mm4 \n\t" | 998 "movq %%mm0, %%mm4 \n\t" |
999 "movq %%mm3, %%mm5 \n\t" | 999 "movq %%mm3, %%mm5 \n\t" |
1000 "movq %%mm6, %%mm0 \n\t" | 1000 "movq %%mm6, %%mm0 \n\t" |
1001 "movq %%mm7, %%mm1 \n\t" | 1001 "movq %%mm7, %%mm1 \n\t" |
1002 | 1002 |
1003 "movq %%mm4, %%mm6 \n\t" | 1003 "movq %%mm4, %%mm6 \n\t" |
1004 "movq %%mm5, %%mm7 \n\t" | 1004 "movq %%mm5, %%mm7 \n\t" |
1005 "movq %%mm0, %%mm2 \n\t" | 1005 "movq %%mm0, %%mm2 \n\t" |
1006 "movq %%mm1, %%mm3 \n\t" | 1006 "movq %%mm1, %%mm3 \n\t" |
1007 | 1007 |
1008 "psrlq $8, %%mm2 \n\t" | 1008 "psrlq $8, %%mm2 \n\t" |
1009 "psrlq $8, %%mm3 \n\t" | 1009 "psrlq $8, %%mm3 \n\t" |
1010 "psrlq $8, %%mm6 \n\t" | 1010 "psrlq $8, %%mm6 \n\t" |
1011 "psrlq $8, %%mm7 \n\t" | 1011 "psrlq $8, %%mm7 \n\t" |
1012 "pand %2, %%mm0 \n\t" | 1012 "pand %2, %%mm0 \n\t" |
1013 "pand %2, %%mm1 \n\t" | 1013 "pand %2, %%mm1 \n\t" |
1014 "pand %2, %%mm4 \n\t" | 1014 "pand %2, %%mm4 \n\t" |
1015 "pand %2, %%mm5 \n\t" | 1015 "pand %2, %%mm5 \n\t" |
1016 "pand %3, %%mm2 \n\t" | 1016 "pand %3, %%mm2 \n\t" |
1017 "pand %3, %%mm3 \n\t" | 1017 "pand %3, %%mm3 \n\t" |
1018 "pand %3, %%mm6 \n\t" | 1018 "pand %3, %%mm6 \n\t" |
1019 "pand %3, %%mm7 \n\t" | 1019 "pand %3, %%mm7 \n\t" |
1020 "por %%mm2, %%mm0 \n\t" | 1020 "por %%mm2, %%mm0 \n\t" |
1021 "por %%mm3, %%mm1 \n\t" | 1021 "por %%mm3, %%mm1 \n\t" |
1022 "por %%mm6, %%mm4 \n\t" | 1022 "por %%mm6, %%mm4 \n\t" |
1023 "por %%mm7, %%mm5 \n\t" | 1023 "por %%mm7, %%mm5 \n\t" |
1024 | 1024 |
1025 "movq %%mm1, %%mm2 \n\t" | 1025 "movq %%mm1, %%mm2 \n\t" |
1026 "movq %%mm4, %%mm3 \n\t" | 1026 "movq %%mm4, %%mm3 \n\t" |
1027 "psllq $48, %%mm2 \n\t" | 1027 "psllq $48, %%mm2 \n\t" |
1028 "psllq $32, %%mm3 \n\t" | 1028 "psllq $32, %%mm3 \n\t" |
1029 "pand %4, %%mm2 \n\t" | 1029 "pand %4, %%mm2 \n\t" |
1030 "pand %5, %%mm3 \n\t" | 1030 "pand %5, %%mm3 \n\t" |
1031 "por %%mm2, %%mm0 \n\t" | 1031 "por %%mm2, %%mm0 \n\t" |
1032 "psrlq $16, %%mm1 \n\t" | 1032 "psrlq $16, %%mm1 \n\t" |
1033 "psrlq $32, %%mm4 \n\t" | 1033 "psrlq $32, %%mm4 \n\t" |
1034 "psllq $16, %%mm5 \n\t" | 1034 "psllq $16, %%mm5 \n\t" |
1035 "por %%mm3, %%mm1 \n\t" | 1035 "por %%mm3, %%mm1 \n\t" |
1036 "pand %6, %%mm5 \n\t" | 1036 "pand %6, %%mm5 \n\t" |
1037 "por %%mm5, %%mm4 \n\t" | 1037 "por %%mm5, %%mm4 \n\t" |
1038 | 1038 |
1039 MOVNTQ" %%mm0, %0 \n\t" | 1039 MOVNTQ" %%mm0, %0 \n\t" |
1040 MOVNTQ" %%mm1, 8%0 \n\t" | 1040 MOVNTQ" %%mm1, 8%0 \n\t" |
1041 MOVNTQ" %%mm4, 16%0" | 1041 MOVNTQ" %%mm4, 16%0" |
1042 | 1042 |
1043 :"=m"(*d) | 1043 :"=m"(*d) |
1044 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | 1044 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
1045 :"memory"); | 1045 :"memory"); |
1046 d += 24; | 1046 d += 24; |
1047 s += 8; | 1047 s += 8; |
1048 } | 1048 } |
1049 __asm__ volatile(SFENCE:::"memory"); | 1049 __asm__ volatile(SFENCE:::"memory"); |
1050 __asm__ volatile(EMMS:::"memory"); | 1050 __asm__ volatile(EMMS:::"memory"); |
1072 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | 1072 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
1073 mm_end = end - 7; | 1073 mm_end = end - 7; |
1074 while (s < mm_end) | 1074 while (s < mm_end) |
1075 { | 1075 { |
1076 __asm__ volatile( | 1076 __asm__ volatile( |
1077 PREFETCH" 32%1 \n\t" | 1077 PREFETCH" 32%1 \n\t" |
1078 "movq %1, %%mm0 \n\t" | 1078 "movq %1, %%mm0 \n\t" |
1079 "movq %1, %%mm1 \n\t" | 1079 "movq %1, %%mm1 \n\t" |
1080 "movq %1, %%mm2 \n\t" | 1080 "movq %1, %%mm2 \n\t" |
1081 "pand %2, %%mm0 \n\t" | 1081 "pand %2, %%mm0 \n\t" |
1082 "pand %3, %%mm1 \n\t" | 1082 "pand %3, %%mm1 \n\t" |
1083 "pand %4, %%mm2 \n\t" | 1083 "pand %4, %%mm2 \n\t" |
1084 "psllq $3, %%mm0 \n\t" | 1084 "psllq $3, %%mm0 \n\t" |
1085 "psrlq $3, %%mm1 \n\t" | 1085 "psrlq $3, %%mm1 \n\t" |
1086 "psrlq $8, %%mm2 \n\t" | 1086 "psrlq $8, %%mm2 \n\t" |
1087 "movq %%mm0, %%mm3 \n\t" | 1087 "movq %%mm0, %%mm3 \n\t" |
1088 "movq %%mm1, %%mm4 \n\t" | 1088 "movq %%mm1, %%mm4 \n\t" |
1089 "movq %%mm2, %%mm5 \n\t" | 1089 "movq %%mm2, %%mm5 \n\t" |
1090 "punpcklwd %5, %%mm0 \n\t" | 1090 "punpcklwd %5, %%mm0 \n\t" |
1091 "punpcklwd %5, %%mm1 \n\t" | 1091 "punpcklwd %5, %%mm1 \n\t" |
1092 "punpcklwd %5, %%mm2 \n\t" | 1092 "punpcklwd %5, %%mm2 \n\t" |
1093 "punpckhwd %5, %%mm3 \n\t" | 1093 "punpckhwd %5, %%mm3 \n\t" |
1094 "punpckhwd %5, %%mm4 \n\t" | 1094 "punpckhwd %5, %%mm4 \n\t" |
1095 "punpckhwd %5, %%mm5 \n\t" | 1095 "punpckhwd %5, %%mm5 \n\t" |
1096 "psllq $8, %%mm1 \n\t" | 1096 "psllq $8, %%mm1 \n\t" |
1097 "psllq $16, %%mm2 \n\t" | 1097 "psllq $16, %%mm2 \n\t" |
1098 "por %%mm1, %%mm0 \n\t" | 1098 "por %%mm1, %%mm0 \n\t" |
1099 "por %%mm2, %%mm0 \n\t" | 1099 "por %%mm2, %%mm0 \n\t" |
1100 "psllq $8, %%mm4 \n\t" | 1100 "psllq $8, %%mm4 \n\t" |
1101 "psllq $16, %%mm5 \n\t" | 1101 "psllq $16, %%mm5 \n\t" |
1102 "por %%mm4, %%mm3 \n\t" | 1102 "por %%mm4, %%mm3 \n\t" |
1103 "por %%mm5, %%mm3 \n\t" | 1103 "por %%mm5, %%mm3 \n\t" |
1104 | 1104 |
1105 "movq %%mm0, %%mm6 \n\t" | 1105 "movq %%mm0, %%mm6 \n\t" |
1106 "movq %%mm3, %%mm7 \n\t" | 1106 "movq %%mm3, %%mm7 \n\t" |
1107 | 1107 |
1108 "movq 8%1, %%mm0 \n\t" | 1108 "movq 8%1, %%mm0 \n\t" |
1109 "movq 8%1, %%mm1 \n\t" | 1109 "movq 8%1, %%mm1 \n\t" |
1110 "movq 8%1, %%mm2 \n\t" | 1110 "movq 8%1, %%mm2 \n\t" |
1111 "pand %2, %%mm0 \n\t" | 1111 "pand %2, %%mm0 \n\t" |
1112 "pand %3, %%mm1 \n\t" | 1112 "pand %3, %%mm1 \n\t" |
1113 "pand %4, %%mm2 \n\t" | 1113 "pand %4, %%mm2 \n\t" |
1114 "psllq $3, %%mm0 \n\t" | 1114 "psllq $3, %%mm0 \n\t" |
1115 "psrlq $3, %%mm1 \n\t" | 1115 "psrlq $3, %%mm1 \n\t" |
1116 "psrlq $8, %%mm2 \n\t" | 1116 "psrlq $8, %%mm2 \n\t" |
1117 "movq %%mm0, %%mm3 \n\t" | 1117 "movq %%mm0, %%mm3 \n\t" |
1118 "movq %%mm1, %%mm4 \n\t" | 1118 "movq %%mm1, %%mm4 \n\t" |
1119 "movq %%mm2, %%mm5 \n\t" | 1119 "movq %%mm2, %%mm5 \n\t" |
1120 "punpcklwd %5, %%mm0 \n\t" | 1120 "punpcklwd %5, %%mm0 \n\t" |
1121 "punpcklwd %5, %%mm1 \n\t" | 1121 "punpcklwd %5, %%mm1 \n\t" |
1122 "punpcklwd %5, %%mm2 \n\t" | 1122 "punpcklwd %5, %%mm2 \n\t" |
1123 "punpckhwd %5, %%mm3 \n\t" | 1123 "punpckhwd %5, %%mm3 \n\t" |
1124 "punpckhwd %5, %%mm4 \n\t" | 1124 "punpckhwd %5, %%mm4 \n\t" |
1125 "punpckhwd %5, %%mm5 \n\t" | 1125 "punpckhwd %5, %%mm5 \n\t" |
1126 "psllq $8, %%mm1 \n\t" | 1126 "psllq $8, %%mm1 \n\t" |
1127 "psllq $16, %%mm2 \n\t" | 1127 "psllq $16, %%mm2 \n\t" |
1128 "por %%mm1, %%mm0 \n\t" | 1128 "por %%mm1, %%mm0 \n\t" |
1129 "por %%mm2, %%mm0 \n\t" | 1129 "por %%mm2, %%mm0 \n\t" |
1130 "psllq $8, %%mm4 \n\t" | 1130 "psllq $8, %%mm4 \n\t" |
1131 "psllq $16, %%mm5 \n\t" | 1131 "psllq $16, %%mm5 \n\t" |
1132 "por %%mm4, %%mm3 \n\t" | 1132 "por %%mm4, %%mm3 \n\t" |
1133 "por %%mm5, %%mm3 \n\t" | 1133 "por %%mm5, %%mm3 \n\t" |
1134 :"=m"(*d) | 1134 :"=m"(*d) |
1135 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | 1135 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) |
1136 :"memory"); | 1136 :"memory"); |
1137 /* borrowed 32 to 24 */ | 1137 /* borrowed 32 to 24 */ |
1138 __asm__ volatile( | 1138 __asm__ volatile( |
1139 "movq %%mm0, %%mm4 \n\t" | 1139 "movq %%mm0, %%mm4 \n\t" |
1140 "movq %%mm3, %%mm5 \n\t" | 1140 "movq %%mm3, %%mm5 \n\t" |
1141 "movq %%mm6, %%mm0 \n\t" | 1141 "movq %%mm6, %%mm0 \n\t" |
1142 "movq %%mm7, %%mm1 \n\t" | 1142 "movq %%mm7, %%mm1 \n\t" |
1143 | 1143 |
1144 "movq %%mm4, %%mm6 \n\t" | 1144 "movq %%mm4, %%mm6 \n\t" |
1145 "movq %%mm5, %%mm7 \n\t" | 1145 "movq %%mm5, %%mm7 \n\t" |
1146 "movq %%mm0, %%mm2 \n\t" | 1146 "movq %%mm0, %%mm2 \n\t" |
1147 "movq %%mm1, %%mm3 \n\t" | 1147 "movq %%mm1, %%mm3 \n\t" |
1148 | 1148 |
1149 "psrlq $8, %%mm2 \n\t" | 1149 "psrlq $8, %%mm2 \n\t" |
1150 "psrlq $8, %%mm3 \n\t" | 1150 "psrlq $8, %%mm3 \n\t" |
1151 "psrlq $8, %%mm6 \n\t" | 1151 "psrlq $8, %%mm6 \n\t" |
1152 "psrlq $8, %%mm7 \n\t" | 1152 "psrlq $8, %%mm7 \n\t" |
1153 "pand %2, %%mm0 \n\t" | 1153 "pand %2, %%mm0 \n\t" |
1154 "pand %2, %%mm1 \n\t" | 1154 "pand %2, %%mm1 \n\t" |
1155 "pand %2, %%mm4 \n\t" | 1155 "pand %2, %%mm4 \n\t" |
1156 "pand %2, %%mm5 \n\t" | 1156 "pand %2, %%mm5 \n\t" |
1157 "pand %3, %%mm2 \n\t" | 1157 "pand %3, %%mm2 \n\t" |
1158 "pand %3, %%mm3 \n\t" | 1158 "pand %3, %%mm3 \n\t" |
1159 "pand %3, %%mm6 \n\t" | 1159 "pand %3, %%mm6 \n\t" |
1160 "pand %3, %%mm7 \n\t" | 1160 "pand %3, %%mm7 \n\t" |
1161 "por %%mm2, %%mm0 \n\t" | 1161 "por %%mm2, %%mm0 \n\t" |
1162 "por %%mm3, %%mm1 \n\t" | 1162 "por %%mm3, %%mm1 \n\t" |
1163 "por %%mm6, %%mm4 \n\t" | 1163 "por %%mm6, %%mm4 \n\t" |
1164 "por %%mm7, %%mm5 \n\t" | 1164 "por %%mm7, %%mm5 \n\t" |
1165 | 1165 |
1166 "movq %%mm1, %%mm2 \n\t" | 1166 "movq %%mm1, %%mm2 \n\t" |
1167 "movq %%mm4, %%mm3 \n\t" | 1167 "movq %%mm4, %%mm3 \n\t" |
1168 "psllq $48, %%mm2 \n\t" | 1168 "psllq $48, %%mm2 \n\t" |
1169 "psllq $32, %%mm3 \n\t" | 1169 "psllq $32, %%mm3 \n\t" |
1170 "pand %4, %%mm2 \n\t" | 1170 "pand %4, %%mm2 \n\t" |
1171 "pand %5, %%mm3 \n\t" | 1171 "pand %5, %%mm3 \n\t" |
1172 "por %%mm2, %%mm0 \n\t" | 1172 "por %%mm2, %%mm0 \n\t" |
1173 "psrlq $16, %%mm1 \n\t" | 1173 "psrlq $16, %%mm1 \n\t" |
1174 "psrlq $32, %%mm4 \n\t" | 1174 "psrlq $32, %%mm4 \n\t" |
1175 "psllq $16, %%mm5 \n\t" | 1175 "psllq $16, %%mm5 \n\t" |
1176 "por %%mm3, %%mm1 \n\t" | 1176 "por %%mm3, %%mm1 \n\t" |
1177 "pand %6, %%mm5 \n\t" | 1177 "pand %6, %%mm5 \n\t" |
1178 "por %%mm5, %%mm4 \n\t" | 1178 "por %%mm5, %%mm4 \n\t" |
1179 | 1179 |
1180 MOVNTQ" %%mm0, %0 \n\t" | 1180 MOVNTQ" %%mm0, %0 \n\t" |
1181 MOVNTQ" %%mm1, 8%0 \n\t" | 1181 MOVNTQ" %%mm1, 8%0 \n\t" |
1182 MOVNTQ" %%mm4, 16%0" | 1182 MOVNTQ" %%mm4, 16%0" |
1183 | 1183 |
1184 :"=m"(*d) | 1184 :"=m"(*d) |
1185 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | 1185 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
1186 :"memory"); | 1186 :"memory"); |
1187 d += 24; | 1187 d += 24; |
1188 s += 8; | 1188 s += 8; |
1189 } | 1189 } |
1190 __asm__ volatile(SFENCE:::"memory"); | 1190 __asm__ volatile(SFENCE:::"memory"); |
1191 __asm__ volatile(EMMS:::"memory"); | 1191 __asm__ volatile(EMMS:::"memory"); |
1234 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | 1234 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); |
1235 mm_end = end - 3; | 1235 mm_end = end - 3; |
1236 while (s < mm_end) | 1236 while (s < mm_end) |
1237 { | 1237 { |
1238 __asm__ volatile( | 1238 __asm__ volatile( |
1239 PREFETCH" 32%1 \n\t" | 1239 PREFETCH" 32%1 \n\t" |
1240 "movq %1, %%mm0 \n\t" | 1240 "movq %1, %%mm0 \n\t" |
1241 "movq %1, %%mm1 \n\t" | 1241 "movq %1, %%mm1 \n\t" |
1242 "movq %1, %%mm2 \n\t" | 1242 "movq %1, %%mm2 \n\t" |
1243 "pand %2, %%mm0 \n\t" | 1243 "pand %2, %%mm0 \n\t" |
1244 "pand %3, %%mm1 \n\t" | 1244 "pand %3, %%mm1 \n\t" |
1245 "pand %4, %%mm2 \n\t" | 1245 "pand %4, %%mm2 \n\t" |
1246 "psllq $3, %%mm0 \n\t" | 1246 "psllq $3, %%mm0 \n\t" |
1247 "psrlq $2, %%mm1 \n\t" | 1247 "psrlq $2, %%mm1 \n\t" |
1248 "psrlq $7, %%mm2 \n\t" | 1248 "psrlq $7, %%mm2 \n\t" |
1249 PACK_RGB32 | 1249 PACK_RGB32 |
1250 :"=m"(*d) | 1250 :"=m"(*d) |
1251 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | 1251 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) |
1252 :"memory"); | 1252 :"memory"); |
1253 d += 16; | 1253 d += 16; |
1254 s += 4; | 1254 s += 4; |
1255 } | 1255 } |
1256 __asm__ volatile(SFENCE:::"memory"); | 1256 __asm__ volatile(SFENCE:::"memory"); |
1257 __asm__ volatile(EMMS:::"memory"); | 1257 __asm__ volatile(EMMS:::"memory"); |
1289 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | 1289 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); |
1290 mm_end = end - 3; | 1290 mm_end = end - 3; |
1291 while (s < mm_end) | 1291 while (s < mm_end) |
1292 { | 1292 { |
1293 __asm__ volatile( | 1293 __asm__ volatile( |
1294 PREFETCH" 32%1 \n\t" | 1294 PREFETCH" 32%1 \n\t" |
1295 "movq %1, %%mm0 \n\t" | 1295 "movq %1, %%mm0 \n\t" |
1296 "movq %1, %%mm1 \n\t" | 1296 "movq %1, %%mm1 \n\t" |
1297 "movq %1, %%mm2 \n\t" | 1297 "movq %1, %%mm2 \n\t" |
1298 "pand %2, %%mm0 \n\t" | 1298 "pand %2, %%mm0 \n\t" |
1299 "pand %3, %%mm1 \n\t" | 1299 "pand %3, %%mm1 \n\t" |
1300 "pand %4, %%mm2 \n\t" | 1300 "pand %4, %%mm2 \n\t" |
1301 "psllq $3, %%mm0 \n\t" | 1301 "psllq $3, %%mm0 \n\t" |
1302 "psrlq $3, %%mm1 \n\t" | 1302 "psrlq $3, %%mm1 \n\t" |
1303 "psrlq $8, %%mm2 \n\t" | 1303 "psrlq $8, %%mm2 \n\t" |
1304 PACK_RGB32 | 1304 PACK_RGB32 |
1305 :"=m"(*d) | 1305 :"=m"(*d) |
1306 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | 1306 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) |
1307 :"memory"); | 1307 :"memory"); |
1308 d += 16; | 1308 d += 16; |
1309 s += 4; | 1309 s += 4; |
1310 } | 1310 } |
1311 __asm__ volatile(SFENCE:::"memory"); | 1311 __asm__ volatile(SFENCE:::"memory"); |
1312 __asm__ volatile(EMMS:::"memory"); | 1312 __asm__ volatile(EMMS:::"memory"); |
1334 x86_reg idx = 15 - src_size; | 1334 x86_reg idx = 15 - src_size; |
1335 const uint8_t *s = src-idx; | 1335 const uint8_t *s = src-idx; |
1336 uint8_t *d = dst-idx; | 1336 uint8_t *d = dst-idx; |
1337 #if HAVE_MMX | 1337 #if HAVE_MMX |
1338 __asm__ volatile( | 1338 __asm__ volatile( |
1339 "test %0, %0 \n\t" | 1339 "test %0, %0 \n\t" |
1340 "jns 2f \n\t" | 1340 "jns 2f \n\t" |
1341 PREFETCH" (%1, %0) \n\t" | 1341 PREFETCH" (%1, %0) \n\t" |
1342 "movq %3, %%mm7 \n\t" | 1342 "movq %3, %%mm7 \n\t" |
1343 "pxor %4, %%mm7 \n\t" | 1343 "pxor %4, %%mm7 \n\t" |
1344 "movq %%mm7, %%mm6 \n\t" | 1344 "movq %%mm7, %%mm6 \n\t" |
1345 "pxor %5, %%mm7 \n\t" | 1345 "pxor %5, %%mm7 \n\t" |
1346 ASMALIGN(4) | 1346 ASMALIGN(4) |
1347 "1: \n\t" | 1347 "1: \n\t" |
1348 PREFETCH" 32(%1, %0) \n\t" | 1348 PREFETCH" 32(%1, %0) \n\t" |
1349 "movq (%1, %0), %%mm0 \n\t" | 1349 "movq (%1, %0), %%mm0 \n\t" |
1350 "movq 8(%1, %0), %%mm1 \n\t" | 1350 "movq 8(%1, %0), %%mm1 \n\t" |
1351 # if HAVE_MMX2 | 1351 # if HAVE_MMX2 |
1352 "pshufw $177, %%mm0, %%mm3 \n\t" | 1352 "pshufw $177, %%mm0, %%mm3 \n\t" |
1353 "pshufw $177, %%mm1, %%mm5 \n\t" | 1353 "pshufw $177, %%mm1, %%mm5 \n\t" |
1354 "pand %%mm7, %%mm0 \n\t" | 1354 "pand %%mm7, %%mm0 \n\t" |
1355 "pand %%mm6, %%mm3 \n\t" | 1355 "pand %%mm6, %%mm3 \n\t" |
1356 "pand %%mm7, %%mm1 \n\t" | 1356 "pand %%mm7, %%mm1 \n\t" |
1357 "pand %%mm6, %%mm5 \n\t" | 1357 "pand %%mm6, %%mm5 \n\t" |
1358 "por %%mm3, %%mm0 \n\t" | 1358 "por %%mm3, %%mm0 \n\t" |
1359 "por %%mm5, %%mm1 \n\t" | 1359 "por %%mm5, %%mm1 \n\t" |
1360 # else | 1360 # else |
1361 "movq %%mm0, %%mm2 \n\t" | 1361 "movq %%mm0, %%mm2 \n\t" |
1362 "movq %%mm1, %%mm4 \n\t" | 1362 "movq %%mm1, %%mm4 \n\t" |
1363 "pand %%mm7, %%mm0 \n\t" | 1363 "pand %%mm7, %%mm0 \n\t" |
1364 "pand %%mm6, %%mm2 \n\t" | 1364 "pand %%mm6, %%mm2 \n\t" |
1365 "pand %%mm7, %%mm1 \n\t" | 1365 "pand %%mm7, %%mm1 \n\t" |
1366 "pand %%mm6, %%mm4 \n\t" | 1366 "pand %%mm6, %%mm4 \n\t" |
1367 "movq %%mm2, %%mm3 \n\t" | 1367 "movq %%mm2, %%mm3 \n\t" |
1368 "movq %%mm4, %%mm5 \n\t" | 1368 "movq %%mm4, %%mm5 \n\t" |
1369 "pslld $16, %%mm2 \n\t" | 1369 "pslld $16, %%mm2 \n\t" |
1370 "psrld $16, %%mm3 \n\t" | 1370 "psrld $16, %%mm3 \n\t" |
1371 "pslld $16, %%mm4 \n\t" | 1371 "pslld $16, %%mm4 \n\t" |
1372 "psrld $16, %%mm5 \n\t" | 1372 "psrld $16, %%mm5 \n\t" |
1373 "por %%mm2, %%mm0 \n\t" | 1373 "por %%mm2, %%mm0 \n\t" |
1374 "por %%mm4, %%mm1 \n\t" | 1374 "por %%mm4, %%mm1 \n\t" |
1375 "por %%mm3, %%mm0 \n\t" | 1375 "por %%mm3, %%mm0 \n\t" |
1376 "por %%mm5, %%mm1 \n\t" | 1376 "por %%mm5, %%mm1 \n\t" |
1377 # endif | 1377 # endif |
1378 MOVNTQ" %%mm0, (%2, %0) \n\t" | 1378 MOVNTQ" %%mm0, (%2, %0) \n\t" |
1379 MOVNTQ" %%mm1, 8(%2, %0) \n\t" | 1379 MOVNTQ" %%mm1, 8(%2, %0) \n\t" |
1380 "add $16, %0 \n\t" | 1380 "add $16, %0 \n\t" |
1381 "js 1b \n\t" | 1381 "js 1b \n\t" |
1382 SFENCE" \n\t" | 1382 SFENCE" \n\t" |
1383 EMMS" \n\t" | 1383 EMMS" \n\t" |
1384 "2: \n\t" | 1384 "2: \n\t" |
1385 : "+&r"(idx) | 1385 : "+&r"(idx) |
1386 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) | 1386 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) |
1387 : "memory"); | 1387 : "memory"); |
1388 #endif | 1388 #endif |
1389 for (; idx<15; idx+=4) { | 1389 for (; idx<15; idx+=4) { |
1390 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; | 1390 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; |
1391 v &= 0xff00ff; | 1391 v &= 0xff00ff; |
1392 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); | 1392 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); |
1397 { | 1397 { |
1398 unsigned i; | 1398 unsigned i; |
1399 #if HAVE_MMX | 1399 #if HAVE_MMX |
1400 x86_reg mmx_size= 23 - src_size; | 1400 x86_reg mmx_size= 23 - src_size; |
1401 __asm__ volatile ( | 1401 __asm__ volatile ( |
1402 "test %%"REG_a", %%"REG_a" \n\t" | 1402 "test %%"REG_a", %%"REG_a" \n\t" |
1403 "jns 2f \n\t" | 1403 "jns 2f \n\t" |
1404 "movq "MANGLE(mask24r)", %%mm5 \n\t" | 1404 "movq "MANGLE(mask24r)", %%mm5 \n\t" |
1405 "movq "MANGLE(mask24g)", %%mm6 \n\t" | 1405 "movq "MANGLE(mask24g)", %%mm6 \n\t" |
1406 "movq "MANGLE(mask24b)", %%mm7 \n\t" | 1406 "movq "MANGLE(mask24b)", %%mm7 \n\t" |
1407 ASMALIGN(4) | 1407 ASMALIGN(4) |
1408 "1: \n\t" | 1408 "1: \n\t" |
1409 PREFETCH" 32(%1, %%"REG_a") \n\t" | 1409 PREFETCH" 32(%1, %%"REG_a") \n\t" |
1410 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | 1410 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
1411 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG | 1411 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG |
1412 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B | 1412 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B |
1413 "psllq $16, %%mm0 \n\t" // 00 BGR BGR | 1413 "psllq $16, %%mm0 \n\t" // 00 BGR BGR |
1414 "pand %%mm5, %%mm0 \n\t" | 1414 "pand %%mm5, %%mm0 \n\t" |
1415 "pand %%mm6, %%mm1 \n\t" | 1415 "pand %%mm6, %%mm1 \n\t" |
1416 "pand %%mm7, %%mm2 \n\t" | 1416 "pand %%mm7, %%mm2 \n\t" |
1417 "por %%mm0, %%mm1 \n\t" | 1417 "por %%mm0, %%mm1 \n\t" |
1418 "por %%mm2, %%mm1 \n\t" | 1418 "por %%mm2, %%mm1 \n\t" |
1419 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | 1419 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
1420 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG | 1420 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG |
1421 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B | 1421 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B |
1422 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR | 1422 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR |
1423 "pand %%mm7, %%mm0 \n\t" | 1423 "pand %%mm7, %%mm0 \n\t" |
1424 "pand %%mm5, %%mm1 \n\t" | 1424 "pand %%mm5, %%mm1 \n\t" |
1425 "pand %%mm6, %%mm2 \n\t" | 1425 "pand %%mm6, %%mm2 \n\t" |
1426 "por %%mm0, %%mm1 \n\t" | 1426 "por %%mm0, %%mm1 \n\t" |
1427 "por %%mm2, %%mm1 \n\t" | 1427 "por %%mm2, %%mm1 \n\t" |
1428 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B | 1428 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B |
1429 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R | 1429 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R |
1430 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR | 1430 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR |
1431 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG | 1431 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG |
1432 "pand %%mm6, %%mm0 \n\t" | 1432 "pand %%mm6, %%mm0 \n\t" |
1433 "pand %%mm7, %%mm1 \n\t" | 1433 "pand %%mm7, %%mm1 \n\t" |
1434 "pand %%mm5, %%mm2 \n\t" | 1434 "pand %%mm5, %%mm2 \n\t" |
1435 "por %%mm0, %%mm1 \n\t" | 1435 "por %%mm0, %%mm1 \n\t" |
1436 "por %%mm2, %%mm1 \n\t" | 1436 "por %%mm2, %%mm1 \n\t" |
1437 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" | 1437 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" |
1438 "add $24, %%"REG_a" \n\t" | 1438 "add $24, %%"REG_a" \n\t" |
1439 " js 1b \n\t" | 1439 " js 1b \n\t" |
1440 "2: \n\t" | 1440 "2: \n\t" |
1441 : "+a" (mmx_size) | 1441 : "+a" (mmx_size) |
1442 : "r" (src-mmx_size), "r"(dst-mmx_size) | 1442 : "r" (src-mmx_size), "r"(dst-mmx_size) |
1443 ); | 1443 ); |
1444 | 1444 |
1445 __asm__ volatile(SFENCE:::"memory"); | 1445 __asm__ volatile(SFENCE:::"memory"); |
1446 __asm__ volatile(EMMS:::"memory"); | 1446 __asm__ volatile(EMMS:::"memory"); |
1447 | 1447 |
1472 for (y=0; y<height; y++) | 1472 for (y=0; y<height; y++) |
1473 { | 1473 { |
1474 #if HAVE_MMX | 1474 #if HAVE_MMX |
1475 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | 1475 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) |
1476 __asm__ volatile( | 1476 __asm__ volatile( |
1477 "xor %%"REG_a", %%"REG_a" \n\t" | 1477 "xor %%"REG_a", %%"REG_a" \n\t" |
1478 ASMALIGN(4) | 1478 ASMALIGN(4) |
1479 "1: \n\t" | 1479 "1: \n\t" |
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | 1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
1481 PREFETCH" 32(%2, %%"REG_a") \n\t" | 1481 PREFETCH" 32(%2, %%"REG_a") \n\t" |
1482 PREFETCH" 32(%3, %%"REG_a") \n\t" | 1482 PREFETCH" 32(%3, %%"REG_a") \n\t" |
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | 1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
1484 "movq %%mm0, %%mm2 \n\t" // U(0) | 1484 "movq %%mm0, %%mm2 \n\t" // U(0) |
1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | 1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | 1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | 1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) |
1488 | 1488 |
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | 1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | 1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
1491 "movq %%mm3, %%mm4 \n\t" // Y(0) | 1491 "movq %%mm3, %%mm4 \n\t" // Y(0) |
1492 "movq %%mm5, %%mm6 \n\t" // Y(8) | 1492 "movq %%mm5, %%mm6 \n\t" // Y(8) |
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | 1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) |
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | 1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) |
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | 1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) |
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | 1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) |
1497 | 1497 |
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" | 1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" |
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" | 1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" |
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" | 1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" |
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" | 1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" |
1502 | 1502 |
1503 "add $8, %%"REG_a" \n\t" | 1503 "add $8, %%"REG_a" \n\t" |
1504 "cmp %4, %%"REG_a" \n\t" | 1504 "cmp %4, %%"REG_a" \n\t" |
1505 " jb 1b \n\t" | 1505 " jb 1b \n\t" |
1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | 1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
1507 : "%"REG_a | 1507 : "%"REG_a |
1508 ); | 1508 ); |
1509 #else | 1509 #else |
1510 | 1510 |
1511 #if ARCH_ALPHA && HAVE_MVI | 1511 #if ARCH_ALPHA && HAVE_MVI |
1512 #define pl2yuy2(n) \ | 1512 #define pl2yuy2(n) \ |
1595 } | 1595 } |
1596 ysrc += lumStride; | 1596 ysrc += lumStride; |
1597 dst += dstStride; | 1597 dst += dstStride; |
1598 } | 1598 } |
1599 #if HAVE_MMX | 1599 #if HAVE_MMX |
1600 __asm__( EMMS" \n\t" | 1600 __asm__(EMMS" \n\t" |
1601 SFENCE" \n\t" | 1601 SFENCE" \n\t" |
1602 :::"memory"); | 1602 :::"memory"); |
1603 #endif | 1603 #endif |
1604 } | 1604 } |
1605 | 1605 |
1606 /** | 1606 /** |
1607 * Height should be a multiple of 2 and width should be a multiple of 16. | 1607 * Height should be a multiple of 2 and width should be a multiple of 16. |
1624 for (y=0; y<height; y++) | 1624 for (y=0; y<height; y++) |
1625 { | 1625 { |
1626 #if HAVE_MMX | 1626 #if HAVE_MMX |
1627 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | 1627 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) |
1628 __asm__ volatile( | 1628 __asm__ volatile( |
1629 "xor %%"REG_a", %%"REG_a" \n\t" | 1629 "xor %%"REG_a", %%"REG_a" \n\t" |
1630 ASMALIGN(4) | 1630 ASMALIGN(4) |
1631 "1: \n\t" | 1631 "1: \n\t" |
1632 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | 1632 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
1633 PREFETCH" 32(%2, %%"REG_a") \n\t" | 1633 PREFETCH" 32(%2, %%"REG_a") \n\t" |
1634 PREFETCH" 32(%3, %%"REG_a") \n\t" | 1634 PREFETCH" 32(%3, %%"REG_a") \n\t" |
1635 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | 1635 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
1636 "movq %%mm0, %%mm2 \n\t" // U(0) | 1636 "movq %%mm0, %%mm2 \n\t" // U(0) |
1637 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | 1637 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
1638 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | 1638 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1639 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | 1639 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) |
1640 | 1640 |
1641 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | 1641 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
1642 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | 1642 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
1643 "movq %%mm0, %%mm4 \n\t" // Y(0) | 1643 "movq %%mm0, %%mm4 \n\t" // Y(0) |
1644 "movq %%mm2, %%mm6 \n\t" // Y(8) | 1644 "movq %%mm2, %%mm6 \n\t" // Y(8) |
1645 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | 1645 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) |
1646 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | 1646 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) |
1647 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | 1647 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) |
1648 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | 1648 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) |
1649 | 1649 |
1650 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" | 1650 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" |
1651 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" | 1651 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" |
1652 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" | 1652 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" |
1653 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" | 1653 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" |
1654 | 1654 |
1655 "add $8, %%"REG_a" \n\t" | 1655 "add $8, %%"REG_a" \n\t" |
1656 "cmp %4, %%"REG_a" \n\t" | 1656 "cmp %4, %%"REG_a" \n\t" |
1657 " jb 1b \n\t" | 1657 " jb 1b \n\t" |
1658 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | 1658 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
1659 : "%"REG_a | 1659 : "%"REG_a |
1660 ); | 1660 ); |
1661 #else | 1661 #else |
1662 //FIXME adapt the Alpha ASM code from yv12->yuy2 | 1662 //FIXME adapt the Alpha ASM code from yv12->yuy2 |
1663 | 1663 |
1664 #if HAVE_FAST_64BIT | 1664 #if HAVE_FAST_64BIT |
1701 } | 1701 } |
1702 ysrc += lumStride; | 1702 ysrc += lumStride; |
1703 dst += dstStride; | 1703 dst += dstStride; |
1704 } | 1704 } |
1705 #if HAVE_MMX | 1705 #if HAVE_MMX |
1706 __asm__( EMMS" \n\t" | 1706 __asm__(EMMS" \n\t" |
1707 SFENCE" \n\t" | 1707 SFENCE" \n\t" |
1708 :::"memory"); | 1708 :::"memory"); |
1709 #endif | 1709 #endif |
1710 } | 1710 } |
1711 | 1711 |
1712 /** | 1712 /** |
1713 * Height should be a multiple of 2 and width should be a multiple of 16 | 1713 * Height should be a multiple of 2 and width should be a multiple of 16 |
1753 const x86_reg chromWidth= width>>1; | 1753 const x86_reg chromWidth= width>>1; |
1754 for (y=0; y<height; y+=2) | 1754 for (y=0; y<height; y+=2) |
1755 { | 1755 { |
1756 #if HAVE_MMX | 1756 #if HAVE_MMX |
1757 __asm__ volatile( | 1757 __asm__ volatile( |
1758 "xor %%"REG_a", %%"REG_a" \n\t" | 1758 "xor %%"REG_a", %%"REG_a" \n\t" |
1759 "pcmpeqw %%mm7, %%mm7 \n\t" | 1759 "pcmpeqw %%mm7, %%mm7 \n\t" |
1760 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | 1760 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... |
1761 ASMALIGN(4) | 1761 ASMALIGN(4) |
1762 "1: \n\t" | 1762 "1: \n\t" |
1763 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | 1763 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
1764 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | 1764 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
1765 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | 1765 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) |
1766 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | 1766 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) |
1767 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | 1767 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) |
1768 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | 1768 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) |
1769 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | 1769 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) |
1770 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | 1770 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) |
1771 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | 1771 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) |
1772 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | 1772 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1773 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | 1773 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) |
1774 | 1774 |
1775 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" | 1775 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" |
1776 | 1776 |
1777 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) | 1777 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) |
1778 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) | 1778 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) |
1779 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | 1779 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) |
1780 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | 1780 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) |
1781 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | 1781 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) |
1782 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | 1782 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) |
1783 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | 1783 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) |
1784 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | 1784 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) |
1785 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | 1785 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) |
1786 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | 1786 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) |
1787 | 1787 |
1788 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" | 1788 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" |
1789 | 1789 |
1790 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | 1790 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) |
1791 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | 1791 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) |
1792 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | 1792 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) |
1793 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | 1793 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) |
1794 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | 1794 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) |
1795 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | 1795 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) |
1796 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | 1796 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) |
1797 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | 1797 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) |
1798 | 1798 |
1799 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" | 1799 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" |
1800 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" | 1800 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" |
1801 | 1801 |
1802 "add $8, %%"REG_a" \n\t" | 1802 "add $8, %%"REG_a" \n\t" |
1803 "cmp %4, %%"REG_a" \n\t" | 1803 "cmp %4, %%"REG_a" \n\t" |
1804 " jb 1b \n\t" | 1804 " jb 1b \n\t" |
1805 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | 1805 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
1806 : "memory", "%"REG_a | 1806 : "memory", "%"REG_a |
1807 ); | 1807 ); |
1808 | 1808 |
1809 ydst += lumStride; | 1809 ydst += lumStride; |
1810 src += srcStride; | 1810 src += srcStride; |
1811 | 1811 |
1812 __asm__ volatile( | 1812 __asm__ volatile( |
1813 "xor %%"REG_a", %%"REG_a" \n\t" | 1813 "xor %%"REG_a", %%"REG_a" \n\t" |
1814 ASMALIGN(4) | 1814 ASMALIGN(4) |
1815 "1: \n\t" | 1815 "1: \n\t" |
1816 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | 1816 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
1817 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | 1817 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
1818 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | 1818 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) |
1819 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | 1819 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) |
1820 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | 1820 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) |
1821 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | 1821 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) |
1822 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | 1822 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) |
1823 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | 1823 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) |
1824 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | 1824 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) |
1825 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | 1825 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) |
1826 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | 1826 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) |
1827 | 1827 |
1828 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" | 1828 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" |
1829 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" | 1829 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" |
1830 | 1830 |
1831 "add $8, %%"REG_a" \n\t" | 1831 "add $8, %%"REG_a" \n\t" |
1832 "cmp %4, %%"REG_a" \n\t" | 1832 "cmp %4, %%"REG_a" \n\t" |
1833 " jb 1b \n\t" | 1833 " jb 1b \n\t" |
1834 | 1834 |
1835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | 1835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
1836 : "memory", "%"REG_a | 1836 : "memory", "%"REG_a |
1837 ); | 1837 ); |
1838 #else | 1838 #else |
1839 long i; | 1839 long i; |
1840 for (i=0; i<chromWidth; i++) | 1840 for (i=0; i<chromWidth; i++) |
1841 { | 1841 { |
1857 vdst += chromStride; | 1857 vdst += chromStride; |
1858 ydst += lumStride; | 1858 ydst += lumStride; |
1859 src += srcStride; | 1859 src += srcStride; |
1860 } | 1860 } |
1861 #if HAVE_MMX | 1861 #if HAVE_MMX |
1862 __asm__ volatile( EMMS" \n\t" | 1862 __asm__ volatile(EMMS" \n\t" |
1863 SFENCE" \n\t" | 1863 SFENCE" \n\t" |
1864 :::"memory"); | 1864 :::"memory"); |
1865 #endif | 1865 #endif |
1866 } | 1866 } |
1867 | 1867 |
1868 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, | 1868 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, |
1869 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | 1869 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
1886 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | 1886 dst[2*x+1]= (3*src[x] + src[x+1])>>2; |
1887 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | 1887 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; |
1888 } | 1888 } |
1889 dst[2*srcWidth-1]= src[srcWidth-1]; | 1889 dst[2*srcWidth-1]= src[srcWidth-1]; |
1890 | 1890 |
1891 dst+= dstStride; | 1891 dst+= dstStride; |
1892 | 1892 |
1893 for (y=1; y<srcHeight; y++){ | 1893 for (y=1; y<srcHeight; y++){ |
1894 #if HAVE_MMX2 || HAVE_AMD3DNOW | 1894 #if HAVE_MMX2 || HAVE_AMD3DNOW |
1895 const x86_reg mmxSize= srcWidth&~15; | 1895 const x86_reg mmxSize= srcWidth&~15; |
1896 __asm__ volatile( | 1896 __asm__ volatile( |
1897 "mov %4, %%"REG_a" \n\t" | 1897 "mov %4, %%"REG_a" \n\t" |
1898 "1: \n\t" | 1898 "1: \n\t" |
1899 "movq (%0, %%"REG_a"), %%mm0 \n\t" | 1899 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
1900 "movq (%1, %%"REG_a"), %%mm1 \n\t" | 1900 "movq (%1, %%"REG_a"), %%mm1 \n\t" |
1901 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" | 1901 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" |
1902 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" | 1902 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" |
1903 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" | 1903 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" |
1904 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" | 1904 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" |
1905 PAVGB" %%mm0, %%mm5 \n\t" | 1905 PAVGB" %%mm0, %%mm5 \n\t" |
1906 PAVGB" %%mm0, %%mm3 \n\t" | 1906 PAVGB" %%mm0, %%mm3 \n\t" |
1907 PAVGB" %%mm0, %%mm5 \n\t" | 1907 PAVGB" %%mm0, %%mm5 \n\t" |
1908 PAVGB" %%mm0, %%mm3 \n\t" | 1908 PAVGB" %%mm0, %%mm3 \n\t" |
1909 PAVGB" %%mm1, %%mm4 \n\t" | 1909 PAVGB" %%mm1, %%mm4 \n\t" |
1910 PAVGB" %%mm1, %%mm2 \n\t" | 1910 PAVGB" %%mm1, %%mm2 \n\t" |
1911 PAVGB" %%mm1, %%mm4 \n\t" | 1911 PAVGB" %%mm1, %%mm4 \n\t" |
1912 PAVGB" %%mm1, %%mm2 \n\t" | 1912 PAVGB" %%mm1, %%mm2 \n\t" |
1913 "movq %%mm5, %%mm7 \n\t" | 1913 "movq %%mm5, %%mm7 \n\t" |
1914 "movq %%mm4, %%mm6 \n\t" | 1914 "movq %%mm4, %%mm6 \n\t" |
1915 "punpcklbw %%mm3, %%mm5 \n\t" | 1915 "punpcklbw %%mm3, %%mm5 \n\t" |
1916 "punpckhbw %%mm3, %%mm7 \n\t" | 1916 "punpckhbw %%mm3, %%mm7 \n\t" |
1917 "punpcklbw %%mm2, %%mm4 \n\t" | 1917 "punpcklbw %%mm2, %%mm4 \n\t" |
1918 "punpckhbw %%mm2, %%mm6 \n\t" | 1918 "punpckhbw %%mm2, %%mm6 \n\t" |
1919 #if 1 | 1919 #if 1 |
1920 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" | 1920 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" |
1921 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" | 1921 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" |
1922 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" | 1922 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" |
1923 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" | 1923 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" |
1924 #else | 1924 #else |
1925 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" | 1925 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" |
1926 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t" | 1926 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t" |
1927 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" | 1927 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" |
1928 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t" | 1928 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t" |
1929 #endif | 1929 #endif |
1930 "add $8, %%"REG_a" \n\t" | 1930 "add $8, %%"REG_a" \n\t" |
1931 " js 1b \n\t" | 1931 " js 1b \n\t" |
1932 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | 1932 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), |
1933 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | 1933 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), |
1934 "g" (-mmxSize) | 1934 "g" (-mmxSize) |
1935 : "%"REG_a | 1935 : "%"REG_a |
1936 | 1936 |
1937 ); | 1937 ); |
1938 #else | 1938 #else |
1939 const x86_reg mmxSize=1; | 1939 const x86_reg mmxSize=1; |
1940 #endif | 1940 #endif |
1969 dst[2*x+1]= src[x]; | 1969 dst[2*x+1]= src[x]; |
1970 } | 1970 } |
1971 #endif | 1971 #endif |
1972 | 1972 |
1973 #if HAVE_MMX | 1973 #if HAVE_MMX |
1974 __asm__ volatile( EMMS" \n\t" | 1974 __asm__ volatile(EMMS" \n\t" |
1975 SFENCE" \n\t" | 1975 SFENCE" \n\t" |
1976 :::"memory"); | 1976 :::"memory"); |
1977 #endif | 1977 #endif |
1978 } | 1978 } |
1979 | 1979 |
1980 /** | 1980 /** |
1981 * Height should be a multiple of 2 and width should be a multiple of 16. | 1981 * Height should be a multiple of 2 and width should be a multiple of 16. |
1991 const x86_reg chromWidth= width>>1; | 1991 const x86_reg chromWidth= width>>1; |
1992 for (y=0; y<height; y+=2) | 1992 for (y=0; y<height; y+=2) |
1993 { | 1993 { |
1994 #if HAVE_MMX | 1994 #if HAVE_MMX |
1995 __asm__ volatile( | 1995 __asm__ volatile( |
1996 "xor %%"REG_a", %%"REG_a" \n\t" | 1996 "xor %%"REG_a", %%"REG_a" \n\t" |
1997 "pcmpeqw %%mm7, %%mm7 \n\t" | 1997 "pcmpeqw %%mm7, %%mm7 \n\t" |
1998 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | 1998 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... |
1999 ASMALIGN(4) | 1999 ASMALIGN(4) |
2000 "1: \n\t" | 2000 "1: \n\t" |
2001 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | 2001 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
2002 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) | 2002 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) |
2003 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) | 2003 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) |
2004 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | 2004 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) |
2005 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | 2005 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) |
2006 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | 2006 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) |
2007 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | 2007 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) |
2008 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | 2008 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) |
2009 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | 2009 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) |
2010 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | 2010 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
2011 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | 2011 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) |
2012 | 2012 |
2013 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" | 2013 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" |
2014 | 2014 |
2015 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) | 2015 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) |
2016 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) | 2016 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) |
2017 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | 2017 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) |
2018 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | 2018 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) |
2019 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | 2019 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) |
2020 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | 2020 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) |
2021 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | 2021 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) |
2022 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | 2022 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) |
2023 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | 2023 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) |
2024 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | 2024 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) |
2025 | 2025 |
2026 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" | 2026 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" |
2027 | 2027 |
2028 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | 2028 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) |
2029 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | 2029 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) |
2030 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | 2030 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) |
2031 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | 2031 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) |
2032 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | 2032 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) |
2033 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | 2033 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) |
2034 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | 2034 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) |
2035 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | 2035 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) |
2036 | 2036 |
2037 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" | 2037 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" |
2038 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" | 2038 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" |
2039 | 2039 |
2040 "add $8, %%"REG_a" \n\t" | 2040 "add $8, %%"REG_a" \n\t" |
2041 "cmp %4, %%"REG_a" \n\t" | 2041 "cmp %4, %%"REG_a" \n\t" |
2042 " jb 1b \n\t" | 2042 " jb 1b \n\t" |
2043 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | 2043 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2044 : "memory", "%"REG_a | 2044 : "memory", "%"REG_a |
2045 ); | 2045 ); |
2046 | 2046 |
2047 ydst += lumStride; | 2047 ydst += lumStride; |
2048 src += srcStride; | 2048 src += srcStride; |
2049 | 2049 |
2050 __asm__ volatile( | 2050 __asm__ volatile( |
2051 "xor %%"REG_a", %%"REG_a" \n\t" | 2051 "xor %%"REG_a", %%"REG_a" \n\t" |
2052 ASMALIGN(4) | 2052 ASMALIGN(4) |
2053 "1: \n\t" | 2053 "1: \n\t" |
2054 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | 2054 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
2055 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | 2055 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
2056 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | 2056 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) |
2057 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | 2057 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) |
2058 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | 2058 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) |
2059 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | 2059 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) |
2060 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | 2060 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) |
2061 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | 2061 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) |
2062 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | 2062 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) |
2063 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | 2063 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) |
2064 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | 2064 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) |
2065 | 2065 |
2066 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" | 2066 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" |
2067 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" | 2067 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" |
2068 | 2068 |
2069 "add $8, %%"REG_a" \n\t" | 2069 "add $8, %%"REG_a" \n\t" |
2070 "cmp %4, %%"REG_a" \n\t" | 2070 "cmp %4, %%"REG_a" \n\t" |
2071 " jb 1b \n\t" | 2071 " jb 1b \n\t" |
2072 | 2072 |
2073 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | 2073 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2074 : "memory", "%"REG_a | 2074 : "memory", "%"REG_a |
2075 ); | 2075 ); |
2076 #else | 2076 #else |
2077 long i; | 2077 long i; |
2078 for (i=0; i<chromWidth; i++) | 2078 for (i=0; i<chromWidth; i++) |
2079 { | 2079 { |
2095 vdst += chromStride; | 2095 vdst += chromStride; |
2096 ydst += lumStride; | 2096 ydst += lumStride; |
2097 src += srcStride; | 2097 src += srcStride; |
2098 } | 2098 } |
2099 #if HAVE_MMX | 2099 #if HAVE_MMX |
2100 __asm__ volatile( EMMS" \n\t" | 2100 __asm__ volatile(EMMS" \n\t" |
2101 SFENCE" \n\t" | 2101 SFENCE" \n\t" |
2102 :::"memory"); | 2102 :::"memory"); |
2103 #endif | 2103 #endif |
2104 } | 2104 } |
2105 | 2105 |
2106 /** | 2106 /** |
2107 * Height should be a multiple of 2 and width should be a multiple of 2. | 2107 * Height should be a multiple of 2 and width should be a multiple of 2. |
2121 { | 2121 { |
2122 long i; | 2122 long i; |
2123 for (i=0; i<2; i++) | 2123 for (i=0; i<2; i++) |
2124 { | 2124 { |
2125 __asm__ volatile( | 2125 __asm__ volatile( |
2126 "mov %2, %%"REG_a" \n\t" | 2126 "mov %2, %%"REG_a" \n\t" |
2127 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" | 2127 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" |
2128 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | 2128 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" |
2129 "pxor %%mm7, %%mm7 \n\t" | 2129 "pxor %%mm7, %%mm7 \n\t" |
2130 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | 2130 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" |
2131 ASMALIGN(4) | 2131 ASMALIGN(4) |
2132 "1: \n\t" | 2132 "1: \n\t" |
2133 PREFETCH" 64(%0, %%"REG_d") \n\t" | 2133 PREFETCH" 64(%0, %%"REG_d") \n\t" |
2134 "movd (%0, %%"REG_d"), %%mm0 \n\t" | 2134 "movd (%0, %%"REG_d"), %%mm0 \n\t" |
2135 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | 2135 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" |
2136 "punpcklbw %%mm7, %%mm0 \n\t" | 2136 "punpcklbw %%mm7, %%mm0 \n\t" |
2137 "punpcklbw %%mm7, %%mm1 \n\t" | 2137 "punpcklbw %%mm7, %%mm1 \n\t" |
2138 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" | 2138 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" |
2139 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | 2139 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" |
2140 "punpcklbw %%mm7, %%mm2 \n\t" | 2140 "punpcklbw %%mm7, %%mm2 \n\t" |
2141 "punpcklbw %%mm7, %%mm3 \n\t" | 2141 "punpcklbw %%mm7, %%mm3 \n\t" |
2142 "pmaddwd %%mm6, %%mm0 \n\t" | 2142 "pmaddwd %%mm6, %%mm0 \n\t" |
2143 "pmaddwd %%mm6, %%mm1 \n\t" | 2143 "pmaddwd %%mm6, %%mm1 \n\t" |
2144 "pmaddwd %%mm6, %%mm2 \n\t" | 2144 "pmaddwd %%mm6, %%mm2 \n\t" |
2145 "pmaddwd %%mm6, %%mm3 \n\t" | 2145 "pmaddwd %%mm6, %%mm3 \n\t" |
2146 #ifndef FAST_BGR2YV12 | 2146 #ifndef FAST_BGR2YV12 |
2147 "psrad $8, %%mm0 \n\t" | 2147 "psrad $8, %%mm0 \n\t" |
2148 "psrad $8, %%mm1 \n\t" | 2148 "psrad $8, %%mm1 \n\t" |
2149 "psrad $8, %%mm2 \n\t" | 2149 "psrad $8, %%mm2 \n\t" |
2150 "psrad $8, %%mm3 \n\t" | 2150 "psrad $8, %%mm3 \n\t" |
2151 #endif | 2151 #endif |
2152 "packssdw %%mm1, %%mm0 \n\t" | 2152 "packssdw %%mm1, %%mm0 \n\t" |
2153 "packssdw %%mm3, %%mm2 \n\t" | 2153 "packssdw %%mm3, %%mm2 \n\t" |
2154 "pmaddwd %%mm5, %%mm0 \n\t" | 2154 "pmaddwd %%mm5, %%mm0 \n\t" |
2155 "pmaddwd %%mm5, %%mm2 \n\t" | 2155 "pmaddwd %%mm5, %%mm2 \n\t" |
2156 "packssdw %%mm2, %%mm0 \n\t" | 2156 "packssdw %%mm2, %%mm0 \n\t" |
2157 "psraw $7, %%mm0 \n\t" | 2157 "psraw $7, %%mm0 \n\t" |
2158 | 2158 |
2159 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" | 2159 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
2160 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | 2160 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" |
2161 "punpcklbw %%mm7, %%mm4 \n\t" | 2161 "punpcklbw %%mm7, %%mm4 \n\t" |
2162 "punpcklbw %%mm7, %%mm1 \n\t" | 2162 "punpcklbw %%mm7, %%mm1 \n\t" |
2163 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" | 2163 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" |
2164 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | 2164 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" |
2165 "punpcklbw %%mm7, %%mm2 \n\t" | 2165 "punpcklbw %%mm7, %%mm2 \n\t" |
2166 "punpcklbw %%mm7, %%mm3 \n\t" | 2166 "punpcklbw %%mm7, %%mm3 \n\t" |
2167 "pmaddwd %%mm6, %%mm4 \n\t" | 2167 "pmaddwd %%mm6, %%mm4 \n\t" |
2168 "pmaddwd %%mm6, %%mm1 \n\t" | 2168 "pmaddwd %%mm6, %%mm1 \n\t" |
2169 "pmaddwd %%mm6, %%mm2 \n\t" | 2169 "pmaddwd %%mm6, %%mm2 \n\t" |
2170 "pmaddwd %%mm6, %%mm3 \n\t" | 2170 "pmaddwd %%mm6, %%mm3 \n\t" |
2171 #ifndef FAST_BGR2YV12 | 2171 #ifndef FAST_BGR2YV12 |
2172 "psrad $8, %%mm4 \n\t" | 2172 "psrad $8, %%mm4 \n\t" |
2173 "psrad $8, %%mm1 \n\t" | 2173 "psrad $8, %%mm1 \n\t" |
2174 "psrad $8, %%mm2 \n\t" | 2174 "psrad $8, %%mm2 \n\t" |
2175 "psrad $8, %%mm3 \n\t" | 2175 "psrad $8, %%mm3 \n\t" |
2176 #endif | 2176 #endif |
2177 "packssdw %%mm1, %%mm4 \n\t" | 2177 "packssdw %%mm1, %%mm4 \n\t" |
2178 "packssdw %%mm3, %%mm2 \n\t" | 2178 "packssdw %%mm3, %%mm2 \n\t" |
2179 "pmaddwd %%mm5, %%mm4 \n\t" | 2179 "pmaddwd %%mm5, %%mm4 \n\t" |
2180 "pmaddwd %%mm5, %%mm2 \n\t" | 2180 "pmaddwd %%mm5, %%mm2 \n\t" |
2181 "add $24, %%"REG_d" \n\t" | 2181 "add $24, %%"REG_d" \n\t" |
2182 "packssdw %%mm2, %%mm4 \n\t" | 2182 "packssdw %%mm2, %%mm4 \n\t" |
2183 "psraw $7, %%mm4 \n\t" | 2183 "psraw $7, %%mm4 \n\t" |
2184 | 2184 |
2185 "packuswb %%mm4, %%mm0 \n\t" | 2185 "packuswb %%mm4, %%mm0 \n\t" |
2186 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" | 2186 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" |
2187 | 2187 |
2188 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" | 2188 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" |
2189 "add $8, %%"REG_a" \n\t" | 2189 "add $8, %%"REG_a" \n\t" |
2190 " js 1b \n\t" | 2190 " js 1b \n\t" |
2191 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) | 2191 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) |
2192 : "%"REG_a, "%"REG_d | 2192 : "%"REG_a, "%"REG_d |
2193 ); | 2193 ); |
2194 ydst += lumStride; | 2194 ydst += lumStride; |
2195 src += srcStride; | 2195 src += srcStride; |
2196 } | 2196 } |
2197 src -= srcStride*2; | 2197 src -= srcStride*2; |
2198 __asm__ volatile( | 2198 __asm__ volatile( |
2199 "mov %4, %%"REG_a" \n\t" | 2199 "mov %4, %%"REG_a" \n\t" |
2200 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | 2200 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" |
2201 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" | 2201 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" |
2202 "pxor %%mm7, %%mm7 \n\t" | 2202 "pxor %%mm7, %%mm7 \n\t" |
2203 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | 2203 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" |
2204 "add %%"REG_d", %%"REG_d" \n\t" | 2204 "add %%"REG_d", %%"REG_d" \n\t" |
2205 ASMALIGN(4) | 2205 ASMALIGN(4) |
2206 "1: \n\t" | 2206 "1: \n\t" |
2207 PREFETCH" 64(%0, %%"REG_d") \n\t" | 2207 PREFETCH" 64(%0, %%"REG_d") \n\t" |
2208 PREFETCH" 64(%1, %%"REG_d") \n\t" | 2208 PREFETCH" 64(%1, %%"REG_d") \n\t" |
2209 #if HAVE_MMX2 || HAVE_AMD3DNOW | 2209 #if HAVE_MMX2 || HAVE_AMD3DNOW |
2210 "movq (%0, %%"REG_d"), %%mm0 \n\t" | 2210 "movq (%0, %%"REG_d"), %%mm0 \n\t" |
2211 "movq (%1, %%"REG_d"), %%mm1 \n\t" | 2211 "movq (%1, %%"REG_d"), %%mm1 \n\t" |
2212 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" | 2212 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" |
2213 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" | 2213 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" |
2214 PAVGB" %%mm1, %%mm0 \n\t" | 2214 PAVGB" %%mm1, %%mm0 \n\t" |
2215 PAVGB" %%mm3, %%mm2 \n\t" | 2215 PAVGB" %%mm3, %%mm2 \n\t" |
2216 "movq %%mm0, %%mm1 \n\t" | 2216 "movq %%mm0, %%mm1 \n\t" |
2217 "movq %%mm2, %%mm3 \n\t" | 2217 "movq %%mm2, %%mm3 \n\t" |
2218 "psrlq $24, %%mm0 \n\t" | 2218 "psrlq $24, %%mm0 \n\t" |
2219 "psrlq $24, %%mm2 \n\t" | 2219 "psrlq $24, %%mm2 \n\t" |
2220 PAVGB" %%mm1, %%mm0 \n\t" | 2220 PAVGB" %%mm1, %%mm0 \n\t" |
2221 PAVGB" %%mm3, %%mm2 \n\t" | 2221 PAVGB" %%mm3, %%mm2 \n\t" |
2222 "punpcklbw %%mm7, %%mm0 \n\t" | 2222 "punpcklbw %%mm7, %%mm0 \n\t" |
2223 "punpcklbw %%mm7, %%mm2 \n\t" | 2223 "punpcklbw %%mm7, %%mm2 \n\t" |
2224 #else | 2224 #else |
2225 "movd (%0, %%"REG_d"), %%mm0 \n\t" | 2225 "movd (%0, %%"REG_d"), %%mm0 \n\t" |
2226 "movd (%1, %%"REG_d"), %%mm1 \n\t" | 2226 "movd (%1, %%"REG_d"), %%mm1 \n\t" |
2227 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" | 2227 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" |
2228 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" | 2228 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" |
2229 "punpcklbw %%mm7, %%mm0 \n\t" | 2229 "punpcklbw %%mm7, %%mm0 \n\t" |
2230 "punpcklbw %%mm7, %%mm1 \n\t" | 2230 "punpcklbw %%mm7, %%mm1 \n\t" |
2231 "punpcklbw %%mm7, %%mm2 \n\t" | 2231 "punpcklbw %%mm7, %%mm2 \n\t" |
2232 "punpcklbw %%mm7, %%mm3 \n\t" | 2232 "punpcklbw %%mm7, %%mm3 \n\t" |
2233 "paddw %%mm1, %%mm0 \n\t" | 2233 "paddw %%mm1, %%mm0 \n\t" |
2234 "paddw %%mm3, %%mm2 \n\t" | 2234 "paddw %%mm3, %%mm2 \n\t" |
2235 "paddw %%mm2, %%mm0 \n\t" | 2235 "paddw %%mm2, %%mm0 \n\t" |
2236 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" | 2236 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" |
2237 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" | 2237 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" |
2238 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" | 2238 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" |
2239 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" | 2239 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" |
2240 "punpcklbw %%mm7, %%mm4 \n\t" | 2240 "punpcklbw %%mm7, %%mm4 \n\t" |
2241 "punpcklbw %%mm7, %%mm1 \n\t" | 2241 "punpcklbw %%mm7, %%mm1 \n\t" |
2242 "punpcklbw %%mm7, %%mm2 \n\t" | 2242 "punpcklbw %%mm7, %%mm2 \n\t" |
2243 "punpcklbw %%mm7, %%mm3 \n\t" | 2243 "punpcklbw %%mm7, %%mm3 \n\t" |
2244 "paddw %%mm1, %%mm4 \n\t" | 2244 "paddw %%mm1, %%mm4 \n\t" |
2245 "paddw %%mm3, %%mm2 \n\t" | 2245 "paddw %%mm3, %%mm2 \n\t" |
2246 "paddw %%mm4, %%mm2 \n\t" | 2246 "paddw %%mm4, %%mm2 \n\t" |
2247 "psrlw $2, %%mm0 \n\t" | 2247 "psrlw $2, %%mm0 \n\t" |
2248 "psrlw $2, %%mm2 \n\t" | 2248 "psrlw $2, %%mm2 \n\t" |
2249 #endif | 2249 #endif |
2250 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" | 2250 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" |
2251 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | 2251 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" |
2252 | 2252 |
2253 "pmaddwd %%mm0, %%mm1 \n\t" | 2253 "pmaddwd %%mm0, %%mm1 \n\t" |
2254 "pmaddwd %%mm2, %%mm3 \n\t" | 2254 "pmaddwd %%mm2, %%mm3 \n\t" |
2255 "pmaddwd %%mm6, %%mm0 \n\t" | 2255 "pmaddwd %%mm6, %%mm0 \n\t" |
2256 "pmaddwd %%mm6, %%mm2 \n\t" | 2256 "pmaddwd %%mm6, %%mm2 \n\t" |
2257 #ifndef FAST_BGR2YV12 | 2257 #ifndef FAST_BGR2YV12 |
2258 "psrad $8, %%mm0 \n\t" | 2258 "psrad $8, %%mm0 \n\t" |
2259 "psrad $8, %%mm1 \n\t" | 2259 "psrad $8, %%mm1 \n\t" |
2260 "psrad $8, %%mm2 \n\t" | 2260 "psrad $8, %%mm2 \n\t" |
2261 "psrad $8, %%mm3 \n\t" | 2261 "psrad $8, %%mm3 \n\t" |
2262 #endif | 2262 #endif |
2263 "packssdw %%mm2, %%mm0 \n\t" | 2263 "packssdw %%mm2, %%mm0 \n\t" |
2264 "packssdw %%mm3, %%mm1 \n\t" | 2264 "packssdw %%mm3, %%mm1 \n\t" |
2265 "pmaddwd %%mm5, %%mm0 \n\t" | 2265 "pmaddwd %%mm5, %%mm0 \n\t" |
2266 "pmaddwd %%mm5, %%mm1 \n\t" | 2266 "pmaddwd %%mm5, %%mm1 \n\t" |
2267 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | 2267 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 |
2268 "psraw $7, %%mm0 \n\t" | 2268 "psraw $7, %%mm0 \n\t" |
2269 | 2269 |
2270 #if HAVE_MMX2 || HAVE_AMD3DNOW | 2270 #if HAVE_MMX2 || HAVE_AMD3DNOW |
2271 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" | 2271 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" |
2272 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" | 2272 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" |
2273 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" | 2273 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" |
2274 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" | 2274 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" |
2275 PAVGB" %%mm1, %%mm4 \n\t" | 2275 PAVGB" %%mm1, %%mm4 \n\t" |
2276 PAVGB" %%mm3, %%mm2 \n\t" | 2276 PAVGB" %%mm3, %%mm2 \n\t" |
2277 "movq %%mm4, %%mm1 \n\t" | 2277 "movq %%mm4, %%mm1 \n\t" |
2278 "movq %%mm2, %%mm3 \n\t" | 2278 "movq %%mm2, %%mm3 \n\t" |
2279 "psrlq $24, %%mm4 \n\t" | 2279 "psrlq $24, %%mm4 \n\t" |
2280 "psrlq $24, %%mm2 \n\t" | 2280 "psrlq $24, %%mm2 \n\t" |
2281 PAVGB" %%mm1, %%mm4 \n\t" | 2281 PAVGB" %%mm1, %%mm4 \n\t" |
2282 PAVGB" %%mm3, %%mm2 \n\t" | 2282 PAVGB" %%mm3, %%mm2 \n\t" |
2283 "punpcklbw %%mm7, %%mm4 \n\t" | 2283 "punpcklbw %%mm7, %%mm4 \n\t" |
2284 "punpcklbw %%mm7, %%mm2 \n\t" | 2284 "punpcklbw %%mm7, %%mm2 \n\t" |
2285 #else | 2285 #else |
2286 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" | 2286 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
2287 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" | 2287 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" |
2288 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" | 2288 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" |
2289 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" | 2289 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" |
2290 "punpcklbw %%mm7, %%mm4 \n\t" | 2290 "punpcklbw %%mm7, %%mm4 \n\t" |
2291 "punpcklbw %%mm7, %%mm1 \n\t" | 2291 "punpcklbw %%mm7, %%mm1 \n\t" |
2292 "punpcklbw %%mm7, %%mm2 \n\t" | 2292 "punpcklbw %%mm7, %%mm2 \n\t" |
2293 "punpcklbw %%mm7, %%mm3 \n\t" | 2293 "punpcklbw %%mm7, %%mm3 \n\t" |
2294 "paddw %%mm1, %%mm4 \n\t" | 2294 "paddw %%mm1, %%mm4 \n\t" |
2295 "paddw %%mm3, %%mm2 \n\t" | 2295 "paddw %%mm3, %%mm2 \n\t" |
2296 "paddw %%mm2, %%mm4 \n\t" | 2296 "paddw %%mm2, %%mm4 \n\t" |
2297 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" | 2297 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" |
2298 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" | 2298 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" |
2299 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" | 2299 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" |
2300 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" | 2300 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" |
2301 "punpcklbw %%mm7, %%mm5 \n\t" | 2301 "punpcklbw %%mm7, %%mm5 \n\t" |
2302 "punpcklbw %%mm7, %%mm1 \n\t" | 2302 "punpcklbw %%mm7, %%mm1 \n\t" |
2303 "punpcklbw %%mm7, %%mm2 \n\t" | 2303 "punpcklbw %%mm7, %%mm2 \n\t" |
2304 "punpcklbw %%mm7, %%mm3 \n\t" | 2304 "punpcklbw %%mm7, %%mm3 \n\t" |
2305 "paddw %%mm1, %%mm5 \n\t" | 2305 "paddw %%mm1, %%mm5 \n\t" |
2306 "paddw %%mm3, %%mm2 \n\t" | 2306 "paddw %%mm3, %%mm2 \n\t" |
2307 "paddw %%mm5, %%mm2 \n\t" | 2307 "paddw %%mm5, %%mm2 \n\t" |
2308 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | 2308 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" |
2309 "psrlw $2, %%mm4 \n\t" | 2309 "psrlw $2, %%mm4 \n\t" |
2310 "psrlw $2, %%mm2 \n\t" | 2310 "psrlw $2, %%mm2 \n\t" |
2311 #endif | 2311 #endif |
2312 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" | 2312 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" |
2313 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | 2313 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" |
2314 | 2314 |
2315 "pmaddwd %%mm4, %%mm1 \n\t" | 2315 "pmaddwd %%mm4, %%mm1 \n\t" |
2316 "pmaddwd %%mm2, %%mm3 \n\t" | 2316 "pmaddwd %%mm2, %%mm3 \n\t" |
2317 "pmaddwd %%mm6, %%mm4 \n\t" | 2317 "pmaddwd %%mm6, %%mm4 \n\t" |
2318 "pmaddwd %%mm6, %%mm2 \n\t" | 2318 "pmaddwd %%mm6, %%mm2 \n\t" |
2319 #ifndef FAST_BGR2YV12 | 2319 #ifndef FAST_BGR2YV12 |
2320 "psrad $8, %%mm4 \n\t" | 2320 "psrad $8, %%mm4 \n\t" |
2321 "psrad $8, %%mm1 \n\t" | 2321 "psrad $8, %%mm1 \n\t" |
2322 "psrad $8, %%mm2 \n\t" | 2322 "psrad $8, %%mm2 \n\t" |
2323 "psrad $8, %%mm3 \n\t" | 2323 "psrad $8, %%mm3 \n\t" |
2324 #endif | 2324 #endif |
2325 "packssdw %%mm2, %%mm4 \n\t" | 2325 "packssdw %%mm2, %%mm4 \n\t" |
2326 "packssdw %%mm3, %%mm1 \n\t" | 2326 "packssdw %%mm3, %%mm1 \n\t" |
2327 "pmaddwd %%mm5, %%mm4 \n\t" | 2327 "pmaddwd %%mm5, %%mm4 \n\t" |
2328 "pmaddwd %%mm5, %%mm1 \n\t" | 2328 "pmaddwd %%mm5, %%mm1 \n\t" |
2329 "add $24, %%"REG_d" \n\t" | 2329 "add $24, %%"REG_d" \n\t" |
2330 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | 2330 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2331 "psraw $7, %%mm4 \n\t" | 2331 "psraw $7, %%mm4 \n\t" |
2332 | 2332 |
2333 "movq %%mm0, %%mm1 \n\t" | 2333 "movq %%mm0, %%mm1 \n\t" |
2334 "punpckldq %%mm4, %%mm0 \n\t" | 2334 "punpckldq %%mm4, %%mm0 \n\t" |
2335 "punpckhdq %%mm4, %%mm1 \n\t" | 2335 "punpckhdq %%mm4, %%mm1 \n\t" |
2336 "packsswb %%mm1, %%mm0 \n\t" | 2336 "packsswb %%mm1, %%mm0 \n\t" |
2337 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" | 2337 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" |
2338 "movd %%mm0, (%2, %%"REG_a") \n\t" | 2338 "movd %%mm0, (%2, %%"REG_a") \n\t" |
2339 "punpckhdq %%mm0, %%mm0 \n\t" | 2339 "punpckhdq %%mm0, %%mm0 \n\t" |
2340 "movd %%mm0, (%3, %%"REG_a") \n\t" | 2340 "movd %%mm0, (%3, %%"REG_a") \n\t" |
2341 "add $4, %%"REG_a" \n\t" | 2341 "add $4, %%"REG_a" \n\t" |
2342 " js 1b \n\t" | 2342 " js 1b \n\t" |
2343 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) | 2343 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) |
2344 : "%"REG_a, "%"REG_d | 2344 : "%"REG_a, "%"REG_d |
2345 ); | 2345 ); |
2346 | 2346 |
2347 udst += chromStride; | 2347 udst += chromStride; |
2348 vdst += chromStride; | 2348 vdst += chromStride; |
2349 src += srcStride*2; | 2349 src += srcStride*2; |
2350 } | 2350 } |
2351 | 2351 |
2352 __asm__ volatile( EMMS" \n\t" | 2352 __asm__ volatile(EMMS" \n\t" |
2353 SFENCE" \n\t" | 2353 SFENCE" \n\t" |
2354 :::"memory"); | 2354 :::"memory"); |
2355 #else | 2355 #else |
2356 y=0; | 2356 y=0; |
2357 #endif | 2357 #endif |
2358 for (; y<height; y+=2) | 2358 for (; y<height; y+=2) |
2359 { | 2359 { |
2416 long w; | 2416 long w; |
2417 | 2417 |
2418 #if HAVE_MMX | 2418 #if HAVE_MMX |
2419 #if HAVE_SSE2 | 2419 #if HAVE_SSE2 |
2420 __asm__( | 2420 __asm__( |
2421 "xor %%"REG_a", %%"REG_a" \n\t" | 2421 "xor %%"REG_a", %%"REG_a" \n\t" |
2422 "1: \n\t" | 2422 "1: \n\t" |
2423 PREFETCH" 64(%1, %%"REG_a") \n\t" | 2423 PREFETCH" 64(%1, %%"REG_a") \n\t" |
2424 PREFETCH" 64(%2, %%"REG_a") \n\t" | 2424 PREFETCH" 64(%2, %%"REG_a") \n\t" |
2425 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" | 2425 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" |
2426 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" | 2426 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" |
2427 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" | 2427 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" |
2428 "punpcklbw %%xmm2, %%xmm0 \n\t" | 2428 "punpcklbw %%xmm2, %%xmm0 \n\t" |
2429 "punpckhbw %%xmm2, %%xmm1 \n\t" | 2429 "punpckhbw %%xmm2, %%xmm1 \n\t" |
2430 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" | 2430 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" |
2431 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" | 2431 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" |
2432 "add $16, %%"REG_a" \n\t" | 2432 "add $16, %%"REG_a" \n\t" |
2433 "cmp %3, %%"REG_a" \n\t" | 2433 "cmp %3, %%"REG_a" \n\t" |
2434 " jb 1b \n\t" | 2434 " jb 1b \n\t" |
2435 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | 2435 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) |
2436 : "memory", "%"REG_a"" | 2436 : "memory", "%"REG_a"" |
2437 ); | 2437 ); |
2438 #else | 2438 #else |
2439 __asm__( | 2439 __asm__( |
2440 "xor %%"REG_a", %%"REG_a" \n\t" | 2440 "xor %%"REG_a", %%"REG_a" \n\t" |
2441 "1: \n\t" | 2441 "1: \n\t" |
2442 PREFETCH" 64(%1, %%"REG_a") \n\t" | 2442 PREFETCH" 64(%1, %%"REG_a") \n\t" |
2443 PREFETCH" 64(%2, %%"REG_a") \n\t" | 2443 PREFETCH" 64(%2, %%"REG_a") \n\t" |
2444 "movq (%1, %%"REG_a"), %%mm0 \n\t" | 2444 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
2445 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" | 2445 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" |
2446 "movq %%mm0, %%mm1 \n\t" | 2446 "movq %%mm0, %%mm1 \n\t" |
2447 "movq %%mm2, %%mm3 \n\t" | 2447 "movq %%mm2, %%mm3 \n\t" |
2448 "movq (%2, %%"REG_a"), %%mm4 \n\t" | 2448 "movq (%2, %%"REG_a"), %%mm4 \n\t" |
2449 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" | 2449 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" |
2450 "punpcklbw %%mm4, %%mm0 \n\t" | 2450 "punpcklbw %%mm4, %%mm0 \n\t" |
2451 "punpckhbw %%mm4, %%mm1 \n\t" | 2451 "punpckhbw %%mm4, %%mm1 \n\t" |
2452 "punpcklbw %%mm5, %%mm2 \n\t" | 2452 "punpcklbw %%mm5, %%mm2 \n\t" |
2453 "punpckhbw %%mm5, %%mm3 \n\t" | 2453 "punpckhbw %%mm5, %%mm3 \n\t" |
2454 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" | 2454 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" |
2455 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" | 2455 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" |
2456 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" | 2456 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" |
2457 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" | 2457 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" |
2458 "add $16, %%"REG_a" \n\t" | 2458 "add $16, %%"REG_a" \n\t" |
2459 "cmp %3, %%"REG_a" \n\t" | 2459 "cmp %3, %%"REG_a" \n\t" |
2460 " jb 1b \n\t" | 2460 " jb 1b \n\t" |
2461 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | 2461 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) |
2462 : "memory", "%"REG_a | 2462 : "memory", "%"REG_a |
2463 ); | 2463 ); |
2464 #endif | 2464 #endif |
2465 for (w= (width&(~15)); w < width; w++) | 2465 for (w= (width&(~15)); w < width; w++) |
2466 { | 2466 { |
2467 dest[2*w+0] = src1[w]; | 2467 dest[2*w+0] = src1[w]; |
2478 src1 += src1Stride; | 2478 src1 += src1Stride; |
2479 src2 += src2Stride; | 2479 src2 += src2Stride; |
2480 } | 2480 } |
2481 #if HAVE_MMX | 2481 #if HAVE_MMX |
2482 __asm__( | 2482 __asm__( |
2483 EMMS" \n\t" | 2483 EMMS" \n\t" |
2484 SFENCE" \n\t" | 2484 SFENCE" \n\t" |
2485 ::: "memory" | 2485 ::: "memory" |
2486 ); | 2486 ); |
2487 #endif | 2487 #endif |
2488 } | 2488 } |
2489 | 2489 |
2490 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, | 2490 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, |
2491 uint8_t *dst1, uint8_t *dst2, | 2491 uint8_t *dst1, uint8_t *dst2, |
2496 x86_reg y; | 2496 x86_reg y; |
2497 long x,w,h; | 2497 long x,w,h; |
2498 w=width/2; h=height/2; | 2498 w=width/2; h=height/2; |
2499 #if HAVE_MMX | 2499 #if HAVE_MMX |
2500 __asm__ volatile( | 2500 __asm__ volatile( |
2501 PREFETCH" %0 \n\t" | 2501 PREFETCH" %0 \n\t" |
2502 PREFETCH" %1 \n\t" | 2502 PREFETCH" %1 \n\t" |
2503 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); | 2503 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); |
2504 #endif | 2504 #endif |
2505 for (y=0;y<h;y++){ | 2505 for (y=0;y<h;y++){ |
2506 const uint8_t* s1=src1+srcStride1*(y>>1); | 2506 const uint8_t* s1=src1+srcStride1*(y>>1); |
2507 uint8_t* d=dst1+dstStride1*y; | 2507 uint8_t* d=dst1+dstStride1*y; |
2508 x=0; | 2508 x=0; |
2509 #if HAVE_MMX | 2509 #if HAVE_MMX |
2510 for (;x<w-31;x+=32) | 2510 for (;x<w-31;x+=32) |
2511 { | 2511 { |
2512 __asm__ volatile( | 2512 __asm__ volatile( |
2513 PREFETCH" 32%1 \n\t" | 2513 PREFETCH" 32%1 \n\t" |
2514 "movq %1, %%mm0 \n\t" | 2514 "movq %1, %%mm0 \n\t" |
2515 "movq 8%1, %%mm2 \n\t" | 2515 "movq 8%1, %%mm2 \n\t" |
2516 "movq 16%1, %%mm4 \n\t" | 2516 "movq 16%1, %%mm4 \n\t" |
2517 "movq 24%1, %%mm6 \n\t" | 2517 "movq 24%1, %%mm6 \n\t" |
2518 "movq %%mm0, %%mm1 \n\t" | 2518 "movq %%mm0, %%mm1 \n\t" |
2519 "movq %%mm2, %%mm3 \n\t" | 2519 "movq %%mm2, %%mm3 \n\t" |
2520 "movq %%mm4, %%mm5 \n\t" | 2520 "movq %%mm4, %%mm5 \n\t" |
2521 "movq %%mm6, %%mm7 \n\t" | 2521 "movq %%mm6, %%mm7 \n\t" |
2522 "punpcklbw %%mm0, %%mm0 \n\t" | 2522 "punpcklbw %%mm0, %%mm0 \n\t" |
2523 "punpckhbw %%mm1, %%mm1 \n\t" | 2523 "punpckhbw %%mm1, %%mm1 \n\t" |
2524 "punpcklbw %%mm2, %%mm2 \n\t" | 2524 "punpcklbw %%mm2, %%mm2 \n\t" |
2525 "punpckhbw %%mm3, %%mm3 \n\t" | 2525 "punpckhbw %%mm3, %%mm3 \n\t" |
2526 "punpcklbw %%mm4, %%mm4 \n\t" | 2526 "punpcklbw %%mm4, %%mm4 \n\t" |
2527 "punpckhbw %%mm5, %%mm5 \n\t" | 2527 "punpckhbw %%mm5, %%mm5 \n\t" |
2528 "punpcklbw %%mm6, %%mm6 \n\t" | 2528 "punpcklbw %%mm6, %%mm6 \n\t" |
2529 "punpckhbw %%mm7, %%mm7 \n\t" | 2529 "punpckhbw %%mm7, %%mm7 \n\t" |
2530 MOVNTQ" %%mm0, %0 \n\t" | 2530 MOVNTQ" %%mm0, %0 \n\t" |
2531 MOVNTQ" %%mm1, 8%0 \n\t" | 2531 MOVNTQ" %%mm1, 8%0 \n\t" |
2532 MOVNTQ" %%mm2, 16%0 \n\t" | 2532 MOVNTQ" %%mm2, 16%0 \n\t" |
2533 MOVNTQ" %%mm3, 24%0 \n\t" | 2533 MOVNTQ" %%mm3, 24%0 \n\t" |
2534 MOVNTQ" %%mm4, 32%0 \n\t" | 2534 MOVNTQ" %%mm4, 32%0 \n\t" |
2535 MOVNTQ" %%mm5, 40%0 \n\t" | 2535 MOVNTQ" %%mm5, 40%0 \n\t" |
2536 MOVNTQ" %%mm6, 48%0 \n\t" | 2536 MOVNTQ" %%mm6, 48%0 \n\t" |
2537 MOVNTQ" %%mm7, 56%0" | 2537 MOVNTQ" %%mm7, 56%0" |
2538 :"=m"(d[2*x]) | 2538 :"=m"(d[2*x]) |
2539 :"m"(s1[x]) | 2539 :"m"(s1[x]) |
2540 :"memory"); | 2540 :"memory"); |
2541 } | 2541 } |
2542 #endif | 2542 #endif |
2543 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; | 2543 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; |
2544 } | 2544 } |
2545 for (y=0;y<h;y++){ | 2545 for (y=0;y<h;y++){ |
2546 const uint8_t* s2=src2+srcStride2*(y>>1); | 2546 const uint8_t* s2=src2+srcStride2*(y>>1); |
2547 uint8_t* d=dst2+dstStride2*y; | 2547 uint8_t* d=dst2+dstStride2*y; |
2548 x=0; | 2548 x=0; |
2549 #if HAVE_MMX | 2549 #if HAVE_MMX |
2550 for (;x<w-31;x+=32) | 2550 for (;x<w-31;x+=32) |
2551 { | 2551 { |
2552 __asm__ volatile( | 2552 __asm__ volatile( |
2553 PREFETCH" 32%1 \n\t" | 2553 PREFETCH" 32%1 \n\t" |
2554 "movq %1, %%mm0 \n\t" | 2554 "movq %1, %%mm0 \n\t" |
2555 "movq 8%1, %%mm2 \n\t" | 2555 "movq 8%1, %%mm2 \n\t" |
2556 "movq 16%1, %%mm4 \n\t" | 2556 "movq 16%1, %%mm4 \n\t" |
2557 "movq 24%1, %%mm6 \n\t" | 2557 "movq 24%1, %%mm6 \n\t" |
2558 "movq %%mm0, %%mm1 \n\t" | 2558 "movq %%mm0, %%mm1 \n\t" |
2559 "movq %%mm2, %%mm3 \n\t" | 2559 "movq %%mm2, %%mm3 \n\t" |
2560 "movq %%mm4, %%mm5 \n\t" | 2560 "movq %%mm4, %%mm5 \n\t" |
2561 "movq %%mm6, %%mm7 \n\t" | 2561 "movq %%mm6, %%mm7 \n\t" |
2562 "punpcklbw %%mm0, %%mm0 \n\t" | 2562 "punpcklbw %%mm0, %%mm0 \n\t" |
2563 "punpckhbw %%mm1, %%mm1 \n\t" | 2563 "punpckhbw %%mm1, %%mm1 \n\t" |
2564 "punpcklbw %%mm2, %%mm2 \n\t" | 2564 "punpcklbw %%mm2, %%mm2 \n\t" |
2565 "punpckhbw %%mm3, %%mm3 \n\t" | 2565 "punpckhbw %%mm3, %%mm3 \n\t" |
2566 "punpcklbw %%mm4, %%mm4 \n\t" | 2566 "punpcklbw %%mm4, %%mm4 \n\t" |
2567 "punpckhbw %%mm5, %%mm5 \n\t" | 2567 "punpckhbw %%mm5, %%mm5 \n\t" |
2568 "punpcklbw %%mm6, %%mm6 \n\t" | 2568 "punpcklbw %%mm6, %%mm6 \n\t" |
2569 "punpckhbw %%mm7, %%mm7 \n\t" | 2569 "punpckhbw %%mm7, %%mm7 \n\t" |
2570 MOVNTQ" %%mm0, %0 \n\t" | 2570 MOVNTQ" %%mm0, %0 \n\t" |
2571 MOVNTQ" %%mm1, 8%0 \n\t" | 2571 MOVNTQ" %%mm1, 8%0 \n\t" |
2572 MOVNTQ" %%mm2, 16%0 \n\t" | 2572 MOVNTQ" %%mm2, 16%0 \n\t" |
2573 MOVNTQ" %%mm3, 24%0 \n\t" | 2573 MOVNTQ" %%mm3, 24%0 \n\t" |
2574 MOVNTQ" %%mm4, 32%0 \n\t" | 2574 MOVNTQ" %%mm4, 32%0 \n\t" |
2575 MOVNTQ" %%mm5, 40%0 \n\t" | 2575 MOVNTQ" %%mm5, 40%0 \n\t" |
2576 MOVNTQ" %%mm6, 48%0 \n\t" | 2576 MOVNTQ" %%mm6, 48%0 \n\t" |
2577 MOVNTQ" %%mm7, 56%0" | 2577 MOVNTQ" %%mm7, 56%0" |
2578 :"=m"(d[2*x]) | 2578 :"=m"(d[2*x]) |
2579 :"m"(s2[x]) | 2579 :"m"(s2[x]) |
2580 :"memory"); | 2580 :"memory"); |
2581 } | 2581 } |
2582 #endif | 2582 #endif |
2583 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; | 2583 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; |
2584 } | 2584 } |
2585 #if HAVE_MMX | 2585 #if HAVE_MMX |
2586 __asm__( | 2586 __asm__( |
2587 EMMS" \n\t" | 2587 EMMS" \n\t" |
2588 SFENCE" \n\t" | 2588 SFENCE" \n\t" |
2589 ::: "memory" | 2589 ::: "memory" |
2590 ); | 2590 ); |
2591 #endif | 2591 #endif |
2592 } | 2592 } |
2593 | 2593 |
2594 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | 2594 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, |
2599 { | 2599 { |
2600 x86_reg x; | 2600 x86_reg x; |
2601 long y,w,h; | 2601 long y,w,h; |
2602 w=width/2; h=height; | 2602 w=width/2; h=height; |
2603 for (y=0;y<h;y++){ | 2603 for (y=0;y<h;y++){ |
2604 const uint8_t* yp=src1+srcStride1*y; | 2604 const uint8_t* yp=src1+srcStride1*y; |
2605 const uint8_t* up=src2+srcStride2*(y>>2); | 2605 const uint8_t* up=src2+srcStride2*(y>>2); |
2606 const uint8_t* vp=src3+srcStride3*(y>>2); | 2606 const uint8_t* vp=src3+srcStride3*(y>>2); |
2607 uint8_t* d=dst+dstStride*y; | 2607 uint8_t* d=dst+dstStride*y; |
2608 x=0; | 2608 x=0; |
2609 #if HAVE_MMX | 2609 #if HAVE_MMX |
2610 for (;x<w-7;x+=8) | 2610 for (;x<w-7;x+=8) |
2611 { | 2611 { |
2612 __asm__ volatile( | 2612 __asm__ volatile( |
2613 PREFETCH" 32(%1, %0) \n\t" | 2613 PREFETCH" 32(%1, %0) \n\t" |
2614 PREFETCH" 32(%2, %0) \n\t" | 2614 PREFETCH" 32(%2, %0) \n\t" |
2615 PREFETCH" 32(%3, %0) \n\t" | 2615 PREFETCH" 32(%3, %0) \n\t" |
2616 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | 2616 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ |
2617 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ | 2617 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ |
2618 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ | 2618 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ |
2619 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | 2619 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ |
2620 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ | 2620 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ |
2621 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ | 2621 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ |
2622 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ | 2622 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ |
2623 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ | 2623 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ |
2624 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ | 2624 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ |
2625 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ | 2625 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ |
2626 | 2626 |
2627 "movq %%mm1, %%mm6 \n\t" | 2627 "movq %%mm1, %%mm6 \n\t" |
2628 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ | 2628 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ |
2629 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ | 2629 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ |
2630 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ | 2630 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ |
2631 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" | 2631 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" |
2632 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" | 2632 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" |
2633 | 2633 |
2634 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ | 2634 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ |
2635 "movq 8(%1, %0, 4), %%mm0 \n\t" | 2635 "movq 8(%1, %0, 4), %%mm0 \n\t" |
2636 "movq %%mm0, %%mm3 \n\t" | 2636 "movq %%mm0, %%mm3 \n\t" |
2637 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ | 2637 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ |
2638 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ | 2638 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ |
2639 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" | 2639 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" |
2640 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" | 2640 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" |
2641 | 2641 |
2642 "movq %%mm4, %%mm6 \n\t" | 2642 "movq %%mm4, %%mm6 \n\t" |
2643 "movq 16(%1, %0, 4), %%mm0 \n\t" | 2643 "movq 16(%1, %0, 4), %%mm0 \n\t" |
2644 "movq %%mm0, %%mm3 \n\t" | 2644 "movq %%mm0, %%mm3 \n\t" |
2645 "punpcklbw %%mm5, %%mm4 \n\t" | 2645 "punpcklbw %%mm5, %%mm4 \n\t" |
2646 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ | 2646 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ |
2647 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ | 2647 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ |
2648 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" | 2648 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" |
2649 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" | 2649 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" |
2650 | 2650 |
2651 "punpckhbw %%mm5, %%mm6 \n\t" | 2651 "punpckhbw %%mm5, %%mm6 \n\t" |
2652 "movq 24(%1, %0, 4), %%mm0 \n\t" | 2652 "movq 24(%1, %0, 4), %%mm0 \n\t" |
2653 "movq %%mm0, %%mm3 \n\t" | 2653 "movq %%mm0, %%mm3 \n\t" |
2654 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ | 2654 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ |
2655 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ | 2655 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ |
2656 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" | 2656 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" |
2657 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" | 2657 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" |
2658 | 2658 |
2659 : "+r" (x) | 2659 : "+r" (x) |
2660 : "r"(yp), "r" (up), "r"(vp), "r"(d) | 2660 : "r"(yp), "r" (up), "r"(vp), "r"(d) |
2661 :"memory"); | 2661 :"memory"); |
2662 } | 2662 } |
2663 #endif | 2663 #endif |
2664 for (; x<w; x++) | 2664 for (; x<w; x++) |
2665 { | 2665 { |
2666 const long x2 = x<<2; | 2666 const long x2 = x<<2; |
2667 d[8*x+0] = yp[x2]; | 2667 d[8*x+0] = yp[x2]; |
2668 d[8*x+1] = up[x]; | 2668 d[8*x+1] = up[x]; |
2669 d[8*x+2] = yp[x2+1]; | 2669 d[8*x+2] = yp[x2+1]; |
2670 d[8*x+3] = vp[x]; | 2670 d[8*x+3] = vp[x]; |
2671 d[8*x+4] = yp[x2+2]; | 2671 d[8*x+4] = yp[x2+2]; |
2672 d[8*x+5] = up[x]; | 2672 d[8*x+5] = up[x]; |
2673 d[8*x+6] = yp[x2+3]; | 2673 d[8*x+6] = yp[x2+3]; |
2674 d[8*x+7] = vp[x]; | 2674 d[8*x+7] = vp[x]; |
2675 } | 2675 } |
2676 } | 2676 } |
2677 #if HAVE_MMX | 2677 #if HAVE_MMX |
2678 __asm__( | 2678 __asm__( |
2679 EMMS" \n\t" | 2679 EMMS" \n\t" |
2680 SFENCE" \n\t" | 2680 SFENCE" \n\t" |
2681 ::: "memory" | 2681 ::: "memory" |
2682 ); | 2682 ); |
2683 #endif | 2683 #endif |
2684 } | 2684 } |
2685 | 2685 |
2686 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) | 2686 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) |
2945 src += srcStride; | 2945 src += srcStride; |
2946 ydst+= lumStride; | 2946 ydst+= lumStride; |
2947 } | 2947 } |
2948 #if HAVE_MMX | 2948 #if HAVE_MMX |
2949 __asm__( | 2949 __asm__( |
2950 EMMS" \n\t" | 2950 EMMS" \n\t" |
2951 SFENCE" \n\t" | 2951 SFENCE" \n\t" |
2952 ::: "memory" | 2952 ::: "memory" |
2953 ); | 2953 ); |
2954 #endif | 2954 #endif |
2955 } | 2955 } |
2956 | 2956 |
2957 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | 2957 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, |
2970 udst+= chromStride; | 2970 udst+= chromStride; |
2971 vdst+= chromStride; | 2971 vdst+= chromStride; |
2972 } | 2972 } |
2973 #if HAVE_MMX | 2973 #if HAVE_MMX |
2974 __asm__( | 2974 __asm__( |
2975 EMMS" \n\t" | 2975 EMMS" \n\t" |
2976 SFENCE" \n\t" | 2976 SFENCE" \n\t" |
2977 ::: "memory" | 2977 ::: "memory" |
2978 ); | 2978 ); |
2979 #endif | 2979 #endif |
2980 } | 2980 } |
2981 | 2981 |
2982 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | 2982 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, |
2997 src += srcStride; | 2997 src += srcStride; |
2998 ydst+= lumStride; | 2998 ydst+= lumStride; |
2999 } | 2999 } |
3000 #if HAVE_MMX | 3000 #if HAVE_MMX |
3001 __asm__( | 3001 __asm__( |
3002 EMMS" \n\t" | 3002 EMMS" \n\t" |
3003 SFENCE" \n\t" | 3003 SFENCE" \n\t" |
3004 ::: "memory" | 3004 ::: "memory" |
3005 ); | 3005 ); |
3006 #endif | 3006 #endif |
3007 } | 3007 } |
3008 | 3008 |
3009 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | 3009 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, |
3022 udst+= chromStride; | 3022 udst+= chromStride; |
3023 vdst+= chromStride; | 3023 vdst+= chromStride; |
3024 } | 3024 } |
3025 #if HAVE_MMX | 3025 #if HAVE_MMX |
3026 __asm__( | 3026 __asm__( |
3027 EMMS" \n\t" | 3027 EMMS" \n\t" |
3028 SFENCE" \n\t" | 3028 SFENCE" \n\t" |
3029 ::: "memory" | 3029 ::: "memory" |
3030 ); | 3030 ); |
3031 #endif | 3031 #endif |
3032 } | 3032 } |
3033 | 3033 |
3034 static inline void RENAME(rgb2rgb_init)(void){ | 3034 static inline void RENAME(rgb2rgb_init)(void){ |