Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 6570:52ecfeddf864
Changed the -pass option to -passwd to avoid clash with mencoder option.
author | bertrand |
---|---|
date | Tue, 25 Jun 2002 23:56:33 +0000 |
parents | e7635c03910f |
children | f98313dcd428 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
3132 | 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
9 */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
10 |
6492 | 11 #include <stddef.h> |
12 #include <inttypes.h> /* for __WORDSIZE */ | |
13 | |
14 #ifndef __WORDSIZE | |
15 #warning You have misconfigured system and probably will lose performance! | |
16 #endif | |
17 | |
3132 | 18 #undef PREFETCH |
19 #undef MOVNTQ | |
20 #undef EMMS | |
21 #undef SFENCE | |
22 #undef MMREG_SIZE | |
23 #undef PREFETCHW | |
24 #undef PAVGB | |
2755 | 25 |
3132 | 26 #ifdef HAVE_SSE2 |
27 #define MMREG_SIZE 16 | |
28 #else | |
29 #define MMREG_SIZE 8 | |
2535 | 30 #endif |
2513 | 31 |
3132 | 32 #ifdef HAVE_3DNOW |
33 #define PREFETCH "prefetch" | |
34 #define PREFETCHW "prefetchw" | |
35 #define PAVGB "pavgusb" | |
36 #elif defined ( HAVE_MMX2 ) | |
37 #define PREFETCH "prefetchnta" | |
38 #define PREFETCHW "prefetcht0" | |
39 #define PAVGB "pavgb" | |
40 #else | |
41 #define PREFETCH "/nop" | |
42 #define PREFETCHW "/nop" | |
43 #endif | |
44 | |
45 #ifdef HAVE_3DNOW | |
46 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
47 #define EMMS "femms" | |
48 #else | |
49 #define EMMS "emms" | |
50 #endif | |
51 | |
52 #ifdef HAVE_MMX2 | |
53 #define MOVNTQ "movntq" | |
54 #define SFENCE "sfence" | |
55 #else | |
56 #define MOVNTQ "movq" | |
57 #define SFENCE "/nop" | |
58 #endif | |
59 | |
60 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size) | |
2504 | 61 { |
2508 | 62 uint8_t *dest = dst; |
2677 | 63 const uint8_t *s = src; |
64 const uint8_t *end; | |
2510 | 65 #ifdef HAVE_MMX |
6492 | 66 uint8_t *mm_end; |
2510 | 67 #endif |
2504 | 68 end = s + src_size; |
2510 | 69 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
70 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
71 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
6492 | 72 mm_end = (uint8_t*)((((unsigned long)end)/24)*24); |
2510 | 73 while(s < mm_end) |
74 { | |
2511 | 75 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
76 PREFETCH" 32%1\n\t" |
2510 | 77 "movd %1, %%mm0\n\t" |
2738 | 78 "punpckldq 3%1, %%mm0\n\t" |
79 "movd 6%1, %%mm1\n\t" | |
80 "punpckldq 9%1, %%mm1\n\t" | |
81 "movd 12%1, %%mm2\n\t" | |
82 "punpckldq 15%1, %%mm2\n\t" | |
83 "movd 18%1, %%mm3\n\t" | |
84 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 85 "pand %%mm7, %%mm0\n\t" |
2738 | 86 "pand %%mm7, %%mm1\n\t" |
2510 | 87 "pand %%mm7, %%mm2\n\t" |
2738 | 88 "pand %%mm7, %%mm3\n\t" |
2511 | 89 MOVNTQ" %%mm0, %0\n\t" |
2738 | 90 MOVNTQ" %%mm1, 8%0\n\t" |
91 MOVNTQ" %%mm2, 16%0\n\t" | |
92 MOVNTQ" %%mm3, 24%0" | |
2510 | 93 :"=m"(*dest) |
94 :"m"(*s) | |
95 :"memory"); | |
2738 | 96 dest += 32; |
97 s += 24; | |
2510 | 98 } |
2513 | 99 __asm __volatile(SFENCE:::"memory"); |
2511 | 100 __asm __volatile(EMMS:::"memory"); |
2510 | 101 #endif |
2504 | 102 while(s < end) |
103 { | |
2508 | 104 *dest++ = *s++; |
105 *dest++ = *s++; | |
106 *dest++ = *s++; | |
107 *dest++ = 0; | |
2504 | 108 } |
109 } | |
2505 | 110 |
3132 | 111 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 112 { |
113 uint8_t *dest = dst; | |
2677 | 114 const uint8_t *s = src; |
115 const uint8_t *end; | |
2517 | 116 #ifdef HAVE_MMX |
6492 | 117 uint8_t *mm_end; |
2517 | 118 #endif |
2505 | 119 end = s + src_size; |
2517 | 120 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
121 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
6492 | 122 mm_end = (uint8_t*)((((unsigned long)end)/32)*32); |
2517 | 123 while(s < mm_end) |
124 { | |
125 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
126 PREFETCH" 32%1\n\t" |
2517 | 127 "movq %1, %%mm0\n\t" |
128 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
129 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
130 "movq 24%1, %%mm5\n\t" |
2517 | 131 "movq %%mm0, %%mm2\n\t" |
132 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
133 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
134 "movq %%mm5, %%mm7\n\t" |
2517 | 135 "psrlq $8, %%mm2\n\t" |
136 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
137 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
138 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
139 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
140 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
141 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
142 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
143 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
144 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
145 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
146 "pand %3, %%mm7\n\t" |
2517 | 147 "por %%mm2, %%mm0\n\t" |
148 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
149 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
150 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
153 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
154 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
156 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
157 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
158 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
159 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
160 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
163 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 "por %%mm5, %%mm4\n\t" |
3132 | 165 |
2517 | 166 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
167 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
168 MOVNTQ" %%mm4, 16%0" |
2517 | 169 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
170 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
171 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 172 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
173 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
174 s += 32; |
2517 | 175 } |
176 __asm __volatile(SFENCE:::"memory"); | |
177 __asm __volatile(EMMS:::"memory"); | |
178 #endif | |
2505 | 179 while(s < end) |
180 { | |
181 *dest++ = *s++; | |
182 *dest++ = *s++; | |
183 *dest++ = *s++; | |
184 s++; | |
185 } | |
186 } | |
2506 | 187 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
188 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
189 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
190 ported to gcc & bugfixed : A'rpi |
2564 | 191 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
192 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
193 */ |
3132 | 194 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 195 { |
6492 | 196 register const uint8_t* s=src; |
197 register uint8_t* d=dst; | |
198 register const uint8_t *end; | |
199 uint8_t *mm_end; | |
200 end = s + src_size; | |
2506 | 201 #ifdef HAVE_MMX |
6492 | 202 __asm __volatile(PREFETCH" %0"::"m"(*s)); |
203 __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); | |
204 mm_end = (uint8_t*)((((unsigned long)end)/16)*16); | |
205 while(s<mm_end) | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
206 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
207 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
208 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
209 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
210 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
211 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
212 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
213 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
214 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
215 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
216 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
217 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
218 MOVNTQ" %%mm2, 8%0" |
6492 | 219 :"=m"(*d) |
220 :"m"(*s) | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
221 ); |
6492 | 222 d+=16; |
223 s+=16; | |
2506 | 224 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
225 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
226 __asm __volatile(EMMS:::"memory"); |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
227 #endif |
6492 | 228 mm_end = (uint8_t*)((((unsigned long)end)/4)*4); |
229 while(s < mm_end) | |
230 { | |
231 register unsigned x= *((uint32_t *)s); | |
232 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
233 d+=4; | |
234 s+=4; | |
235 } | |
236 if(s < end) | |
237 { | |
238 register unsigned short x= *((uint16_t *)s); | |
239 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
240 } | |
2506 | 241 } |
2694 | 242 |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
243 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
244 { |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
245 unsigned j,i,num_pixels=src_size/3; |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
246 for(i=0,j=0; j<num_pixels; i+=3,j+=3) |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
247 { |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
248 dst[j+0] = src[i+2]; |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
249 dst[j+1] = src[i+1]; |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
250 dst[j+2] = src[i+0]; |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
251 } |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
252 } |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
253 |
3132 | 254 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 255 { |
6492 | 256 const uint8_t *s = src; |
257 const uint8_t *end; | |
2741 | 258 #ifdef HAVE_MMX |
6492 | 259 const uint8_t *mm_end; |
260 #endif | |
2741 | 261 uint16_t *d = (uint16_t *)dst; |
262 end = s + src_size; | |
6492 | 263 #ifdef HAVE_MMX |
2741 | 264 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
265 __asm __volatile( | |
266 "movq %0, %%mm7\n\t" | |
267 "movq %1, %%mm6\n\t" | |
268 ::"m"(red_16mask),"m"(green_16mask)); | |
6492 | 269 mm_end = (uint8_t*)((((unsigned long)end)/16)*16); |
2741 | 270 while(s < mm_end) |
271 { | |
272 __asm __volatile( | |
273 PREFETCH" 32%1\n\t" | |
274 "movd %1, %%mm0\n\t" | |
275 "movd 4%1, %%mm3\n\t" | |
276 "punpckldq 8%1, %%mm0\n\t" | |
277 "punpckldq 12%1, %%mm3\n\t" | |
278 "movq %%mm0, %%mm1\n\t" | |
279 "movq %%mm0, %%mm2\n\t" | |
280 "movq %%mm3, %%mm4\n\t" | |
281 "movq %%mm3, %%mm5\n\t" | |
282 "psrlq $3, %%mm0\n\t" | |
283 "psrlq $3, %%mm3\n\t" | |
284 "pand %2, %%mm0\n\t" | |
285 "pand %2, %%mm3\n\t" | |
286 "psrlq $5, %%mm1\n\t" | |
287 "psrlq $5, %%mm4\n\t" | |
288 "pand %%mm6, %%mm1\n\t" | |
289 "pand %%mm6, %%mm4\n\t" | |
290 "psrlq $8, %%mm2\n\t" | |
291 "psrlq $8, %%mm5\n\t" | |
292 "pand %%mm7, %%mm2\n\t" | |
293 "pand %%mm7, %%mm5\n\t" | |
294 "por %%mm1, %%mm0\n\t" | |
295 "por %%mm4, %%mm3\n\t" | |
296 "por %%mm2, %%mm0\n\t" | |
297 "por %%mm5, %%mm3\n\t" | |
298 "psllq $16, %%mm3\n\t" | |
299 "por %%mm3, %%mm0\n\t" | |
300 MOVNTQ" %%mm0, %0\n\t" | |
301 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
302 d += 4; | |
303 s += 16; | |
304 } | |
6492 | 305 __asm __volatile(SFENCE:::"memory"); |
306 __asm __volatile(EMMS:::"memory"); | |
307 #endif | |
2741 | 308 while(s < end) |
309 { | |
310 const int b= *s++; | |
311 const int g= *s++; | |
312 const int r= *s++; | |
313 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
6492 | 314 s++; |
2741 | 315 } |
2694 | 316 } |
317 | |
3132 | 318 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 319 { |
6492 | 320 const uint8_t *s = src; |
321 const uint8_t *end; | |
2741 | 322 #ifdef HAVE_MMX |
6492 | 323 const uint8_t *mm_end; |
324 #endif | |
2741 | 325 uint16_t *d = (uint16_t *)dst; |
326 end = s + src_size; | |
6492 | 327 #ifdef HAVE_MMX |
2741 | 328 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
329 __asm __volatile( | |
330 "movq %0, %%mm7\n\t" | |
331 "movq %1, %%mm6\n\t" | |
332 ::"m"(red_15mask),"m"(green_15mask)); | |
6492 | 333 mm_end = (uint8_t*)((((unsigned long)end)/16)*16); |
2741 | 334 while(s < mm_end) |
335 { | |
336 __asm __volatile( | |
337 PREFETCH" 32%1\n\t" | |
338 "movd %1, %%mm0\n\t" | |
339 "movd 4%1, %%mm3\n\t" | |
340 "punpckldq 8%1, %%mm0\n\t" | |
341 "punpckldq 12%1, %%mm3\n\t" | |
342 "movq %%mm0, %%mm1\n\t" | |
343 "movq %%mm0, %%mm2\n\t" | |
344 "movq %%mm3, %%mm4\n\t" | |
345 "movq %%mm3, %%mm5\n\t" | |
346 "psrlq $3, %%mm0\n\t" | |
347 "psrlq $3, %%mm3\n\t" | |
348 "pand %2, %%mm0\n\t" | |
349 "pand %2, %%mm3\n\t" | |
350 "psrlq $6, %%mm1\n\t" | |
351 "psrlq $6, %%mm4\n\t" | |
352 "pand %%mm6, %%mm1\n\t" | |
353 "pand %%mm6, %%mm4\n\t" | |
354 "psrlq $9, %%mm2\n\t" | |
355 "psrlq $9, %%mm5\n\t" | |
356 "pand %%mm7, %%mm2\n\t" | |
357 "pand %%mm7, %%mm5\n\t" | |
358 "por %%mm1, %%mm0\n\t" | |
359 "por %%mm4, %%mm3\n\t" | |
360 "por %%mm2, %%mm0\n\t" | |
361 "por %%mm5, %%mm3\n\t" | |
362 "psllq $16, %%mm3\n\t" | |
363 "por %%mm3, %%mm0\n\t" | |
364 MOVNTQ" %%mm0, %0\n\t" | |
365 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
366 d += 4; | |
367 s += 16; | |
368 } | |
6492 | 369 __asm __volatile(SFENCE:::"memory"); |
370 __asm __volatile(EMMS:::"memory"); | |
371 #endif | |
2741 | 372 while(s < end) |
373 { | |
374 const int b= *s++; | |
375 const int g= *s++; | |
376 const int r= *s++; | |
6492 | 377 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
6096 | 378 s++; |
2741 | 379 } |
2694 | 380 } |
381 | |
3132 | 382 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 383 { |
6492 | 384 const uint8_t *s = src; |
385 const uint8_t *end; | |
2740 | 386 #ifdef HAVE_MMX |
6492 | 387 const uint8_t *mm_end; |
388 #endif | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
389 uint16_t *d = (uint16_t *)dst; |
2740 | 390 end = s + src_size; |
6492 | 391 #ifdef HAVE_MMX |
2738 | 392 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
393 __asm __volatile( | |
394 "movq %0, %%mm7\n\t" | |
395 "movq %1, %%mm6\n\t" | |
2741 | 396 ::"m"(red_16mask),"m"(green_16mask)); |
6492 | 397 mm_end = (uint8_t*)((((unsigned long)end)/16)*16); |
2740 | 398 while(s < mm_end) |
2738 | 399 { |
400 __asm __volatile( | |
401 PREFETCH" 32%1\n\t" | |
402 "movd %1, %%mm0\n\t" | |
2740 | 403 "movd 3%1, %%mm3\n\t" |
404 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 405 "punpckldq 9%1, %%mm3\n\t" |
406 "movq %%mm0, %%mm1\n\t" | |
407 "movq %%mm0, %%mm2\n\t" | |
408 "movq %%mm3, %%mm4\n\t" | |
409 "movq %%mm3, %%mm5\n\t" | |
410 "psrlq $3, %%mm0\n\t" | |
411 "psrlq $3, %%mm3\n\t" | |
2740 | 412 "pand %2, %%mm0\n\t" |
413 "pand %2, %%mm3\n\t" | |
414 "psrlq $5, %%mm1\n\t" | |
415 "psrlq $5, %%mm4\n\t" | |
416 "pand %%mm6, %%mm1\n\t" | |
417 "pand %%mm6, %%mm4\n\t" | |
418 "psrlq $8, %%mm2\n\t" | |
419 "psrlq $8, %%mm5\n\t" | |
420 "pand %%mm7, %%mm2\n\t" | |
421 "pand %%mm7, %%mm5\n\t" | |
2738 | 422 "por %%mm1, %%mm0\n\t" |
2740 | 423 "por %%mm4, %%mm3\n\t" |
2738 | 424 "por %%mm2, %%mm0\n\t" |
425 "por %%mm5, %%mm3\n\t" | |
2740 | 426 "psllq $16, %%mm3\n\t" |
427 "por %%mm3, %%mm0\n\t" | |
2738 | 428 MOVNTQ" %%mm0, %0\n\t" |
2741 | 429 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 430 d += 4; |
431 s += 12; | |
2738 | 432 } |
6492 | 433 __asm __volatile(SFENCE:::"memory"); |
434 __asm __volatile(EMMS:::"memory"); | |
435 #endif | |
2740 | 436 while(s < end) |
437 { | |
438 const int b= *s++; | |
439 const int g= *s++; | |
440 const int r= *s++; | |
441 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
442 } | |
2718 | 443 } |
444 | |
3132 | 445 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 446 { |
6492 | 447 const uint8_t *s = src; |
448 const uint8_t *end; | |
2741 | 449 #ifdef HAVE_MMX |
6492 | 450 const uint8_t *mm_end; |
451 #endif | |
2741 | 452 uint16_t *d = (uint16_t *)dst; |
453 end = s + src_size; | |
6492 | 454 #ifdef HAVE_MMX |
2741 | 455 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
456 __asm __volatile( | |
457 "movq %0, %%mm7\n\t" | |
458 "movq %1, %%mm6\n\t" | |
459 ::"m"(red_15mask),"m"(green_15mask)); | |
6492 | 460 mm_end = (uint8_t*)((((unsigned long)end)/16)*16); |
2741 | 461 while(s < mm_end) |
462 { | |
463 __asm __volatile( | |
464 PREFETCH" 32%1\n\t" | |
465 "movd %1, %%mm0\n\t" | |
466 "movd 3%1, %%mm3\n\t" | |
467 "punpckldq 6%1, %%mm0\n\t" | |
468 "punpckldq 9%1, %%mm3\n\t" | |
469 "movq %%mm0, %%mm1\n\t" | |
470 "movq %%mm0, %%mm2\n\t" | |
471 "movq %%mm3, %%mm4\n\t" | |
472 "movq %%mm3, %%mm5\n\t" | |
473 "psrlq $3, %%mm0\n\t" | |
474 "psrlq $3, %%mm3\n\t" | |
475 "pand %2, %%mm0\n\t" | |
476 "pand %2, %%mm3\n\t" | |
477 "psrlq $6, %%mm1\n\t" | |
478 "psrlq $6, %%mm4\n\t" | |
479 "pand %%mm6, %%mm1\n\t" | |
480 "pand %%mm6, %%mm4\n\t" | |
481 "psrlq $9, %%mm2\n\t" | |
482 "psrlq $9, %%mm5\n\t" | |
483 "pand %%mm7, %%mm2\n\t" | |
484 "pand %%mm7, %%mm5\n\t" | |
485 "por %%mm1, %%mm0\n\t" | |
486 "por %%mm4, %%mm3\n\t" | |
487 "por %%mm2, %%mm0\n\t" | |
488 "por %%mm5, %%mm3\n\t" | |
489 "psllq $16, %%mm3\n\t" | |
490 "por %%mm3, %%mm0\n\t" | |
491 MOVNTQ" %%mm0, %0\n\t" | |
492 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
493 d += 4; | |
494 s += 12; | |
495 } | |
6492 | 496 __asm __volatile(SFENCE:::"memory"); |
497 __asm __volatile(EMMS:::"memory"); | |
498 #endif | |
2741 | 499 while(s < end) |
500 { | |
501 const int b= *s++; | |
502 const int g= *s++; | |
503 const int r= *s++; | |
504 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
505 } | |
6492 | 506 } |
507 | |
508 /* | |
509 I use here less accurate approximation by simply | |
510 left-shifting the input | |
511 value and filling the low order bits with | |
512 zeroes. This method improves png's | |
513 compression but this scheme cannot reproduce white exactly, since it does not | |
514 generate an all-ones maximum value; the net effect is to darken the | |
515 image slightly. | |
516 | |
517 The better method should be "left bit replication": | |
518 | |
519 4 3 2 1 0 | |
520 --------- | |
521 1 1 0 1 1 | |
522 | |
523 7 6 5 4 3 2 1 0 | |
524 ---------------- | |
525 1 1 0 1 1 1 1 0 | |
526 |=======| |===| | |
527 | Leftmost Bits Repeated to Fill Open Bits | |
528 | | |
529 Original Bits | |
530 */ | |
531 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
532 { | |
533 const uint16_t *end; | |
534 #ifdef HAVE_MMX | |
535 const uint16_t *mm_end; | |
536 #endif | |
537 uint8_t *d = (uint8_t *)dst; | |
538 const uint16_t *s = (uint16_t *)src; | |
539 end = s + src_size/2; | |
540 #ifdef HAVE_MMX | |
541 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
542 mm_end = (uint16_t*)((((unsigned long)end)/8)*8); | |
543 while(s < mm_end) | |
544 { | |
545 __asm __volatile( | |
546 PREFETCH" 32%1\n\t" | |
547 "movq %1, %%mm0\n\t" | |
548 "movq %1, %%mm1\n\t" | |
549 "movq %1, %%mm2\n\t" | |
550 "pand %2, %%mm0\n\t" | |
551 "pand %3, %%mm1\n\t" | |
552 "pand %4, %%mm2\n\t" | |
553 "psllq $3, %%mm0\n\t" | |
554 "psrlq $2, %%mm1\n\t" | |
555 "psrlq $7, %%mm2\n\t" | |
556 "movq %%mm0, %%mm3\n\t" | |
557 "movq %%mm1, %%mm4\n\t" | |
558 "movq %%mm2, %%mm5\n\t" | |
559 "punpcklwd %5, %%mm0\n\t" | |
560 "punpcklwd %5, %%mm1\n\t" | |
561 "punpcklwd %5, %%mm2\n\t" | |
562 "punpckhwd %5, %%mm3\n\t" | |
563 "punpckhwd %5, %%mm4\n\t" | |
564 "punpckhwd %5, %%mm5\n\t" | |
565 "psllq $8, %%mm1\n\t" | |
566 "psllq $16, %%mm2\n\t" | |
567 "por %%mm1, %%mm0\n\t" | |
568 "por %%mm2, %%mm0\n\t" | |
569 "psllq $8, %%mm4\n\t" | |
570 "psllq $16, %%mm5\n\t" | |
571 "por %%mm4, %%mm3\n\t" | |
572 "por %%mm5, %%mm3\n\t" | |
573 | |
574 "movq %%mm0, %%mm6\n\t" | |
575 "movq %%mm3, %%mm7\n\t" | |
576 | |
577 "movq 8%1, %%mm0\n\t" | |
578 "movq 8%1, %%mm1\n\t" | |
579 "movq 8%1, %%mm2\n\t" | |
580 "pand %2, %%mm0\n\t" | |
581 "pand %3, %%mm1\n\t" | |
582 "pand %4, %%mm2\n\t" | |
583 "psllq $3, %%mm0\n\t" | |
584 "psrlq $2, %%mm1\n\t" | |
585 "psrlq $7, %%mm2\n\t" | |
586 "movq %%mm0, %%mm3\n\t" | |
587 "movq %%mm1, %%mm4\n\t" | |
588 "movq %%mm2, %%mm5\n\t" | |
589 "punpcklwd %5, %%mm0\n\t" | |
590 "punpcklwd %5, %%mm1\n\t" | |
591 "punpcklwd %5, %%mm2\n\t" | |
592 "punpckhwd %5, %%mm3\n\t" | |
593 "punpckhwd %5, %%mm4\n\t" | |
594 "punpckhwd %5, %%mm5\n\t" | |
595 "psllq $8, %%mm1\n\t" | |
596 "psllq $16, %%mm2\n\t" | |
597 "por %%mm1, %%mm0\n\t" | |
598 "por %%mm2, %%mm0\n\t" | |
599 "psllq $8, %%mm4\n\t" | |
600 "psllq $16, %%mm5\n\t" | |
601 "por %%mm4, %%mm3\n\t" | |
602 "por %%mm5, %%mm3\n\t" | |
603 | |
604 :"=m"(*d) | |
605 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | |
606 :"memory"); | |
607 /* Borrowed 32 to 24 */ | |
608 __asm __volatile( | |
609 "movq %%mm0, %%mm4\n\t" | |
610 "movq %%mm3, %%mm5\n\t" | |
611 "movq %%mm6, %%mm0\n\t" | |
612 "movq %%mm7, %%mm1\n\t" | |
613 | |
614 "movq %%mm4, %%mm6\n\t" | |
615 "movq %%mm5, %%mm7\n\t" | |
616 "movq %%mm0, %%mm2\n\t" | |
617 "movq %%mm1, %%mm3\n\t" | |
618 | |
619 "psrlq $8, %%mm2\n\t" | |
620 "psrlq $8, %%mm3\n\t" | |
621 "psrlq $8, %%mm6\n\t" | |
622 "psrlq $8, %%mm7\n\t" | |
623 "pand %2, %%mm0\n\t" | |
624 "pand %2, %%mm1\n\t" | |
625 "pand %2, %%mm4\n\t" | |
626 "pand %2, %%mm5\n\t" | |
627 "pand %3, %%mm2\n\t" | |
628 "pand %3, %%mm3\n\t" | |
629 "pand %3, %%mm6\n\t" | |
630 "pand %3, %%mm7\n\t" | |
631 "por %%mm2, %%mm0\n\t" | |
632 "por %%mm3, %%mm1\n\t" | |
633 "por %%mm6, %%mm4\n\t" | |
634 "por %%mm7, %%mm5\n\t" | |
635 | |
636 "movq %%mm1, %%mm2\n\t" | |
637 "movq %%mm4, %%mm3\n\t" | |
638 "psllq $48, %%mm2\n\t" | |
639 "psllq $32, %%mm3\n\t" | |
640 "pand %4, %%mm2\n\t" | |
641 "pand %5, %%mm3\n\t" | |
642 "por %%mm2, %%mm0\n\t" | |
643 "psrlq $16, %%mm1\n\t" | |
644 "psrlq $32, %%mm4\n\t" | |
645 "psllq $16, %%mm5\n\t" | |
646 "por %%mm3, %%mm1\n\t" | |
647 "pand %6, %%mm5\n\t" | |
648 "por %%mm5, %%mm4\n\t" | |
649 | |
650 MOVNTQ" %%mm0, %0\n\t" | |
651 MOVNTQ" %%mm1, 8%0\n\t" | |
652 MOVNTQ" %%mm4, 16%0" | |
653 | |
654 :"=m"(*d) | |
655 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
656 :"memory"); | |
657 d += 24; | |
658 s += 8; | |
659 } | |
2741 | 660 __asm __volatile(SFENCE:::"memory"); |
661 __asm __volatile(EMMS:::"memory"); | |
6492 | 662 #endif |
663 while(s < end) | |
664 { | |
665 register uint16_t bgr; | |
666 bgr = *s++; | |
667 *d++ = (bgr&0x1F)<<3; | |
668 *d++ = (bgr&0x3E0)>>2; | |
669 *d++ = (bgr&0x7C00)>>7; | |
670 } | |
671 } | |
672 | |
673 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
674 { | |
675 const uint16_t *end; | |
676 #ifdef HAVE_MMX | |
677 const uint16_t *mm_end; | |
678 #endif | |
679 uint8_t *d = (uint8_t *)dst; | |
680 const uint16_t *s = (const uint16_t *)src; | |
681 end = s + src_size/2; | |
682 #ifdef HAVE_MMX | |
683 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
684 mm_end = (uint16_t*)((((unsigned long)end)/8)*8); | |
685 while(s < mm_end) | |
2718 | 686 { |
6492 | 687 __asm __volatile( |
688 PREFETCH" 32%1\n\t" | |
689 "movq %1, %%mm0\n\t" | |
690 "movq %1, %%mm1\n\t" | |
691 "movq %1, %%mm2\n\t" | |
692 "pand %2, %%mm0\n\t" | |
693 "pand %3, %%mm1\n\t" | |
694 "pand %4, %%mm2\n\t" | |
695 "psllq $3, %%mm0\n\t" | |
696 "psrlq $3, %%mm1\n\t" | |
697 "psrlq $8, %%mm2\n\t" | |
698 "movq %%mm0, %%mm3\n\t" | |
699 "movq %%mm1, %%mm4\n\t" | |
700 "movq %%mm2, %%mm5\n\t" | |
701 "punpcklwd %5, %%mm0\n\t" | |
702 "punpcklwd %5, %%mm1\n\t" | |
703 "punpcklwd %5, %%mm2\n\t" | |
704 "punpckhwd %5, %%mm3\n\t" | |
705 "punpckhwd %5, %%mm4\n\t" | |
706 "punpckhwd %5, %%mm5\n\t" | |
707 "psllq $8, %%mm1\n\t" | |
708 "psllq $16, %%mm2\n\t" | |
709 "por %%mm1, %%mm0\n\t" | |
710 "por %%mm2, %%mm0\n\t" | |
711 "psllq $8, %%mm4\n\t" | |
712 "psllq $16, %%mm5\n\t" | |
713 "por %%mm4, %%mm3\n\t" | |
714 "por %%mm5, %%mm3\n\t" | |
715 | |
716 "movq %%mm0, %%mm6\n\t" | |
717 "movq %%mm3, %%mm7\n\t" | |
718 | |
719 "movq 8%1, %%mm0\n\t" | |
720 "movq 8%1, %%mm1\n\t" | |
721 "movq 8%1, %%mm2\n\t" | |
722 "pand %2, %%mm0\n\t" | |
723 "pand %3, %%mm1\n\t" | |
724 "pand %4, %%mm2\n\t" | |
725 "psllq $3, %%mm0\n\t" | |
726 "psrlq $3, %%mm1\n\t" | |
727 "psrlq $8, %%mm2\n\t" | |
728 "movq %%mm0, %%mm3\n\t" | |
729 "movq %%mm1, %%mm4\n\t" | |
730 "movq %%mm2, %%mm5\n\t" | |
731 "punpcklwd %5, %%mm0\n\t" | |
732 "punpcklwd %5, %%mm1\n\t" | |
733 "punpcklwd %5, %%mm2\n\t" | |
734 "punpckhwd %5, %%mm3\n\t" | |
735 "punpckhwd %5, %%mm4\n\t" | |
736 "punpckhwd %5, %%mm5\n\t" | |
737 "psllq $8, %%mm1\n\t" | |
738 "psllq $16, %%mm2\n\t" | |
739 "por %%mm1, %%mm0\n\t" | |
740 "por %%mm2, %%mm0\n\t" | |
741 "psllq $8, %%mm4\n\t" | |
742 "psllq $16, %%mm5\n\t" | |
743 "por %%mm4, %%mm3\n\t" | |
744 "por %%mm5, %%mm3\n\t" | |
745 :"=m"(*d) | |
746 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | |
747 :"memory"); | |
748 /* Borrowed 32 to 24 */ | |
749 __asm __volatile( | |
750 "movq %%mm0, %%mm4\n\t" | |
751 "movq %%mm3, %%mm5\n\t" | |
752 "movq %%mm6, %%mm0\n\t" | |
753 "movq %%mm7, %%mm1\n\t" | |
754 | |
755 "movq %%mm4, %%mm6\n\t" | |
756 "movq %%mm5, %%mm7\n\t" | |
757 "movq %%mm0, %%mm2\n\t" | |
758 "movq %%mm1, %%mm3\n\t" | |
759 | |
760 "psrlq $8, %%mm2\n\t" | |
761 "psrlq $8, %%mm3\n\t" | |
762 "psrlq $8, %%mm6\n\t" | |
763 "psrlq $8, %%mm7\n\t" | |
764 "pand %2, %%mm0\n\t" | |
765 "pand %2, %%mm1\n\t" | |
766 "pand %2, %%mm4\n\t" | |
767 "pand %2, %%mm5\n\t" | |
768 "pand %3, %%mm2\n\t" | |
769 "pand %3, %%mm3\n\t" | |
770 "pand %3, %%mm6\n\t" | |
771 "pand %3, %%mm7\n\t" | |
772 "por %%mm2, %%mm0\n\t" | |
773 "por %%mm3, %%mm1\n\t" | |
774 "por %%mm6, %%mm4\n\t" | |
775 "por %%mm7, %%mm5\n\t" | |
776 | |
777 "movq %%mm1, %%mm2\n\t" | |
778 "movq %%mm4, %%mm3\n\t" | |
779 "psllq $48, %%mm2\n\t" | |
780 "psllq $32, %%mm3\n\t" | |
781 "pand %4, %%mm2\n\t" | |
782 "pand %5, %%mm3\n\t" | |
783 "por %%mm2, %%mm0\n\t" | |
784 "psrlq $16, %%mm1\n\t" | |
785 "psrlq $32, %%mm4\n\t" | |
786 "psllq $16, %%mm5\n\t" | |
787 "por %%mm3, %%mm1\n\t" | |
788 "pand %6, %%mm5\n\t" | |
789 "por %%mm5, %%mm4\n\t" | |
790 | |
791 MOVNTQ" %%mm0, %0\n\t" | |
792 MOVNTQ" %%mm1, 8%0\n\t" | |
793 MOVNTQ" %%mm4, 16%0" | |
794 | |
795 :"=m"(*d) | |
796 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
797 :"memory"); | |
798 d += 24; | |
799 s += 8; | |
800 } | |
801 __asm __volatile(SFENCE:::"memory"); | |
802 __asm __volatile(EMMS:::"memory"); | |
803 #endif | |
804 while(s < end) | |
805 { | |
806 register uint16_t bgr; | |
807 bgr = *s++; | |
808 *d++ = (bgr&0x1F)<<3; | |
809 *d++ = (bgr&0x7E0)>>3; | |
810 *d++ = (bgr&0xF800)>>8; | |
811 } | |
812 } | |
2718 | 813 |
6492 | 814 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
815 { | |
816 const uint16_t *end; | |
817 #ifdef HAVE_MMX | |
818 const uint16_t *mm_end; | |
819 #endif | |
820 uint8_t *d = (uint8_t *)dst; | |
821 const uint16_t *s = (const uint16_t *)src; | |
822 end = s + src_size/2; | |
823 #ifdef HAVE_MMX | |
824 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
825 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
826 mm_end = (uint16_t*)((((unsigned long)end)/4)*4); | |
827 while(s < mm_end) | |
828 { | |
829 __asm __volatile( | |
830 PREFETCH" 32%1\n\t" | |
831 "movq %1, %%mm0\n\t" | |
832 "movq %1, %%mm1\n\t" | |
833 "movq %1, %%mm2\n\t" | |
834 "pand %2, %%mm0\n\t" | |
835 "pand %3, %%mm1\n\t" | |
836 "pand %4, %%mm2\n\t" | |
837 "psllq $3, %%mm0\n\t" | |
838 "psrlq $2, %%mm1\n\t" | |
839 "psrlq $7, %%mm2\n\t" | |
840 "movq %%mm0, %%mm3\n\t" | |
841 "movq %%mm1, %%mm4\n\t" | |
842 "movq %%mm2, %%mm5\n\t" | |
843 "punpcklwd %%mm7, %%mm0\n\t" | |
844 "punpcklwd %%mm7, %%mm1\n\t" | |
845 "punpcklwd %%mm7, %%mm2\n\t" | |
846 "punpckhwd %%mm7, %%mm3\n\t" | |
847 "punpckhwd %%mm7, %%mm4\n\t" | |
848 "punpckhwd %%mm7, %%mm5\n\t" | |
849 "psllq $8, %%mm1\n\t" | |
850 "psllq $16, %%mm2\n\t" | |
851 "por %%mm1, %%mm0\n\t" | |
852 "por %%mm2, %%mm0\n\t" | |
853 "psllq $8, %%mm4\n\t" | |
854 "psllq $16, %%mm5\n\t" | |
855 "por %%mm4, %%mm3\n\t" | |
856 "por %%mm5, %%mm3\n\t" | |
857 MOVNTQ" %%mm0, %0\n\t" | |
858 MOVNTQ" %%mm3, 8%0\n\t" | |
859 :"=m"(*d) | |
860 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | |
861 :"memory"); | |
862 d += 16; | |
863 s += 4; | |
864 } | |
865 __asm __volatile(SFENCE:::"memory"); | |
866 __asm __volatile(EMMS:::"memory"); | |
867 #endif | |
868 while(s < end) | |
869 { | |
870 register uint16_t bgr; | |
871 bgr = *s++; | |
872 *d++ = (bgr&0x1F)<<3; | |
873 *d++ = (bgr&0x3E0)>>2; | |
874 *d++ = (bgr&0x7C00)>>7; | |
875 *d++ = 0; | |
2718 | 876 } |
6492 | 877 } |
878 | |
879 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
880 { | |
881 const uint16_t *end; | |
882 #ifdef HAVE_MMX | |
883 const uint16_t *mm_end; | |
2741 | 884 #endif |
6492 | 885 uint8_t *d = (uint8_t *)dst; |
886 const uint16_t *s = (uint16_t *)src; | |
887 end = s + src_size/2; | |
888 #ifdef HAVE_MMX | |
889 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
890 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
891 mm_end = (uint16_t*)((((unsigned long)end)/4)*4); | |
892 while(s < mm_end) | |
893 { | |
894 __asm __volatile( | |
895 PREFETCH" 32%1\n\t" | |
896 "movq %1, %%mm0\n\t" | |
897 "movq %1, %%mm1\n\t" | |
898 "movq %1, %%mm2\n\t" | |
899 "pand %2, %%mm0\n\t" | |
900 "pand %3, %%mm1\n\t" | |
901 "pand %4, %%mm2\n\t" | |
902 "psllq $3, %%mm0\n\t" | |
903 "psrlq $3, %%mm1\n\t" | |
904 "psrlq $8, %%mm2\n\t" | |
905 "movq %%mm0, %%mm3\n\t" | |
906 "movq %%mm1, %%mm4\n\t" | |
907 "movq %%mm2, %%mm5\n\t" | |
908 "punpcklwd %%mm7, %%mm0\n\t" | |
909 "punpcklwd %%mm7, %%mm1\n\t" | |
910 "punpcklwd %%mm7, %%mm2\n\t" | |
911 "punpckhwd %%mm7, %%mm3\n\t" | |
912 "punpckhwd %%mm7, %%mm4\n\t" | |
913 "punpckhwd %%mm7, %%mm5\n\t" | |
914 "psllq $8, %%mm1\n\t" | |
915 "psllq $16, %%mm2\n\t" | |
916 "por %%mm1, %%mm0\n\t" | |
917 "por %%mm2, %%mm0\n\t" | |
918 "psllq $8, %%mm4\n\t" | |
919 "psllq $16, %%mm5\n\t" | |
920 "por %%mm4, %%mm3\n\t" | |
921 "por %%mm5, %%mm3\n\t" | |
922 MOVNTQ" %%mm0, %0\n\t" | |
923 MOVNTQ" %%mm3, 8%0\n\t" | |
924 :"=m"(*d) | |
925 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | |
926 :"memory"); | |
927 d += 16; | |
928 s += 4; | |
929 } | |
930 __asm __volatile(SFENCE:::"memory"); | |
931 __asm __volatile(EMMS:::"memory"); | |
932 #endif | |
933 while(s < end) | |
934 { | |
935 register uint16_t bgr; | |
936 bgr = *s++; | |
937 *d++ = (bgr&0x1F)<<3; | |
938 *d++ = (bgr&0x7E0)>>3; | |
939 *d++ = (bgr&0xF800)>>8; | |
940 *d++ = 0; | |
941 } | |
2718 | 942 } |
2694 | 943 |
3132 | 944 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
2755 | 945 { |
946 #ifdef HAVE_MMX | |
6492 | 947 /* TODO: unroll this loop */ |
2755 | 948 asm volatile ( |
949 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
950 ".balign 16 \n\t" |
2755 | 951 "1: \n\t" |
952 PREFETCH" 32(%0, %%eax) \n\t" | |
953 "movq (%0, %%eax), %%mm0 \n\t" | |
954 "movq %%mm0, %%mm1 \n\t" | |
955 "movq %%mm0, %%mm2 \n\t" | |
956 "pslld $16, %%mm0 \n\t" | |
957 "psrld $16, %%mm1 \n\t" | |
6492 | 958 "pand "MANGLE(mask32r)", %%mm0 \n\t" |
959 "pand "MANGLE(mask32g)", %%mm2 \n\t" | |
960 "pand "MANGLE(mask32b)", %%mm1 \n\t" | |
2755 | 961 "por %%mm0, %%mm2 \n\t" |
962 "por %%mm1, %%mm2 \n\t" | |
963 MOVNTQ" %%mm2, (%1, %%eax) \n\t" | |
6096 | 964 "addl $8, %%eax \n\t" |
2755 | 965 "cmpl %2, %%eax \n\t" |
966 " jb 1b \n\t" | |
5961
f4f3cfcd0d64
10l - MMX rgb2bgr 32bpp expects num_of_bytes instead of num_of_pixels
arpi
parents:
5588
diff
changeset
|
967 :: "r" (src), "r"(dst), "r" (src_size) |
2755 | 968 : "%eax" |
969 ); | |
2766 | 970 |
971 __asm __volatile(SFENCE:::"memory"); | |
972 __asm __volatile(EMMS:::"memory"); | |
2755 | 973 #else |
6492 | 974 unsigned i; |
975 unsigned num_pixels = src_size >> 2; | |
2755 | 976 for(i=0; i<num_pixels; i++) |
977 { | |
978 dst[4*i + 0] = src[4*i + 2]; | |
979 dst[4*i + 1] = src[4*i + 1]; | |
980 dst[4*i + 2] = src[4*i + 0]; | |
981 } | |
982 #endif | |
983 } | |
984 | |
5582 | 985 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
986 { | |
6492 | 987 unsigned i; |
5582 | 988 #ifdef HAVE_MMX |
989 int mmx_size= 23 - src_size; | |
990 asm volatile ( | |
991 "movq "MANGLE(mask24r)", %%mm5 \n\t" | |
992 "movq "MANGLE(mask24g)", %%mm6 \n\t" | |
993 "movq "MANGLE(mask24b)", %%mm7 \n\t" | |
994 ".balign 16 \n\t" | |
995 "1: \n\t" | |
996 PREFETCH" 32(%1, %%eax) \n\t" | |
997 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG | |
998 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG | |
999 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B | |
1000 "psllq $16, %%mm0 \n\t" // 00 BGR BGR | |
1001 "pand %%mm5, %%mm0 \n\t" | |
1002 "pand %%mm6, %%mm1 \n\t" | |
1003 "pand %%mm7, %%mm2 \n\t" | |
1004 "por %%mm0, %%mm1 \n\t" | |
1005 "por %%mm2, %%mm1 \n\t" | |
1006 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG | |
1007 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG | |
1008 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B | |
1009 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR | |
1010 "pand %%mm7, %%mm0 \n\t" | |
1011 "pand %%mm5, %%mm1 \n\t" | |
1012 "pand %%mm6, %%mm2 \n\t" | |
1013 "por %%mm0, %%mm1 \n\t" | |
1014 "por %%mm2, %%mm1 \n\t" | |
1015 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B | |
1016 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R | |
1017 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR | |
1018 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG | |
1019 "pand %%mm6, %%mm0 \n\t" | |
1020 "pand %%mm7, %%mm1 \n\t" | |
1021 "pand %%mm5, %%mm2 \n\t" | |
1022 "por %%mm0, %%mm1 \n\t" | |
1023 "por %%mm2, %%mm1 \n\t" | |
1024 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t" | |
1025 "addl $24, %%eax \n\t" | |
1026 " js 1b \n\t" | |
1027 : "+a" (mmx_size) | |
1028 : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1029 ); | |
1030 | |
1031 __asm __volatile(SFENCE:::"memory"); | |
1032 __asm __volatile(EMMS:::"memory"); | |
1033 | |
6096 | 1034 if(mmx_size==23) return; //finihsed, was multiple of 8 |
6492 | 1035 |
5582 | 1036 src+= src_size; |
1037 dst+= src_size; | |
6492 | 1038 src_size= 23-mmx_size; |
5582 | 1039 src-= src_size; |
1040 dst-= src_size; | |
1041 #endif | |
1042 for(i=0; i<src_size; i+=3) | |
1043 { | |
6492 | 1044 register uint8_t x; |
5582 | 1045 x = src[i + 2]; |
1046 dst[i + 1] = src[i + 1]; | |
1047 dst[i + 2] = src[i + 0]; | |
1048 dst[i + 0] = x; | |
1049 } | |
1050 } | |
1051 | |
5588 | 1052 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 1053 unsigned int width, unsigned int height, |
5588 | 1054 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma) |
2701 | 1055 { |
6492 | 1056 unsigned y; |
1057 const unsigned chromWidth= width>>1; | |
2723 | 1058 for(y=0; y<height; y++) |
1059 { | |
2702 | 1060 #ifdef HAVE_MMX |
2723 | 1061 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
1062 asm volatile( | |
1063 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1064 ".balign 16 \n\t" |
2723 | 1065 "1: \n\t" |
1066 PREFETCH" 32(%1, %%eax, 2) \n\t" | |
1067 PREFETCH" 32(%2, %%eax) \n\t" | |
1068 PREFETCH" 32(%3, %%eax) \n\t" | |
1069 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | |
1070 "movq %%mm0, %%mm2 \n\t" // U(0) | |
1071 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | |
1072 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1073 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1074 | |
1075 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | |
1076 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | |
1077 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
1078 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
1079 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
1080 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
1081 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
1082 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 1083 |
2723 | 1084 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" |
1085 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | |
1086 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | |
1087 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | |
2702 | 1088 |
2723 | 1089 "addl $8, %%eax \n\t" |
1090 "cmpl %4, %%eax \n\t" | |
1091 " jb 1b \n\t" | |
1092 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) | |
1093 : "%eax" | |
1094 ); | |
2702 | 1095 #else |
6492 | 1096 #if __WORDSIZE >= 64 |
2723 | 1097 int i; |
6492 | 1098 uint64_t *ldst = (uint64_t *) dst; |
1099 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1100 for(i = 0; i < chromWidth; i += 2){ | |
1101 uint64_t k, l; | |
1102 k = yc[0] + (uc[0] << 8) + | |
1103 (yc[1] << 16) + (vc[0] << 24); | |
1104 l = yc[2] + (uc[1] << 8) + | |
1105 (yc[3] << 16) + (vc[1] << 24); | |
1106 *ldst++ = k + (l << 32); | |
1107 yc += 4; | |
1108 uc += 2; | |
1109 vc += 2; | |
2723 | 1110 } |
6492 | 1111 |
1112 #else | |
1113 int i, *idst = (int32_t *) dst; | |
1114 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1115 for(i = 0; i < chromWidth; i++){ | |
1116 *idst++ = yc[0] + (uc[0] << 8) + | |
1117 (yc[1] << 16) + (vc[0] << 24); | |
1118 yc += 2; | |
1119 uc++; | |
1120 vc++; | |
1121 } | |
1122 #endif | |
2723 | 1123 #endif |
5588 | 1124 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
2723 | 1125 { |
1126 usrc += chromStride; | |
1127 vsrc += chromStride; | |
1128 } | |
1129 ysrc += lumStride; | |
1130 dst += dstStride; | |
2701 | 1131 } |
2723 | 1132 #ifdef HAVE_MMX |
1133 asm( EMMS" \n\t" | |
1134 SFENCE" \n\t" | |
1135 :::"memory"); | |
2702 | 1136 #endif |
2701 | 1137 } |
1138 | |
2724 | 1139 /** |
1140 * | |
1141 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1142 * problem for anyone then tell me, and ill fix it) | |
1143 */ | |
5588 | 1144 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1145 unsigned int width, unsigned int height, | |
1146 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
1147 { | |
1148 //FIXME interpolate chroma | |
1149 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1150 } | |
1151 | |
1152 /** | |
1153 * | |
1154 * width should be a multiple of 16 | |
1155 */ | |
1156 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1157 unsigned int width, unsigned int height, | |
1158 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
1159 { | |
1160 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1161 } | |
1162 | |
1163 /** | |
1164 * | |
1165 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1166 * problem for anyone then tell me, and ill fix it) | |
1167 */ | |
3132 | 1168 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2725 | 1169 unsigned int width, unsigned int height, |
1170 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
2701 | 1171 { |
6492 | 1172 unsigned y; |
1173 const unsigned chromWidth= width>>1; | |
2724 | 1174 for(y=0; y<height; y+=2) |
1175 { | |
2704 | 1176 #ifdef HAVE_MMX |
2724 | 1177 asm volatile( |
1178 "xorl %%eax, %%eax \n\t" | |
1179 "pcmpeqw %%mm7, %%mm7 \n\t" | |
1180 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1181 ".balign 16 \n\t" |
2724 | 1182 "1: \n\t" |
1183 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1184 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
1185 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
1186 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
1187 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
1188 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
1189 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
1190 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1191 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1192 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1193 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
1194 | |
1195 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2704 | 1196 |
2724 | 1197 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) |
1198 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | |
1199 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
1200 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
1201 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
1202 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
1203 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
1204 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
1205 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
1206 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 1207 |
2724 | 1208 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" |
1209 | |
1210 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
1211 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
1212 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
1213 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
1214 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
1215 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
1216 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
1217 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 1218 |
2724 | 1219 MOVNTQ" %%mm0, (%3, %%eax) \n\t" |
1220 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
1221 | |
1222 "addl $8, %%eax \n\t" | |
1223 "cmpl %4, %%eax \n\t" | |
1224 " jb 1b \n\t" | |
2725 | 1225 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
1226 : "memory", "%eax" | |
1227 ); | |
2704 | 1228 |
2806 | 1229 ydst += lumStride; |
1230 src += srcStride; | |
1231 | |
2725 | 1232 asm volatile( |
1233 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1234 ".balign 16 \n\t" |
2724 | 1235 "1: \n\t" |
1236 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1237 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
1238 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
1239 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
1240 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
1241 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
1242 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
1243 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
1244 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
1245 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
1246 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 1247 |
2724 | 1248 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" |
1249 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
1250 | |
1251 "addl $8, %%eax \n\t" | |
2725 | 1252 "cmpl %4, %%eax \n\t" |
2724 | 1253 " jb 1b \n\t" |
2704 | 1254 |
2806 | 1255 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
2724 | 1256 : "memory", "%eax" |
1257 ); | |
2704 | 1258 #else |
6492 | 1259 unsigned i; |
2724 | 1260 for(i=0; i<chromWidth; i++) |
1261 { | |
1262 ydst[2*i+0] = src[4*i+0]; | |
1263 udst[i] = src[4*i+1]; | |
1264 ydst[2*i+1] = src[4*i+2]; | |
1265 vdst[i] = src[4*i+3]; | |
1266 } | |
1267 ydst += lumStride; | |
1268 src += srcStride; | |
1269 | |
1270 for(i=0; i<chromWidth; i++) | |
1271 { | |
1272 ydst[2*i+0] = src[4*i+0]; | |
1273 ydst[2*i+1] = src[4*i+2]; | |
1274 } | |
1275 #endif | |
1276 udst += chromStride; | |
1277 vdst += chromStride; | |
1278 ydst += lumStride; | |
1279 src += srcStride; | |
2701 | 1280 } |
2724 | 1281 #ifdef HAVE_MMX |
2847 | 1282 asm volatile( EMMS" \n\t" |
1283 SFENCE" \n\t" | |
1284 :::"memory"); | |
2704 | 1285 #endif |
2723 | 1286 } |
2801 | 1287 |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1288 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1289 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1290 unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride) |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1291 { |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1292 /* Y Plane */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1293 memcpy(ydst, ysrc, width*height); |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1294 |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1295 /* XXX: implement upscaling for U,V */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1296 } |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1297 |
2801 | 1298 /** |
1299 * | |
1300 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1301 * problem for anyone then tell me, and ill fix it) | |
3132 | 1302 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version |
2801 | 1303 */ |
3132 | 1304 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2801 | 1305 unsigned int width, unsigned int height, |
1306 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
1307 { | |
6492 | 1308 unsigned y; |
1309 const unsigned chromWidth= width>>1; | |
2801 | 1310 for(y=0; y<height; y+=2) |
1311 { | |
2847 | 1312 #ifdef HAVE_MMX |
1313 asm volatile( | |
1314 "xorl %%eax, %%eax \n\t" | |
1315 "pcmpeqw %%mm7, %%mm7 \n\t" | |
1316 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
1317 ".balign 16 \n\t" | |
1318 "1: \n\t" | |
1319 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1320 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
1321 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
1322 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
1323 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
1324 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
1325 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
1326 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1327 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1328 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1329 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
1330 | |
1331 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
1332 | |
1333 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
1334 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
1335 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
1336 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
1337 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
1338 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
1339 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
1340 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
1341 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
1342 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
1343 | |
1344 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
1345 | |
1346 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
1347 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
1348 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
1349 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
1350 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
1351 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
1352 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
1353 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
1354 | |
1355 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
1356 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
1357 | |
1358 "addl $8, %%eax \n\t" | |
1359 "cmpl %4, %%eax \n\t" | |
1360 " jb 1b \n\t" | |
1361 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
1362 : "memory", "%eax" | |
1363 ); | |
1364 | |
1365 ydst += lumStride; | |
1366 src += srcStride; | |
1367 | |
1368 asm volatile( | |
1369 "xorl %%eax, %%eax \n\t" | |
1370 ".balign 16 \n\t" | |
1371 "1: \n\t" | |
1372 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1373 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
1374 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
1375 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
1376 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
1377 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
1378 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
1379 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
1380 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
1381 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
1382 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
1383 | |
1384 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
1385 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
1386 | |
1387 "addl $8, %%eax \n\t" | |
1388 "cmpl %4, %%eax \n\t" | |
1389 " jb 1b \n\t" | |
1390 | |
1391 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
1392 : "memory", "%eax" | |
1393 ); | |
1394 #else | |
6492 | 1395 unsigned i; |
2801 | 1396 for(i=0; i<chromWidth; i++) |
1397 { | |
1398 udst[i] = src[4*i+0]; | |
1399 ydst[2*i+0] = src[4*i+1]; | |
1400 vdst[i] = src[4*i+2]; | |
1401 ydst[2*i+1] = src[4*i+3]; | |
1402 } | |
1403 ydst += lumStride; | |
1404 src += srcStride; | |
1405 | |
1406 for(i=0; i<chromWidth; i++) | |
1407 { | |
1408 ydst[2*i+0] = src[4*i+1]; | |
1409 ydst[2*i+1] = src[4*i+3]; | |
1410 } | |
2847 | 1411 #endif |
2801 | 1412 udst += chromStride; |
1413 vdst += chromStride; | |
1414 ydst += lumStride; | |
1415 src += srcStride; | |
1416 } | |
2847 | 1417 #ifdef HAVE_MMX |
1418 asm volatile( EMMS" \n\t" | |
1419 SFENCE" \n\t" | |
1420 :::"memory"); | |
1421 #endif | |
2801 | 1422 } |
1423 | |
3132 | 1424 /** |
1425 * | |
1426 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
1427 * problem for anyone then tell me, and ill fix it) | |
4622 | 1428 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version |
3132 | 1429 */ |
1430 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1431 unsigned int width, unsigned int height, | |
1432 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
1433 { | |
6492 | 1434 unsigned y; |
1435 const unsigned chromWidth= width>>1; | |
4622 | 1436 #ifdef HAVE_MMX |
1437 for(y=0; y<height-2; y+=2) | |
1438 { | |
6492 | 1439 unsigned i; |
4622 | 1440 for(i=0; i<2; i++) |
1441 { | |
1442 asm volatile( | |
1443 "movl %2, %%eax \n\t" | |
4923 | 1444 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
1445 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4622 | 1446 "pxor %%mm7, %%mm7 \n\t" |
1447 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1448 ".balign 16 \n\t" | |
1449 "1: \n\t" | |
1450 PREFETCH" 64(%0, %%ebx) \n\t" | |
1451 "movd (%0, %%ebx), %%mm0 \n\t" | |
1452 "movd 3(%0, %%ebx), %%mm1 \n\t" | |
1453 "punpcklbw %%mm7, %%mm0 \n\t" | |
1454 "punpcklbw %%mm7, %%mm1 \n\t" | |
1455 "movd 6(%0, %%ebx), %%mm2 \n\t" | |
1456 "movd 9(%0, %%ebx), %%mm3 \n\t" | |
1457 "punpcklbw %%mm7, %%mm2 \n\t" | |
1458 "punpcklbw %%mm7, %%mm3 \n\t" | |
1459 "pmaddwd %%mm6, %%mm0 \n\t" | |
1460 "pmaddwd %%mm6, %%mm1 \n\t" | |
1461 "pmaddwd %%mm6, %%mm2 \n\t" | |
1462 "pmaddwd %%mm6, %%mm3 \n\t" | |
1463 #ifndef FAST_BGR2YV12 | |
1464 "psrad $8, %%mm0 \n\t" | |
1465 "psrad $8, %%mm1 \n\t" | |
1466 "psrad $8, %%mm2 \n\t" | |
1467 "psrad $8, %%mm3 \n\t" | |
1468 #endif | |
1469 "packssdw %%mm1, %%mm0 \n\t" | |
1470 "packssdw %%mm3, %%mm2 \n\t" | |
1471 "pmaddwd %%mm5, %%mm0 \n\t" | |
1472 "pmaddwd %%mm5, %%mm2 \n\t" | |
1473 "packssdw %%mm2, %%mm0 \n\t" | |
1474 "psraw $7, %%mm0 \n\t" | |
1475 | |
1476 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1477 "movd 15(%0, %%ebx), %%mm1 \n\t" | |
1478 "punpcklbw %%mm7, %%mm4 \n\t" | |
1479 "punpcklbw %%mm7, %%mm1 \n\t" | |
1480 "movd 18(%0, %%ebx), %%mm2 \n\t" | |
1481 "movd 21(%0, %%ebx), %%mm3 \n\t" | |
1482 "punpcklbw %%mm7, %%mm2 \n\t" | |
1483 "punpcklbw %%mm7, %%mm3 \n\t" | |
1484 "pmaddwd %%mm6, %%mm4 \n\t" | |
1485 "pmaddwd %%mm6, %%mm1 \n\t" | |
1486 "pmaddwd %%mm6, %%mm2 \n\t" | |
1487 "pmaddwd %%mm6, %%mm3 \n\t" | |
1488 #ifndef FAST_BGR2YV12 | |
1489 "psrad $8, %%mm4 \n\t" | |
1490 "psrad $8, %%mm1 \n\t" | |
1491 "psrad $8, %%mm2 \n\t" | |
1492 "psrad $8, %%mm3 \n\t" | |
1493 #endif | |
1494 "packssdw %%mm1, %%mm4 \n\t" | |
1495 "packssdw %%mm3, %%mm2 \n\t" | |
1496 "pmaddwd %%mm5, %%mm4 \n\t" | |
1497 "pmaddwd %%mm5, %%mm2 \n\t" | |
1498 "addl $24, %%ebx \n\t" | |
1499 "packssdw %%mm2, %%mm4 \n\t" | |
1500 "psraw $7, %%mm4 \n\t" | |
1501 | |
1502 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 1503 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4622 | 1504 |
1505 MOVNTQ" %%mm0, (%1, %%eax) \n\t" | |
1506 "addl $8, %%eax \n\t" | |
1507 " js 1b \n\t" | |
1508 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) | |
1509 : "%eax", "%ebx" | |
1510 ); | |
1511 ydst += lumStride; | |
1512 src += srcStride; | |
1513 } | |
1514 src -= srcStride*2; | |
1515 asm volatile( | |
1516 "movl %4, %%eax \n\t" | |
4923 | 1517 "movq "MANGLE(w1111)", %%mm5 \n\t" |
1518 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4622 | 1519 "pxor %%mm7, %%mm7 \n\t" |
1520 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1521 "addl %%ebx, %%ebx \n\t" | |
1522 ".balign 16 \n\t" | |
1523 "1: \n\t" | |
1524 PREFETCH" 64(%0, %%ebx) \n\t" | |
1525 PREFETCH" 64(%1, %%ebx) \n\t" | |
1526 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1527 "movq (%0, %%ebx), %%mm0 \n\t" | |
1528 "movq (%1, %%ebx), %%mm1 \n\t" | |
1529 "movq 6(%0, %%ebx), %%mm2 \n\t" | |
1530 "movq 6(%1, %%ebx), %%mm3 \n\t" | |
1531 PAVGB" %%mm1, %%mm0 \n\t" | |
1532 PAVGB" %%mm3, %%mm2 \n\t" | |
1533 "movq %%mm0, %%mm1 \n\t" | |
1534 "movq %%mm2, %%mm3 \n\t" | |
1535 "psrlq $24, %%mm0 \n\t" | |
1536 "psrlq $24, %%mm2 \n\t" | |
1537 PAVGB" %%mm1, %%mm0 \n\t" | |
1538 PAVGB" %%mm3, %%mm2 \n\t" | |
1539 "punpcklbw %%mm7, %%mm0 \n\t" | |
1540 "punpcklbw %%mm7, %%mm2 \n\t" | |
1541 #else | |
1542 "movd (%0, %%ebx), %%mm0 \n\t" | |
1543 "movd (%1, %%ebx), %%mm1 \n\t" | |
1544 "movd 3(%0, %%ebx), %%mm2 \n\t" | |
1545 "movd 3(%1, %%ebx), %%mm3 \n\t" | |
1546 "punpcklbw %%mm7, %%mm0 \n\t" | |
1547 "punpcklbw %%mm7, %%mm1 \n\t" | |
1548 "punpcklbw %%mm7, %%mm2 \n\t" | |
1549 "punpcklbw %%mm7, %%mm3 \n\t" | |
1550 "paddw %%mm1, %%mm0 \n\t" | |
1551 "paddw %%mm3, %%mm2 \n\t" | |
1552 "paddw %%mm2, %%mm0 \n\t" | |
1553 "movd 6(%0, %%ebx), %%mm4 \n\t" | |
1554 "movd 6(%1, %%ebx), %%mm1 \n\t" | |
1555 "movd 9(%0, %%ebx), %%mm2 \n\t" | |
1556 "movd 9(%1, %%ebx), %%mm3 \n\t" | |
1557 "punpcklbw %%mm7, %%mm4 \n\t" | |
1558 "punpcklbw %%mm7, %%mm1 \n\t" | |
1559 "punpcklbw %%mm7, %%mm2 \n\t" | |
1560 "punpcklbw %%mm7, %%mm3 \n\t" | |
1561 "paddw %%mm1, %%mm4 \n\t" | |
1562 "paddw %%mm3, %%mm2 \n\t" | |
1563 "paddw %%mm4, %%mm2 \n\t" | |
1564 "psrlw $2, %%mm0 \n\t" | |
1565 "psrlw $2, %%mm2 \n\t" | |
1566 #endif | |
4923 | 1567 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1568 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 1569 |
1570 "pmaddwd %%mm0, %%mm1 \n\t" | |
1571 "pmaddwd %%mm2, %%mm3 \n\t" | |
1572 "pmaddwd %%mm6, %%mm0 \n\t" | |
1573 "pmaddwd %%mm6, %%mm2 \n\t" | |
1574 #ifndef FAST_BGR2YV12 | |
1575 "psrad $8, %%mm0 \n\t" | |
1576 "psrad $8, %%mm1 \n\t" | |
1577 "psrad $8, %%mm2 \n\t" | |
1578 "psrad $8, %%mm3 \n\t" | |
1579 #endif | |
1580 "packssdw %%mm2, %%mm0 \n\t" | |
1581 "packssdw %%mm3, %%mm1 \n\t" | |
1582 "pmaddwd %%mm5, %%mm0 \n\t" | |
1583 "pmaddwd %%mm5, %%mm1 \n\t" | |
1584 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1585 "psraw $7, %%mm0 \n\t" | |
1586 | |
1587 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1588 "movq 12(%0, %%ebx), %%mm4 \n\t" | |
1589 "movq 12(%1, %%ebx), %%mm1 \n\t" | |
1590 "movq 18(%0, %%ebx), %%mm2 \n\t" | |
1591 "movq 18(%1, %%ebx), %%mm3 \n\t" | |
1592 PAVGB" %%mm1, %%mm4 \n\t" | |
1593 PAVGB" %%mm3, %%mm2 \n\t" | |
1594 "movq %%mm4, %%mm1 \n\t" | |
1595 "movq %%mm2, %%mm3 \n\t" | |
1596 "psrlq $24, %%mm4 \n\t" | |
1597 "psrlq $24, %%mm2 \n\t" | |
1598 PAVGB" %%mm1, %%mm4 \n\t" | |
1599 PAVGB" %%mm3, %%mm2 \n\t" | |
1600 "punpcklbw %%mm7, %%mm4 \n\t" | |
1601 "punpcklbw %%mm7, %%mm2 \n\t" | |
1602 #else | |
1603 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1604 "movd 12(%1, %%ebx), %%mm1 \n\t" | |
1605 "movd 15(%0, %%ebx), %%mm2 \n\t" | |
1606 "movd 15(%1, %%ebx), %%mm3 \n\t" | |
1607 "punpcklbw %%mm7, %%mm4 \n\t" | |
1608 "punpcklbw %%mm7, %%mm1 \n\t" | |
1609 "punpcklbw %%mm7, %%mm2 \n\t" | |
1610 "punpcklbw %%mm7, %%mm3 \n\t" | |
1611 "paddw %%mm1, %%mm4 \n\t" | |
1612 "paddw %%mm3, %%mm2 \n\t" | |
1613 "paddw %%mm2, %%mm4 \n\t" | |
1614 "movd 18(%0, %%ebx), %%mm5 \n\t" | |
1615 "movd 18(%1, %%ebx), %%mm1 \n\t" | |
1616 "movd 21(%0, %%ebx), %%mm2 \n\t" | |
1617 "movd 21(%1, %%ebx), %%mm3 \n\t" | |
1618 "punpcklbw %%mm7, %%mm5 \n\t" | |
1619 "punpcklbw %%mm7, %%mm1 \n\t" | |
1620 "punpcklbw %%mm7, %%mm2 \n\t" | |
1621 "punpcklbw %%mm7, %%mm3 \n\t" | |
1622 "paddw %%mm1, %%mm5 \n\t" | |
1623 "paddw %%mm3, %%mm2 \n\t" | |
1624 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 1625 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4622 | 1626 "psrlw $2, %%mm4 \n\t" |
1627 "psrlw $2, %%mm2 \n\t" | |
1628 #endif | |
4923 | 1629 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1630 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 1631 |
1632 "pmaddwd %%mm4, %%mm1 \n\t" | |
1633 "pmaddwd %%mm2, %%mm3 \n\t" | |
1634 "pmaddwd %%mm6, %%mm4 \n\t" | |
1635 "pmaddwd %%mm6, %%mm2 \n\t" | |
1636 #ifndef FAST_BGR2YV12 | |
1637 "psrad $8, %%mm4 \n\t" | |
1638 "psrad $8, %%mm1 \n\t" | |
1639 "psrad $8, %%mm2 \n\t" | |
1640 "psrad $8, %%mm3 \n\t" | |
1641 #endif | |
1642 "packssdw %%mm2, %%mm4 \n\t" | |
1643 "packssdw %%mm3, %%mm1 \n\t" | |
1644 "pmaddwd %%mm5, %%mm4 \n\t" | |
1645 "pmaddwd %%mm5, %%mm1 \n\t" | |
1646 "addl $24, %%ebx \n\t" | |
1647 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
1648 "psraw $7, %%mm4 \n\t" | |
1649 | |
1650 "movq %%mm0, %%mm1 \n\t" | |
1651 "punpckldq %%mm4, %%mm0 \n\t" | |
1652 "punpckhdq %%mm4, %%mm1 \n\t" | |
1653 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 1654 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
4622 | 1655 |
1656 "movd %%mm0, (%2, %%eax) \n\t" | |
1657 "punpckhdq %%mm0, %%mm0 \n\t" | |
1658 "movd %%mm0, (%3, %%eax) \n\t" | |
1659 "addl $4, %%eax \n\t" | |
1660 " js 1b \n\t" | |
1661 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width) | |
1662 : "%eax", "%ebx" | |
1663 ); | |
1664 | |
1665 udst += chromStride; | |
1666 vdst += chromStride; | |
1667 src += srcStride*2; | |
1668 } | |
1669 | |
1670 asm volatile( EMMS" \n\t" | |
1671 SFENCE" \n\t" | |
1672 :::"memory"); | |
1673 #else | |
1674 y=0; | |
1675 #endif | |
1676 for(; y<height; y+=2) | |
3132 | 1677 { |
6492 | 1678 unsigned i; |
3132 | 1679 for(i=0; i<chromWidth; i++) |
1680 { | |
1681 unsigned int b= src[6*i+0]; | |
1682 unsigned int g= src[6*i+1]; | |
1683 unsigned int r= src[6*i+2]; | |
2801 | 1684 |
3633 | 1685 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
1686 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
1687 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
3132 | 1688 |
1689 udst[i] = U; | |
1690 vdst[i] = V; | |
1691 ydst[2*i] = Y; | |
1692 | |
1693 b= src[6*i+3]; | |
1694 g= src[6*i+4]; | |
1695 r= src[6*i+5]; | |
1696 | |
3633 | 1697 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1698 ydst[2*i+1] = Y; |
1699 } | |
1700 ydst += lumStride; | |
1701 src += srcStride; | |
1702 | |
1703 for(i=0; i<chromWidth; i++) | |
1704 { | |
1705 unsigned int b= src[6*i+0]; | |
1706 unsigned int g= src[6*i+1]; | |
1707 unsigned int r= src[6*i+2]; | |
1708 | |
3633 | 1709 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1710 |
1711 ydst[2*i] = Y; | |
1712 | |
1713 b= src[6*i+3]; | |
1714 g= src[6*i+4]; | |
1715 r= src[6*i+5]; | |
1716 | |
3633 | 1717 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1718 ydst[2*i+1] = Y; |
1719 } | |
1720 udst += chromStride; | |
1721 vdst += chromStride; | |
1722 ydst += lumStride; | |
1723 src += srcStride; | |
1724 } | |
1725 } | |
5337 | 1726 |
1727 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |
6492 | 1728 unsigned width, unsigned height, unsigned src1Stride, |
1729 unsigned src2Stride, unsigned dstStride){ | |
1730 unsigned h; | |
5337 | 1731 |
1732 for(h=0; h < height; h++) | |
1733 { | |
6492 | 1734 unsigned w; |
5337 | 1735 |
1736 #ifdef HAVE_MMX | |
1737 #ifdef HAVE_SSE2 | |
1738 asm( | |
1739 "xorl %%eax, %%eax \n\t" | |
1740 "1: \n\t" | |
1741 PREFETCH" 64(%1, %%eax) \n\t" | |
1742 PREFETCH" 64(%2, %%eax) \n\t" | |
1743 "movdqa (%1, %%eax), %%xmm0 \n\t" | |
1744 "movdqa (%1, %%eax), %%xmm1 \n\t" | |
1745 "movdqa (%2, %%eax), %%xmm2 \n\t" | |
1746 "punpcklbw %%xmm2, %%xmm0 \n\t" | |
1747 "punpckhbw %%xmm2, %%xmm1 \n\t" | |
1748 "movntdq %%xmm0, (%0, %%eax, 2) \n\t" | |
1749 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t" | |
1750 "addl $16, %%eax \n\t" | |
1751 "cmpl %3, %%eax \n\t" | |
1752 " jb 1b \n\t" | |
1753 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
1754 : "memory", "%eax" | |
1755 ); | |
1756 #else | |
1757 asm( | |
1758 "xorl %%eax, %%eax \n\t" | |
1759 "1: \n\t" | |
1760 PREFETCH" 64(%1, %%eax) \n\t" | |
1761 PREFETCH" 64(%2, %%eax) \n\t" | |
1762 "movq (%1, %%eax), %%mm0 \n\t" | |
1763 "movq 8(%1, %%eax), %%mm2 \n\t" | |
1764 "movq %%mm0, %%mm1 \n\t" | |
1765 "movq %%mm2, %%mm3 \n\t" | |
1766 "movq (%2, %%eax), %%mm4 \n\t" | |
1767 "movq 8(%2, %%eax), %%mm5 \n\t" | |
1768 "punpcklbw %%mm4, %%mm0 \n\t" | |
1769 "punpckhbw %%mm4, %%mm1 \n\t" | |
1770 "punpcklbw %%mm5, %%mm2 \n\t" | |
1771 "punpckhbw %%mm5, %%mm3 \n\t" | |
1772 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t" | |
1773 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t" | |
1774 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t" | |
1775 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t" | |
1776 "addl $16, %%eax \n\t" | |
1777 "cmpl %3, %%eax \n\t" | |
1778 " jb 1b \n\t" | |
1779 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
1780 : "memory", "%eax" | |
1781 ); | |
1782 #endif | |
1783 for(w= (width&(~15)); w < width; w++) | |
1784 { | |
1785 dest[2*w+0] = src1[w]; | |
1786 dest[2*w+1] = src2[w]; | |
1787 } | |
1788 #else | |
1789 for(w=0; w < width; w++) | |
1790 { | |
1791 dest[2*w+0] = src1[w]; | |
1792 dest[2*w+1] = src2[w]; | |
1793 } | |
1794 #endif | |
1795 dest += dstStride; | |
1796 src1 += src1Stride; | |
1797 src2 += src2Stride; | |
1798 } | |
1799 #ifdef HAVE_MMX | |
1800 asm( | |
1801 EMMS" \n\t" | |
1802 SFENCE" \n\t" | |
1803 ::: "memory" | |
1804 ); | |
1805 #endif | |
1806 } |