Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 4126:5ee0a20cc791
started
author | alex |
---|---|
date | Sun, 13 Jan 2002 00:26:23 +0000 |
parents | e81bfc0826b1 |
children | e3a9fae516e4 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
3132 | 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
9 */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
10 |
3132 | 11 #undef PREFETCH |
12 #undef MOVNTQ | |
13 #undef EMMS | |
14 #undef SFENCE | |
15 #undef MMREG_SIZE | |
16 #undef PREFETCHW | |
17 #undef PAVGB | |
2755 | 18 |
3132 | 19 #ifdef HAVE_SSE2 |
20 #define MMREG_SIZE 16 | |
21 #else | |
22 #define MMREG_SIZE 8 | |
2535 | 23 #endif |
2513 | 24 |
3132 | 25 #ifdef HAVE_3DNOW |
26 #define PREFETCH "prefetch" | |
27 #define PREFETCHW "prefetchw" | |
28 #define PAVGB "pavgusb" | |
29 #elif defined ( HAVE_MMX2 ) | |
30 #define PREFETCH "prefetchnta" | |
31 #define PREFETCHW "prefetcht0" | |
32 #define PAVGB "pavgb" | |
33 #else | |
34 #define PREFETCH "/nop" | |
35 #define PREFETCHW "/nop" | |
36 #endif | |
37 | |
38 #ifdef HAVE_3DNOW | |
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
40 #define EMMS "femms" | |
41 #else | |
42 #define EMMS "emms" | |
43 #endif | |
44 | |
45 #ifdef HAVE_MMX2 | |
46 #define MOVNTQ "movntq" | |
47 #define SFENCE "sfence" | |
48 #else | |
49 #define MOVNTQ "movq" | |
50 #define SFENCE "/nop" | |
51 #endif | |
52 | |
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size) | |
2504 | 54 { |
2508 | 55 uint8_t *dest = dst; |
2677 | 56 const uint8_t *s = src; |
57 const uint8_t *end; | |
2510 | 58 #ifdef HAVE_MMX |
59 uint8_t *mm_end; | |
60 #endif | |
2504 | 61 end = s + src_size; |
2510 | 62 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
63 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2740 | 64 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
65 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2740 | 66 if(mm_end == end) mm_end -= MMREG_SIZE*4; |
2510 | 67 while(s < mm_end) |
68 { | |
2511 | 69 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
70 PREFETCH" 32%1\n\t" |
2510 | 71 "movd %1, %%mm0\n\t" |
2738 | 72 "punpckldq 3%1, %%mm0\n\t" |
73 "movd 6%1, %%mm1\n\t" | |
74 "punpckldq 9%1, %%mm1\n\t" | |
75 "movd 12%1, %%mm2\n\t" | |
76 "punpckldq 15%1, %%mm2\n\t" | |
77 "movd 18%1, %%mm3\n\t" | |
78 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 79 "pand %%mm7, %%mm0\n\t" |
2738 | 80 "pand %%mm7, %%mm1\n\t" |
2510 | 81 "pand %%mm7, %%mm2\n\t" |
2738 | 82 "pand %%mm7, %%mm3\n\t" |
2511 | 83 MOVNTQ" %%mm0, %0\n\t" |
2738 | 84 MOVNTQ" %%mm1, 8%0\n\t" |
85 MOVNTQ" %%mm2, 16%0\n\t" | |
86 MOVNTQ" %%mm3, 24%0" | |
2510 | 87 :"=m"(*dest) |
88 :"m"(*s) | |
89 :"memory"); | |
2738 | 90 dest += 32; |
91 s += 24; | |
2510 | 92 } |
2513 | 93 __asm __volatile(SFENCE:::"memory"); |
2511 | 94 __asm __volatile(EMMS:::"memory"); |
2510 | 95 #endif |
2504 | 96 while(s < end) |
97 { | |
2508 | 98 *dest++ = *s++; |
99 *dest++ = *s++; | |
100 *dest++ = *s++; | |
101 *dest++ = 0; | |
2504 | 102 } |
103 } | |
2505 | 104 |
3132 | 105 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 106 { |
107 uint8_t *dest = dst; | |
2677 | 108 const uint8_t *s = src; |
109 const uint8_t *end; | |
2517 | 110 #ifdef HAVE_MMX |
111 uint8_t *mm_end; | |
112 #endif | |
2505 | 113 end = s + src_size; |
2517 | 114 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
115 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
116 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2517 | 117 while(s < mm_end) |
118 { | |
119 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
120 PREFETCH" 32%1\n\t" |
2517 | 121 "movq %1, %%mm0\n\t" |
122 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
123 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
124 "movq 24%1, %%mm5\n\t" |
2517 | 125 "movq %%mm0, %%mm2\n\t" |
126 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
127 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
128 "movq %%mm5, %%mm7\n\t" |
2517 | 129 "psrlq $8, %%mm2\n\t" |
130 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
131 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
132 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
133 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
134 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
135 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
136 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
137 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
138 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
139 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
140 "pand %3, %%mm7\n\t" |
2517 | 141 "por %%mm2, %%mm0\n\t" |
142 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
143 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
144 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
145 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
146 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
147 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
148 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
149 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
150 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
153 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
154 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
156 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
157 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
158 "por %%mm5, %%mm4\n\t" |
3132 | 159 |
2517 | 160 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 MOVNTQ" %%mm4, 16%0" |
2517 | 163 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
165 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 166 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
167 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
168 s += 32; |
2517 | 169 } |
170 __asm __volatile(SFENCE:::"memory"); | |
171 __asm __volatile(EMMS:::"memory"); | |
172 #endif | |
2505 | 173 while(s < end) |
174 { | |
175 *dest++ = *s++; | |
176 *dest++ = *s++; | |
177 *dest++ = *s++; | |
178 s++; | |
179 } | |
180 } | |
2506 | 181 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
182 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
183 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
184 ported to gcc & bugfixed : A'rpi |
2564 | 185 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
186 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
187 */ |
3132 | 188 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 189 { |
190 #ifdef HAVE_MMX | |
2677 | 191 register const char* s=src+src_size; |
2506 | 192 register char* d=dst+src_size; |
193 register int offs=-src_size; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
194 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
195 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
196 "movq %0, %%mm4\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
197 ::"m"(mask15s)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
198 while(offs<0) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
199 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
200 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
201 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
202 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
203 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
204 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
205 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
206 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
207 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
208 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
209 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
210 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
211 MOVNTQ" %%mm2, 8%0" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
212 :"=m"(*(d+offs)) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
213 :"m"(*(s+offs)) |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
214 ); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
215 offs+=16; |
2506 | 216 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
217 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
218 __asm __volatile(EMMS:::"memory"); |
2506 | 219 #else |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
220 #if 0 |
2677 | 221 const uint16_t *s1=( uint16_t * )src; |
2506 | 222 uint16_t *d1=( uint16_t * )dst; |
223 uint16_t *e=((uint8_t *)s1)+src_size; | |
224 while( s1<e ){ | |
225 register int x=*( s1++ ); | |
226 /* rrrrrggggggbbbbb | |
227 0rrrrrgggggbbbbb | |
228 0111 1111 1110 0000=0x7FE0 | |
229 00000000000001 1111=0x001F */ | |
230 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); | |
231 } | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
232 #else |
2718 | 233 const unsigned *s1=( unsigned * )src; |
234 unsigned *d1=( unsigned * )dst; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
235 int i; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
236 int size= src_size>>2; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
237 for(i=0; i<size; i++) |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
238 { |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
239 register int x= s1[i]; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
240 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
241 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
242 |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
243 } |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
244 #endif |
2506 | 245 #endif |
246 } | |
2694 | 247 |
3132 | 248 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 249 { |
2741 | 250 #ifdef HAVE_MMX |
251 const uint8_t *s = src; | |
252 const uint8_t *end,*mm_end; | |
253 uint16_t *d = (uint16_t *)dst; | |
254 end = s + src_size; | |
255 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
256 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
257 __asm __volatile( | |
258 "movq %0, %%mm7\n\t" | |
259 "movq %1, %%mm6\n\t" | |
260 ::"m"(red_16mask),"m"(green_16mask)); | |
261 while(s < mm_end) | |
262 { | |
263 __asm __volatile( | |
264 PREFETCH" 32%1\n\t" | |
265 "movd %1, %%mm0\n\t" | |
266 "movd 4%1, %%mm3\n\t" | |
267 "punpckldq 8%1, %%mm0\n\t" | |
268 "punpckldq 12%1, %%mm3\n\t" | |
269 "movq %%mm0, %%mm1\n\t" | |
270 "movq %%mm0, %%mm2\n\t" | |
271 "movq %%mm3, %%mm4\n\t" | |
272 "movq %%mm3, %%mm5\n\t" | |
273 "psrlq $3, %%mm0\n\t" | |
274 "psrlq $3, %%mm3\n\t" | |
275 "pand %2, %%mm0\n\t" | |
276 "pand %2, %%mm3\n\t" | |
277 "psrlq $5, %%mm1\n\t" | |
278 "psrlq $5, %%mm4\n\t" | |
279 "pand %%mm6, %%mm1\n\t" | |
280 "pand %%mm6, %%mm4\n\t" | |
281 "psrlq $8, %%mm2\n\t" | |
282 "psrlq $8, %%mm5\n\t" | |
283 "pand %%mm7, %%mm2\n\t" | |
284 "pand %%mm7, %%mm5\n\t" | |
285 "por %%mm1, %%mm0\n\t" | |
286 "por %%mm4, %%mm3\n\t" | |
287 "por %%mm2, %%mm0\n\t" | |
288 "por %%mm5, %%mm3\n\t" | |
289 "psllq $16, %%mm3\n\t" | |
290 "por %%mm3, %%mm0\n\t" | |
291 MOVNTQ" %%mm0, %0\n\t" | |
292 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
293 d += 4; | |
294 s += 16; | |
295 } | |
296 while(s < end) | |
297 { | |
298 const int b= *s++; | |
299 const int g= *s++; | |
300 const int r= *s++; | |
301 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
302 } | |
303 __asm __volatile(SFENCE:::"memory"); | |
304 __asm __volatile(EMMS:::"memory"); | |
305 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
306 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
307 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
308 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 309 { |
310 const int b= src[i+0]; | |
311 const int g= src[i+1]; | |
312 const int r= src[i+2]; | |
313 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
314 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2694 | 315 } |
2741 | 316 #endif |
2694 | 317 } |
318 | |
3132 | 319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 320 { |
2741 | 321 #ifdef HAVE_MMX |
322 const uint8_t *s = src; | |
323 const uint8_t *end,*mm_end; | |
324 uint16_t *d = (uint16_t *)dst; | |
325 end = s + src_size; | |
326 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
327 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
328 __asm __volatile( | |
329 "movq %0, %%mm7\n\t" | |
330 "movq %1, %%mm6\n\t" | |
331 ::"m"(red_15mask),"m"(green_15mask)); | |
332 while(s < mm_end) | |
333 { | |
334 __asm __volatile( | |
335 PREFETCH" 32%1\n\t" | |
336 "movd %1, %%mm0\n\t" | |
337 "movd 4%1, %%mm3\n\t" | |
338 "punpckldq 8%1, %%mm0\n\t" | |
339 "punpckldq 12%1, %%mm3\n\t" | |
340 "movq %%mm0, %%mm1\n\t" | |
341 "movq %%mm0, %%mm2\n\t" | |
342 "movq %%mm3, %%mm4\n\t" | |
343 "movq %%mm3, %%mm5\n\t" | |
344 "psrlq $3, %%mm0\n\t" | |
345 "psrlq $3, %%mm3\n\t" | |
346 "pand %2, %%mm0\n\t" | |
347 "pand %2, %%mm3\n\t" | |
348 "psrlq $6, %%mm1\n\t" | |
349 "psrlq $6, %%mm4\n\t" | |
350 "pand %%mm6, %%mm1\n\t" | |
351 "pand %%mm6, %%mm4\n\t" | |
352 "psrlq $9, %%mm2\n\t" | |
353 "psrlq $9, %%mm5\n\t" | |
354 "pand %%mm7, %%mm2\n\t" | |
355 "pand %%mm7, %%mm5\n\t" | |
356 "por %%mm1, %%mm0\n\t" | |
357 "por %%mm4, %%mm3\n\t" | |
358 "por %%mm2, %%mm0\n\t" | |
359 "por %%mm5, %%mm3\n\t" | |
360 "psllq $16, %%mm3\n\t" | |
361 "por %%mm3, %%mm0\n\t" | |
362 MOVNTQ" %%mm0, %0\n\t" | |
363 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
364 d += 4; | |
365 s += 16; | |
366 } | |
367 while(s < end) | |
368 { | |
369 const int b= *s++; | |
370 const int g= *s++; | |
371 const int r= *s++; | |
372 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
373 } | |
374 __asm __volatile(SFENCE:::"memory"); | |
375 __asm __volatile(EMMS:::"memory"); | |
376 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
377 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
378 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
379 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 380 { |
381 const int b= src[i+0]; | |
382 const int g= src[i+1]; | |
383 const int r= src[i+2]; | |
384 | |
2720 | 385 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2694 | 386 } |
2741 | 387 #endif |
2694 | 388 } |
389 | |
3132 | 390 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 391 { |
2740 | 392 #ifdef HAVE_MMX |
393 const uint8_t *s = src; | |
394 const uint8_t *end,*mm_end; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
395 uint16_t *d = (uint16_t *)dst; |
2740 | 396 end = s + src_size; |
397 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
2738 | 398 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
399 __asm __volatile( | |
400 "movq %0, %%mm7\n\t" | |
401 "movq %1, %%mm6\n\t" | |
2741 | 402 ::"m"(red_16mask),"m"(green_16mask)); |
403 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
2740 | 404 while(s < mm_end) |
2738 | 405 { |
406 __asm __volatile( | |
407 PREFETCH" 32%1\n\t" | |
408 "movd %1, %%mm0\n\t" | |
2740 | 409 "movd 3%1, %%mm3\n\t" |
410 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 411 "punpckldq 9%1, %%mm3\n\t" |
412 "movq %%mm0, %%mm1\n\t" | |
413 "movq %%mm0, %%mm2\n\t" | |
414 "movq %%mm3, %%mm4\n\t" | |
415 "movq %%mm3, %%mm5\n\t" | |
416 "psrlq $3, %%mm0\n\t" | |
417 "psrlq $3, %%mm3\n\t" | |
2740 | 418 "pand %2, %%mm0\n\t" |
419 "pand %2, %%mm3\n\t" | |
420 "psrlq $5, %%mm1\n\t" | |
421 "psrlq $5, %%mm4\n\t" | |
422 "pand %%mm6, %%mm1\n\t" | |
423 "pand %%mm6, %%mm4\n\t" | |
424 "psrlq $8, %%mm2\n\t" | |
425 "psrlq $8, %%mm5\n\t" | |
426 "pand %%mm7, %%mm2\n\t" | |
427 "pand %%mm7, %%mm5\n\t" | |
2738 | 428 "por %%mm1, %%mm0\n\t" |
2740 | 429 "por %%mm4, %%mm3\n\t" |
2738 | 430 "por %%mm2, %%mm0\n\t" |
431 "por %%mm5, %%mm3\n\t" | |
2740 | 432 "psllq $16, %%mm3\n\t" |
433 "por %%mm3, %%mm0\n\t" | |
2738 | 434 MOVNTQ" %%mm0, %0\n\t" |
2741 | 435 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 436 d += 4; |
437 s += 12; | |
2738 | 438 } |
2740 | 439 while(s < end) |
440 { | |
441 const int b= *s++; | |
442 const int g= *s++; | |
443 const int r= *s++; | |
444 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
445 } | |
446 __asm __volatile(SFENCE:::"memory"); | |
447 __asm __volatile(EMMS:::"memory"); | |
448 #else | |
449 unsigned j,i,num_pixels=src_size/3; | |
450 uint16_t *d = (uint16_t *)dst; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
451 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 452 { |
453 const int b= src[i+0]; | |
454 const int g= src[i+1]; | |
455 const int r= src[i+2]; | |
456 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
457 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2718 | 458 } |
2740 | 459 #endif |
2718 | 460 } |
461 | |
3132 | 462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 463 { |
2741 | 464 #ifdef HAVE_MMX |
465 const uint8_t *s = src; | |
466 const uint8_t *end,*mm_end; | |
467 uint16_t *d = (uint16_t *)dst; | |
468 end = s + src_size; | |
469 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
470 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
471 __asm __volatile( | |
472 "movq %0, %%mm7\n\t" | |
473 "movq %1, %%mm6\n\t" | |
474 ::"m"(red_15mask),"m"(green_15mask)); | |
475 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
476 while(s < mm_end) | |
477 { | |
478 __asm __volatile( | |
479 PREFETCH" 32%1\n\t" | |
480 "movd %1, %%mm0\n\t" | |
481 "movd 3%1, %%mm3\n\t" | |
482 "punpckldq 6%1, %%mm0\n\t" | |
483 "punpckldq 9%1, %%mm3\n\t" | |
484 "movq %%mm0, %%mm1\n\t" | |
485 "movq %%mm0, %%mm2\n\t" | |
486 "movq %%mm3, %%mm4\n\t" | |
487 "movq %%mm3, %%mm5\n\t" | |
488 "psrlq $3, %%mm0\n\t" | |
489 "psrlq $3, %%mm3\n\t" | |
490 "pand %2, %%mm0\n\t" | |
491 "pand %2, %%mm3\n\t" | |
492 "psrlq $6, %%mm1\n\t" | |
493 "psrlq $6, %%mm4\n\t" | |
494 "pand %%mm6, %%mm1\n\t" | |
495 "pand %%mm6, %%mm4\n\t" | |
496 "psrlq $9, %%mm2\n\t" | |
497 "psrlq $9, %%mm5\n\t" | |
498 "pand %%mm7, %%mm2\n\t" | |
499 "pand %%mm7, %%mm5\n\t" | |
500 "por %%mm1, %%mm0\n\t" | |
501 "por %%mm4, %%mm3\n\t" | |
502 "por %%mm2, %%mm0\n\t" | |
503 "por %%mm5, %%mm3\n\t" | |
504 "psllq $16, %%mm3\n\t" | |
505 "por %%mm3, %%mm0\n\t" | |
506 MOVNTQ" %%mm0, %0\n\t" | |
507 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
508 d += 4; | |
509 s += 12; | |
510 } | |
511 while(s < end) | |
512 { | |
513 const int b= *s++; | |
514 const int g= *s++; | |
515 const int r= *s++; | |
516 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
517 } | |
518 __asm __volatile(SFENCE:::"memory"); | |
519 __asm __volatile(EMMS:::"memory"); | |
520 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
521 unsigned j,i,num_pixels=src_size/3; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
522 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
523 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 524 { |
525 const int b= src[i+0]; | |
526 const int g= src[i+1]; | |
527 const int r= src[i+2]; | |
528 | |
2720 | 529 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2718 | 530 } |
2741 | 531 #endif |
2718 | 532 } |
2694 | 533 |
3132 | 534 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
2755 | 535 { |
536 int num_pixels= src_size >> 2; | |
537 #ifdef HAVE_MMX | |
538 asm volatile ( | |
539 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
540 ".balign 16 \n\t" |
2755 | 541 "1: \n\t" |
542 PREFETCH" 32(%0, %%eax) \n\t" | |
543 "movq (%0, %%eax), %%mm0 \n\t" | |
544 "movq %%mm0, %%mm1 \n\t" | |
545 "movq %%mm0, %%mm2 \n\t" | |
546 "pslld $16, %%mm0 \n\t" | |
547 "psrld $16, %%mm1 \n\t" | |
548 "pand mask32r, %%mm0 \n\t" | |
549 "pand mask32g, %%mm2 \n\t" | |
550 "pand mask32b, %%mm1 \n\t" | |
551 "por %%mm0, %%mm2 \n\t" | |
552 "por %%mm1, %%mm2 \n\t" | |
553 MOVNTQ" %%mm2, (%1, %%eax) \n\t" | |
554 "addl $2, %%eax \n\t" | |
555 "cmpl %2, %%eax \n\t" | |
556 " jb 1b \n\t" | |
557 :: "r" (src), "r"(dst), "r" (num_pixels) | |
558 : "%eax" | |
559 ); | |
2766 | 560 |
561 __asm __volatile(SFENCE:::"memory"); | |
562 __asm __volatile(EMMS:::"memory"); | |
2755 | 563 #else |
564 int i; | |
565 for(i=0; i<num_pixels; i++) | |
566 { | |
567 dst[4*i + 0] = src[4*i + 2]; | |
568 dst[4*i + 1] = src[4*i + 1]; | |
569 dst[4*i + 2] = src[4*i + 0]; | |
570 } | |
571 #endif | |
572 } | |
573 | |
2702 | 574 /** |
575 * | |
2724 | 576 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
577 * problem for anyone then tell me, and ill fix it) | |
2702 | 578 */ |
3132 | 579 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 580 unsigned int width, unsigned int height, |
581 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
2701 | 582 { |
2723 | 583 int y; |
584 const int chromWidth= width>>1; | |
585 for(y=0; y<height; y++) | |
586 { | |
2702 | 587 #ifdef HAVE_MMX |
2723 | 588 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
589 asm volatile( | |
590 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
591 ".balign 16 \n\t" |
2723 | 592 "1: \n\t" |
593 PREFETCH" 32(%1, %%eax, 2) \n\t" | |
594 PREFETCH" 32(%2, %%eax) \n\t" | |
595 PREFETCH" 32(%3, %%eax) \n\t" | |
596 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | |
597 "movq %%mm0, %%mm2 \n\t" // U(0) | |
598 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | |
599 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
600 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
601 | |
602 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | |
603 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | |
604 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
605 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
606 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
607 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
608 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
609 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 610 |
2723 | 611 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" |
612 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | |
613 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | |
614 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | |
2702 | 615 |
2723 | 616 "addl $8, %%eax \n\t" |
617 "cmpl %4, %%eax \n\t" | |
618 " jb 1b \n\t" | |
619 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) | |
620 : "%eax" | |
621 ); | |
2702 | 622 #else |
2723 | 623 int i; |
624 for(i=0; i<chromWidth; i++) | |
625 { | |
626 dst[4*i+0] = ysrc[2*i+0]; | |
627 dst[4*i+1] = usrc[i]; | |
628 dst[4*i+2] = ysrc[2*i+1]; | |
629 dst[4*i+3] = vsrc[i]; | |
630 } | |
631 #endif | |
632 if(y&1) | |
633 { | |
634 usrc += chromStride; | |
635 vsrc += chromStride; | |
636 } | |
637 ysrc += lumStride; | |
638 dst += dstStride; | |
2701 | 639 } |
2723 | 640 #ifdef HAVE_MMX |
641 asm( EMMS" \n\t" | |
642 SFENCE" \n\t" | |
643 :::"memory"); | |
2702 | 644 #endif |
2701 | 645 } |
646 | |
2724 | 647 /** |
648 * | |
649 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
650 * problem for anyone then tell me, and ill fix it) | |
651 */ | |
3132 | 652 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2725 | 653 unsigned int width, unsigned int height, |
654 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
2701 | 655 { |
2724 | 656 int y; |
657 const int chromWidth= width>>1; | |
658 for(y=0; y<height; y+=2) | |
659 { | |
2704 | 660 #ifdef HAVE_MMX |
2724 | 661 asm volatile( |
662 "xorl %%eax, %%eax \n\t" | |
663 "pcmpeqw %%mm7, %%mm7 \n\t" | |
664 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
665 ".balign 16 \n\t" |
2724 | 666 "1: \n\t" |
667 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
668 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
669 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
670 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
671 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
672 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
673 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
674 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
675 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
676 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
677 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
678 | |
679 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2704 | 680 |
2724 | 681 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) |
682 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | |
683 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
684 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
685 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
686 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
687 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
688 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
689 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
690 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 691 |
2724 | 692 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" |
693 | |
694 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
695 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
696 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
697 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
698 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
699 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
700 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
701 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 702 |
2724 | 703 MOVNTQ" %%mm0, (%3, %%eax) \n\t" |
704 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
705 | |
706 "addl $8, %%eax \n\t" | |
707 "cmpl %4, %%eax \n\t" | |
708 " jb 1b \n\t" | |
2725 | 709 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
710 : "memory", "%eax" | |
711 ); | |
2704 | 712 |
2806 | 713 ydst += lumStride; |
714 src += srcStride; | |
715 | |
2725 | 716 asm volatile( |
717 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
718 ".balign 16 \n\t" |
2724 | 719 "1: \n\t" |
720 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
721 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
722 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
723 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
724 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
725 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
726 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
727 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
728 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
729 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
730 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 731 |
2724 | 732 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" |
733 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
734 | |
735 "addl $8, %%eax \n\t" | |
2725 | 736 "cmpl %4, %%eax \n\t" |
2724 | 737 " jb 1b \n\t" |
2704 | 738 |
2806 | 739 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
2724 | 740 : "memory", "%eax" |
741 ); | |
2704 | 742 #else |
2724 | 743 int i; |
744 for(i=0; i<chromWidth; i++) | |
745 { | |
746 ydst[2*i+0] = src[4*i+0]; | |
747 udst[i] = src[4*i+1]; | |
748 ydst[2*i+1] = src[4*i+2]; | |
749 vdst[i] = src[4*i+3]; | |
750 } | |
751 ydst += lumStride; | |
752 src += srcStride; | |
753 | |
754 for(i=0; i<chromWidth; i++) | |
755 { | |
756 ydst[2*i+0] = src[4*i+0]; | |
757 ydst[2*i+1] = src[4*i+2]; | |
758 } | |
759 #endif | |
760 udst += chromStride; | |
761 vdst += chromStride; | |
762 ydst += lumStride; | |
763 src += srcStride; | |
2701 | 764 } |
2724 | 765 #ifdef HAVE_MMX |
2847 | 766 asm volatile( EMMS" \n\t" |
767 SFENCE" \n\t" | |
768 :::"memory"); | |
2704 | 769 #endif |
2723 | 770 } |
2801 | 771 |
772 /** | |
773 * | |
774 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
775 * problem for anyone then tell me, and ill fix it) | |
3132 | 776 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version |
2801 | 777 */ |
3132 | 778 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2801 | 779 unsigned int width, unsigned int height, |
780 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
781 { | |
782 int y; | |
783 const int chromWidth= width>>1; | |
784 for(y=0; y<height; y+=2) | |
785 { | |
2847 | 786 #ifdef HAVE_MMX |
787 asm volatile( | |
788 "xorl %%eax, %%eax \n\t" | |
789 "pcmpeqw %%mm7, %%mm7 \n\t" | |
790 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
791 ".balign 16 \n\t" | |
792 "1: \n\t" | |
793 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
794 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
795 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
796 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
797 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
798 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
799 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
800 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
801 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
802 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
803 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
804 | |
805 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
806 | |
807 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
808 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
809 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
810 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
811 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
812 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
813 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
814 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
815 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
816 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
817 | |
818 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
819 | |
820 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
821 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
822 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
823 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
824 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
825 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
826 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
827 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
828 | |
829 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
830 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
831 | |
832 "addl $8, %%eax \n\t" | |
833 "cmpl %4, %%eax \n\t" | |
834 " jb 1b \n\t" | |
835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
836 : "memory", "%eax" | |
837 ); | |
838 | |
839 ydst += lumStride; | |
840 src += srcStride; | |
841 | |
842 asm volatile( | |
843 "xorl %%eax, %%eax \n\t" | |
844 ".balign 16 \n\t" | |
845 "1: \n\t" | |
846 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
847 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
848 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
849 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
850 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
851 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
852 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
853 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
854 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
855 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
856 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
857 | |
858 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
859 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
860 | |
861 "addl $8, %%eax \n\t" | |
862 "cmpl %4, %%eax \n\t" | |
863 " jb 1b \n\t" | |
864 | |
865 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
866 : "memory", "%eax" | |
867 ); | |
868 #else | |
2801 | 869 int i; |
870 for(i=0; i<chromWidth; i++) | |
871 { | |
872 udst[i] = src[4*i+0]; | |
873 ydst[2*i+0] = src[4*i+1]; | |
874 vdst[i] = src[4*i+2]; | |
875 ydst[2*i+1] = src[4*i+3]; | |
876 } | |
877 ydst += lumStride; | |
878 src += srcStride; | |
879 | |
880 for(i=0; i<chromWidth; i++) | |
881 { | |
882 ydst[2*i+0] = src[4*i+1]; | |
883 ydst[2*i+1] = src[4*i+3]; | |
884 } | |
2847 | 885 #endif |
2801 | 886 udst += chromStride; |
887 vdst += chromStride; | |
888 ydst += lumStride; | |
889 src += srcStride; | |
890 } | |
2847 | 891 #ifdef HAVE_MMX |
892 asm volatile( EMMS" \n\t" | |
893 SFENCE" \n\t" | |
894 :::"memory"); | |
895 #endif | |
2801 | 896 } |
897 | |
3132 | 898 /** |
899 * | |
900 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
901 * problem for anyone then tell me, and ill fix it) | |
902 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version | |
903 */ | |
904 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
905 unsigned int width, unsigned int height, | |
906 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
907 { | |
908 int y; | |
909 const int chromWidth= width>>1; | |
910 for(y=0; y<height; y+=2) | |
911 { | |
912 int i; | |
913 for(i=0; i<chromWidth; i++) | |
914 { | |
915 unsigned int b= src[6*i+0]; | |
916 unsigned int g= src[6*i+1]; | |
917 unsigned int r= src[6*i+2]; | |
2801 | 918 |
3633 | 919 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
920 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
921 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
3132 | 922 |
923 udst[i] = U; | |
924 vdst[i] = V; | |
925 ydst[2*i] = Y; | |
926 | |
927 b= src[6*i+3]; | |
928 g= src[6*i+4]; | |
929 r= src[6*i+5]; | |
930 | |
3633 | 931 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 932 ydst[2*i+1] = Y; |
933 } | |
934 ydst += lumStride; | |
935 src += srcStride; | |
936 | |
937 for(i=0; i<chromWidth; i++) | |
938 { | |
939 unsigned int b= src[6*i+0]; | |
940 unsigned int g= src[6*i+1]; | |
941 unsigned int r= src[6*i+2]; | |
942 | |
3633 | 943 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 944 |
945 ydst[2*i] = Y; | |
946 | |
947 b= src[6*i+3]; | |
948 g= src[6*i+4]; | |
949 r= src[6*i+5]; | |
950 | |
3633 | 951 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 952 ydst[2*i+1] = Y; |
953 } | |
954 udst += chromStride; | |
955 vdst += chromStride; | |
956 ydst += lumStride; | |
957 src += srcStride; | |
958 } | |
959 } |