Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 5386:e93fc4a8851a
2-pass lavc encoding fixed
author | arpi |
---|---|
date | Fri, 29 Mar 2002 01:01:35 +0000 |
parents | 0bd1c35aa42c |
children | 21bd4b32abb4 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
3132 | 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
9 */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
10 |
3132 | 11 #undef PREFETCH |
12 #undef MOVNTQ | |
13 #undef EMMS | |
14 #undef SFENCE | |
15 #undef MMREG_SIZE | |
16 #undef PREFETCHW | |
17 #undef PAVGB | |
2755 | 18 |
3132 | 19 #ifdef HAVE_SSE2 |
20 #define MMREG_SIZE 16 | |
21 #else | |
22 #define MMREG_SIZE 8 | |
2535 | 23 #endif |
2513 | 24 |
3132 | 25 #ifdef HAVE_3DNOW |
26 #define PREFETCH "prefetch" | |
27 #define PREFETCHW "prefetchw" | |
28 #define PAVGB "pavgusb" | |
29 #elif defined ( HAVE_MMX2 ) | |
30 #define PREFETCH "prefetchnta" | |
31 #define PREFETCHW "prefetcht0" | |
32 #define PAVGB "pavgb" | |
33 #else | |
34 #define PREFETCH "/nop" | |
35 #define PREFETCHW "/nop" | |
36 #endif | |
37 | |
38 #ifdef HAVE_3DNOW | |
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
40 #define EMMS "femms" | |
41 #else | |
42 #define EMMS "emms" | |
43 #endif | |
44 | |
45 #ifdef HAVE_MMX2 | |
46 #define MOVNTQ "movntq" | |
47 #define SFENCE "sfence" | |
48 #else | |
49 #define MOVNTQ "movq" | |
50 #define SFENCE "/nop" | |
51 #endif | |
52 | |
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size) | |
2504 | 54 { |
2508 | 55 uint8_t *dest = dst; |
2677 | 56 const uint8_t *s = src; |
57 const uint8_t *end; | |
2510 | 58 #ifdef HAVE_MMX |
59 uint8_t *mm_end; | |
60 #endif | |
2504 | 61 end = s + src_size; |
2510 | 62 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
63 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2740 | 64 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
65 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2740 | 66 if(mm_end == end) mm_end -= MMREG_SIZE*4; |
2510 | 67 while(s < mm_end) |
68 { | |
2511 | 69 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
70 PREFETCH" 32%1\n\t" |
2510 | 71 "movd %1, %%mm0\n\t" |
2738 | 72 "punpckldq 3%1, %%mm0\n\t" |
73 "movd 6%1, %%mm1\n\t" | |
74 "punpckldq 9%1, %%mm1\n\t" | |
75 "movd 12%1, %%mm2\n\t" | |
76 "punpckldq 15%1, %%mm2\n\t" | |
77 "movd 18%1, %%mm3\n\t" | |
78 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 79 "pand %%mm7, %%mm0\n\t" |
2738 | 80 "pand %%mm7, %%mm1\n\t" |
2510 | 81 "pand %%mm7, %%mm2\n\t" |
2738 | 82 "pand %%mm7, %%mm3\n\t" |
2511 | 83 MOVNTQ" %%mm0, %0\n\t" |
2738 | 84 MOVNTQ" %%mm1, 8%0\n\t" |
85 MOVNTQ" %%mm2, 16%0\n\t" | |
86 MOVNTQ" %%mm3, 24%0" | |
2510 | 87 :"=m"(*dest) |
88 :"m"(*s) | |
89 :"memory"); | |
2738 | 90 dest += 32; |
91 s += 24; | |
2510 | 92 } |
2513 | 93 __asm __volatile(SFENCE:::"memory"); |
2511 | 94 __asm __volatile(EMMS:::"memory"); |
2510 | 95 #endif |
2504 | 96 while(s < end) |
97 { | |
2508 | 98 *dest++ = *s++; |
99 *dest++ = *s++; | |
100 *dest++ = *s++; | |
101 *dest++ = 0; | |
2504 | 102 } |
103 } | |
2505 | 104 |
3132 | 105 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 106 { |
107 uint8_t *dest = dst; | |
2677 | 108 const uint8_t *s = src; |
109 const uint8_t *end; | |
2517 | 110 #ifdef HAVE_MMX |
111 uint8_t *mm_end; | |
112 #endif | |
2505 | 113 end = s + src_size; |
2517 | 114 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
115 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
116 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2517 | 117 while(s < mm_end) |
118 { | |
119 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
120 PREFETCH" 32%1\n\t" |
2517 | 121 "movq %1, %%mm0\n\t" |
122 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
123 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
124 "movq 24%1, %%mm5\n\t" |
2517 | 125 "movq %%mm0, %%mm2\n\t" |
126 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
127 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
128 "movq %%mm5, %%mm7\n\t" |
2517 | 129 "psrlq $8, %%mm2\n\t" |
130 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
131 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
132 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
133 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
134 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
135 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
136 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
137 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
138 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
139 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
140 "pand %3, %%mm7\n\t" |
2517 | 141 "por %%mm2, %%mm0\n\t" |
142 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
143 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
144 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
145 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
146 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
147 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
148 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
149 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
150 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
153 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
154 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
156 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
157 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
158 "por %%mm5, %%mm4\n\t" |
3132 | 159 |
2517 | 160 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 MOVNTQ" %%mm4, 16%0" |
2517 | 163 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
165 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 166 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
167 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
168 s += 32; |
2517 | 169 } |
170 __asm __volatile(SFENCE:::"memory"); | |
171 __asm __volatile(EMMS:::"memory"); | |
172 #endif | |
2505 | 173 while(s < end) |
174 { | |
175 *dest++ = *s++; | |
176 *dest++ = *s++; | |
177 *dest++ = *s++; | |
178 s++; | |
179 } | |
180 } | |
2506 | 181 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
182 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
183 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
184 ported to gcc & bugfixed : A'rpi |
2564 | 185 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
186 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
187 */ |
3132 | 188 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 189 { |
190 #ifdef HAVE_MMX | |
2677 | 191 register const char* s=src+src_size; |
2506 | 192 register char* d=dst+src_size; |
193 register int offs=-src_size; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
194 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
195 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
196 "movq %0, %%mm4\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
197 ::"m"(mask15s)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
198 while(offs<0) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
199 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
200 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
201 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
202 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
203 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
204 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
205 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
206 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
207 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
208 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
209 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
210 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
211 MOVNTQ" %%mm2, 8%0" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
212 :"=m"(*(d+offs)) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
213 :"m"(*(s+offs)) |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
214 ); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
215 offs+=16; |
2506 | 216 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
217 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
218 __asm __volatile(EMMS:::"memory"); |
2506 | 219 #else |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
220 #if 0 |
2677 | 221 const uint16_t *s1=( uint16_t * )src; |
2506 | 222 uint16_t *d1=( uint16_t * )dst; |
223 uint16_t *e=((uint8_t *)s1)+src_size; | |
224 while( s1<e ){ | |
225 register int x=*( s1++ ); | |
226 /* rrrrrggggggbbbbb | |
227 0rrrrrgggggbbbbb | |
228 0111 1111 1110 0000=0x7FE0 | |
229 00000000000001 1111=0x001F */ | |
230 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); | |
231 } | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
232 #else |
2718 | 233 const unsigned *s1=( unsigned * )src; |
234 unsigned *d1=( unsigned * )dst; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
235 int i; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
236 int size= src_size>>2; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
237 for(i=0; i<size; i++) |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
238 { |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
239 register int x= s1[i]; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
240 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
241 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
242 |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
243 } |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
244 #endif |
2506 | 245 #endif |
246 } | |
2694 | 247 |
3132 | 248 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 249 { |
2741 | 250 #ifdef HAVE_MMX |
251 const uint8_t *s = src; | |
252 const uint8_t *end,*mm_end; | |
253 uint16_t *d = (uint16_t *)dst; | |
254 end = s + src_size; | |
255 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
256 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
257 __asm __volatile( | |
258 "movq %0, %%mm7\n\t" | |
259 "movq %1, %%mm6\n\t" | |
260 ::"m"(red_16mask),"m"(green_16mask)); | |
261 while(s < mm_end) | |
262 { | |
263 __asm __volatile( | |
264 PREFETCH" 32%1\n\t" | |
265 "movd %1, %%mm0\n\t" | |
266 "movd 4%1, %%mm3\n\t" | |
267 "punpckldq 8%1, %%mm0\n\t" | |
268 "punpckldq 12%1, %%mm3\n\t" | |
269 "movq %%mm0, %%mm1\n\t" | |
270 "movq %%mm0, %%mm2\n\t" | |
271 "movq %%mm3, %%mm4\n\t" | |
272 "movq %%mm3, %%mm5\n\t" | |
273 "psrlq $3, %%mm0\n\t" | |
274 "psrlq $3, %%mm3\n\t" | |
275 "pand %2, %%mm0\n\t" | |
276 "pand %2, %%mm3\n\t" | |
277 "psrlq $5, %%mm1\n\t" | |
278 "psrlq $5, %%mm4\n\t" | |
279 "pand %%mm6, %%mm1\n\t" | |
280 "pand %%mm6, %%mm4\n\t" | |
281 "psrlq $8, %%mm2\n\t" | |
282 "psrlq $8, %%mm5\n\t" | |
283 "pand %%mm7, %%mm2\n\t" | |
284 "pand %%mm7, %%mm5\n\t" | |
285 "por %%mm1, %%mm0\n\t" | |
286 "por %%mm4, %%mm3\n\t" | |
287 "por %%mm2, %%mm0\n\t" | |
288 "por %%mm5, %%mm3\n\t" | |
289 "psllq $16, %%mm3\n\t" | |
290 "por %%mm3, %%mm0\n\t" | |
291 MOVNTQ" %%mm0, %0\n\t" | |
292 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
293 d += 4; | |
294 s += 16; | |
295 } | |
296 while(s < end) | |
297 { | |
298 const int b= *s++; | |
299 const int g= *s++; | |
300 const int r= *s++; | |
301 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
302 } | |
303 __asm __volatile(SFENCE:::"memory"); | |
304 __asm __volatile(EMMS:::"memory"); | |
305 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
306 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
307 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
308 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 309 { |
310 const int b= src[i+0]; | |
311 const int g= src[i+1]; | |
312 const int r= src[i+2]; | |
313 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
314 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2694 | 315 } |
2741 | 316 #endif |
2694 | 317 } |
318 | |
3132 | 319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 320 { |
2741 | 321 #ifdef HAVE_MMX |
322 const uint8_t *s = src; | |
323 const uint8_t *end,*mm_end; | |
324 uint16_t *d = (uint16_t *)dst; | |
325 end = s + src_size; | |
326 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
327 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
328 __asm __volatile( | |
329 "movq %0, %%mm7\n\t" | |
330 "movq %1, %%mm6\n\t" | |
331 ::"m"(red_15mask),"m"(green_15mask)); | |
332 while(s < mm_end) | |
333 { | |
334 __asm __volatile( | |
335 PREFETCH" 32%1\n\t" | |
336 "movd %1, %%mm0\n\t" | |
337 "movd 4%1, %%mm3\n\t" | |
338 "punpckldq 8%1, %%mm0\n\t" | |
339 "punpckldq 12%1, %%mm3\n\t" | |
340 "movq %%mm0, %%mm1\n\t" | |
341 "movq %%mm0, %%mm2\n\t" | |
342 "movq %%mm3, %%mm4\n\t" | |
343 "movq %%mm3, %%mm5\n\t" | |
344 "psrlq $3, %%mm0\n\t" | |
345 "psrlq $3, %%mm3\n\t" | |
346 "pand %2, %%mm0\n\t" | |
347 "pand %2, %%mm3\n\t" | |
348 "psrlq $6, %%mm1\n\t" | |
349 "psrlq $6, %%mm4\n\t" | |
350 "pand %%mm6, %%mm1\n\t" | |
351 "pand %%mm6, %%mm4\n\t" | |
352 "psrlq $9, %%mm2\n\t" | |
353 "psrlq $9, %%mm5\n\t" | |
354 "pand %%mm7, %%mm2\n\t" | |
355 "pand %%mm7, %%mm5\n\t" | |
356 "por %%mm1, %%mm0\n\t" | |
357 "por %%mm4, %%mm3\n\t" | |
358 "por %%mm2, %%mm0\n\t" | |
359 "por %%mm5, %%mm3\n\t" | |
360 "psllq $16, %%mm3\n\t" | |
361 "por %%mm3, %%mm0\n\t" | |
362 MOVNTQ" %%mm0, %0\n\t" | |
363 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
364 d += 4; | |
365 s += 16; | |
366 } | |
367 while(s < end) | |
368 { | |
369 const int b= *s++; | |
370 const int g= *s++; | |
371 const int r= *s++; | |
372 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
373 } | |
374 __asm __volatile(SFENCE:::"memory"); | |
375 __asm __volatile(EMMS:::"memory"); | |
376 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
377 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
378 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
379 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 380 { |
381 const int b= src[i+0]; | |
382 const int g= src[i+1]; | |
383 const int r= src[i+2]; | |
384 | |
2720 | 385 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2694 | 386 } |
2741 | 387 #endif |
2694 | 388 } |
389 | |
3132 | 390 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 391 { |
2740 | 392 #ifdef HAVE_MMX |
393 const uint8_t *s = src; | |
394 const uint8_t *end,*mm_end; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
395 uint16_t *d = (uint16_t *)dst; |
2740 | 396 end = s + src_size; |
397 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
2738 | 398 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
399 __asm __volatile( | |
400 "movq %0, %%mm7\n\t" | |
401 "movq %1, %%mm6\n\t" | |
2741 | 402 ::"m"(red_16mask),"m"(green_16mask)); |
403 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
2740 | 404 while(s < mm_end) |
2738 | 405 { |
406 __asm __volatile( | |
407 PREFETCH" 32%1\n\t" | |
408 "movd %1, %%mm0\n\t" | |
2740 | 409 "movd 3%1, %%mm3\n\t" |
410 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 411 "punpckldq 9%1, %%mm3\n\t" |
412 "movq %%mm0, %%mm1\n\t" | |
413 "movq %%mm0, %%mm2\n\t" | |
414 "movq %%mm3, %%mm4\n\t" | |
415 "movq %%mm3, %%mm5\n\t" | |
416 "psrlq $3, %%mm0\n\t" | |
417 "psrlq $3, %%mm3\n\t" | |
2740 | 418 "pand %2, %%mm0\n\t" |
419 "pand %2, %%mm3\n\t" | |
420 "psrlq $5, %%mm1\n\t" | |
421 "psrlq $5, %%mm4\n\t" | |
422 "pand %%mm6, %%mm1\n\t" | |
423 "pand %%mm6, %%mm4\n\t" | |
424 "psrlq $8, %%mm2\n\t" | |
425 "psrlq $8, %%mm5\n\t" | |
426 "pand %%mm7, %%mm2\n\t" | |
427 "pand %%mm7, %%mm5\n\t" | |
2738 | 428 "por %%mm1, %%mm0\n\t" |
2740 | 429 "por %%mm4, %%mm3\n\t" |
2738 | 430 "por %%mm2, %%mm0\n\t" |
431 "por %%mm5, %%mm3\n\t" | |
2740 | 432 "psllq $16, %%mm3\n\t" |
433 "por %%mm3, %%mm0\n\t" | |
2738 | 434 MOVNTQ" %%mm0, %0\n\t" |
2741 | 435 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 436 d += 4; |
437 s += 12; | |
2738 | 438 } |
2740 | 439 while(s < end) |
440 { | |
441 const int b= *s++; | |
442 const int g= *s++; | |
443 const int r= *s++; | |
444 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
445 } | |
446 __asm __volatile(SFENCE:::"memory"); | |
447 __asm __volatile(EMMS:::"memory"); | |
448 #else | |
449 unsigned j,i,num_pixels=src_size/3; | |
450 uint16_t *d = (uint16_t *)dst; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
451 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 452 { |
453 const int b= src[i+0]; | |
454 const int g= src[i+1]; | |
455 const int r= src[i+2]; | |
456 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
457 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2718 | 458 } |
2740 | 459 #endif |
2718 | 460 } |
461 | |
3132 | 462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 463 { |
2741 | 464 #ifdef HAVE_MMX |
465 const uint8_t *s = src; | |
466 const uint8_t *end,*mm_end; | |
467 uint16_t *d = (uint16_t *)dst; | |
468 end = s + src_size; | |
469 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
470 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
471 __asm __volatile( | |
472 "movq %0, %%mm7\n\t" | |
473 "movq %1, %%mm6\n\t" | |
474 ::"m"(red_15mask),"m"(green_15mask)); | |
475 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
476 while(s < mm_end) | |
477 { | |
478 __asm __volatile( | |
479 PREFETCH" 32%1\n\t" | |
480 "movd %1, %%mm0\n\t" | |
481 "movd 3%1, %%mm3\n\t" | |
482 "punpckldq 6%1, %%mm0\n\t" | |
483 "punpckldq 9%1, %%mm3\n\t" | |
484 "movq %%mm0, %%mm1\n\t" | |
485 "movq %%mm0, %%mm2\n\t" | |
486 "movq %%mm3, %%mm4\n\t" | |
487 "movq %%mm3, %%mm5\n\t" | |
488 "psrlq $3, %%mm0\n\t" | |
489 "psrlq $3, %%mm3\n\t" | |
490 "pand %2, %%mm0\n\t" | |
491 "pand %2, %%mm3\n\t" | |
492 "psrlq $6, %%mm1\n\t" | |
493 "psrlq $6, %%mm4\n\t" | |
494 "pand %%mm6, %%mm1\n\t" | |
495 "pand %%mm6, %%mm4\n\t" | |
496 "psrlq $9, %%mm2\n\t" | |
497 "psrlq $9, %%mm5\n\t" | |
498 "pand %%mm7, %%mm2\n\t" | |
499 "pand %%mm7, %%mm5\n\t" | |
500 "por %%mm1, %%mm0\n\t" | |
501 "por %%mm4, %%mm3\n\t" | |
502 "por %%mm2, %%mm0\n\t" | |
503 "por %%mm5, %%mm3\n\t" | |
504 "psllq $16, %%mm3\n\t" | |
505 "por %%mm3, %%mm0\n\t" | |
506 MOVNTQ" %%mm0, %0\n\t" | |
507 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
508 d += 4; | |
509 s += 12; | |
510 } | |
511 while(s < end) | |
512 { | |
513 const int b= *s++; | |
514 const int g= *s++; | |
515 const int r= *s++; | |
516 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
517 } | |
518 __asm __volatile(SFENCE:::"memory"); | |
519 __asm __volatile(EMMS:::"memory"); | |
520 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
521 unsigned j,i,num_pixels=src_size/3; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
522 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
523 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 524 { |
525 const int b= src[i+0]; | |
526 const int g= src[i+1]; | |
527 const int r= src[i+2]; | |
528 | |
2720 | 529 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2718 | 530 } |
2741 | 531 #endif |
2718 | 532 } |
2694 | 533 |
3132 | 534 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
2755 | 535 { |
536 int num_pixels= src_size >> 2; | |
537 #ifdef HAVE_MMX | |
538 asm volatile ( | |
539 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
540 ".balign 16 \n\t" |
2755 | 541 "1: \n\t" |
542 PREFETCH" 32(%0, %%eax) \n\t" | |
543 "movq (%0, %%eax), %%mm0 \n\t" | |
544 "movq %%mm0, %%mm1 \n\t" | |
545 "movq %%mm0, %%mm2 \n\t" | |
546 "pslld $16, %%mm0 \n\t" | |
547 "psrld $16, %%mm1 \n\t" | |
4923 | 548 "pand "MANGLE(mask32r)", %%mm0 \n\t" |
549 "pand "MANGLE(mask32g)", %%mm2 \n\t" | |
550 "pand "MANGLE(mask32b)", %%mm1 \n\t" | |
2755 | 551 "por %%mm0, %%mm2 \n\t" |
552 "por %%mm1, %%mm2 \n\t" | |
553 MOVNTQ" %%mm2, (%1, %%eax) \n\t" | |
554 "addl $2, %%eax \n\t" | |
555 "cmpl %2, %%eax \n\t" | |
556 " jb 1b \n\t" | |
557 :: "r" (src), "r"(dst), "r" (num_pixels) | |
558 : "%eax" | |
559 ); | |
2766 | 560 |
561 __asm __volatile(SFENCE:::"memory"); | |
562 __asm __volatile(EMMS:::"memory"); | |
2755 | 563 #else |
564 int i; | |
565 for(i=0; i<num_pixels; i++) | |
566 { | |
567 dst[4*i + 0] = src[4*i + 2]; | |
568 dst[4*i + 1] = src[4*i + 1]; | |
569 dst[4*i + 2] = src[4*i + 0]; | |
570 } | |
571 #endif | |
572 } | |
573 | |
2702 | 574 /** |
575 * | |
2724 | 576 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
577 * problem for anyone then tell me, and ill fix it) | |
2702 | 578 */ |
3132 | 579 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 580 unsigned int width, unsigned int height, |
581 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
2701 | 582 { |
2723 | 583 int y; |
584 const int chromWidth= width>>1; | |
585 for(y=0; y<height; y++) | |
586 { | |
2702 | 587 #ifdef HAVE_MMX |
2723 | 588 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
589 asm volatile( | |
590 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
591 ".balign 16 \n\t" |
2723 | 592 "1: \n\t" |
593 PREFETCH" 32(%1, %%eax, 2) \n\t" | |
594 PREFETCH" 32(%2, %%eax) \n\t" | |
595 PREFETCH" 32(%3, %%eax) \n\t" | |
596 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | |
597 "movq %%mm0, %%mm2 \n\t" // U(0) | |
598 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | |
599 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
600 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
601 | |
602 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | |
603 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | |
604 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
605 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
606 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
607 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
608 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
609 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 610 |
2723 | 611 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" |
612 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | |
613 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | |
614 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | |
2702 | 615 |
2723 | 616 "addl $8, %%eax \n\t" |
617 "cmpl %4, %%eax \n\t" | |
618 " jb 1b \n\t" | |
619 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) | |
620 : "%eax" | |
621 ); | |
2702 | 622 #else |
2723 | 623 int i; |
624 for(i=0; i<chromWidth; i++) | |
625 { | |
626 dst[4*i+0] = ysrc[2*i+0]; | |
627 dst[4*i+1] = usrc[i]; | |
628 dst[4*i+2] = ysrc[2*i+1]; | |
629 dst[4*i+3] = vsrc[i]; | |
630 } | |
631 #endif | |
632 if(y&1) | |
633 { | |
634 usrc += chromStride; | |
635 vsrc += chromStride; | |
636 } | |
637 ysrc += lumStride; | |
638 dst += dstStride; | |
2701 | 639 } |
2723 | 640 #ifdef HAVE_MMX |
641 asm( EMMS" \n\t" | |
642 SFENCE" \n\t" | |
643 :::"memory"); | |
2702 | 644 #endif |
2701 | 645 } |
646 | |
2724 | 647 /** |
648 * | |
649 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
650 * problem for anyone then tell me, and ill fix it) | |
651 */ | |
3132 | 652 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2725 | 653 unsigned int width, unsigned int height, |
654 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
2701 | 655 { |
2724 | 656 int y; |
657 const int chromWidth= width>>1; | |
658 for(y=0; y<height; y+=2) | |
659 { | |
2704 | 660 #ifdef HAVE_MMX |
2724 | 661 asm volatile( |
662 "xorl %%eax, %%eax \n\t" | |
663 "pcmpeqw %%mm7, %%mm7 \n\t" | |
664 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
665 ".balign 16 \n\t" |
2724 | 666 "1: \n\t" |
667 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
668 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
669 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
670 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
671 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
672 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
673 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
674 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
675 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
676 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
677 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
678 | |
679 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2704 | 680 |
2724 | 681 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) |
682 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | |
683 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
684 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
685 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
686 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
687 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
688 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
689 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
690 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 691 |
2724 | 692 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" |
693 | |
694 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
695 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
696 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
697 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
698 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
699 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
700 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
701 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 702 |
2724 | 703 MOVNTQ" %%mm0, (%3, %%eax) \n\t" |
704 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
705 | |
706 "addl $8, %%eax \n\t" | |
707 "cmpl %4, %%eax \n\t" | |
708 " jb 1b \n\t" | |
2725 | 709 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
710 : "memory", "%eax" | |
711 ); | |
2704 | 712 |
2806 | 713 ydst += lumStride; |
714 src += srcStride; | |
715 | |
2725 | 716 asm volatile( |
717 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
718 ".balign 16 \n\t" |
2724 | 719 "1: \n\t" |
720 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
721 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
722 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
723 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
724 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
725 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
726 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
727 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
728 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
729 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
730 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 731 |
2724 | 732 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" |
733 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
734 | |
735 "addl $8, %%eax \n\t" | |
2725 | 736 "cmpl %4, %%eax \n\t" |
2724 | 737 " jb 1b \n\t" |
2704 | 738 |
2806 | 739 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
2724 | 740 : "memory", "%eax" |
741 ); | |
2704 | 742 #else |
2724 | 743 int i; |
744 for(i=0; i<chromWidth; i++) | |
745 { | |
746 ydst[2*i+0] = src[4*i+0]; | |
747 udst[i] = src[4*i+1]; | |
748 ydst[2*i+1] = src[4*i+2]; | |
749 vdst[i] = src[4*i+3]; | |
750 } | |
751 ydst += lumStride; | |
752 src += srcStride; | |
753 | |
754 for(i=0; i<chromWidth; i++) | |
755 { | |
756 ydst[2*i+0] = src[4*i+0]; | |
757 ydst[2*i+1] = src[4*i+2]; | |
758 } | |
759 #endif | |
760 udst += chromStride; | |
761 vdst += chromStride; | |
762 ydst += lumStride; | |
763 src += srcStride; | |
2701 | 764 } |
2724 | 765 #ifdef HAVE_MMX |
2847 | 766 asm volatile( EMMS" \n\t" |
767 SFENCE" \n\t" | |
768 :::"memory"); | |
2704 | 769 #endif |
2723 | 770 } |
2801 | 771 |
772 /** | |
773 * | |
774 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
775 * problem for anyone then tell me, and ill fix it) | |
3132 | 776 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version |
2801 | 777 */ |
3132 | 778 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2801 | 779 unsigned int width, unsigned int height, |
780 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
781 { | |
782 int y; | |
783 const int chromWidth= width>>1; | |
784 for(y=0; y<height; y+=2) | |
785 { | |
2847 | 786 #ifdef HAVE_MMX |
787 asm volatile( | |
788 "xorl %%eax, %%eax \n\t" | |
789 "pcmpeqw %%mm7, %%mm7 \n\t" | |
790 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
791 ".balign 16 \n\t" | |
792 "1: \n\t" | |
793 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
794 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
795 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
796 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
797 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
798 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
799 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
800 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
801 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
802 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
803 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
804 | |
805 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
806 | |
807 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
808 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
809 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
810 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
811 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
812 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
813 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
814 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
815 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
816 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
817 | |
818 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
819 | |
820 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
821 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
822 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
823 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
824 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
825 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
826 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
827 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
828 | |
829 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
830 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
831 | |
832 "addl $8, %%eax \n\t" | |
833 "cmpl %4, %%eax \n\t" | |
834 " jb 1b \n\t" | |
835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
836 : "memory", "%eax" | |
837 ); | |
838 | |
839 ydst += lumStride; | |
840 src += srcStride; | |
841 | |
842 asm volatile( | |
843 "xorl %%eax, %%eax \n\t" | |
844 ".balign 16 \n\t" | |
845 "1: \n\t" | |
846 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
847 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
848 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
849 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
850 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
851 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
852 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
853 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
854 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
855 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
856 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
857 | |
858 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
859 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
860 | |
861 "addl $8, %%eax \n\t" | |
862 "cmpl %4, %%eax \n\t" | |
863 " jb 1b \n\t" | |
864 | |
865 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
866 : "memory", "%eax" | |
867 ); | |
868 #else | |
2801 | 869 int i; |
870 for(i=0; i<chromWidth; i++) | |
871 { | |
872 udst[i] = src[4*i+0]; | |
873 ydst[2*i+0] = src[4*i+1]; | |
874 vdst[i] = src[4*i+2]; | |
875 ydst[2*i+1] = src[4*i+3]; | |
876 } | |
877 ydst += lumStride; | |
878 src += srcStride; | |
879 | |
880 for(i=0; i<chromWidth; i++) | |
881 { | |
882 ydst[2*i+0] = src[4*i+1]; | |
883 ydst[2*i+1] = src[4*i+3]; | |
884 } | |
2847 | 885 #endif |
2801 | 886 udst += chromStride; |
887 vdst += chromStride; | |
888 ydst += lumStride; | |
889 src += srcStride; | |
890 } | |
2847 | 891 #ifdef HAVE_MMX |
892 asm volatile( EMMS" \n\t" | |
893 SFENCE" \n\t" | |
894 :::"memory"); | |
895 #endif | |
2801 | 896 } |
897 | |
3132 | 898 /** |
899 * | |
900 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
901 * problem for anyone then tell me, and ill fix it) | |
4622 | 902 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version |
3132 | 903 */ |
904 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
905 unsigned int width, unsigned int height, | |
906 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
907 { | |
908 int y; | |
909 const int chromWidth= width>>1; | |
4622 | 910 #ifdef HAVE_MMX |
911 for(y=0; y<height-2; y+=2) | |
912 { | |
913 int i; | |
914 for(i=0; i<2; i++) | |
915 { | |
916 asm volatile( | |
917 "movl %2, %%eax \n\t" | |
4923 | 918 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
919 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4622 | 920 "pxor %%mm7, %%mm7 \n\t" |
921 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
922 ".balign 16 \n\t" | |
923 "1: \n\t" | |
924 PREFETCH" 64(%0, %%ebx) \n\t" | |
925 "movd (%0, %%ebx), %%mm0 \n\t" | |
926 "movd 3(%0, %%ebx), %%mm1 \n\t" | |
927 "punpcklbw %%mm7, %%mm0 \n\t" | |
928 "punpcklbw %%mm7, %%mm1 \n\t" | |
929 "movd 6(%0, %%ebx), %%mm2 \n\t" | |
930 "movd 9(%0, %%ebx), %%mm3 \n\t" | |
931 "punpcklbw %%mm7, %%mm2 \n\t" | |
932 "punpcklbw %%mm7, %%mm3 \n\t" | |
933 "pmaddwd %%mm6, %%mm0 \n\t" | |
934 "pmaddwd %%mm6, %%mm1 \n\t" | |
935 "pmaddwd %%mm6, %%mm2 \n\t" | |
936 "pmaddwd %%mm6, %%mm3 \n\t" | |
937 #ifndef FAST_BGR2YV12 | |
938 "psrad $8, %%mm0 \n\t" | |
939 "psrad $8, %%mm1 \n\t" | |
940 "psrad $8, %%mm2 \n\t" | |
941 "psrad $8, %%mm3 \n\t" | |
942 #endif | |
943 "packssdw %%mm1, %%mm0 \n\t" | |
944 "packssdw %%mm3, %%mm2 \n\t" | |
945 "pmaddwd %%mm5, %%mm0 \n\t" | |
946 "pmaddwd %%mm5, %%mm2 \n\t" | |
947 "packssdw %%mm2, %%mm0 \n\t" | |
948 "psraw $7, %%mm0 \n\t" | |
949 | |
950 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
951 "movd 15(%0, %%ebx), %%mm1 \n\t" | |
952 "punpcklbw %%mm7, %%mm4 \n\t" | |
953 "punpcklbw %%mm7, %%mm1 \n\t" | |
954 "movd 18(%0, %%ebx), %%mm2 \n\t" | |
955 "movd 21(%0, %%ebx), %%mm3 \n\t" | |
956 "punpcklbw %%mm7, %%mm2 \n\t" | |
957 "punpcklbw %%mm7, %%mm3 \n\t" | |
958 "pmaddwd %%mm6, %%mm4 \n\t" | |
959 "pmaddwd %%mm6, %%mm1 \n\t" | |
960 "pmaddwd %%mm6, %%mm2 \n\t" | |
961 "pmaddwd %%mm6, %%mm3 \n\t" | |
962 #ifndef FAST_BGR2YV12 | |
963 "psrad $8, %%mm4 \n\t" | |
964 "psrad $8, %%mm1 \n\t" | |
965 "psrad $8, %%mm2 \n\t" | |
966 "psrad $8, %%mm3 \n\t" | |
967 #endif | |
968 "packssdw %%mm1, %%mm4 \n\t" | |
969 "packssdw %%mm3, %%mm2 \n\t" | |
970 "pmaddwd %%mm5, %%mm4 \n\t" | |
971 "pmaddwd %%mm5, %%mm2 \n\t" | |
972 "addl $24, %%ebx \n\t" | |
973 "packssdw %%mm2, %%mm4 \n\t" | |
974 "psraw $7, %%mm4 \n\t" | |
975 | |
976 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 977 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4622 | 978 |
979 MOVNTQ" %%mm0, (%1, %%eax) \n\t" | |
980 "addl $8, %%eax \n\t" | |
981 " js 1b \n\t" | |
982 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) | |
983 : "%eax", "%ebx" | |
984 ); | |
985 ydst += lumStride; | |
986 src += srcStride; | |
987 } | |
988 src -= srcStride*2; | |
989 asm volatile( | |
990 "movl %4, %%eax \n\t" | |
4923 | 991 "movq "MANGLE(w1111)", %%mm5 \n\t" |
992 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4622 | 993 "pxor %%mm7, %%mm7 \n\t" |
994 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
995 "addl %%ebx, %%ebx \n\t" | |
996 ".balign 16 \n\t" | |
997 "1: \n\t" | |
998 PREFETCH" 64(%0, %%ebx) \n\t" | |
999 PREFETCH" 64(%1, %%ebx) \n\t" | |
1000 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1001 "movq (%0, %%ebx), %%mm0 \n\t" | |
1002 "movq (%1, %%ebx), %%mm1 \n\t" | |
1003 "movq 6(%0, %%ebx), %%mm2 \n\t" | |
1004 "movq 6(%1, %%ebx), %%mm3 \n\t" | |
1005 PAVGB" %%mm1, %%mm0 \n\t" | |
1006 PAVGB" %%mm3, %%mm2 \n\t" | |
1007 "movq %%mm0, %%mm1 \n\t" | |
1008 "movq %%mm2, %%mm3 \n\t" | |
1009 "psrlq $24, %%mm0 \n\t" | |
1010 "psrlq $24, %%mm2 \n\t" | |
1011 PAVGB" %%mm1, %%mm0 \n\t" | |
1012 PAVGB" %%mm3, %%mm2 \n\t" | |
1013 "punpcklbw %%mm7, %%mm0 \n\t" | |
1014 "punpcklbw %%mm7, %%mm2 \n\t" | |
1015 #else | |
1016 "movd (%0, %%ebx), %%mm0 \n\t" | |
1017 "movd (%1, %%ebx), %%mm1 \n\t" | |
1018 "movd 3(%0, %%ebx), %%mm2 \n\t" | |
1019 "movd 3(%1, %%ebx), %%mm3 \n\t" | |
1020 "punpcklbw %%mm7, %%mm0 \n\t" | |
1021 "punpcklbw %%mm7, %%mm1 \n\t" | |
1022 "punpcklbw %%mm7, %%mm2 \n\t" | |
1023 "punpcklbw %%mm7, %%mm3 \n\t" | |
1024 "paddw %%mm1, %%mm0 \n\t" | |
1025 "paddw %%mm3, %%mm2 \n\t" | |
1026 "paddw %%mm2, %%mm0 \n\t" | |
1027 "movd 6(%0, %%ebx), %%mm4 \n\t" | |
1028 "movd 6(%1, %%ebx), %%mm1 \n\t" | |
1029 "movd 9(%0, %%ebx), %%mm2 \n\t" | |
1030 "movd 9(%1, %%ebx), %%mm3 \n\t" | |
1031 "punpcklbw %%mm7, %%mm4 \n\t" | |
1032 "punpcklbw %%mm7, %%mm1 \n\t" | |
1033 "punpcklbw %%mm7, %%mm2 \n\t" | |
1034 "punpcklbw %%mm7, %%mm3 \n\t" | |
1035 "paddw %%mm1, %%mm4 \n\t" | |
1036 "paddw %%mm3, %%mm2 \n\t" | |
1037 "paddw %%mm4, %%mm2 \n\t" | |
1038 "psrlw $2, %%mm0 \n\t" | |
1039 "psrlw $2, %%mm2 \n\t" | |
1040 #endif | |
4923 | 1041 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1042 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 1043 |
1044 "pmaddwd %%mm0, %%mm1 \n\t" | |
1045 "pmaddwd %%mm2, %%mm3 \n\t" | |
1046 "pmaddwd %%mm6, %%mm0 \n\t" | |
1047 "pmaddwd %%mm6, %%mm2 \n\t" | |
1048 #ifndef FAST_BGR2YV12 | |
1049 "psrad $8, %%mm0 \n\t" | |
1050 "psrad $8, %%mm1 \n\t" | |
1051 "psrad $8, %%mm2 \n\t" | |
1052 "psrad $8, %%mm3 \n\t" | |
1053 #endif | |
1054 "packssdw %%mm2, %%mm0 \n\t" | |
1055 "packssdw %%mm3, %%mm1 \n\t" | |
1056 "pmaddwd %%mm5, %%mm0 \n\t" | |
1057 "pmaddwd %%mm5, %%mm1 \n\t" | |
1058 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1059 "psraw $7, %%mm0 \n\t" | |
1060 | |
1061 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1062 "movq 12(%0, %%ebx), %%mm4 \n\t" | |
1063 "movq 12(%1, %%ebx), %%mm1 \n\t" | |
1064 "movq 18(%0, %%ebx), %%mm2 \n\t" | |
1065 "movq 18(%1, %%ebx), %%mm3 \n\t" | |
1066 PAVGB" %%mm1, %%mm4 \n\t" | |
1067 PAVGB" %%mm3, %%mm2 \n\t" | |
1068 "movq %%mm4, %%mm1 \n\t" | |
1069 "movq %%mm2, %%mm3 \n\t" | |
1070 "psrlq $24, %%mm4 \n\t" | |
1071 "psrlq $24, %%mm2 \n\t" | |
1072 PAVGB" %%mm1, %%mm4 \n\t" | |
1073 PAVGB" %%mm3, %%mm2 \n\t" | |
1074 "punpcklbw %%mm7, %%mm4 \n\t" | |
1075 "punpcklbw %%mm7, %%mm2 \n\t" | |
1076 #else | |
1077 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1078 "movd 12(%1, %%ebx), %%mm1 \n\t" | |
1079 "movd 15(%0, %%ebx), %%mm2 \n\t" | |
1080 "movd 15(%1, %%ebx), %%mm3 \n\t" | |
1081 "punpcklbw %%mm7, %%mm4 \n\t" | |
1082 "punpcklbw %%mm7, %%mm1 \n\t" | |
1083 "punpcklbw %%mm7, %%mm2 \n\t" | |
1084 "punpcklbw %%mm7, %%mm3 \n\t" | |
1085 "paddw %%mm1, %%mm4 \n\t" | |
1086 "paddw %%mm3, %%mm2 \n\t" | |
1087 "paddw %%mm2, %%mm4 \n\t" | |
1088 "movd 18(%0, %%ebx), %%mm5 \n\t" | |
1089 "movd 18(%1, %%ebx), %%mm1 \n\t" | |
1090 "movd 21(%0, %%ebx), %%mm2 \n\t" | |
1091 "movd 21(%1, %%ebx), %%mm3 \n\t" | |
1092 "punpcklbw %%mm7, %%mm5 \n\t" | |
1093 "punpcklbw %%mm7, %%mm1 \n\t" | |
1094 "punpcklbw %%mm7, %%mm2 \n\t" | |
1095 "punpcklbw %%mm7, %%mm3 \n\t" | |
1096 "paddw %%mm1, %%mm5 \n\t" | |
1097 "paddw %%mm3, %%mm2 \n\t" | |
1098 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 1099 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4622 | 1100 "psrlw $2, %%mm4 \n\t" |
1101 "psrlw $2, %%mm2 \n\t" | |
1102 #endif | |
4923 | 1103 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1104 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 1105 |
1106 "pmaddwd %%mm4, %%mm1 \n\t" | |
1107 "pmaddwd %%mm2, %%mm3 \n\t" | |
1108 "pmaddwd %%mm6, %%mm4 \n\t" | |
1109 "pmaddwd %%mm6, %%mm2 \n\t" | |
1110 #ifndef FAST_BGR2YV12 | |
1111 "psrad $8, %%mm4 \n\t" | |
1112 "psrad $8, %%mm1 \n\t" | |
1113 "psrad $8, %%mm2 \n\t" | |
1114 "psrad $8, %%mm3 \n\t" | |
1115 #endif | |
1116 "packssdw %%mm2, %%mm4 \n\t" | |
1117 "packssdw %%mm3, %%mm1 \n\t" | |
1118 "pmaddwd %%mm5, %%mm4 \n\t" | |
1119 "pmaddwd %%mm5, %%mm1 \n\t" | |
1120 "addl $24, %%ebx \n\t" | |
1121 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
1122 "psraw $7, %%mm4 \n\t" | |
1123 | |
1124 "movq %%mm0, %%mm1 \n\t" | |
1125 "punpckldq %%mm4, %%mm0 \n\t" | |
1126 "punpckhdq %%mm4, %%mm1 \n\t" | |
1127 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 1128 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
4622 | 1129 |
1130 "movd %%mm0, (%2, %%eax) \n\t" | |
1131 "punpckhdq %%mm0, %%mm0 \n\t" | |
1132 "movd %%mm0, (%3, %%eax) \n\t" | |
1133 "addl $4, %%eax \n\t" | |
1134 " js 1b \n\t" | |
1135 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width) | |
1136 : "%eax", "%ebx" | |
1137 ); | |
1138 | |
1139 udst += chromStride; | |
1140 vdst += chromStride; | |
1141 src += srcStride*2; | |
1142 } | |
1143 | |
1144 asm volatile( EMMS" \n\t" | |
1145 SFENCE" \n\t" | |
1146 :::"memory"); | |
1147 #else | |
1148 y=0; | |
1149 #endif | |
1150 for(; y<height; y+=2) | |
3132 | 1151 { |
1152 int i; | |
1153 for(i=0; i<chromWidth; i++) | |
1154 { | |
1155 unsigned int b= src[6*i+0]; | |
1156 unsigned int g= src[6*i+1]; | |
1157 unsigned int r= src[6*i+2]; | |
2801 | 1158 |
3633 | 1159 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
1160 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
1161 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
3132 | 1162 |
1163 udst[i] = U; | |
1164 vdst[i] = V; | |
1165 ydst[2*i] = Y; | |
1166 | |
1167 b= src[6*i+3]; | |
1168 g= src[6*i+4]; | |
1169 r= src[6*i+5]; | |
1170 | |
3633 | 1171 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1172 ydst[2*i+1] = Y; |
1173 } | |
1174 ydst += lumStride; | |
1175 src += srcStride; | |
1176 | |
1177 for(i=0; i<chromWidth; i++) | |
1178 { | |
1179 unsigned int b= src[6*i+0]; | |
1180 unsigned int g= src[6*i+1]; | |
1181 unsigned int r= src[6*i+2]; | |
1182 | |
3633 | 1183 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1184 |
1185 ydst[2*i] = Y; | |
1186 | |
1187 b= src[6*i+3]; | |
1188 g= src[6*i+4]; | |
1189 r= src[6*i+5]; | |
1190 | |
3633 | 1191 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1192 ydst[2*i+1] = Y; |
1193 } | |
1194 udst += chromStride; | |
1195 vdst += chromStride; | |
1196 ydst += lumStride; | |
1197 src += srcStride; | |
1198 } | |
1199 } | |
5337 | 1200 |
1201 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |
1202 int width, int height, int src1Stride, int src2Stride, int dstStride){ | |
1203 int h; | |
1204 | |
1205 for(h=0; h < height; h++) | |
1206 { | |
1207 int w; | |
1208 | |
1209 #ifdef HAVE_MMX | |
1210 #ifdef HAVE_SSE2 | |
1211 asm( | |
1212 "xorl %%eax, %%eax \n\t" | |
1213 "1: \n\t" | |
1214 PREFETCH" 64(%1, %%eax) \n\t" | |
1215 PREFETCH" 64(%2, %%eax) \n\t" | |
1216 "movdqa (%1, %%eax), %%xmm0 \n\t" | |
1217 "movdqa (%1, %%eax), %%xmm1 \n\t" | |
1218 "movdqa (%2, %%eax), %%xmm2 \n\t" | |
1219 "punpcklbw %%xmm2, %%xmm0 \n\t" | |
1220 "punpckhbw %%xmm2, %%xmm1 \n\t" | |
1221 "movntdq %%xmm0, (%0, %%eax, 2) \n\t" | |
1222 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t" | |
1223 "addl $16, %%eax \n\t" | |
1224 "cmpl %3, %%eax \n\t" | |
1225 " jb 1b \n\t" | |
1226 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
1227 : "memory", "%eax" | |
1228 ); | |
1229 #else | |
1230 asm( | |
1231 "xorl %%eax, %%eax \n\t" | |
1232 "1: \n\t" | |
1233 PREFETCH" 64(%1, %%eax) \n\t" | |
1234 PREFETCH" 64(%2, %%eax) \n\t" | |
1235 "movq (%1, %%eax), %%mm0 \n\t" | |
1236 "movq 8(%1, %%eax), %%mm2 \n\t" | |
1237 "movq %%mm0, %%mm1 \n\t" | |
1238 "movq %%mm2, %%mm3 \n\t" | |
1239 "movq (%2, %%eax), %%mm4 \n\t" | |
1240 "movq 8(%2, %%eax), %%mm5 \n\t" | |
1241 "punpcklbw %%mm4, %%mm0 \n\t" | |
1242 "punpckhbw %%mm4, %%mm1 \n\t" | |
1243 "punpcklbw %%mm5, %%mm2 \n\t" | |
1244 "punpckhbw %%mm5, %%mm3 \n\t" | |
1245 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t" | |
1246 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t" | |
1247 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t" | |
1248 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t" | |
1249 "addl $16, %%eax \n\t" | |
1250 "cmpl %3, %%eax \n\t" | |
1251 " jb 1b \n\t" | |
1252 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
1253 : "memory", "%eax" | |
1254 ); | |
1255 #endif | |
1256 for(w= (width&(~15)); w < width; w++) | |
1257 { | |
1258 dest[2*w+0] = src1[w]; | |
1259 dest[2*w+1] = src2[w]; | |
1260 } | |
1261 #else | |
1262 for(w=0; w < width; w++) | |
1263 { | |
1264 dest[2*w+0] = src1[w]; | |
1265 dest[2*w+1] = src2[w]; | |
1266 } | |
1267 #endif | |
1268 dest += dstStride; | |
1269 src1 += src1Stride; | |
1270 src2 += src2Stride; | |
1271 } | |
1272 #ifdef HAVE_MMX | |
1273 asm( | |
1274 EMMS" \n\t" | |
1275 SFENCE" \n\t" | |
1276 ::: "memory" | |
1277 ); | |
1278 #endif | |
1279 } |