Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 5677:a21cab74cde8
bitrate>16000 means bits not kbits - noticed by George Hawkins <george_hawkins@yahoo.com>
author | arpi |
---|---|
date | Thu, 18 Apr 2002 15:23:34 +0000 |
parents | f0fa3373f616 |
children | f4f3cfcd0d64 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
3132 | 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
9 */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
10 |
3132 | 11 #undef PREFETCH |
12 #undef MOVNTQ | |
13 #undef EMMS | |
14 #undef SFENCE | |
15 #undef MMREG_SIZE | |
16 #undef PREFETCHW | |
17 #undef PAVGB | |
2755 | 18 |
3132 | 19 #ifdef HAVE_SSE2 |
20 #define MMREG_SIZE 16 | |
21 #else | |
22 #define MMREG_SIZE 8 | |
2535 | 23 #endif |
2513 | 24 |
3132 | 25 #ifdef HAVE_3DNOW |
26 #define PREFETCH "prefetch" | |
27 #define PREFETCHW "prefetchw" | |
28 #define PAVGB "pavgusb" | |
29 #elif defined ( HAVE_MMX2 ) | |
30 #define PREFETCH "prefetchnta" | |
31 #define PREFETCHW "prefetcht0" | |
32 #define PAVGB "pavgb" | |
33 #else | |
34 #define PREFETCH "/nop" | |
35 #define PREFETCHW "/nop" | |
36 #endif | |
37 | |
38 #ifdef HAVE_3DNOW | |
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
40 #define EMMS "femms" | |
41 #else | |
42 #define EMMS "emms" | |
43 #endif | |
44 | |
45 #ifdef HAVE_MMX2 | |
46 #define MOVNTQ "movntq" | |
47 #define SFENCE "sfence" | |
48 #else | |
49 #define MOVNTQ "movq" | |
50 #define SFENCE "/nop" | |
51 #endif | |
52 | |
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size) | |
2504 | 54 { |
2508 | 55 uint8_t *dest = dst; |
2677 | 56 const uint8_t *s = src; |
57 const uint8_t *end; | |
2510 | 58 #ifdef HAVE_MMX |
59 uint8_t *mm_end; | |
60 #endif | |
2504 | 61 end = s + src_size; |
2510 | 62 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
63 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2740 | 64 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
65 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2740 | 66 if(mm_end == end) mm_end -= MMREG_SIZE*4; |
2510 | 67 while(s < mm_end) |
68 { | |
2511 | 69 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
70 PREFETCH" 32%1\n\t" |
2510 | 71 "movd %1, %%mm0\n\t" |
2738 | 72 "punpckldq 3%1, %%mm0\n\t" |
73 "movd 6%1, %%mm1\n\t" | |
74 "punpckldq 9%1, %%mm1\n\t" | |
75 "movd 12%1, %%mm2\n\t" | |
76 "punpckldq 15%1, %%mm2\n\t" | |
77 "movd 18%1, %%mm3\n\t" | |
78 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 79 "pand %%mm7, %%mm0\n\t" |
2738 | 80 "pand %%mm7, %%mm1\n\t" |
2510 | 81 "pand %%mm7, %%mm2\n\t" |
2738 | 82 "pand %%mm7, %%mm3\n\t" |
2511 | 83 MOVNTQ" %%mm0, %0\n\t" |
2738 | 84 MOVNTQ" %%mm1, 8%0\n\t" |
85 MOVNTQ" %%mm2, 16%0\n\t" | |
86 MOVNTQ" %%mm3, 24%0" | |
2510 | 87 :"=m"(*dest) |
88 :"m"(*s) | |
89 :"memory"); | |
2738 | 90 dest += 32; |
91 s += 24; | |
2510 | 92 } |
2513 | 93 __asm __volatile(SFENCE:::"memory"); |
2511 | 94 __asm __volatile(EMMS:::"memory"); |
2510 | 95 #endif |
2504 | 96 while(s < end) |
97 { | |
2508 | 98 *dest++ = *s++; |
99 *dest++ = *s++; | |
100 *dest++ = *s++; | |
101 *dest++ = 0; | |
2504 | 102 } |
103 } | |
2505 | 104 |
3132 | 105 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 106 { |
107 uint8_t *dest = dst; | |
2677 | 108 const uint8_t *s = src; |
109 const uint8_t *end; | |
2517 | 110 #ifdef HAVE_MMX |
111 uint8_t *mm_end; | |
112 #endif | |
2505 | 113 end = s + src_size; |
2517 | 114 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
115 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
116 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2517 | 117 while(s < mm_end) |
118 { | |
119 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
120 PREFETCH" 32%1\n\t" |
2517 | 121 "movq %1, %%mm0\n\t" |
122 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
123 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
124 "movq 24%1, %%mm5\n\t" |
2517 | 125 "movq %%mm0, %%mm2\n\t" |
126 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
127 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
128 "movq %%mm5, %%mm7\n\t" |
2517 | 129 "psrlq $8, %%mm2\n\t" |
130 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
131 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
132 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
133 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
134 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
135 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
136 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
137 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
138 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
139 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
140 "pand %3, %%mm7\n\t" |
2517 | 141 "por %%mm2, %%mm0\n\t" |
142 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
143 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
144 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
145 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
146 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
147 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
148 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
149 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
150 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
153 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
154 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
156 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
157 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
158 "por %%mm5, %%mm4\n\t" |
3132 | 159 |
2517 | 160 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 MOVNTQ" %%mm4, 16%0" |
2517 | 163 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
165 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 166 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
167 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
168 s += 32; |
2517 | 169 } |
170 __asm __volatile(SFENCE:::"memory"); | |
171 __asm __volatile(EMMS:::"memory"); | |
172 #endif | |
2505 | 173 while(s < end) |
174 { | |
175 *dest++ = *s++; | |
176 *dest++ = *s++; | |
177 *dest++ = *s++; | |
178 s++; | |
179 } | |
180 } | |
2506 | 181 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
182 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
183 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
184 ported to gcc & bugfixed : A'rpi |
2564 | 185 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
186 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
187 */ |
3132 | 188 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 189 { |
190 #ifdef HAVE_MMX | |
2677 | 191 register const char* s=src+src_size; |
2506 | 192 register char* d=dst+src_size; |
193 register int offs=-src_size; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
194 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
195 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
196 "movq %0, %%mm4\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
197 ::"m"(mask15s)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
198 while(offs<0) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
199 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
200 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
201 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
202 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
203 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
204 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
205 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
206 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
207 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
208 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
209 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
210 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
211 MOVNTQ" %%mm2, 8%0" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
212 :"=m"(*(d+offs)) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
213 :"m"(*(s+offs)) |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
214 ); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
215 offs+=16; |
2506 | 216 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
217 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
218 __asm __volatile(EMMS:::"memory"); |
2506 | 219 #else |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
220 #if 0 |
2677 | 221 const uint16_t *s1=( uint16_t * )src; |
2506 | 222 uint16_t *d1=( uint16_t * )dst; |
223 uint16_t *e=((uint8_t *)s1)+src_size; | |
224 while( s1<e ){ | |
225 register int x=*( s1++ ); | |
226 /* rrrrrggggggbbbbb | |
227 0rrrrrgggggbbbbb | |
228 0111 1111 1110 0000=0x7FE0 | |
229 00000000000001 1111=0x001F */ | |
230 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); | |
231 } | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
232 #else |
2718 | 233 const unsigned *s1=( unsigned * )src; |
234 unsigned *d1=( unsigned * )dst; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
235 int i; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
236 int size= src_size>>2; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
237 for(i=0; i<size; i++) |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
238 { |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
239 register int x= s1[i]; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
240 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
241 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
242 |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
243 } |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
244 #endif |
2506 | 245 #endif |
246 } | |
2694 | 247 |
3132 | 248 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 249 { |
2741 | 250 #ifdef HAVE_MMX |
251 const uint8_t *s = src; | |
252 const uint8_t *end,*mm_end; | |
253 uint16_t *d = (uint16_t *)dst; | |
254 end = s + src_size; | |
255 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
256 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
257 __asm __volatile( | |
258 "movq %0, %%mm7\n\t" | |
259 "movq %1, %%mm6\n\t" | |
260 ::"m"(red_16mask),"m"(green_16mask)); | |
261 while(s < mm_end) | |
262 { | |
263 __asm __volatile( | |
264 PREFETCH" 32%1\n\t" | |
265 "movd %1, %%mm0\n\t" | |
266 "movd 4%1, %%mm3\n\t" | |
267 "punpckldq 8%1, %%mm0\n\t" | |
268 "punpckldq 12%1, %%mm3\n\t" | |
269 "movq %%mm0, %%mm1\n\t" | |
270 "movq %%mm0, %%mm2\n\t" | |
271 "movq %%mm3, %%mm4\n\t" | |
272 "movq %%mm3, %%mm5\n\t" | |
273 "psrlq $3, %%mm0\n\t" | |
274 "psrlq $3, %%mm3\n\t" | |
275 "pand %2, %%mm0\n\t" | |
276 "pand %2, %%mm3\n\t" | |
277 "psrlq $5, %%mm1\n\t" | |
278 "psrlq $5, %%mm4\n\t" | |
279 "pand %%mm6, %%mm1\n\t" | |
280 "pand %%mm6, %%mm4\n\t" | |
281 "psrlq $8, %%mm2\n\t" | |
282 "psrlq $8, %%mm5\n\t" | |
283 "pand %%mm7, %%mm2\n\t" | |
284 "pand %%mm7, %%mm5\n\t" | |
285 "por %%mm1, %%mm0\n\t" | |
286 "por %%mm4, %%mm3\n\t" | |
287 "por %%mm2, %%mm0\n\t" | |
288 "por %%mm5, %%mm3\n\t" | |
289 "psllq $16, %%mm3\n\t" | |
290 "por %%mm3, %%mm0\n\t" | |
291 MOVNTQ" %%mm0, %0\n\t" | |
292 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
293 d += 4; | |
294 s += 16; | |
295 } | |
296 while(s < end) | |
297 { | |
298 const int b= *s++; | |
299 const int g= *s++; | |
300 const int r= *s++; | |
301 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
302 } | |
303 __asm __volatile(SFENCE:::"memory"); | |
304 __asm __volatile(EMMS:::"memory"); | |
305 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
306 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
307 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
308 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 309 { |
310 const int b= src[i+0]; | |
311 const int g= src[i+1]; | |
312 const int r= src[i+2]; | |
313 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
314 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2694 | 315 } |
2741 | 316 #endif |
2694 | 317 } |
318 | |
3132 | 319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 320 { |
2741 | 321 #ifdef HAVE_MMX |
322 const uint8_t *s = src; | |
323 const uint8_t *end,*mm_end; | |
324 uint16_t *d = (uint16_t *)dst; | |
325 end = s + src_size; | |
326 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
327 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
328 __asm __volatile( | |
329 "movq %0, %%mm7\n\t" | |
330 "movq %1, %%mm6\n\t" | |
331 ::"m"(red_15mask),"m"(green_15mask)); | |
332 while(s < mm_end) | |
333 { | |
334 __asm __volatile( | |
335 PREFETCH" 32%1\n\t" | |
336 "movd %1, %%mm0\n\t" | |
337 "movd 4%1, %%mm3\n\t" | |
338 "punpckldq 8%1, %%mm0\n\t" | |
339 "punpckldq 12%1, %%mm3\n\t" | |
340 "movq %%mm0, %%mm1\n\t" | |
341 "movq %%mm0, %%mm2\n\t" | |
342 "movq %%mm3, %%mm4\n\t" | |
343 "movq %%mm3, %%mm5\n\t" | |
344 "psrlq $3, %%mm0\n\t" | |
345 "psrlq $3, %%mm3\n\t" | |
346 "pand %2, %%mm0\n\t" | |
347 "pand %2, %%mm3\n\t" | |
348 "psrlq $6, %%mm1\n\t" | |
349 "psrlq $6, %%mm4\n\t" | |
350 "pand %%mm6, %%mm1\n\t" | |
351 "pand %%mm6, %%mm4\n\t" | |
352 "psrlq $9, %%mm2\n\t" | |
353 "psrlq $9, %%mm5\n\t" | |
354 "pand %%mm7, %%mm2\n\t" | |
355 "pand %%mm7, %%mm5\n\t" | |
356 "por %%mm1, %%mm0\n\t" | |
357 "por %%mm4, %%mm3\n\t" | |
358 "por %%mm2, %%mm0\n\t" | |
359 "por %%mm5, %%mm3\n\t" | |
360 "psllq $16, %%mm3\n\t" | |
361 "por %%mm3, %%mm0\n\t" | |
362 MOVNTQ" %%mm0, %0\n\t" | |
363 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
364 d += 4; | |
365 s += 16; | |
366 } | |
367 while(s < end) | |
368 { | |
369 const int b= *s++; | |
370 const int g= *s++; | |
371 const int r= *s++; | |
372 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
373 } | |
374 __asm __volatile(SFENCE:::"memory"); | |
375 __asm __volatile(EMMS:::"memory"); | |
376 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
377 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
378 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
379 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 380 { |
381 const int b= src[i+0]; | |
382 const int g= src[i+1]; | |
383 const int r= src[i+2]; | |
384 | |
2720 | 385 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2694 | 386 } |
2741 | 387 #endif |
2694 | 388 } |
389 | |
3132 | 390 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 391 { |
2740 | 392 #ifdef HAVE_MMX |
393 const uint8_t *s = src; | |
394 const uint8_t *end,*mm_end; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
395 uint16_t *d = (uint16_t *)dst; |
2740 | 396 end = s + src_size; |
397 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
2738 | 398 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
399 __asm __volatile( | |
400 "movq %0, %%mm7\n\t" | |
401 "movq %1, %%mm6\n\t" | |
2741 | 402 ::"m"(red_16mask),"m"(green_16mask)); |
403 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
2740 | 404 while(s < mm_end) |
2738 | 405 { |
406 __asm __volatile( | |
407 PREFETCH" 32%1\n\t" | |
408 "movd %1, %%mm0\n\t" | |
2740 | 409 "movd 3%1, %%mm3\n\t" |
410 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 411 "punpckldq 9%1, %%mm3\n\t" |
412 "movq %%mm0, %%mm1\n\t" | |
413 "movq %%mm0, %%mm2\n\t" | |
414 "movq %%mm3, %%mm4\n\t" | |
415 "movq %%mm3, %%mm5\n\t" | |
416 "psrlq $3, %%mm0\n\t" | |
417 "psrlq $3, %%mm3\n\t" | |
2740 | 418 "pand %2, %%mm0\n\t" |
419 "pand %2, %%mm3\n\t" | |
420 "psrlq $5, %%mm1\n\t" | |
421 "psrlq $5, %%mm4\n\t" | |
422 "pand %%mm6, %%mm1\n\t" | |
423 "pand %%mm6, %%mm4\n\t" | |
424 "psrlq $8, %%mm2\n\t" | |
425 "psrlq $8, %%mm5\n\t" | |
426 "pand %%mm7, %%mm2\n\t" | |
427 "pand %%mm7, %%mm5\n\t" | |
2738 | 428 "por %%mm1, %%mm0\n\t" |
2740 | 429 "por %%mm4, %%mm3\n\t" |
2738 | 430 "por %%mm2, %%mm0\n\t" |
431 "por %%mm5, %%mm3\n\t" | |
2740 | 432 "psllq $16, %%mm3\n\t" |
433 "por %%mm3, %%mm0\n\t" | |
2738 | 434 MOVNTQ" %%mm0, %0\n\t" |
2741 | 435 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 436 d += 4; |
437 s += 12; | |
2738 | 438 } |
2740 | 439 while(s < end) |
440 { | |
441 const int b= *s++; | |
442 const int g= *s++; | |
443 const int r= *s++; | |
444 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
445 } | |
446 __asm __volatile(SFENCE:::"memory"); | |
447 __asm __volatile(EMMS:::"memory"); | |
448 #else | |
449 unsigned j,i,num_pixels=src_size/3; | |
450 uint16_t *d = (uint16_t *)dst; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
451 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 452 { |
453 const int b= src[i+0]; | |
454 const int g= src[i+1]; | |
455 const int r= src[i+2]; | |
456 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
457 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2718 | 458 } |
2740 | 459 #endif |
2718 | 460 } |
461 | |
3132 | 462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 463 { |
2741 | 464 #ifdef HAVE_MMX |
465 const uint8_t *s = src; | |
466 const uint8_t *end,*mm_end; | |
467 uint16_t *d = (uint16_t *)dst; | |
468 end = s + src_size; | |
469 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
470 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
471 __asm __volatile( | |
472 "movq %0, %%mm7\n\t" | |
473 "movq %1, %%mm6\n\t" | |
474 ::"m"(red_15mask),"m"(green_15mask)); | |
475 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
476 while(s < mm_end) | |
477 { | |
478 __asm __volatile( | |
479 PREFETCH" 32%1\n\t" | |
480 "movd %1, %%mm0\n\t" | |
481 "movd 3%1, %%mm3\n\t" | |
482 "punpckldq 6%1, %%mm0\n\t" | |
483 "punpckldq 9%1, %%mm3\n\t" | |
484 "movq %%mm0, %%mm1\n\t" | |
485 "movq %%mm0, %%mm2\n\t" | |
486 "movq %%mm3, %%mm4\n\t" | |
487 "movq %%mm3, %%mm5\n\t" | |
488 "psrlq $3, %%mm0\n\t" | |
489 "psrlq $3, %%mm3\n\t" | |
490 "pand %2, %%mm0\n\t" | |
491 "pand %2, %%mm3\n\t" | |
492 "psrlq $6, %%mm1\n\t" | |
493 "psrlq $6, %%mm4\n\t" | |
494 "pand %%mm6, %%mm1\n\t" | |
495 "pand %%mm6, %%mm4\n\t" | |
496 "psrlq $9, %%mm2\n\t" | |
497 "psrlq $9, %%mm5\n\t" | |
498 "pand %%mm7, %%mm2\n\t" | |
499 "pand %%mm7, %%mm5\n\t" | |
500 "por %%mm1, %%mm0\n\t" | |
501 "por %%mm4, %%mm3\n\t" | |
502 "por %%mm2, %%mm0\n\t" | |
503 "por %%mm5, %%mm3\n\t" | |
504 "psllq $16, %%mm3\n\t" | |
505 "por %%mm3, %%mm0\n\t" | |
506 MOVNTQ" %%mm0, %0\n\t" | |
507 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
508 d += 4; | |
509 s += 12; | |
510 } | |
511 while(s < end) | |
512 { | |
513 const int b= *s++; | |
514 const int g= *s++; | |
515 const int r= *s++; | |
516 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
517 } | |
518 __asm __volatile(SFENCE:::"memory"); | |
519 __asm __volatile(EMMS:::"memory"); | |
520 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
521 unsigned j,i,num_pixels=src_size/3; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
522 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
523 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 524 { |
525 const int b= src[i+0]; | |
526 const int g= src[i+1]; | |
527 const int r= src[i+2]; | |
528 | |
2720 | 529 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2718 | 530 } |
2741 | 531 #endif |
2718 | 532 } |
2694 | 533 |
3132 | 534 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
2755 | 535 { |
536 int num_pixels= src_size >> 2; | |
537 #ifdef HAVE_MMX | |
538 asm volatile ( | |
539 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
540 ".balign 16 \n\t" |
2755 | 541 "1: \n\t" |
542 PREFETCH" 32(%0, %%eax) \n\t" | |
543 "movq (%0, %%eax), %%mm0 \n\t" | |
544 "movq %%mm0, %%mm1 \n\t" | |
545 "movq %%mm0, %%mm2 \n\t" | |
546 "pslld $16, %%mm0 \n\t" | |
547 "psrld $16, %%mm1 \n\t" | |
4923 | 548 "pand "MANGLE(mask32r)", %%mm0 \n\t" |
549 "pand "MANGLE(mask32g)", %%mm2 \n\t" | |
550 "pand "MANGLE(mask32b)", %%mm1 \n\t" | |
2755 | 551 "por %%mm0, %%mm2 \n\t" |
552 "por %%mm1, %%mm2 \n\t" | |
553 MOVNTQ" %%mm2, (%1, %%eax) \n\t" | |
554 "addl $2, %%eax \n\t" | |
555 "cmpl %2, %%eax \n\t" | |
556 " jb 1b \n\t" | |
557 :: "r" (src), "r"(dst), "r" (num_pixels) | |
558 : "%eax" | |
559 ); | |
2766 | 560 |
561 __asm __volatile(SFENCE:::"memory"); | |
562 __asm __volatile(EMMS:::"memory"); | |
2755 | 563 #else |
564 int i; | |
565 for(i=0; i<num_pixels; i++) | |
566 { | |
567 dst[4*i + 0] = src[4*i + 2]; | |
568 dst[4*i + 1] = src[4*i + 1]; | |
569 dst[4*i + 2] = src[4*i + 0]; | |
570 } | |
571 #endif | |
572 } | |
573 | |
5582 | 574 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
575 { | |
576 int i; | |
577 #ifdef HAVE_MMX | |
578 int mmx_size= 23 - src_size; | |
579 asm volatile ( | |
580 "movq "MANGLE(mask24r)", %%mm5 \n\t" | |
581 "movq "MANGLE(mask24g)", %%mm6 \n\t" | |
582 "movq "MANGLE(mask24b)", %%mm7 \n\t" | |
583 ".balign 16 \n\t" | |
584 "1: \n\t" | |
585 PREFETCH" 32(%1, %%eax) \n\t" | |
586 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG | |
587 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG | |
588 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B | |
589 "psllq $16, %%mm0 \n\t" // 00 BGR BGR | |
590 "pand %%mm5, %%mm0 \n\t" | |
591 "pand %%mm6, %%mm1 \n\t" | |
592 "pand %%mm7, %%mm2 \n\t" | |
593 "por %%mm0, %%mm1 \n\t" | |
594 "por %%mm2, %%mm1 \n\t" | |
595 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG | |
596 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG | |
597 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B | |
598 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR | |
599 "pand %%mm7, %%mm0 \n\t" | |
600 "pand %%mm5, %%mm1 \n\t" | |
601 "pand %%mm6, %%mm2 \n\t" | |
602 "por %%mm0, %%mm1 \n\t" | |
603 "por %%mm2, %%mm1 \n\t" | |
604 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B | |
605 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R | |
606 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR | |
607 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG | |
608 "pand %%mm6, %%mm0 \n\t" | |
609 "pand %%mm7, %%mm1 \n\t" | |
610 "pand %%mm5, %%mm2 \n\t" | |
611 "por %%mm0, %%mm1 \n\t" | |
612 "por %%mm2, %%mm1 \n\t" | |
613 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t" | |
614 "addl $24, %%eax \n\t" | |
615 " js 1b \n\t" | |
616 : "+a" (mmx_size) | |
617 : "r" (src-mmx_size), "r"(dst-mmx_size) | |
618 ); | |
619 | |
620 __asm __volatile(SFENCE:::"memory"); | |
621 __asm __volatile(EMMS:::"memory"); | |
622 | |
623 if(!mmx_size) return; //finihsed, was multiple of 8 | |
624 | |
625 src+= src_size; | |
626 dst+= src_size; | |
627 src_size= 24-mmx_size; | |
628 src-= src_size; | |
629 dst-= src_size; | |
630 #endif | |
631 for(i=0; i<src_size; i+=3) | |
632 { | |
633 register int x; | |
634 x = src[i + 2]; | |
635 dst[i + 1] = src[i + 1]; | |
636 dst[i + 2] = src[i + 0]; | |
637 dst[i + 0] = x; | |
638 } | |
639 } | |
640 | |
5588 | 641 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 642 unsigned int width, unsigned int height, |
5588 | 643 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma) |
2701 | 644 { |
2723 | 645 int y; |
646 const int chromWidth= width>>1; | |
647 for(y=0; y<height; y++) | |
648 { | |
2702 | 649 #ifdef HAVE_MMX |
2723 | 650 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
651 asm volatile( | |
652 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
653 ".balign 16 \n\t" |
2723 | 654 "1: \n\t" |
655 PREFETCH" 32(%1, %%eax, 2) \n\t" | |
656 PREFETCH" 32(%2, %%eax) \n\t" | |
657 PREFETCH" 32(%3, %%eax) \n\t" | |
658 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | |
659 "movq %%mm0, %%mm2 \n\t" // U(0) | |
660 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | |
661 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
662 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
663 | |
664 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | |
665 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | |
666 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
667 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
668 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
669 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
670 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
671 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 672 |
2723 | 673 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" |
674 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | |
675 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | |
676 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | |
2702 | 677 |
2723 | 678 "addl $8, %%eax \n\t" |
679 "cmpl %4, %%eax \n\t" | |
680 " jb 1b \n\t" | |
681 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) | |
682 : "%eax" | |
683 ); | |
2702 | 684 #else |
2723 | 685 int i; |
686 for(i=0; i<chromWidth; i++) | |
687 { | |
688 dst[4*i+0] = ysrc[2*i+0]; | |
689 dst[4*i+1] = usrc[i]; | |
690 dst[4*i+2] = ysrc[2*i+1]; | |
691 dst[4*i+3] = vsrc[i]; | |
692 } | |
693 #endif | |
5588 | 694 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
2723 | 695 { |
696 usrc += chromStride; | |
697 vsrc += chromStride; | |
698 } | |
699 ysrc += lumStride; | |
700 dst += dstStride; | |
2701 | 701 } |
2723 | 702 #ifdef HAVE_MMX |
703 asm( EMMS" \n\t" | |
704 SFENCE" \n\t" | |
705 :::"memory"); | |
2702 | 706 #endif |
2701 | 707 } |
708 | |
2724 | 709 /** |
710 * | |
711 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
712 * problem for anyone then tell me, and ill fix it) | |
713 */ | |
5588 | 714 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
715 unsigned int width, unsigned int height, | |
716 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
717 { | |
718 //FIXME interpolate chroma | |
719 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
720 } | |
721 | |
722 /** | |
723 * | |
724 * width should be a multiple of 16 | |
725 */ | |
726 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
727 unsigned int width, unsigned int height, | |
728 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
729 { | |
730 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
731 } | |
732 | |
733 /** | |
734 * | |
735 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
736 * problem for anyone then tell me, and ill fix it) | |
737 */ | |
3132 | 738 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2725 | 739 unsigned int width, unsigned int height, |
740 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
2701 | 741 { |
2724 | 742 int y; |
743 const int chromWidth= width>>1; | |
744 for(y=0; y<height; y+=2) | |
745 { | |
2704 | 746 #ifdef HAVE_MMX |
2724 | 747 asm volatile( |
748 "xorl %%eax, %%eax \n\t" | |
749 "pcmpeqw %%mm7, %%mm7 \n\t" | |
750 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
751 ".balign 16 \n\t" |
2724 | 752 "1: \n\t" |
753 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
754 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
755 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
756 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
757 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
758 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
759 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
760 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
761 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
762 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
763 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
764 | |
765 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2704 | 766 |
2724 | 767 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) |
768 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | |
769 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
770 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
771 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
772 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
773 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
774 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
775 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
776 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 777 |
2724 | 778 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" |
779 | |
780 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
781 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
782 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
783 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
784 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
785 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
786 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
787 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 788 |
2724 | 789 MOVNTQ" %%mm0, (%3, %%eax) \n\t" |
790 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
791 | |
792 "addl $8, %%eax \n\t" | |
793 "cmpl %4, %%eax \n\t" | |
794 " jb 1b \n\t" | |
2725 | 795 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
796 : "memory", "%eax" | |
797 ); | |
2704 | 798 |
2806 | 799 ydst += lumStride; |
800 src += srcStride; | |
801 | |
2725 | 802 asm volatile( |
803 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
804 ".balign 16 \n\t" |
2724 | 805 "1: \n\t" |
806 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
807 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
808 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
809 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
810 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
811 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
812 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
813 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
814 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
815 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
816 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 817 |
2724 | 818 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" |
819 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
820 | |
821 "addl $8, %%eax \n\t" | |
2725 | 822 "cmpl %4, %%eax \n\t" |
2724 | 823 " jb 1b \n\t" |
2704 | 824 |
2806 | 825 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
2724 | 826 : "memory", "%eax" |
827 ); | |
2704 | 828 #else |
2724 | 829 int i; |
830 for(i=0; i<chromWidth; i++) | |
831 { | |
832 ydst[2*i+0] = src[4*i+0]; | |
833 udst[i] = src[4*i+1]; | |
834 ydst[2*i+1] = src[4*i+2]; | |
835 vdst[i] = src[4*i+3]; | |
836 } | |
837 ydst += lumStride; | |
838 src += srcStride; | |
839 | |
840 for(i=0; i<chromWidth; i++) | |
841 { | |
842 ydst[2*i+0] = src[4*i+0]; | |
843 ydst[2*i+1] = src[4*i+2]; | |
844 } | |
845 #endif | |
846 udst += chromStride; | |
847 vdst += chromStride; | |
848 ydst += lumStride; | |
849 src += srcStride; | |
2701 | 850 } |
2724 | 851 #ifdef HAVE_MMX |
2847 | 852 asm volatile( EMMS" \n\t" |
853 SFENCE" \n\t" | |
854 :::"memory"); | |
2704 | 855 #endif |
2723 | 856 } |
2801 | 857 |
858 /** | |
859 * | |
860 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
861 * problem for anyone then tell me, and ill fix it) | |
3132 | 862 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version |
2801 | 863 */ |
3132 | 864 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2801 | 865 unsigned int width, unsigned int height, |
866 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
867 { | |
868 int y; | |
869 const int chromWidth= width>>1; | |
870 for(y=0; y<height; y+=2) | |
871 { | |
2847 | 872 #ifdef HAVE_MMX |
873 asm volatile( | |
874 "xorl %%eax, %%eax \n\t" | |
875 "pcmpeqw %%mm7, %%mm7 \n\t" | |
876 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
877 ".balign 16 \n\t" | |
878 "1: \n\t" | |
879 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
880 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
881 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
882 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
883 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
884 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
885 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
886 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
887 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
888 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
889 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
890 | |
891 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
892 | |
893 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
894 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
895 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
896 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
897 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
898 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
899 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
900 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
901 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
902 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
903 | |
904 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
905 | |
906 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
907 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
908 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
909 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
910 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
911 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
912 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
913 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
914 | |
915 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
916 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
917 | |
918 "addl $8, %%eax \n\t" | |
919 "cmpl %4, %%eax \n\t" | |
920 " jb 1b \n\t" | |
921 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
922 : "memory", "%eax" | |
923 ); | |
924 | |
925 ydst += lumStride; | |
926 src += srcStride; | |
927 | |
928 asm volatile( | |
929 "xorl %%eax, %%eax \n\t" | |
930 ".balign 16 \n\t" | |
931 "1: \n\t" | |
932 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
933 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
934 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
935 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
936 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
937 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
938 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
939 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
940 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
941 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
942 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
943 | |
944 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
945 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
946 | |
947 "addl $8, %%eax \n\t" | |
948 "cmpl %4, %%eax \n\t" | |
949 " jb 1b \n\t" | |
950 | |
951 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
952 : "memory", "%eax" | |
953 ); | |
954 #else | |
2801 | 955 int i; |
956 for(i=0; i<chromWidth; i++) | |
957 { | |
958 udst[i] = src[4*i+0]; | |
959 ydst[2*i+0] = src[4*i+1]; | |
960 vdst[i] = src[4*i+2]; | |
961 ydst[2*i+1] = src[4*i+3]; | |
962 } | |
963 ydst += lumStride; | |
964 src += srcStride; | |
965 | |
966 for(i=0; i<chromWidth; i++) | |
967 { | |
968 ydst[2*i+0] = src[4*i+1]; | |
969 ydst[2*i+1] = src[4*i+3]; | |
970 } | |
2847 | 971 #endif |
2801 | 972 udst += chromStride; |
973 vdst += chromStride; | |
974 ydst += lumStride; | |
975 src += srcStride; | |
976 } | |
2847 | 977 #ifdef HAVE_MMX |
978 asm volatile( EMMS" \n\t" | |
979 SFENCE" \n\t" | |
980 :::"memory"); | |
981 #endif | |
2801 | 982 } |
983 | |
3132 | 984 /** |
985 * | |
986 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
987 * problem for anyone then tell me, and ill fix it) | |
4622 | 988 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version |
3132 | 989 */ |
990 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
991 unsigned int width, unsigned int height, | |
992 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
993 { | |
994 int y; | |
995 const int chromWidth= width>>1; | |
4622 | 996 #ifdef HAVE_MMX |
997 for(y=0; y<height-2; y+=2) | |
998 { | |
999 int i; | |
1000 for(i=0; i<2; i++) | |
1001 { | |
1002 asm volatile( | |
1003 "movl %2, %%eax \n\t" | |
4923 | 1004 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
1005 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4622 | 1006 "pxor %%mm7, %%mm7 \n\t" |
1007 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1008 ".balign 16 \n\t" | |
1009 "1: \n\t" | |
1010 PREFETCH" 64(%0, %%ebx) \n\t" | |
1011 "movd (%0, %%ebx), %%mm0 \n\t" | |
1012 "movd 3(%0, %%ebx), %%mm1 \n\t" | |
1013 "punpcklbw %%mm7, %%mm0 \n\t" | |
1014 "punpcklbw %%mm7, %%mm1 \n\t" | |
1015 "movd 6(%0, %%ebx), %%mm2 \n\t" | |
1016 "movd 9(%0, %%ebx), %%mm3 \n\t" | |
1017 "punpcklbw %%mm7, %%mm2 \n\t" | |
1018 "punpcklbw %%mm7, %%mm3 \n\t" | |
1019 "pmaddwd %%mm6, %%mm0 \n\t" | |
1020 "pmaddwd %%mm6, %%mm1 \n\t" | |
1021 "pmaddwd %%mm6, %%mm2 \n\t" | |
1022 "pmaddwd %%mm6, %%mm3 \n\t" | |
1023 #ifndef FAST_BGR2YV12 | |
1024 "psrad $8, %%mm0 \n\t" | |
1025 "psrad $8, %%mm1 \n\t" | |
1026 "psrad $8, %%mm2 \n\t" | |
1027 "psrad $8, %%mm3 \n\t" | |
1028 #endif | |
1029 "packssdw %%mm1, %%mm0 \n\t" | |
1030 "packssdw %%mm3, %%mm2 \n\t" | |
1031 "pmaddwd %%mm5, %%mm0 \n\t" | |
1032 "pmaddwd %%mm5, %%mm2 \n\t" | |
1033 "packssdw %%mm2, %%mm0 \n\t" | |
1034 "psraw $7, %%mm0 \n\t" | |
1035 | |
1036 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1037 "movd 15(%0, %%ebx), %%mm1 \n\t" | |
1038 "punpcklbw %%mm7, %%mm4 \n\t" | |
1039 "punpcklbw %%mm7, %%mm1 \n\t" | |
1040 "movd 18(%0, %%ebx), %%mm2 \n\t" | |
1041 "movd 21(%0, %%ebx), %%mm3 \n\t" | |
1042 "punpcklbw %%mm7, %%mm2 \n\t" | |
1043 "punpcklbw %%mm7, %%mm3 \n\t" | |
1044 "pmaddwd %%mm6, %%mm4 \n\t" | |
1045 "pmaddwd %%mm6, %%mm1 \n\t" | |
1046 "pmaddwd %%mm6, %%mm2 \n\t" | |
1047 "pmaddwd %%mm6, %%mm3 \n\t" | |
1048 #ifndef FAST_BGR2YV12 | |
1049 "psrad $8, %%mm4 \n\t" | |
1050 "psrad $8, %%mm1 \n\t" | |
1051 "psrad $8, %%mm2 \n\t" | |
1052 "psrad $8, %%mm3 \n\t" | |
1053 #endif | |
1054 "packssdw %%mm1, %%mm4 \n\t" | |
1055 "packssdw %%mm3, %%mm2 \n\t" | |
1056 "pmaddwd %%mm5, %%mm4 \n\t" | |
1057 "pmaddwd %%mm5, %%mm2 \n\t" | |
1058 "addl $24, %%ebx \n\t" | |
1059 "packssdw %%mm2, %%mm4 \n\t" | |
1060 "psraw $7, %%mm4 \n\t" | |
1061 | |
1062 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 1063 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4622 | 1064 |
1065 MOVNTQ" %%mm0, (%1, %%eax) \n\t" | |
1066 "addl $8, %%eax \n\t" | |
1067 " js 1b \n\t" | |
1068 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) | |
1069 : "%eax", "%ebx" | |
1070 ); | |
1071 ydst += lumStride; | |
1072 src += srcStride; | |
1073 } | |
1074 src -= srcStride*2; | |
1075 asm volatile( | |
1076 "movl %4, %%eax \n\t" | |
4923 | 1077 "movq "MANGLE(w1111)", %%mm5 \n\t" |
1078 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4622 | 1079 "pxor %%mm7, %%mm7 \n\t" |
1080 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1081 "addl %%ebx, %%ebx \n\t" | |
1082 ".balign 16 \n\t" | |
1083 "1: \n\t" | |
1084 PREFETCH" 64(%0, %%ebx) \n\t" | |
1085 PREFETCH" 64(%1, %%ebx) \n\t" | |
1086 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1087 "movq (%0, %%ebx), %%mm0 \n\t" | |
1088 "movq (%1, %%ebx), %%mm1 \n\t" | |
1089 "movq 6(%0, %%ebx), %%mm2 \n\t" | |
1090 "movq 6(%1, %%ebx), %%mm3 \n\t" | |
1091 PAVGB" %%mm1, %%mm0 \n\t" | |
1092 PAVGB" %%mm3, %%mm2 \n\t" | |
1093 "movq %%mm0, %%mm1 \n\t" | |
1094 "movq %%mm2, %%mm3 \n\t" | |
1095 "psrlq $24, %%mm0 \n\t" | |
1096 "psrlq $24, %%mm2 \n\t" | |
1097 PAVGB" %%mm1, %%mm0 \n\t" | |
1098 PAVGB" %%mm3, %%mm2 \n\t" | |
1099 "punpcklbw %%mm7, %%mm0 \n\t" | |
1100 "punpcklbw %%mm7, %%mm2 \n\t" | |
1101 #else | |
1102 "movd (%0, %%ebx), %%mm0 \n\t" | |
1103 "movd (%1, %%ebx), %%mm1 \n\t" | |
1104 "movd 3(%0, %%ebx), %%mm2 \n\t" | |
1105 "movd 3(%1, %%ebx), %%mm3 \n\t" | |
1106 "punpcklbw %%mm7, %%mm0 \n\t" | |
1107 "punpcklbw %%mm7, %%mm1 \n\t" | |
1108 "punpcklbw %%mm7, %%mm2 \n\t" | |
1109 "punpcklbw %%mm7, %%mm3 \n\t" | |
1110 "paddw %%mm1, %%mm0 \n\t" | |
1111 "paddw %%mm3, %%mm2 \n\t" | |
1112 "paddw %%mm2, %%mm0 \n\t" | |
1113 "movd 6(%0, %%ebx), %%mm4 \n\t" | |
1114 "movd 6(%1, %%ebx), %%mm1 \n\t" | |
1115 "movd 9(%0, %%ebx), %%mm2 \n\t" | |
1116 "movd 9(%1, %%ebx), %%mm3 \n\t" | |
1117 "punpcklbw %%mm7, %%mm4 \n\t" | |
1118 "punpcklbw %%mm7, %%mm1 \n\t" | |
1119 "punpcklbw %%mm7, %%mm2 \n\t" | |
1120 "punpcklbw %%mm7, %%mm3 \n\t" | |
1121 "paddw %%mm1, %%mm4 \n\t" | |
1122 "paddw %%mm3, %%mm2 \n\t" | |
1123 "paddw %%mm4, %%mm2 \n\t" | |
1124 "psrlw $2, %%mm0 \n\t" | |
1125 "psrlw $2, %%mm2 \n\t" | |
1126 #endif | |
4923 | 1127 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1128 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 1129 |
1130 "pmaddwd %%mm0, %%mm1 \n\t" | |
1131 "pmaddwd %%mm2, %%mm3 \n\t" | |
1132 "pmaddwd %%mm6, %%mm0 \n\t" | |
1133 "pmaddwd %%mm6, %%mm2 \n\t" | |
1134 #ifndef FAST_BGR2YV12 | |
1135 "psrad $8, %%mm0 \n\t" | |
1136 "psrad $8, %%mm1 \n\t" | |
1137 "psrad $8, %%mm2 \n\t" | |
1138 "psrad $8, %%mm3 \n\t" | |
1139 #endif | |
1140 "packssdw %%mm2, %%mm0 \n\t" | |
1141 "packssdw %%mm3, %%mm1 \n\t" | |
1142 "pmaddwd %%mm5, %%mm0 \n\t" | |
1143 "pmaddwd %%mm5, %%mm1 \n\t" | |
1144 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1145 "psraw $7, %%mm0 \n\t" | |
1146 | |
1147 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1148 "movq 12(%0, %%ebx), %%mm4 \n\t" | |
1149 "movq 12(%1, %%ebx), %%mm1 \n\t" | |
1150 "movq 18(%0, %%ebx), %%mm2 \n\t" | |
1151 "movq 18(%1, %%ebx), %%mm3 \n\t" | |
1152 PAVGB" %%mm1, %%mm4 \n\t" | |
1153 PAVGB" %%mm3, %%mm2 \n\t" | |
1154 "movq %%mm4, %%mm1 \n\t" | |
1155 "movq %%mm2, %%mm3 \n\t" | |
1156 "psrlq $24, %%mm4 \n\t" | |
1157 "psrlq $24, %%mm2 \n\t" | |
1158 PAVGB" %%mm1, %%mm4 \n\t" | |
1159 PAVGB" %%mm3, %%mm2 \n\t" | |
1160 "punpcklbw %%mm7, %%mm4 \n\t" | |
1161 "punpcklbw %%mm7, %%mm2 \n\t" | |
1162 #else | |
1163 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1164 "movd 12(%1, %%ebx), %%mm1 \n\t" | |
1165 "movd 15(%0, %%ebx), %%mm2 \n\t" | |
1166 "movd 15(%1, %%ebx), %%mm3 \n\t" | |
1167 "punpcklbw %%mm7, %%mm4 \n\t" | |
1168 "punpcklbw %%mm7, %%mm1 \n\t" | |
1169 "punpcklbw %%mm7, %%mm2 \n\t" | |
1170 "punpcklbw %%mm7, %%mm3 \n\t" | |
1171 "paddw %%mm1, %%mm4 \n\t" | |
1172 "paddw %%mm3, %%mm2 \n\t" | |
1173 "paddw %%mm2, %%mm4 \n\t" | |
1174 "movd 18(%0, %%ebx), %%mm5 \n\t" | |
1175 "movd 18(%1, %%ebx), %%mm1 \n\t" | |
1176 "movd 21(%0, %%ebx), %%mm2 \n\t" | |
1177 "movd 21(%1, %%ebx), %%mm3 \n\t" | |
1178 "punpcklbw %%mm7, %%mm5 \n\t" | |
1179 "punpcklbw %%mm7, %%mm1 \n\t" | |
1180 "punpcklbw %%mm7, %%mm2 \n\t" | |
1181 "punpcklbw %%mm7, %%mm3 \n\t" | |
1182 "paddw %%mm1, %%mm5 \n\t" | |
1183 "paddw %%mm3, %%mm2 \n\t" | |
1184 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 1185 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4622 | 1186 "psrlw $2, %%mm4 \n\t" |
1187 "psrlw $2, %%mm2 \n\t" | |
1188 #endif | |
4923 | 1189 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1190 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 1191 |
1192 "pmaddwd %%mm4, %%mm1 \n\t" | |
1193 "pmaddwd %%mm2, %%mm3 \n\t" | |
1194 "pmaddwd %%mm6, %%mm4 \n\t" | |
1195 "pmaddwd %%mm6, %%mm2 \n\t" | |
1196 #ifndef FAST_BGR2YV12 | |
1197 "psrad $8, %%mm4 \n\t" | |
1198 "psrad $8, %%mm1 \n\t" | |
1199 "psrad $8, %%mm2 \n\t" | |
1200 "psrad $8, %%mm3 \n\t" | |
1201 #endif | |
1202 "packssdw %%mm2, %%mm4 \n\t" | |
1203 "packssdw %%mm3, %%mm1 \n\t" | |
1204 "pmaddwd %%mm5, %%mm4 \n\t" | |
1205 "pmaddwd %%mm5, %%mm1 \n\t" | |
1206 "addl $24, %%ebx \n\t" | |
1207 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
1208 "psraw $7, %%mm4 \n\t" | |
1209 | |
1210 "movq %%mm0, %%mm1 \n\t" | |
1211 "punpckldq %%mm4, %%mm0 \n\t" | |
1212 "punpckhdq %%mm4, %%mm1 \n\t" | |
1213 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 1214 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
4622 | 1215 |
1216 "movd %%mm0, (%2, %%eax) \n\t" | |
1217 "punpckhdq %%mm0, %%mm0 \n\t" | |
1218 "movd %%mm0, (%3, %%eax) \n\t" | |
1219 "addl $4, %%eax \n\t" | |
1220 " js 1b \n\t" | |
1221 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width) | |
1222 : "%eax", "%ebx" | |
1223 ); | |
1224 | |
1225 udst += chromStride; | |
1226 vdst += chromStride; | |
1227 src += srcStride*2; | |
1228 } | |
1229 | |
1230 asm volatile( EMMS" \n\t" | |
1231 SFENCE" \n\t" | |
1232 :::"memory"); | |
1233 #else | |
1234 y=0; | |
1235 #endif | |
1236 for(; y<height; y+=2) | |
3132 | 1237 { |
1238 int i; | |
1239 for(i=0; i<chromWidth; i++) | |
1240 { | |
1241 unsigned int b= src[6*i+0]; | |
1242 unsigned int g= src[6*i+1]; | |
1243 unsigned int r= src[6*i+2]; | |
2801 | 1244 |
3633 | 1245 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
1246 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
1247 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
3132 | 1248 |
1249 udst[i] = U; | |
1250 vdst[i] = V; | |
1251 ydst[2*i] = Y; | |
1252 | |
1253 b= src[6*i+3]; | |
1254 g= src[6*i+4]; | |
1255 r= src[6*i+5]; | |
1256 | |
3633 | 1257 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1258 ydst[2*i+1] = Y; |
1259 } | |
1260 ydst += lumStride; | |
1261 src += srcStride; | |
1262 | |
1263 for(i=0; i<chromWidth; i++) | |
1264 { | |
1265 unsigned int b= src[6*i+0]; | |
1266 unsigned int g= src[6*i+1]; | |
1267 unsigned int r= src[6*i+2]; | |
1268 | |
3633 | 1269 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1270 |
1271 ydst[2*i] = Y; | |
1272 | |
1273 b= src[6*i+3]; | |
1274 g= src[6*i+4]; | |
1275 r= src[6*i+5]; | |
1276 | |
3633 | 1277 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 1278 ydst[2*i+1] = Y; |
1279 } | |
1280 udst += chromStride; | |
1281 vdst += chromStride; | |
1282 ydst += lumStride; | |
1283 src += srcStride; | |
1284 } | |
1285 } | |
5337 | 1286 |
1287 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |
1288 int width, int height, int src1Stride, int src2Stride, int dstStride){ | |
1289 int h; | |
1290 | |
1291 for(h=0; h < height; h++) | |
1292 { | |
1293 int w; | |
1294 | |
1295 #ifdef HAVE_MMX | |
1296 #ifdef HAVE_SSE2 | |
1297 asm( | |
1298 "xorl %%eax, %%eax \n\t" | |
1299 "1: \n\t" | |
1300 PREFETCH" 64(%1, %%eax) \n\t" | |
1301 PREFETCH" 64(%2, %%eax) \n\t" | |
1302 "movdqa (%1, %%eax), %%xmm0 \n\t" | |
1303 "movdqa (%1, %%eax), %%xmm1 \n\t" | |
1304 "movdqa (%2, %%eax), %%xmm2 \n\t" | |
1305 "punpcklbw %%xmm2, %%xmm0 \n\t" | |
1306 "punpckhbw %%xmm2, %%xmm1 \n\t" | |
1307 "movntdq %%xmm0, (%0, %%eax, 2) \n\t" | |
1308 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t" | |
1309 "addl $16, %%eax \n\t" | |
1310 "cmpl %3, %%eax \n\t" | |
1311 " jb 1b \n\t" | |
1312 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
1313 : "memory", "%eax" | |
1314 ); | |
1315 #else | |
1316 asm( | |
1317 "xorl %%eax, %%eax \n\t" | |
1318 "1: \n\t" | |
1319 PREFETCH" 64(%1, %%eax) \n\t" | |
1320 PREFETCH" 64(%2, %%eax) \n\t" | |
1321 "movq (%1, %%eax), %%mm0 \n\t" | |
1322 "movq 8(%1, %%eax), %%mm2 \n\t" | |
1323 "movq %%mm0, %%mm1 \n\t" | |
1324 "movq %%mm2, %%mm3 \n\t" | |
1325 "movq (%2, %%eax), %%mm4 \n\t" | |
1326 "movq 8(%2, %%eax), %%mm5 \n\t" | |
1327 "punpcklbw %%mm4, %%mm0 \n\t" | |
1328 "punpckhbw %%mm4, %%mm1 \n\t" | |
1329 "punpcklbw %%mm5, %%mm2 \n\t" | |
1330 "punpckhbw %%mm5, %%mm3 \n\t" | |
1331 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t" | |
1332 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t" | |
1333 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t" | |
1334 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t" | |
1335 "addl $16, %%eax \n\t" | |
1336 "cmpl %3, %%eax \n\t" | |
1337 " jb 1b \n\t" | |
1338 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
1339 : "memory", "%eax" | |
1340 ); | |
1341 #endif | |
1342 for(w= (width&(~15)); w < width; w++) | |
1343 { | |
1344 dest[2*w+0] = src1[w]; | |
1345 dest[2*w+1] = src2[w]; | |
1346 } | |
1347 #else | |
1348 for(w=0; w < width; w++) | |
1349 { | |
1350 dest[2*w+0] = src1[w]; | |
1351 dest[2*w+1] = src2[w]; | |
1352 } | |
1353 #endif | |
1354 dest += dstStride; | |
1355 src1 += src1Stride; | |
1356 src2 += src2Stride; | |
1357 } | |
1358 #ifdef HAVE_MMX | |
1359 asm( | |
1360 EMMS" \n\t" | |
1361 SFENCE" \n\t" | |
1362 ::: "memory" | |
1363 ); | |
1364 #endif | |
1365 } |