Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 18846:2e73bc9d4b1c
Switch to UTF-8, patch by Rickard Narstrom
author | gpoirier |
---|---|
date | Thu, 29 Jun 2006 08:18:08 +0000 |
parents | 0dd0bcc57c18 |
children |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
3132 | 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) |
13423 | 9 * lot of big-endian byteorder fixes by Alex Beregszaszi |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
10 */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
11 |
6492 | 12 #include <stddef.h> |
13 #include <inttypes.h> /* for __WORDSIZE */ | |
14 | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
15 #include "asmalign.h" |
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
16 |
6492 | 17 #ifndef __WORDSIZE |
7421
0684cad9b204
use detected WORDSIZE instead of warning, when inttypes.h doesn't define __WORDSIZE
arpi
parents:
6608
diff
changeset
|
18 // #warning You have misconfigured system and probably will lose performance! |
0684cad9b204
use detected WORDSIZE instead of warning, when inttypes.h doesn't define __WORDSIZE
arpi
parents:
6608
diff
changeset
|
19 #define __WORDSIZE MP_WORDSIZE |
6492 | 20 #endif |
21 | |
3132 | 22 #undef PREFETCH |
23 #undef MOVNTQ | |
24 #undef EMMS | |
25 #undef SFENCE | |
26 #undef MMREG_SIZE | |
27 #undef PREFETCHW | |
28 #undef PAVGB | |
2755 | 29 |
3132 | 30 #ifdef HAVE_SSE2 |
31 #define MMREG_SIZE 16 | |
32 #else | |
33 #define MMREG_SIZE 8 | |
2535 | 34 #endif |
2513 | 35 |
3132 | 36 #ifdef HAVE_3DNOW |
37 #define PREFETCH "prefetch" | |
38 #define PREFETCHW "prefetchw" | |
39 #define PAVGB "pavgusb" | |
40 #elif defined ( HAVE_MMX2 ) | |
41 #define PREFETCH "prefetchnta" | |
42 #define PREFETCHW "prefetcht0" | |
43 #define PAVGB "pavgb" | |
44 #else | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
45 #ifdef __APPLE__ |
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
46 #define PREFETCH "#" |
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
47 #define PREFETCHW "#" |
18110 | 48 #else |
3132 | 49 #define PREFETCH "/nop" |
50 #define PREFETCHW "/nop" | |
51 #endif | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
52 #endif |
3132 | 53 |
54 #ifdef HAVE_3DNOW | |
55 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
56 #define EMMS "femms" | |
57 #else | |
58 #define EMMS "emms" | |
59 #endif | |
60 | |
61 #ifdef HAVE_MMX2 | |
62 #define MOVNTQ "movntq" | |
63 #define SFENCE "sfence" | |
64 #else | |
65 #define MOVNTQ "movq" | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
66 #ifdef __APPLE__ |
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
67 #define SFENCE "#" |
18110 | 68 #else |
3132 | 69 #define SFENCE "/nop" |
70 #endif | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
71 #endif |
3132 | 72 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
73 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size) |
2504 | 74 { |
2508 | 75 uint8_t *dest = dst; |
2677 | 76 const uint8_t *s = src; |
77 const uint8_t *end; | |
2510 | 78 #ifdef HAVE_MMX |
6605 | 79 const uint8_t *mm_end; |
2510 | 80 #endif |
2504 | 81 end = s + src_size; |
2510 | 82 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
83 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
6605 | 84 mm_end = end - 23; |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
85 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2510 | 86 while(s < mm_end) |
87 { | |
2511 | 88 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
89 PREFETCH" 32%1\n\t" |
2510 | 90 "movd %1, %%mm0\n\t" |
2738 | 91 "punpckldq 3%1, %%mm0\n\t" |
92 "movd 6%1, %%mm1\n\t" | |
93 "punpckldq 9%1, %%mm1\n\t" | |
94 "movd 12%1, %%mm2\n\t" | |
95 "punpckldq 15%1, %%mm2\n\t" | |
96 "movd 18%1, %%mm3\n\t" | |
97 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 98 "pand %%mm7, %%mm0\n\t" |
2738 | 99 "pand %%mm7, %%mm1\n\t" |
2510 | 100 "pand %%mm7, %%mm2\n\t" |
2738 | 101 "pand %%mm7, %%mm3\n\t" |
2511 | 102 MOVNTQ" %%mm0, %0\n\t" |
2738 | 103 MOVNTQ" %%mm1, 8%0\n\t" |
104 MOVNTQ" %%mm2, 16%0\n\t" | |
105 MOVNTQ" %%mm3, 24%0" | |
2510 | 106 :"=m"(*dest) |
107 :"m"(*s) | |
108 :"memory"); | |
2738 | 109 dest += 32; |
110 s += 24; | |
2510 | 111 } |
2513 | 112 __asm __volatile(SFENCE:::"memory"); |
2511 | 113 __asm __volatile(EMMS:::"memory"); |
2510 | 114 #endif |
2504 | 115 while(s < end) |
116 { | |
13423 | 117 #ifdef WORDS_BIGENDIAN |
17586 | 118 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ |
13423 | 119 *dest++ = 0; |
17586 | 120 *dest++ = s[2]; |
121 *dest++ = s[1]; | |
122 *dest++ = s[0]; | |
123 s+=3; | |
13423 | 124 #else |
2508 | 125 *dest++ = *s++; |
126 *dest++ = *s++; | |
127 *dest++ = *s++; | |
128 *dest++ = 0; | |
13423 | 129 #endif |
2504 | 130 } |
131 } | |
2505 | 132 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
133 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size) |
2505 | 134 { |
135 uint8_t *dest = dst; | |
2677 | 136 const uint8_t *s = src; |
137 const uint8_t *end; | |
2517 | 138 #ifdef HAVE_MMX |
6605 | 139 const uint8_t *mm_end; |
2517 | 140 #endif |
2505 | 141 end = s + src_size; |
2517 | 142 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
143 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
6605 | 144 mm_end = end - 31; |
2517 | 145 while(s < mm_end) |
146 { | |
147 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
148 PREFETCH" 32%1\n\t" |
2517 | 149 "movq %1, %%mm0\n\t" |
150 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "movq 24%1, %%mm5\n\t" |
2517 | 153 "movq %%mm0, %%mm2\n\t" |
154 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
156 "movq %%mm5, %%mm7\n\t" |
2517 | 157 "psrlq $8, %%mm2\n\t" |
158 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
159 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
160 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
163 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
165 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
166 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
167 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
168 "pand %3, %%mm7\n\t" |
2517 | 169 "por %%mm2, %%mm0\n\t" |
170 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
171 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
172 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
173 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
174 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
175 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
176 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
177 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
178 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
179 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
180 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
181 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
182 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
183 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
184 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
185 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
186 "por %%mm5, %%mm4\n\t" |
3132 | 187 |
2517 | 188 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
189 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
190 MOVNTQ" %%mm4, 16%0" |
2517 | 191 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
192 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
193 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 194 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
195 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
196 s += 32; |
2517 | 197 } |
198 __asm __volatile(SFENCE:::"memory"); | |
199 __asm __volatile(EMMS:::"memory"); | |
200 #endif | |
2505 | 201 while(s < end) |
202 { | |
13423 | 203 #ifdef WORDS_BIGENDIAN |
17586 | 204 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ |
13423 | 205 s++; |
17586 | 206 dest[2] = *s++; |
207 dest[1] = *s++; | |
208 dest[0] = *s++; | |
209 dest += 3; | |
13423 | 210 #else |
2505 | 211 *dest++ = *s++; |
212 *dest++ = *s++; | |
213 *dest++ = *s++; | |
214 s++; | |
13423 | 215 #endif |
2505 | 216 } |
217 } | |
2506 | 218 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
219 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
220 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
221 ported to gcc & bugfixed : A'rpi |
2564 | 222 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
223 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
224 */ |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
225 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size) |
2506 | 226 { |
6492 | 227 register const uint8_t* s=src; |
228 register uint8_t* d=dst; | |
229 register const uint8_t *end; | |
6605 | 230 const uint8_t *mm_end; |
6492 | 231 end = s + src_size; |
2506 | 232 #ifdef HAVE_MMX |
6492 | 233 __asm __volatile(PREFETCH" %0"::"m"(*s)); |
234 __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); | |
6605 | 235 mm_end = end - 15; |
6492 | 236 while(s<mm_end) |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
237 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
238 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
239 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
240 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
241 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
242 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
243 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
244 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
245 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
246 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
247 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
248 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
249 MOVNTQ" %%mm2, 8%0" |
6492 | 250 :"=m"(*d) |
251 :"m"(*s) | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
252 ); |
6492 | 253 d+=16; |
254 s+=16; | |
2506 | 255 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
256 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
257 __asm __volatile(EMMS:::"memory"); |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
258 #endif |
6605 | 259 mm_end = end - 3; |
6492 | 260 while(s < mm_end) |
261 { | |
262 register unsigned x= *((uint32_t *)s); | |
263 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
264 d+=4; | |
265 s+=4; | |
266 } | |
267 if(s < end) | |
268 { | |
269 register unsigned short x= *((uint16_t *)s); | |
270 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
271 } | |
2506 | 272 } |
2694 | 273 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
274 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
275 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
276 register const uint8_t* s=src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
277 register uint8_t* d=dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
278 register const uint8_t *end; |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
279 const uint8_t *mm_end; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
280 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
281 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
282 __asm __volatile(PREFETCH" %0"::"m"(*s)); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
283 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
284 __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
285 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
286 while(s<mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
287 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
288 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
289 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
290 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
291 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
292 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
293 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
294 "psrlq $1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
295 "psrlq $1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
296 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
297 "pand %%mm7, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
298 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
299 "pand %%mm6, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
300 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
301 "por %%mm3, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
302 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
303 MOVNTQ" %%mm2, 8%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
304 :"=m"(*d) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
305 :"m"(*s) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
306 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
307 d+=16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
308 s+=16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
309 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
310 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
311 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
312 #endif |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
313 mm_end = end - 3; |
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
314 while(s < mm_end) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
315 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
316 register uint32_t x= *((uint32_t *)s); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
317 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
318 s+=4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
319 d+=4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
320 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
321 if(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
322 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
323 register uint16_t x= *((uint16_t *)s); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
324 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
325 s+=2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
326 d+=2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
327 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
328 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
329 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
330 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) |
2694 | 331 { |
6492 | 332 const uint8_t *s = src; |
333 const uint8_t *end; | |
2741 | 334 #ifdef HAVE_MMX |
6492 | 335 const uint8_t *mm_end; |
336 #endif | |
2741 | 337 uint16_t *d = (uint16_t *)dst; |
338 end = s + src_size; | |
6492 | 339 #ifdef HAVE_MMX |
9454 | 340 mm_end = end - 15; |
341 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
342 asm volatile( | |
343 "movq %3, %%mm5 \n\t" | |
344 "movq %4, %%mm6 \n\t" | |
345 "movq %5, %%mm7 \n\t" | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
346 ASMALIGN16 |
9454 | 347 "1: \n\t" |
348 PREFETCH" 32(%1) \n\t" | |
349 "movd (%1), %%mm0 \n\t" | |
350 "movd 4(%1), %%mm3 \n\t" | |
351 "punpckldq 8(%1), %%mm0 \n\t" | |
352 "punpckldq 12(%1), %%mm3 \n\t" | |
353 "movq %%mm0, %%mm1 \n\t" | |
354 "movq %%mm3, %%mm4 \n\t" | |
355 "pand %%mm6, %%mm0 \n\t" | |
356 "pand %%mm6, %%mm3 \n\t" | |
357 "pmaddwd %%mm7, %%mm0 \n\t" | |
358 "pmaddwd %%mm7, %%mm3 \n\t" | |
359 "pand %%mm5, %%mm1 \n\t" | |
360 "pand %%mm5, %%mm4 \n\t" | |
361 "por %%mm1, %%mm0 \n\t" | |
362 "por %%mm4, %%mm3 \n\t" | |
363 "psrld $5, %%mm0 \n\t" | |
364 "pslld $11, %%mm3 \n\t" | |
365 "por %%mm3, %%mm0 \n\t" | |
366 MOVNTQ" %%mm0, (%0) \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
367 "add $16, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
368 "add $8, %0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
369 "cmp %2, %1 \n\t" |
9454 | 370 " jb 1b \n\t" |
371 : "+r" (d), "+r"(s) | |
372 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | |
373 ); | |
374 #else | |
2741 | 375 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
376 __asm __volatile( | |
377 "movq %0, %%mm7\n\t" | |
378 "movq %1, %%mm6\n\t" | |
379 ::"m"(red_16mask),"m"(green_16mask)); | |
380 while(s < mm_end) | |
381 { | |
382 __asm __volatile( | |
383 PREFETCH" 32%1\n\t" | |
384 "movd %1, %%mm0\n\t" | |
385 "movd 4%1, %%mm3\n\t" | |
386 "punpckldq 8%1, %%mm0\n\t" | |
387 "punpckldq 12%1, %%mm3\n\t" | |
388 "movq %%mm0, %%mm1\n\t" | |
389 "movq %%mm0, %%mm2\n\t" | |
390 "movq %%mm3, %%mm4\n\t" | |
391 "movq %%mm3, %%mm5\n\t" | |
392 "psrlq $3, %%mm0\n\t" | |
393 "psrlq $3, %%mm3\n\t" | |
394 "pand %2, %%mm0\n\t" | |
395 "pand %2, %%mm3\n\t" | |
396 "psrlq $5, %%mm1\n\t" | |
397 "psrlq $5, %%mm4\n\t" | |
398 "pand %%mm6, %%mm1\n\t" | |
399 "pand %%mm6, %%mm4\n\t" | |
400 "psrlq $8, %%mm2\n\t" | |
401 "psrlq $8, %%mm5\n\t" | |
402 "pand %%mm7, %%mm2\n\t" | |
403 "pand %%mm7, %%mm5\n\t" | |
404 "por %%mm1, %%mm0\n\t" | |
405 "por %%mm4, %%mm3\n\t" | |
406 "por %%mm2, %%mm0\n\t" | |
407 "por %%mm5, %%mm3\n\t" | |
408 "psllq $16, %%mm3\n\t" | |
409 "por %%mm3, %%mm0\n\t" | |
410 MOVNTQ" %%mm0, %0\n\t" | |
411 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
412 d += 4; | |
413 s += 16; | |
414 } | |
9454 | 415 #endif |
6492 | 416 __asm __volatile(SFENCE:::"memory"); |
417 __asm __volatile(EMMS:::"memory"); | |
418 #endif | |
2741 | 419 while(s < end) |
420 { | |
14982
49dd10a86b23
Fixes rgb32to16 conversion for I think all platforms since the int8
diego
parents:
13720
diff
changeset
|
421 register int rgb = *(uint32_t*)s; s += 4; |
49dd10a86b23
Fixes rgb32to16 conversion for I think all platforms since the int8
diego
parents:
13720
diff
changeset
|
422 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); |
2741 | 423 } |
2694 | 424 } |
425 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
426 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
427 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
428 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
429 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
430 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
431 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
432 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
433 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
434 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
435 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
436 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
437 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
438 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
439 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
440 ::"m"(red_16mask),"m"(green_16mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
441 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
442 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
443 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
444 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
445 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
446 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
447 "movd 4%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
448 "punpckldq 8%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
449 "punpckldq 12%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
450 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
451 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
452 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
453 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
454 "psllq $8, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
455 "psllq $8, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
456 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
457 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
458 "psrlq $5, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
459 "psrlq $5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
460 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
461 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
462 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
463 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
464 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
465 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
466 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
467 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
468 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
469 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
470 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
471 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
472 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
473 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
474 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
475 s += 16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
476 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
477 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
478 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
479 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
480 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
481 { |
17670
f5f338e543b6
Fix rgb32tobgr16, rgb32to15, and rgb32tobgr15. All had the same problem that
pacman
parents:
17622
diff
changeset
|
482 register int rgb = *(uint32_t*)s; s += 4; |
f5f338e543b6
Fix rgb32tobgr16, rgb32to15, and rgb32tobgr15. All had the same problem that
pacman
parents:
17622
diff
changeset
|
483 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
484 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
485 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
486 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
487 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) |
2694 | 488 { |
6492 | 489 const uint8_t *s = src; |
490 const uint8_t *end; | |
2741 | 491 #ifdef HAVE_MMX |
6492 | 492 const uint8_t *mm_end; |
493 #endif | |
2741 | 494 uint16_t *d = (uint16_t *)dst; |
495 end = s + src_size; | |
6492 | 496 #ifdef HAVE_MMX |
9454 | 497 mm_end = end - 15; |
498 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
499 asm volatile( | |
500 "movq %3, %%mm5 \n\t" | |
501 "movq %4, %%mm6 \n\t" | |
502 "movq %5, %%mm7 \n\t" | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
503 ASMALIGN16 |
9454 | 504 "1: \n\t" |
505 PREFETCH" 32(%1) \n\t" | |
506 "movd (%1), %%mm0 \n\t" | |
507 "movd 4(%1), %%mm3 \n\t" | |
508 "punpckldq 8(%1), %%mm0 \n\t" | |
509 "punpckldq 12(%1), %%mm3 \n\t" | |
510 "movq %%mm0, %%mm1 \n\t" | |
511 "movq %%mm3, %%mm4 \n\t" | |
512 "pand %%mm6, %%mm0 \n\t" | |
513 "pand %%mm6, %%mm3 \n\t" | |
514 "pmaddwd %%mm7, %%mm0 \n\t" | |
515 "pmaddwd %%mm7, %%mm3 \n\t" | |
516 "pand %%mm5, %%mm1 \n\t" | |
517 "pand %%mm5, %%mm4 \n\t" | |
518 "por %%mm1, %%mm0 \n\t" | |
519 "por %%mm4, %%mm3 \n\t" | |
520 "psrld $6, %%mm0 \n\t" | |
521 "pslld $10, %%mm3 \n\t" | |
522 "por %%mm3, %%mm0 \n\t" | |
523 MOVNTQ" %%mm0, (%0) \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
524 "add $16, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
525 "add $8, %0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
526 "cmp %2, %1 \n\t" |
9454 | 527 " jb 1b \n\t" |
528 : "+r" (d), "+r"(s) | |
529 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | |
530 ); | |
531 #else | |
2741 | 532 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
533 __asm __volatile( | |
534 "movq %0, %%mm7\n\t" | |
535 "movq %1, %%mm6\n\t" | |
536 ::"m"(red_15mask),"m"(green_15mask)); | |
537 while(s < mm_end) | |
538 { | |
539 __asm __volatile( | |
540 PREFETCH" 32%1\n\t" | |
541 "movd %1, %%mm0\n\t" | |
542 "movd 4%1, %%mm3\n\t" | |
543 "punpckldq 8%1, %%mm0\n\t" | |
544 "punpckldq 12%1, %%mm3\n\t" | |
545 "movq %%mm0, %%mm1\n\t" | |
546 "movq %%mm0, %%mm2\n\t" | |
547 "movq %%mm3, %%mm4\n\t" | |
548 "movq %%mm3, %%mm5\n\t" | |
549 "psrlq $3, %%mm0\n\t" | |
550 "psrlq $3, %%mm3\n\t" | |
551 "pand %2, %%mm0\n\t" | |
552 "pand %2, %%mm3\n\t" | |
553 "psrlq $6, %%mm1\n\t" | |
554 "psrlq $6, %%mm4\n\t" | |
555 "pand %%mm6, %%mm1\n\t" | |
556 "pand %%mm6, %%mm4\n\t" | |
557 "psrlq $9, %%mm2\n\t" | |
558 "psrlq $9, %%mm5\n\t" | |
559 "pand %%mm7, %%mm2\n\t" | |
560 "pand %%mm7, %%mm5\n\t" | |
561 "por %%mm1, %%mm0\n\t" | |
562 "por %%mm4, %%mm3\n\t" | |
563 "por %%mm2, %%mm0\n\t" | |
564 "por %%mm5, %%mm3\n\t" | |
565 "psllq $16, %%mm3\n\t" | |
566 "por %%mm3, %%mm0\n\t" | |
567 MOVNTQ" %%mm0, %0\n\t" | |
568 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
569 d += 4; | |
570 s += 16; | |
571 } | |
9454 | 572 #endif |
6492 | 573 __asm __volatile(SFENCE:::"memory"); |
574 __asm __volatile(EMMS:::"memory"); | |
575 #endif | |
2741 | 576 while(s < end) |
577 { | |
17670
f5f338e543b6
Fix rgb32tobgr16, rgb32to15, and rgb32tobgr15. All had the same problem that
pacman
parents:
17622
diff
changeset
|
578 register int rgb = *(uint32_t*)s; s += 4; |
f5f338e543b6
Fix rgb32tobgr16, rgb32to15, and rgb32tobgr15. All had the same problem that
pacman
parents:
17622
diff
changeset
|
579 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); |
2741 | 580 } |
2694 | 581 } |
582 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
583 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
584 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
585 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
586 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
587 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
588 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
589 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
590 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
591 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
592 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
593 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
594 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
595 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
596 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
597 ::"m"(red_15mask),"m"(green_15mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
598 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
599 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
600 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
601 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
602 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
603 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
604 "movd 4%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
605 "punpckldq 8%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
606 "punpckldq 12%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
607 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
608 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
609 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
610 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
611 "psllq $7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
612 "psllq $7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
613 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
614 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
615 "psrlq $6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
616 "psrlq $6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
617 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
618 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
619 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
620 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
621 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
622 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
623 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
624 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
625 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
626 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
627 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
628 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
629 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
630 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
631 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
632 s += 16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
633 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
634 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
635 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
636 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
637 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
638 { |
17670
f5f338e543b6
Fix rgb32tobgr16, rgb32to15, and rgb32tobgr15. All had the same problem that
pacman
parents:
17622
diff
changeset
|
639 register int rgb = *(uint32_t*)s; s += 4; |
f5f338e543b6
Fix rgb32tobgr16, rgb32to15, and rgb32tobgr15. All had the same problem that
pacman
parents:
17622
diff
changeset
|
640 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
641 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
642 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
643 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
644 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) |
2718 | 645 { |
6492 | 646 const uint8_t *s = src; |
647 const uint8_t *end; | |
2740 | 648 #ifdef HAVE_MMX |
6492 | 649 const uint8_t *mm_end; |
650 #endif | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
651 uint16_t *d = (uint16_t *)dst; |
2740 | 652 end = s + src_size; |
6492 | 653 #ifdef HAVE_MMX |
2738 | 654 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
655 __asm __volatile( | |
656 "movq %0, %%mm7\n\t" | |
657 "movq %1, %%mm6\n\t" | |
2741 | 658 ::"m"(red_16mask),"m"(green_16mask)); |
6605 | 659 mm_end = end - 11; |
2740 | 660 while(s < mm_end) |
2738 | 661 { |
662 __asm __volatile( | |
663 PREFETCH" 32%1\n\t" | |
664 "movd %1, %%mm0\n\t" | |
2740 | 665 "movd 3%1, %%mm3\n\t" |
666 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 667 "punpckldq 9%1, %%mm3\n\t" |
668 "movq %%mm0, %%mm1\n\t" | |
669 "movq %%mm0, %%mm2\n\t" | |
670 "movq %%mm3, %%mm4\n\t" | |
671 "movq %%mm3, %%mm5\n\t" | |
672 "psrlq $3, %%mm0\n\t" | |
673 "psrlq $3, %%mm3\n\t" | |
2740 | 674 "pand %2, %%mm0\n\t" |
675 "pand %2, %%mm3\n\t" | |
676 "psrlq $5, %%mm1\n\t" | |
677 "psrlq $5, %%mm4\n\t" | |
678 "pand %%mm6, %%mm1\n\t" | |
679 "pand %%mm6, %%mm4\n\t" | |
680 "psrlq $8, %%mm2\n\t" | |
681 "psrlq $8, %%mm5\n\t" | |
682 "pand %%mm7, %%mm2\n\t" | |
683 "pand %%mm7, %%mm5\n\t" | |
2738 | 684 "por %%mm1, %%mm0\n\t" |
2740 | 685 "por %%mm4, %%mm3\n\t" |
2738 | 686 "por %%mm2, %%mm0\n\t" |
687 "por %%mm5, %%mm3\n\t" | |
2740 | 688 "psllq $16, %%mm3\n\t" |
689 "por %%mm3, %%mm0\n\t" | |
2738 | 690 MOVNTQ" %%mm0, %0\n\t" |
2741 | 691 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 692 d += 4; |
693 s += 12; | |
2738 | 694 } |
6492 | 695 __asm __volatile(SFENCE:::"memory"); |
696 __asm __volatile(EMMS:::"memory"); | |
697 #endif | |
2740 | 698 while(s < end) |
699 { | |
700 const int b= *s++; | |
701 const int g= *s++; | |
702 const int r= *s++; | |
703 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
704 } | |
2718 | 705 } |
706 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
707 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
708 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
709 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
710 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
711 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
712 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
713 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
714 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
715 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
716 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
717 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
718 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
719 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
720 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
721 ::"m"(red_16mask),"m"(green_16mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
722 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
723 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
724 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
725 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
726 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
727 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
728 "movd 3%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
729 "punpckldq 6%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
730 "punpckldq 9%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
731 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
732 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
733 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
734 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
735 "psllq $8, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
736 "psllq $8, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
737 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
738 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
739 "psrlq $5, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
740 "psrlq $5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
741 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
742 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
743 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
744 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
745 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
746 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
747 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
748 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
749 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
750 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
751 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
752 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
753 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
754 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
755 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
756 s += 12; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
757 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
758 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
759 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
760 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
761 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
762 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
763 const int r= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
764 const int g= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
765 const int b= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
766 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
767 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
768 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
769 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
770 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) |
2718 | 771 { |
6492 | 772 const uint8_t *s = src; |
773 const uint8_t *end; | |
2741 | 774 #ifdef HAVE_MMX |
6492 | 775 const uint8_t *mm_end; |
776 #endif | |
2741 | 777 uint16_t *d = (uint16_t *)dst; |
778 end = s + src_size; | |
6492 | 779 #ifdef HAVE_MMX |
2741 | 780 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
781 __asm __volatile( | |
782 "movq %0, %%mm7\n\t" | |
783 "movq %1, %%mm6\n\t" | |
784 ::"m"(red_15mask),"m"(green_15mask)); | |
6605 | 785 mm_end = end - 11; |
2741 | 786 while(s < mm_end) |
787 { | |
788 __asm __volatile( | |
789 PREFETCH" 32%1\n\t" | |
790 "movd %1, %%mm0\n\t" | |
791 "movd 3%1, %%mm3\n\t" | |
792 "punpckldq 6%1, %%mm0\n\t" | |
793 "punpckldq 9%1, %%mm3\n\t" | |
794 "movq %%mm0, %%mm1\n\t" | |
795 "movq %%mm0, %%mm2\n\t" | |
796 "movq %%mm3, %%mm4\n\t" | |
797 "movq %%mm3, %%mm5\n\t" | |
798 "psrlq $3, %%mm0\n\t" | |
799 "psrlq $3, %%mm3\n\t" | |
800 "pand %2, %%mm0\n\t" | |
801 "pand %2, %%mm3\n\t" | |
802 "psrlq $6, %%mm1\n\t" | |
803 "psrlq $6, %%mm4\n\t" | |
804 "pand %%mm6, %%mm1\n\t" | |
805 "pand %%mm6, %%mm4\n\t" | |
806 "psrlq $9, %%mm2\n\t" | |
807 "psrlq $9, %%mm5\n\t" | |
808 "pand %%mm7, %%mm2\n\t" | |
809 "pand %%mm7, %%mm5\n\t" | |
810 "por %%mm1, %%mm0\n\t" | |
811 "por %%mm4, %%mm3\n\t" | |
812 "por %%mm2, %%mm0\n\t" | |
813 "por %%mm5, %%mm3\n\t" | |
814 "psllq $16, %%mm3\n\t" | |
815 "por %%mm3, %%mm0\n\t" | |
816 MOVNTQ" %%mm0, %0\n\t" | |
817 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
818 d += 4; | |
819 s += 12; | |
820 } | |
6492 | 821 __asm __volatile(SFENCE:::"memory"); |
822 __asm __volatile(EMMS:::"memory"); | |
823 #endif | |
2741 | 824 while(s < end) |
825 { | |
826 const int b= *s++; | |
827 const int g= *s++; | |
828 const int r= *s++; | |
829 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
830 } | |
6492 | 831 } |
832 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
833 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
834 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
835 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
836 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
837 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
838 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
839 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
840 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
841 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
842 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
843 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
844 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
845 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
846 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
847 ::"m"(red_15mask),"m"(green_15mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
848 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
849 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
850 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
851 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
852 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
853 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
854 "movd 3%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
855 "punpckldq 6%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
856 "punpckldq 9%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
857 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
858 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
859 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
860 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
861 "psllq $7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
862 "psllq $7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
863 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
864 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
865 "psrlq $6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
866 "psrlq $6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
867 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
868 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
869 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
870 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
871 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
872 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
873 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
874 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
875 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
876 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
877 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
878 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
879 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
880 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
881 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
882 s += 12; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
883 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
884 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
885 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
886 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
887 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
888 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
889 const int r= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
890 const int g= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
891 const int b= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
892 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
893 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
894 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
895 |
6492 | 896 /* |
897 I use here less accurate approximation by simply | |
898 left-shifting the input | |
899 value and filling the low order bits with | |
900 zeroes. This method improves png's | |
901 compression but this scheme cannot reproduce white exactly, since it does not | |
902 generate an all-ones maximum value; the net effect is to darken the | |
903 image slightly. | |
904 | |
905 The better method should be "left bit replication": | |
906 | |
907 4 3 2 1 0 | |
908 --------- | |
909 1 1 0 1 1 | |
910 | |
911 7 6 5 4 3 2 1 0 | |
912 ---------------- | |
913 1 1 0 1 1 1 1 0 | |
914 |=======| |===| | |
915 | Leftmost Bits Repeated to Fill Open Bits | |
916 | | |
917 Original Bits | |
918 */ | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
919 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 920 { |
921 const uint16_t *end; | |
922 #ifdef HAVE_MMX | |
923 const uint16_t *mm_end; | |
924 #endif | |
925 uint8_t *d = (uint8_t *)dst; | |
926 const uint16_t *s = (uint16_t *)src; | |
927 end = s + src_size/2; | |
928 #ifdef HAVE_MMX | |
929 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
6605 | 930 mm_end = end - 7; |
6492 | 931 while(s < mm_end) |
932 { | |
933 __asm __volatile( | |
934 PREFETCH" 32%1\n\t" | |
935 "movq %1, %%mm0\n\t" | |
936 "movq %1, %%mm1\n\t" | |
937 "movq %1, %%mm2\n\t" | |
938 "pand %2, %%mm0\n\t" | |
939 "pand %3, %%mm1\n\t" | |
940 "pand %4, %%mm2\n\t" | |
941 "psllq $3, %%mm0\n\t" | |
942 "psrlq $2, %%mm1\n\t" | |
943 "psrlq $7, %%mm2\n\t" | |
944 "movq %%mm0, %%mm3\n\t" | |
945 "movq %%mm1, %%mm4\n\t" | |
946 "movq %%mm2, %%mm5\n\t" | |
947 "punpcklwd %5, %%mm0\n\t" | |
948 "punpcklwd %5, %%mm1\n\t" | |
949 "punpcklwd %5, %%mm2\n\t" | |
950 "punpckhwd %5, %%mm3\n\t" | |
951 "punpckhwd %5, %%mm4\n\t" | |
952 "punpckhwd %5, %%mm5\n\t" | |
953 "psllq $8, %%mm1\n\t" | |
954 "psllq $16, %%mm2\n\t" | |
955 "por %%mm1, %%mm0\n\t" | |
956 "por %%mm2, %%mm0\n\t" | |
957 "psllq $8, %%mm4\n\t" | |
958 "psllq $16, %%mm5\n\t" | |
959 "por %%mm4, %%mm3\n\t" | |
960 "por %%mm5, %%mm3\n\t" | |
961 | |
962 "movq %%mm0, %%mm6\n\t" | |
963 "movq %%mm3, %%mm7\n\t" | |
964 | |
965 "movq 8%1, %%mm0\n\t" | |
966 "movq 8%1, %%mm1\n\t" | |
967 "movq 8%1, %%mm2\n\t" | |
968 "pand %2, %%mm0\n\t" | |
969 "pand %3, %%mm1\n\t" | |
970 "pand %4, %%mm2\n\t" | |
971 "psllq $3, %%mm0\n\t" | |
972 "psrlq $2, %%mm1\n\t" | |
973 "psrlq $7, %%mm2\n\t" | |
974 "movq %%mm0, %%mm3\n\t" | |
975 "movq %%mm1, %%mm4\n\t" | |
976 "movq %%mm2, %%mm5\n\t" | |
977 "punpcklwd %5, %%mm0\n\t" | |
978 "punpcklwd %5, %%mm1\n\t" | |
979 "punpcklwd %5, %%mm2\n\t" | |
980 "punpckhwd %5, %%mm3\n\t" | |
981 "punpckhwd %5, %%mm4\n\t" | |
982 "punpckhwd %5, %%mm5\n\t" | |
983 "psllq $8, %%mm1\n\t" | |
984 "psllq $16, %%mm2\n\t" | |
985 "por %%mm1, %%mm0\n\t" | |
986 "por %%mm2, %%mm0\n\t" | |
987 "psllq $8, %%mm4\n\t" | |
988 "psllq $16, %%mm5\n\t" | |
989 "por %%mm4, %%mm3\n\t" | |
990 "por %%mm5, %%mm3\n\t" | |
991 | |
992 :"=m"(*d) | |
993 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | |
994 :"memory"); | |
995 /* Borrowed 32 to 24 */ | |
996 __asm __volatile( | |
997 "movq %%mm0, %%mm4\n\t" | |
998 "movq %%mm3, %%mm5\n\t" | |
999 "movq %%mm6, %%mm0\n\t" | |
1000 "movq %%mm7, %%mm1\n\t" | |
1001 | |
1002 "movq %%mm4, %%mm6\n\t" | |
1003 "movq %%mm5, %%mm7\n\t" | |
1004 "movq %%mm0, %%mm2\n\t" | |
1005 "movq %%mm1, %%mm3\n\t" | |
1006 | |
1007 "psrlq $8, %%mm2\n\t" | |
1008 "psrlq $8, %%mm3\n\t" | |
1009 "psrlq $8, %%mm6\n\t" | |
1010 "psrlq $8, %%mm7\n\t" | |
1011 "pand %2, %%mm0\n\t" | |
1012 "pand %2, %%mm1\n\t" | |
1013 "pand %2, %%mm4\n\t" | |
1014 "pand %2, %%mm5\n\t" | |
1015 "pand %3, %%mm2\n\t" | |
1016 "pand %3, %%mm3\n\t" | |
1017 "pand %3, %%mm6\n\t" | |
1018 "pand %3, %%mm7\n\t" | |
1019 "por %%mm2, %%mm0\n\t" | |
1020 "por %%mm3, %%mm1\n\t" | |
1021 "por %%mm6, %%mm4\n\t" | |
1022 "por %%mm7, %%mm5\n\t" | |
1023 | |
1024 "movq %%mm1, %%mm2\n\t" | |
1025 "movq %%mm4, %%mm3\n\t" | |
1026 "psllq $48, %%mm2\n\t" | |
1027 "psllq $32, %%mm3\n\t" | |
1028 "pand %4, %%mm2\n\t" | |
1029 "pand %5, %%mm3\n\t" | |
1030 "por %%mm2, %%mm0\n\t" | |
1031 "psrlq $16, %%mm1\n\t" | |
1032 "psrlq $32, %%mm4\n\t" | |
1033 "psllq $16, %%mm5\n\t" | |
1034 "por %%mm3, %%mm1\n\t" | |
1035 "pand %6, %%mm5\n\t" | |
1036 "por %%mm5, %%mm4\n\t" | |
1037 | |
1038 MOVNTQ" %%mm0, %0\n\t" | |
1039 MOVNTQ" %%mm1, 8%0\n\t" | |
1040 MOVNTQ" %%mm4, 16%0" | |
1041 | |
1042 :"=m"(*d) | |
1043 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1044 :"memory"); | |
1045 d += 24; | |
1046 s += 8; | |
1047 } | |
2741 | 1048 __asm __volatile(SFENCE:::"memory"); |
1049 __asm __volatile(EMMS:::"memory"); | |
6492 | 1050 #endif |
1051 while(s < end) | |
1052 { | |
1053 register uint16_t bgr; | |
1054 bgr = *s++; | |
1055 *d++ = (bgr&0x1F)<<3; | |
1056 *d++ = (bgr&0x3E0)>>2; | |
1057 *d++ = (bgr&0x7C00)>>7; | |
1058 } | |
1059 } | |
1060 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1061 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 1062 { |
1063 const uint16_t *end; | |
1064 #ifdef HAVE_MMX | |
1065 const uint16_t *mm_end; | |
1066 #endif | |
1067 uint8_t *d = (uint8_t *)dst; | |
1068 const uint16_t *s = (const uint16_t *)src; | |
1069 end = s + src_size/2; | |
1070 #ifdef HAVE_MMX | |
1071 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
6605 | 1072 mm_end = end - 7; |
6492 | 1073 while(s < mm_end) |
2718 | 1074 { |
6492 | 1075 __asm __volatile( |
1076 PREFETCH" 32%1\n\t" | |
1077 "movq %1, %%mm0\n\t" | |
1078 "movq %1, %%mm1\n\t" | |
1079 "movq %1, %%mm2\n\t" | |
1080 "pand %2, %%mm0\n\t" | |
1081 "pand %3, %%mm1\n\t" | |
1082 "pand %4, %%mm2\n\t" | |
1083 "psllq $3, %%mm0\n\t" | |
1084 "psrlq $3, %%mm1\n\t" | |
1085 "psrlq $8, %%mm2\n\t" | |
1086 "movq %%mm0, %%mm3\n\t" | |
1087 "movq %%mm1, %%mm4\n\t" | |
1088 "movq %%mm2, %%mm5\n\t" | |
1089 "punpcklwd %5, %%mm0\n\t" | |
1090 "punpcklwd %5, %%mm1\n\t" | |
1091 "punpcklwd %5, %%mm2\n\t" | |
1092 "punpckhwd %5, %%mm3\n\t" | |
1093 "punpckhwd %5, %%mm4\n\t" | |
1094 "punpckhwd %5, %%mm5\n\t" | |
1095 "psllq $8, %%mm1\n\t" | |
1096 "psllq $16, %%mm2\n\t" | |
1097 "por %%mm1, %%mm0\n\t" | |
1098 "por %%mm2, %%mm0\n\t" | |
1099 "psllq $8, %%mm4\n\t" | |
1100 "psllq $16, %%mm5\n\t" | |
1101 "por %%mm4, %%mm3\n\t" | |
1102 "por %%mm5, %%mm3\n\t" | |
1103 | |
1104 "movq %%mm0, %%mm6\n\t" | |
1105 "movq %%mm3, %%mm7\n\t" | |
1106 | |
1107 "movq 8%1, %%mm0\n\t" | |
1108 "movq 8%1, %%mm1\n\t" | |
1109 "movq 8%1, %%mm2\n\t" | |
1110 "pand %2, %%mm0\n\t" | |
1111 "pand %3, %%mm1\n\t" | |
1112 "pand %4, %%mm2\n\t" | |
1113 "psllq $3, %%mm0\n\t" | |
1114 "psrlq $3, %%mm1\n\t" | |
1115 "psrlq $8, %%mm2\n\t" | |
1116 "movq %%mm0, %%mm3\n\t" | |
1117 "movq %%mm1, %%mm4\n\t" | |
1118 "movq %%mm2, %%mm5\n\t" | |
1119 "punpcklwd %5, %%mm0\n\t" | |
1120 "punpcklwd %5, %%mm1\n\t" | |
1121 "punpcklwd %5, %%mm2\n\t" | |
1122 "punpckhwd %5, %%mm3\n\t" | |
1123 "punpckhwd %5, %%mm4\n\t" | |
1124 "punpckhwd %5, %%mm5\n\t" | |
1125 "psllq $8, %%mm1\n\t" | |
1126 "psllq $16, %%mm2\n\t" | |
1127 "por %%mm1, %%mm0\n\t" | |
1128 "por %%mm2, %%mm0\n\t" | |
1129 "psllq $8, %%mm4\n\t" | |
1130 "psllq $16, %%mm5\n\t" | |
1131 "por %%mm4, %%mm3\n\t" | |
1132 "por %%mm5, %%mm3\n\t" | |
1133 :"=m"(*d) | |
1134 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | |
1135 :"memory"); | |
1136 /* Borrowed 32 to 24 */ | |
1137 __asm __volatile( | |
1138 "movq %%mm0, %%mm4\n\t" | |
1139 "movq %%mm3, %%mm5\n\t" | |
1140 "movq %%mm6, %%mm0\n\t" | |
1141 "movq %%mm7, %%mm1\n\t" | |
1142 | |
1143 "movq %%mm4, %%mm6\n\t" | |
1144 "movq %%mm5, %%mm7\n\t" | |
1145 "movq %%mm0, %%mm2\n\t" | |
1146 "movq %%mm1, %%mm3\n\t" | |
1147 | |
1148 "psrlq $8, %%mm2\n\t" | |
1149 "psrlq $8, %%mm3\n\t" | |
1150 "psrlq $8, %%mm6\n\t" | |
1151 "psrlq $8, %%mm7\n\t" | |
1152 "pand %2, %%mm0\n\t" | |
1153 "pand %2, %%mm1\n\t" | |
1154 "pand %2, %%mm4\n\t" | |
1155 "pand %2, %%mm5\n\t" | |
1156 "pand %3, %%mm2\n\t" | |
1157 "pand %3, %%mm3\n\t" | |
1158 "pand %3, %%mm6\n\t" | |
1159 "pand %3, %%mm7\n\t" | |
1160 "por %%mm2, %%mm0\n\t" | |
1161 "por %%mm3, %%mm1\n\t" | |
1162 "por %%mm6, %%mm4\n\t" | |
1163 "por %%mm7, %%mm5\n\t" | |
1164 | |
1165 "movq %%mm1, %%mm2\n\t" | |
1166 "movq %%mm4, %%mm3\n\t" | |
1167 "psllq $48, %%mm2\n\t" | |
1168 "psllq $32, %%mm3\n\t" | |
1169 "pand %4, %%mm2\n\t" | |
1170 "pand %5, %%mm3\n\t" | |
1171 "por %%mm2, %%mm0\n\t" | |
1172 "psrlq $16, %%mm1\n\t" | |
1173 "psrlq $32, %%mm4\n\t" | |
1174 "psllq $16, %%mm5\n\t" | |
1175 "por %%mm3, %%mm1\n\t" | |
1176 "pand %6, %%mm5\n\t" | |
1177 "por %%mm5, %%mm4\n\t" | |
1178 | |
1179 MOVNTQ" %%mm0, %0\n\t" | |
1180 MOVNTQ" %%mm1, 8%0\n\t" | |
1181 MOVNTQ" %%mm4, 16%0" | |
1182 | |
1183 :"=m"(*d) | |
1184 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1185 :"memory"); | |
1186 d += 24; | |
1187 s += 8; | |
1188 } | |
1189 __asm __volatile(SFENCE:::"memory"); | |
1190 __asm __volatile(EMMS:::"memory"); | |
1191 #endif | |
1192 while(s < end) | |
1193 { | |
1194 register uint16_t bgr; | |
1195 bgr = *s++; | |
1196 *d++ = (bgr&0x1F)<<3; | |
1197 *d++ = (bgr&0x7E0)>>3; | |
1198 *d++ = (bgr&0xF800)>>8; | |
1199 } | |
1200 } | |
2718 | 1201 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1202 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 1203 { |
1204 const uint16_t *end; | |
1205 #ifdef HAVE_MMX | |
1206 const uint16_t *mm_end; | |
1207 #endif | |
1208 uint8_t *d = (uint8_t *)dst; | |
1209 const uint16_t *s = (const uint16_t *)src; | |
1210 end = s + src_size/2; | |
1211 #ifdef HAVE_MMX | |
1212 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1213 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
6605 | 1214 mm_end = end - 3; |
6492 | 1215 while(s < mm_end) |
1216 { | |
1217 __asm __volatile( | |
1218 PREFETCH" 32%1\n\t" | |
1219 "movq %1, %%mm0\n\t" | |
1220 "movq %1, %%mm1\n\t" | |
1221 "movq %1, %%mm2\n\t" | |
1222 "pand %2, %%mm0\n\t" | |
1223 "pand %3, %%mm1\n\t" | |
1224 "pand %4, %%mm2\n\t" | |
1225 "psllq $3, %%mm0\n\t" | |
1226 "psrlq $2, %%mm1\n\t" | |
1227 "psrlq $7, %%mm2\n\t" | |
1228 "movq %%mm0, %%mm3\n\t" | |
1229 "movq %%mm1, %%mm4\n\t" | |
1230 "movq %%mm2, %%mm5\n\t" | |
1231 "punpcklwd %%mm7, %%mm0\n\t" | |
1232 "punpcklwd %%mm7, %%mm1\n\t" | |
1233 "punpcklwd %%mm7, %%mm2\n\t" | |
1234 "punpckhwd %%mm7, %%mm3\n\t" | |
1235 "punpckhwd %%mm7, %%mm4\n\t" | |
1236 "punpckhwd %%mm7, %%mm5\n\t" | |
1237 "psllq $8, %%mm1\n\t" | |
1238 "psllq $16, %%mm2\n\t" | |
1239 "por %%mm1, %%mm0\n\t" | |
1240 "por %%mm2, %%mm0\n\t" | |
1241 "psllq $8, %%mm4\n\t" | |
1242 "psllq $16, %%mm5\n\t" | |
1243 "por %%mm4, %%mm3\n\t" | |
1244 "por %%mm5, %%mm3\n\t" | |
1245 MOVNTQ" %%mm0, %0\n\t" | |
1246 MOVNTQ" %%mm3, 8%0\n\t" | |
1247 :"=m"(*d) | |
1248 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | |
1249 :"memory"); | |
1250 d += 16; | |
1251 s += 4; | |
1252 } | |
1253 __asm __volatile(SFENCE:::"memory"); | |
1254 __asm __volatile(EMMS:::"memory"); | |
1255 #endif | |
1256 while(s < end) | |
1257 { | |
9430 | 1258 #if 0 //slightly slower on athlon |
1259 int bgr= *s++; | |
1260 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); | |
1261 #else | |
6492 | 1262 register uint16_t bgr; |
1263 bgr = *s++; | |
13423 | 1264 #ifdef WORDS_BIGENDIAN |
1265 *d++ = 0; | |
17586 | 1266 *d++ = (bgr&0x7C00)>>7; |
13423 | 1267 *d++ = (bgr&0x3E0)>>2; |
17586 | 1268 *d++ = (bgr&0x1F)<<3; |
13423 | 1269 #else |
6492 | 1270 *d++ = (bgr&0x1F)<<3; |
1271 *d++ = (bgr&0x3E0)>>2; | |
1272 *d++ = (bgr&0x7C00)>>7; | |
1273 *d++ = 0; | |
9430 | 1274 #endif |
13423 | 1275 |
1276 #endif | |
2718 | 1277 } |
6492 | 1278 } |
1279 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1280 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 1281 { |
1282 const uint16_t *end; | |
1283 #ifdef HAVE_MMX | |
1284 const uint16_t *mm_end; | |
2741 | 1285 #endif |
6492 | 1286 uint8_t *d = (uint8_t *)dst; |
1287 const uint16_t *s = (uint16_t *)src; | |
1288 end = s + src_size/2; | |
1289 #ifdef HAVE_MMX | |
1290 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1291 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
6605 | 1292 mm_end = end - 3; |
6492 | 1293 while(s < mm_end) |
1294 { | |
1295 __asm __volatile( | |
1296 PREFETCH" 32%1\n\t" | |
1297 "movq %1, %%mm0\n\t" | |
1298 "movq %1, %%mm1\n\t" | |
1299 "movq %1, %%mm2\n\t" | |
1300 "pand %2, %%mm0\n\t" | |
1301 "pand %3, %%mm1\n\t" | |
1302 "pand %4, %%mm2\n\t" | |
1303 "psllq $3, %%mm0\n\t" | |
1304 "psrlq $3, %%mm1\n\t" | |
1305 "psrlq $8, %%mm2\n\t" | |
1306 "movq %%mm0, %%mm3\n\t" | |
1307 "movq %%mm1, %%mm4\n\t" | |
1308 "movq %%mm2, %%mm5\n\t" | |
1309 "punpcklwd %%mm7, %%mm0\n\t" | |
1310 "punpcklwd %%mm7, %%mm1\n\t" | |
1311 "punpcklwd %%mm7, %%mm2\n\t" | |
1312 "punpckhwd %%mm7, %%mm3\n\t" | |
1313 "punpckhwd %%mm7, %%mm4\n\t" | |
1314 "punpckhwd %%mm7, %%mm5\n\t" | |
1315 "psllq $8, %%mm1\n\t" | |
1316 "psllq $16, %%mm2\n\t" | |
1317 "por %%mm1, %%mm0\n\t" | |
1318 "por %%mm2, %%mm0\n\t" | |
1319 "psllq $8, %%mm4\n\t" | |
1320 "psllq $16, %%mm5\n\t" | |
1321 "por %%mm4, %%mm3\n\t" | |
1322 "por %%mm5, %%mm3\n\t" | |
1323 MOVNTQ" %%mm0, %0\n\t" | |
1324 MOVNTQ" %%mm3, 8%0\n\t" | |
1325 :"=m"(*d) | |
1326 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | |
1327 :"memory"); | |
1328 d += 16; | |
1329 s += 4; | |
1330 } | |
1331 __asm __volatile(SFENCE:::"memory"); | |
1332 __asm __volatile(EMMS:::"memory"); | |
1333 #endif | |
1334 while(s < end) | |
1335 { | |
1336 register uint16_t bgr; | |
1337 bgr = *s++; | |
13423 | 1338 #ifdef WORDS_BIGENDIAN |
1339 *d++ = 0; | |
17586 | 1340 *d++ = (bgr&0xF800)>>8; |
13423 | 1341 *d++ = (bgr&0x7E0)>>3; |
17586 | 1342 *d++ = (bgr&0x1F)<<3; |
13423 | 1343 #else |
6492 | 1344 *d++ = (bgr&0x1F)<<3; |
1345 *d++ = (bgr&0x7E0)>>3; | |
1346 *d++ = (bgr&0xF800)>>8; | |
1347 *d++ = 0; | |
13423 | 1348 #endif |
6492 | 1349 } |
2718 | 1350 } |
2694 | 1351 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1352 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) |
2755 | 1353 { |
1354 #ifdef HAVE_MMX | |
6492 | 1355 /* TODO: unroll this loop */ |
2755 | 1356 asm volatile ( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1357 "xor %%"REG_a", %%"REG_a" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
1358 ASMALIGN16 |
2755 | 1359 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1360 PREFETCH" 32(%0, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1361 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
2755 | 1362 "movq %%mm0, %%mm1 \n\t" |
1363 "movq %%mm0, %%mm2 \n\t" | |
1364 "pslld $16, %%mm0 \n\t" | |
1365 "psrld $16, %%mm1 \n\t" | |
6492 | 1366 "pand "MANGLE(mask32r)", %%mm0 \n\t" |
1367 "pand "MANGLE(mask32g)", %%mm2 \n\t" | |
1368 "pand "MANGLE(mask32b)", %%mm1 \n\t" | |
2755 | 1369 "por %%mm0, %%mm2 \n\t" |
1370 "por %%mm1, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1371 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1372 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1373 "cmp %2, %%"REG_a" \n\t" |
2755 | 1374 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1375 :: "r" (src), "r"(dst), "r" (src_size-7) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1376 : "%"REG_a |
2755 | 1377 ); |
2766 | 1378 |
1379 __asm __volatile(SFENCE:::"memory"); | |
1380 __asm __volatile(EMMS:::"memory"); | |
2755 | 1381 #else |
6492 | 1382 unsigned i; |
1383 unsigned num_pixels = src_size >> 2; | |
2755 | 1384 for(i=0; i<num_pixels; i++) |
1385 { | |
9988
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1386 #ifdef WORDS_BIGENDIAN |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1387 dst[4*i + 1] = src[4*i + 3]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1388 dst[4*i + 2] = src[4*i + 2]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1389 dst[4*i + 3] = src[4*i + 1]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1390 #else |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1391 dst[4*i + 0] = src[4*i + 2]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1392 dst[4*i + 1] = src[4*i + 1]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1393 dst[4*i + 2] = src[4*i + 0]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1394 #endif |
2755 | 1395 } |
1396 #endif | |
1397 } | |
1398 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1399 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) |
5582 | 1400 { |
6492 | 1401 unsigned i; |
5582 | 1402 #ifdef HAVE_MMX |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1403 long mmx_size= 23 - src_size; |
5582 | 1404 asm volatile ( |
1405 "movq "MANGLE(mask24r)", %%mm5 \n\t" | |
1406 "movq "MANGLE(mask24g)", %%mm6 \n\t" | |
1407 "movq "MANGLE(mask24b)", %%mm7 \n\t" | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
1408 ASMALIGN16 |
5582 | 1409 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1410 PREFETCH" 32(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1411 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1412 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1413 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B |
5582 | 1414 "psllq $16, %%mm0 \n\t" // 00 BGR BGR |
1415 "pand %%mm5, %%mm0 \n\t" | |
1416 "pand %%mm6, %%mm1 \n\t" | |
1417 "pand %%mm7, %%mm2 \n\t" | |
1418 "por %%mm0, %%mm1 \n\t" | |
1419 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1420 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1421 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1422 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1423 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR |
5582 | 1424 "pand %%mm7, %%mm0 \n\t" |
1425 "pand %%mm5, %%mm1 \n\t" | |
1426 "pand %%mm6, %%mm2 \n\t" | |
1427 "por %%mm0, %%mm1 \n\t" | |
1428 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1429 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1430 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1431 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1432 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG |
5582 | 1433 "pand %%mm6, %%mm0 \n\t" |
1434 "pand %%mm7, %%mm1 \n\t" | |
1435 "pand %%mm5, %%mm2 \n\t" | |
1436 "por %%mm0, %%mm1 \n\t" | |
1437 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1438 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1439 "add $24, %%"REG_a" \n\t" |
5582 | 1440 " js 1b \n\t" |
1441 : "+a" (mmx_size) | |
1442 : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1443 ); | |
1444 | |
1445 __asm __volatile(SFENCE:::"memory"); | |
1446 __asm __volatile(EMMS:::"memory"); | |
1447 | |
6096 | 1448 if(mmx_size==23) return; //finihsed, was multiple of 8 |
6492 | 1449 |
5582 | 1450 src+= src_size; |
1451 dst+= src_size; | |
6492 | 1452 src_size= 23-mmx_size; |
5582 | 1453 src-= src_size; |
1454 dst-= src_size; | |
1455 #endif | |
1456 for(i=0; i<src_size; i+=3) | |
1457 { | |
6492 | 1458 register uint8_t x; |
5582 | 1459 x = src[i + 2]; |
1460 dst[i + 1] = src[i + 1]; | |
1461 dst[i + 2] = src[i + 0]; | |
1462 dst[i + 0] = x; | |
1463 } | |
1464 } | |
1465 | |
5588 | 1466 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1467 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1468 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) |
2701 | 1469 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1470 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1471 const long chromWidth= width>>1; |
2723 | 1472 for(y=0; y<height; y++) |
1473 { | |
2702 | 1474 #ifdef HAVE_MMX |
2723 | 1475 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
1476 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1477 "xor %%"REG_a", %%"REG_a" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
1478 ASMALIGN16 |
2723 | 1479 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1481 PREFETCH" 32(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1482 PREFETCH" 32(%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
2723 | 1484 "movq %%mm0, %%mm2 \n\t" // U(0) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
2723 | 1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1488 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
2723 | 1491 "movq %%mm3, %%mm4 \n\t" // Y(0) |
1492 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 1497 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
2702 | 1502 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1503 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1504 "cmp %4, %%"REG_a" \n\t" |
2723 | 1505 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1507 : "%"REG_a |
2723 | 1508 ); |
2702 | 1509 #else |
9393
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1510 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1511 #if defined ARCH_ALPHA && defined HAVE_MVI |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1512 #define pl2yuy2(n) \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1513 y1 = yc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1514 y2 = yc2[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1515 u = uc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1516 v = vc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1517 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1518 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1519 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1520 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1521 yuv1 = (u << 8) + (v << 24); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1522 yuv2 = yuv1 + y2; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1523 yuv1 += y1; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1524 qdst[n] = yuv1; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1525 qdst2[n] = yuv2; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1526 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1527 int i; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1528 uint64_t *qdst = (uint64_t *) dst; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1529 uint64_t *qdst2 = (uint64_t *) (dst + dstStride); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1530 const uint32_t *yc = (uint32_t *) ysrc; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1531 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1532 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1533 for(i = 0; i < chromWidth; i += 8){ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1534 uint64_t y1, y2, yuv1, yuv2; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1535 uint64_t u, v; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1536 /* Prefetch */ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1537 asm("ldq $31,64(%0)" :: "r"(yc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1538 asm("ldq $31,64(%0)" :: "r"(yc2)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1539 asm("ldq $31,64(%0)" :: "r"(uc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1540 asm("ldq $31,64(%0)" :: "r"(vc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1541 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1542 pl2yuy2(0); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1543 pl2yuy2(1); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1544 pl2yuy2(2); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1545 pl2yuy2(3); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1546 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1547 yc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1548 yc2 += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1549 uc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1550 vc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1551 qdst += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1552 qdst2 += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1553 } |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1554 y++; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1555 ysrc += lumStride; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1556 dst += dstStride; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1557 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1558 #elif __WORDSIZE >= 64 |
2723 | 1559 int i; |
6492 | 1560 uint64_t *ldst = (uint64_t *) dst; |
1561 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1562 for(i = 0; i < chromWidth; i += 2){ | |
1563 uint64_t k, l; | |
1564 k = yc[0] + (uc[0] << 8) + | |
1565 (yc[1] << 16) + (vc[0] << 24); | |
1566 l = yc[2] + (uc[1] << 8) + | |
1567 (yc[3] << 16) + (vc[1] << 24); | |
1568 *ldst++ = k + (l << 32); | |
1569 yc += 4; | |
1570 uc += 2; | |
1571 vc += 2; | |
2723 | 1572 } |
6492 | 1573 |
1574 #else | |
1575 int i, *idst = (int32_t *) dst; | |
1576 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1577 for(i = 0; i < chromWidth; i++){ | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1578 #ifdef WORDS_BIGENDIAN |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1579 *idst++ = (yc[0] << 24)+ (uc[0] << 16) + |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1580 (yc[1] << 8) + (vc[0] << 0); |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1581 #else |
6492 | 1582 *idst++ = yc[0] + (uc[0] << 8) + |
1583 (yc[1] << 16) + (vc[0] << 24); | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1584 #endif |
6492 | 1585 yc += 2; |
1586 uc++; | |
1587 vc++; | |
1588 } | |
1589 #endif | |
2723 | 1590 #endif |
5588 | 1591 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
2723 | 1592 { |
1593 usrc += chromStride; | |
1594 vsrc += chromStride; | |
1595 } | |
1596 ysrc += lumStride; | |
1597 dst += dstStride; | |
2701 | 1598 } |
2723 | 1599 #ifdef HAVE_MMX |
1600 asm( EMMS" \n\t" | |
1601 SFENCE" \n\t" | |
1602 :::"memory"); | |
2702 | 1603 #endif |
2701 | 1604 } |
1605 | |
2724 | 1606 /** |
1607 * | |
1608 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1609 * problem for anyone then tell me, and ill fix it) | |
1610 */ | |
5588 | 1611 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1612 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1613 long lumStride, long chromStride, long dstStride) |
5588 | 1614 { |
1615 //FIXME interpolate chroma | |
1616 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1617 } | |
1618 | |
11068 | 1619 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1620 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1621 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) |
11068 | 1622 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1623 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1624 const long chromWidth= width>>1; |
11068 | 1625 for(y=0; y<height; y++) |
1626 { | |
11072 | 1627 #ifdef HAVE_MMX |
1628 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) | |
1629 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1630 "xor %%"REG_a", %%"REG_a" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
1631 ASMALIGN16 |
11072 | 1632 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1633 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1634 PREFETCH" 32(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1635 PREFETCH" 32(%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1636 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
11072 | 1637 "movq %%mm0, %%mm2 \n\t" // U(0) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1638 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
11072 | 1639 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1640 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1641 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1642 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1643 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
11072 | 1644 "movq %%mm0, %%mm4 \n\t" // Y(0) |
1645 "movq %%mm2, %%mm6 \n\t" // Y(8) | |
1646 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | |
1647 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | |
1648 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | |
1649 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | |
1650 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1651 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1652 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1653 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1654 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
11072 | 1655 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1656 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1657 "cmp %4, %%"REG_a" \n\t" |
11072 | 1658 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1659 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1660 : "%"REG_a |
11072 | 1661 ); |
1662 #else | |
1663 //FIXME adapt the alpha asm code from yv12->yuy2 | |
1664 | |
11068 | 1665 #if __WORDSIZE >= 64 |
1666 int i; | |
1667 uint64_t *ldst = (uint64_t *) dst; | |
1668 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1669 for(i = 0; i < chromWidth; i += 2){ | |
1670 uint64_t k, l; | |
1671 k = uc[0] + (yc[0] << 8) + | |
1672 (vc[0] << 16) + (yc[1] << 24); | |
1673 l = uc[1] + (yc[2] << 8) + | |
1674 (vc[1] << 16) + (yc[3] << 24); | |
1675 *ldst++ = k + (l << 32); | |
1676 yc += 4; | |
1677 uc += 2; | |
1678 vc += 2; | |
1679 } | |
1680 | |
1681 #else | |
1682 int i, *idst = (int32_t *) dst; | |
1683 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1684 for(i = 0; i < chromWidth; i++){ | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1685 #ifdef WORDS_BIGENDIAN |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1686 *idst++ = (uc[0] << 24)+ (yc[0] << 16) + |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1687 (vc[0] << 8) + (yc[1] << 0); |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1688 #else |
11068 | 1689 *idst++ = uc[0] + (yc[0] << 8) + |
1690 (vc[0] << 16) + (yc[1] << 24); | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1691 #endif |
11068 | 1692 yc += 2; |
1693 uc++; | |
1694 vc++; | |
1695 } | |
1696 #endif | |
11072 | 1697 #endif |
11068 | 1698 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
1699 { | |
1700 usrc += chromStride; | |
1701 vsrc += chromStride; | |
1702 } | |
1703 ysrc += lumStride; | |
1704 dst += dstStride; | |
1705 } | |
11072 | 1706 #ifdef HAVE_MMX |
1707 asm( EMMS" \n\t" | |
1708 SFENCE" \n\t" | |
1709 :::"memory"); | |
1710 #endif | |
11068 | 1711 } |
1712 | |
1713 /** | |
1714 * | |
1715 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1716 * problem for anyone then tell me, and ill fix it) | |
1717 */ | |
1718 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1719 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1720 long lumStride, long chromStride, long dstStride) |
11068 | 1721 { |
1722 //FIXME interpolate chroma | |
1723 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1724 } | |
1725 | |
5588 | 1726 /** |
1727 * | |
1728 * width should be a multiple of 16 | |
1729 */ | |
1730 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1731 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1732 long lumStride, long chromStride, long dstStride) |
5588 | 1733 { |
1734 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1735 } | |
1736 | |
1737 /** | |
1738 * | |
1739 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1740 * problem for anyone then tell me, and ill fix it) | |
1741 */ | |
3132 | 1742 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1743 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1744 long lumStride, long chromStride, long srcStride) |
2701 | 1745 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1746 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1747 const long chromWidth= width>>1; |
2724 | 1748 for(y=0; y<height; y+=2) |
1749 { | |
2704 | 1750 #ifdef HAVE_MMX |
2724 | 1751 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1752 "xor %%"REG_a", %%"REG_a" \n\t" |
2724 | 1753 "pcmpeqw %%mm7, %%mm7 \n\t" |
1754 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
1755 ASMALIGN16 |
2724 | 1756 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1757 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1758 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1759 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
2724 | 1760 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) |
1761 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
1762 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
1763 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
1764 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1765 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1766 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1767 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
1768 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1769 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t" |
2704 | 1770 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1771 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1772 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12) |
2724 | 1773 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) |
1774 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
1775 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
1776 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
1777 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
1778 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
1779 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
1780 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 1781 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1782 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t" |
2724 | 1783 |
1784 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
1785 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
1786 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
1787 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
1788 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
1789 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
1790 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
1791 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 1792 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1793 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1794 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" |
2724 | 1795 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1796 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1797 "cmp %4, %%"REG_a" \n\t" |
2724 | 1798 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1799 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1800 : "memory", "%"REG_a |
2725 | 1801 ); |
2704 | 1802 |
2806 | 1803 ydst += lumStride; |
1804 src += srcStride; | |
1805 | |
2725 | 1806 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1807 "xor %%"REG_a", %%"REG_a" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
1808 ASMALIGN16 |
2724 | 1809 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1810 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1811 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1812 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1813 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1814 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12) |
2724 | 1815 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) |
1816 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
1817 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
1818 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
1819 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
1820 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 1821 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1822 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1823 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t" |
2724 | 1824 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1825 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1826 "cmp %4, %%"REG_a" \n\t" |
2724 | 1827 " jb 1b \n\t" |
2704 | 1828 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1829 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1830 : "memory", "%"REG_a |
2724 | 1831 ); |
2704 | 1832 #else |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1833 long i; |
2724 | 1834 for(i=0; i<chromWidth; i++) |
1835 { | |
1836 ydst[2*i+0] = src[4*i+0]; | |
1837 udst[i] = src[4*i+1]; | |
1838 ydst[2*i+1] = src[4*i+2]; | |
1839 vdst[i] = src[4*i+3]; | |
1840 } | |
1841 ydst += lumStride; | |
1842 src += srcStride; | |
1843 | |
1844 for(i=0; i<chromWidth; i++) | |
1845 { | |
1846 ydst[2*i+0] = src[4*i+0]; | |
1847 ydst[2*i+1] = src[4*i+2]; | |
1848 } | |
1849 #endif | |
1850 udst += chromStride; | |
1851 vdst += chromStride; | |
1852 ydst += lumStride; | |
1853 src += srcStride; | |
2701 | 1854 } |
2724 | 1855 #ifdef HAVE_MMX |
2847 | 1856 asm volatile( EMMS" \n\t" |
1857 SFENCE" \n\t" | |
1858 :::"memory"); | |
2704 | 1859 #endif |
2723 | 1860 } |
2801 | 1861 |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1862 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1863 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1864 long width, long height, long lumStride, long chromStride) |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1865 { |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1866 /* Y Plane */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1867 memcpy(ydst, ysrc, width*height); |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1868 |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1869 /* XXX: implement upscaling for U,V */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1870 } |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1871 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1872 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride) |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1873 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1874 long x,y; |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1875 |
9256 | 1876 dst[0]= src[0]; |
1877 | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1878 // first line |
9256 | 1879 for(x=0; x<srcWidth-1; x++){ |
1880 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1881 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1882 } |
9256 | 1883 dst[2*srcWidth-1]= src[srcWidth-1]; |
1884 | |
1885 dst+= dstStride; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1886 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1887 for(y=1; y<srcHeight; y++){ |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1889 const long mmxSize= srcWidth&~15; |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1890 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1891 "mov %4, %%"REG_a" \n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1892 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1893 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1894 "movq (%1, %%"REG_a"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1895 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1896 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1897 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1898 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" |
9256 | 1899 PAVGB" %%mm0, %%mm5 \n\t" |
1900 PAVGB" %%mm0, %%mm3 \n\t" | |
1901 PAVGB" %%mm0, %%mm5 \n\t" | |
1902 PAVGB" %%mm0, %%mm3 \n\t" | |
1903 PAVGB" %%mm1, %%mm4 \n\t" | |
1904 PAVGB" %%mm1, %%mm2 \n\t" | |
1905 PAVGB" %%mm1, %%mm4 \n\t" | |
1906 PAVGB" %%mm1, %%mm2 \n\t" | |
1907 "movq %%mm5, %%mm7 \n\t" | |
1908 "movq %%mm4, %%mm6 \n\t" | |
1909 "punpcklbw %%mm3, %%mm5 \n\t" | |
1910 "punpckhbw %%mm3, %%mm7 \n\t" | |
1911 "punpcklbw %%mm2, %%mm4 \n\t" | |
1912 "punpckhbw %%mm2, %%mm6 \n\t" | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1913 #if 1 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1914 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1915 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1916 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1917 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1918 #else |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1919 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1920 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1921 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1922 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1923 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1924 "add $8, %%"REG_a" \n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1925 " js 1b \n\t" |
9256 | 1926 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1927 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1928 "g" (-mmxSize) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1929 : "%"REG_a |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1930 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1931 ); |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1932 #else |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1933 const long mmxSize=1; |
9256 | 1934 #endif |
1935 dst[0 ]= (3*src[0] + src[srcStride])>>2; | |
1936 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1937 |
9256 | 1938 for(x=mmxSize-1; x<srcWidth-1; x++){ |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1939 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1940 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1941 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1942 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1943 } |
9256 | 1944 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; |
1945 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1946 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1947 dst+=dstStride*2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1948 src+=srcStride; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1949 } |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1950 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1951 // last line |
9256 | 1952 #if 1 |
1953 dst[0]= src[0]; | |
1954 | |
1955 for(x=0; x<srcWidth-1; x++){ | |
1956 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1957 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1958 } | |
1959 dst[2*srcWidth-1]= src[srcWidth-1]; | |
1960 #else | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1961 for(x=0; x<srcWidth; x++){ |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1962 dst[2*x+0]= |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1963 dst[2*x+1]= src[x]; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1964 } |
9256 | 1965 #endif |
1966 | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1967 #ifdef HAVE_MMX |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1968 asm volatile( EMMS" \n\t" |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1969 SFENCE" \n\t" |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1970 :::"memory"); |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1971 #endif |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1972 } |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1973 |
2801 | 1974 /** |
1975 * | |
1976 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1977 * problem for anyone then tell me, and ill fix it) | |
3132 | 1978 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version |
2801 | 1979 */ |
3132 | 1980 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1981 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1982 long lumStride, long chromStride, long srcStride) |
2801 | 1983 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1984 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1985 const long chromWidth= width>>1; |
2801 | 1986 for(y=0; y<height; y+=2) |
1987 { | |
2847 | 1988 #ifdef HAVE_MMX |
1989 asm volatile( | |
1990 "xorl %%eax, %%eax \n\t" | |
1991 "pcmpeqw %%mm7, %%mm7 \n\t" | |
1992 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
1993 ASMALIGN16 |
2847 | 1994 "1: \n\t" |
1995 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1996 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
1997 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
1998 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
1999 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
2000 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
2001 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
2002 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
2003 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
2004 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
2005 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
2006 | |
2007 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2008 | |
2009 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
2010 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
2011 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
2012 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
2013 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
2014 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
2016 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
2017 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
2018 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2019 | |
2020 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
2021 | |
2022 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
2023 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
2024 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
2025 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
2026 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
2027 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
2028 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
2029 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2030 | |
2031 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
2032 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
2033 | |
2034 "addl $8, %%eax \n\t" | |
2035 "cmpl %4, %%eax \n\t" | |
2036 " jb 1b \n\t" | |
9394 | 2037 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2847 | 2038 : "memory", "%eax" |
2039 ); | |
2040 | |
2041 ydst += lumStride; | |
2042 src += srcStride; | |
2043 | |
2044 asm volatile( | |
2045 "xorl %%eax, %%eax \n\t" | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
2046 ASMALIGN16 |
2847 | 2047 "1: \n\t" |
2048 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
2049 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
2050 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
2051 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
2052 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
2053 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
2054 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
2055 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
2056 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
2057 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
2058 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2059 | |
2060 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
2061 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
2062 | |
2063 "addl $8, %%eax \n\t" | |
2064 "cmpl %4, %%eax \n\t" | |
2065 " jb 1b \n\t" | |
2066 | |
9394 | 2067 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2847 | 2068 : "memory", "%eax" |
2069 ); | |
2070 #else | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2071 long i; |
2801 | 2072 for(i=0; i<chromWidth; i++) |
2073 { | |
2074 udst[i] = src[4*i+0]; | |
2075 ydst[2*i+0] = src[4*i+1]; | |
2076 vdst[i] = src[4*i+2]; | |
2077 ydst[2*i+1] = src[4*i+3]; | |
2078 } | |
2079 ydst += lumStride; | |
2080 src += srcStride; | |
2081 | |
2082 for(i=0; i<chromWidth; i++) | |
2083 { | |
2084 ydst[2*i+0] = src[4*i+1]; | |
2085 ydst[2*i+1] = src[4*i+3]; | |
2086 } | |
2847 | 2087 #endif |
2801 | 2088 udst += chromStride; |
2089 vdst += chromStride; | |
2090 ydst += lumStride; | |
2091 src += srcStride; | |
2092 } | |
2847 | 2093 #ifdef HAVE_MMX |
2094 asm volatile( EMMS" \n\t" | |
2095 SFENCE" \n\t" | |
2096 :::"memory"); | |
2097 #endif | |
2801 | 2098 } |
2099 | |
3132 | 2100 /** |
2101 * | |
2102 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
2103 * problem for anyone then tell me, and ill fix it) | |
4622 | 2104 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version |
3132 | 2105 */ |
2106 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2107 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2108 long lumStride, long chromStride, long srcStride) |
3132 | 2109 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2110 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2111 const long chromWidth= width>>1; |
4622 | 2112 #ifdef HAVE_MMX |
2113 for(y=0; y<height-2; y+=2) | |
2114 { | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2115 long i; |
4622 | 2116 for(i=0; i<2; i++) |
2117 { | |
2118 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2119 "mov %2, %%"REG_a" \n\t" |
4923 | 2120 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
2121 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4622 | 2122 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2123 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
2124 ASMALIGN16 |
4622 | 2125 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2126 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2127 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2128 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" |
4622 | 2129 "punpcklbw %%mm7, %%mm0 \n\t" |
2130 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2131 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2132 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" |
4622 | 2133 "punpcklbw %%mm7, %%mm2 \n\t" |
2134 "punpcklbw %%mm7, %%mm3 \n\t" | |
2135 "pmaddwd %%mm6, %%mm0 \n\t" | |
2136 "pmaddwd %%mm6, %%mm1 \n\t" | |
2137 "pmaddwd %%mm6, %%mm2 \n\t" | |
2138 "pmaddwd %%mm6, %%mm3 \n\t" | |
2139 #ifndef FAST_BGR2YV12 | |
2140 "psrad $8, %%mm0 \n\t" | |
2141 "psrad $8, %%mm1 \n\t" | |
2142 "psrad $8, %%mm2 \n\t" | |
2143 "psrad $8, %%mm3 \n\t" | |
2144 #endif | |
2145 "packssdw %%mm1, %%mm0 \n\t" | |
2146 "packssdw %%mm3, %%mm2 \n\t" | |
2147 "pmaddwd %%mm5, %%mm0 \n\t" | |
2148 "pmaddwd %%mm5, %%mm2 \n\t" | |
2149 "packssdw %%mm2, %%mm0 \n\t" | |
2150 "psraw $7, %%mm0 \n\t" | |
2151 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2152 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2153 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" |
4622 | 2154 "punpcklbw %%mm7, %%mm4 \n\t" |
2155 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2156 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2157 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" |
4622 | 2158 "punpcklbw %%mm7, %%mm2 \n\t" |
2159 "punpcklbw %%mm7, %%mm3 \n\t" | |
2160 "pmaddwd %%mm6, %%mm4 \n\t" | |
2161 "pmaddwd %%mm6, %%mm1 \n\t" | |
2162 "pmaddwd %%mm6, %%mm2 \n\t" | |
2163 "pmaddwd %%mm6, %%mm3 \n\t" | |
2164 #ifndef FAST_BGR2YV12 | |
2165 "psrad $8, %%mm4 \n\t" | |
2166 "psrad $8, %%mm1 \n\t" | |
2167 "psrad $8, %%mm2 \n\t" | |
2168 "psrad $8, %%mm3 \n\t" | |
2169 #endif | |
2170 "packssdw %%mm1, %%mm4 \n\t" | |
2171 "packssdw %%mm3, %%mm2 \n\t" | |
2172 "pmaddwd %%mm5, %%mm4 \n\t" | |
2173 "pmaddwd %%mm5, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2174 "add $24, %%"REG_b" \n\t" |
4622 | 2175 "packssdw %%mm2, %%mm4 \n\t" |
2176 "psraw $7, %%mm4 \n\t" | |
2177 | |
2178 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 2179 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4622 | 2180 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2181 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2182 "add $8, %%"REG_a" \n\t" |
4622 | 2183 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2184 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2185 : "%"REG_a, "%"REG_b |
4622 | 2186 ); |
2187 ydst += lumStride; | |
2188 src += srcStride; | |
2189 } | |
2190 src -= srcStride*2; | |
2191 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2192 "mov %4, %%"REG_a" \n\t" |
4923 | 2193 "movq "MANGLE(w1111)", %%mm5 \n\t" |
2194 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4622 | 2195 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2196 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2197 "add %%"REG_b", %%"REG_b" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17670
diff
changeset
|
2198 ASMALIGN16 |
4622 | 2199 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2200 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2201 PREFETCH" 64(%1, %%"REG_b") \n\t" |
4622 | 2202 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2203 "movq (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2204 "movq (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2205 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2206 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2207 PAVGB" %%mm1, %%mm0 \n\t" |
2208 PAVGB" %%mm3, %%mm2 \n\t" | |
2209 "movq %%mm0, %%mm1 \n\t" | |
2210 "movq %%mm2, %%mm3 \n\t" | |
2211 "psrlq $24, %%mm0 \n\t" | |
2212 "psrlq $24, %%mm2 \n\t" | |
2213 PAVGB" %%mm1, %%mm0 \n\t" | |
2214 PAVGB" %%mm3, %%mm2 \n\t" | |
2215 "punpcklbw %%mm7, %%mm0 \n\t" | |
2216 "punpcklbw %%mm7, %%mm2 \n\t" | |
2217 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2218 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2219 "movd (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2220 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2221 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2222 "punpcklbw %%mm7, %%mm0 \n\t" |
2223 "punpcklbw %%mm7, %%mm1 \n\t" | |
2224 "punpcklbw %%mm7, %%mm2 \n\t" | |
2225 "punpcklbw %%mm7, %%mm3 \n\t" | |
2226 "paddw %%mm1, %%mm0 \n\t" | |
2227 "paddw %%mm3, %%mm2 \n\t" | |
2228 "paddw %%mm2, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2229 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2230 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2231 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2232 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2233 "punpcklbw %%mm7, %%mm4 \n\t" |
2234 "punpcklbw %%mm7, %%mm1 \n\t" | |
2235 "punpcklbw %%mm7, %%mm2 \n\t" | |
2236 "punpcklbw %%mm7, %%mm3 \n\t" | |
2237 "paddw %%mm1, %%mm4 \n\t" | |
2238 "paddw %%mm3, %%mm2 \n\t" | |
2239 "paddw %%mm4, %%mm2 \n\t" | |
2240 "psrlw $2, %%mm0 \n\t" | |
2241 "psrlw $2, %%mm2 \n\t" | |
2242 #endif | |
4923 | 2243 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
2244 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 2245 |
2246 "pmaddwd %%mm0, %%mm1 \n\t" | |
2247 "pmaddwd %%mm2, %%mm3 \n\t" | |
2248 "pmaddwd %%mm6, %%mm0 \n\t" | |
2249 "pmaddwd %%mm6, %%mm2 \n\t" | |
2250 #ifndef FAST_BGR2YV12 | |
2251 "psrad $8, %%mm0 \n\t" | |
2252 "psrad $8, %%mm1 \n\t" | |
2253 "psrad $8, %%mm2 \n\t" | |
2254 "psrad $8, %%mm3 \n\t" | |
2255 #endif | |
2256 "packssdw %%mm2, %%mm0 \n\t" | |
2257 "packssdw %%mm3, %%mm1 \n\t" | |
2258 "pmaddwd %%mm5, %%mm0 \n\t" | |
2259 "pmaddwd %%mm5, %%mm1 \n\t" | |
2260 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
2261 "psraw $7, %%mm0 \n\t" | |
2262 | |
2263 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2264 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2265 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2266 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2267 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2268 PAVGB" %%mm1, %%mm4 \n\t" |
2269 PAVGB" %%mm3, %%mm2 \n\t" | |
2270 "movq %%mm4, %%mm1 \n\t" | |
2271 "movq %%mm2, %%mm3 \n\t" | |
2272 "psrlq $24, %%mm4 \n\t" | |
2273 "psrlq $24, %%mm2 \n\t" | |
2274 PAVGB" %%mm1, %%mm4 \n\t" | |
2275 PAVGB" %%mm3, %%mm2 \n\t" | |
2276 "punpcklbw %%mm7, %%mm4 \n\t" | |
2277 "punpcklbw %%mm7, %%mm2 \n\t" | |
2278 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2279 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2280 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2281 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2282 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2283 "punpcklbw %%mm7, %%mm4 \n\t" |
2284 "punpcklbw %%mm7, %%mm1 \n\t" | |
2285 "punpcklbw %%mm7, %%mm2 \n\t" | |
2286 "punpcklbw %%mm7, %%mm3 \n\t" | |
2287 "paddw %%mm1, %%mm4 \n\t" | |
2288 "paddw %%mm3, %%mm2 \n\t" | |
2289 "paddw %%mm2, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2290 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2291 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2292 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2293 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2294 "punpcklbw %%mm7, %%mm5 \n\t" |
2295 "punpcklbw %%mm7, %%mm1 \n\t" | |
2296 "punpcklbw %%mm7, %%mm2 \n\t" | |
2297 "punpcklbw %%mm7, %%mm3 \n\t" | |
2298 "paddw %%mm1, %%mm5 \n\t" | |
2299 "paddw %%mm3, %%mm2 \n\t" | |
2300 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 2301 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4622 | 2302 "psrlw $2, %%mm4 \n\t" |
2303 "psrlw $2, %%mm2 \n\t" | |
2304 #endif | |
4923 | 2305 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
2306 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 2307 |
2308 "pmaddwd %%mm4, %%mm1 \n\t" | |
2309 "pmaddwd %%mm2, %%mm3 \n\t" | |
2310 "pmaddwd %%mm6, %%mm4 \n\t" | |
2311 "pmaddwd %%mm6, %%mm2 \n\t" | |
2312 #ifndef FAST_BGR2YV12 | |
2313 "psrad $8, %%mm4 \n\t" | |
2314 "psrad $8, %%mm1 \n\t" | |
2315 "psrad $8, %%mm2 \n\t" | |
2316 "psrad $8, %%mm3 \n\t" | |
2317 #endif | |
2318 "packssdw %%mm2, %%mm4 \n\t" | |
2319 "packssdw %%mm3, %%mm1 \n\t" | |
2320 "pmaddwd %%mm5, %%mm4 \n\t" | |
2321 "pmaddwd %%mm5, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2322 "add $24, %%"REG_b" \n\t" |
4622 | 2323 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2324 "psraw $7, %%mm4 \n\t" | |
2325 | |
2326 "movq %%mm0, %%mm1 \n\t" | |
2327 "punpckldq %%mm4, %%mm0 \n\t" | |
2328 "punpckhdq %%mm4, %%mm1 \n\t" | |
2329 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 2330 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2331 "movd %%mm0, (%2, %%"REG_a") \n\t" |
4622 | 2332 "punpckhdq %%mm0, %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2333 "movd %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2334 "add $4, %%"REG_a" \n\t" |
4622 | 2335 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2336 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2337 : "%"REG_a, "%"REG_b |
4622 | 2338 ); |
2339 | |
2340 udst += chromStride; | |
2341 vdst += chromStride; | |
2342 src += srcStride*2; | |
2343 } | |
2344 | |
2345 asm volatile( EMMS" \n\t" | |
2346 SFENCE" \n\t" | |
2347 :::"memory"); | |
2348 #else | |
2349 y=0; | |
2350 #endif | |
2351 for(; y<height; y+=2) | |
3132 | 2352 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2353 long i; |
3132 | 2354 for(i=0; i<chromWidth; i++) |
2355 { | |
2356 unsigned int b= src[6*i+0]; | |
2357 unsigned int g= src[6*i+1]; | |
2358 unsigned int r= src[6*i+2]; | |
2801 | 2359 |
3633 | 2360 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
2361 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
2362 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
3132 | 2363 |
2364 udst[i] = U; | |
2365 vdst[i] = V; | |
2366 ydst[2*i] = Y; | |
2367 | |
2368 b= src[6*i+3]; | |
2369 g= src[6*i+4]; | |
2370 r= src[6*i+5]; | |
2371 | |
3633 | 2372 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2373 ydst[2*i+1] = Y; |
2374 } | |
2375 ydst += lumStride; | |
2376 src += srcStride; | |
2377 | |
2378 for(i=0; i<chromWidth; i++) | |
2379 { | |
2380 unsigned int b= src[6*i+0]; | |
2381 unsigned int g= src[6*i+1]; | |
2382 unsigned int r= src[6*i+2]; | |
2383 | |
3633 | 2384 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2385 |
2386 ydst[2*i] = Y; | |
2387 | |
2388 b= src[6*i+3]; | |
2389 g= src[6*i+4]; | |
2390 r= src[6*i+5]; | |
2391 | |
3633 | 2392 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2393 ydst[2*i+1] = Y; |
2394 } | |
2395 udst += chromStride; | |
2396 vdst += chromStride; | |
2397 ydst += lumStride; | |
2398 src += srcStride; | |
2399 } | |
2400 } | |
5337 | 2401 |
2402 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2403 long width, long height, long src1Stride, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2404 long src2Stride, long dstStride){ |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2405 long h; |
5337 | 2406 |
2407 for(h=0; h < height; h++) | |
2408 { | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2409 long w; |
5337 | 2410 |
2411 #ifdef HAVE_MMX | |
2412 #ifdef HAVE_SSE2 | |
2413 asm( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2414 "xor %%"REG_a", %%"REG_a" \n\t" |
5337 | 2415 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2416 PREFETCH" 64(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2417 PREFETCH" 64(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2418 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2419 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2420 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" |
5337 | 2421 "punpcklbw %%xmm2, %%xmm0 \n\t" |
2422 "punpckhbw %%xmm2, %%xmm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2423 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2424 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2425 "add $16, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2426 "cmp %3, %%"REG_a" \n\t" |
5337 | 2427 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2428 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2429 : "memory", "%"REG_a"" |
5337 | 2430 ); |
2431 #else | |
2432 asm( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2433 "xor %%"REG_a", %%"REG_a" \n\t" |
5337 | 2434 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2435 PREFETCH" 64(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2436 PREFETCH" 64(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2437 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2438 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" |
5337 | 2439 "movq %%mm0, %%mm1 \n\t" |
2440 "movq %%mm2, %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2441 "movq (%2, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2442 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" |
5337 | 2443 "punpcklbw %%mm4, %%mm0 \n\t" |
2444 "punpckhbw %%mm4, %%mm1 \n\t" | |
2445 "punpcklbw %%mm5, %%mm2 \n\t" | |
2446 "punpckhbw %%mm5, %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2447 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2448 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2449 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2450 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2451 "add $16, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2452 "cmp %3, %%"REG_a" \n\t" |
5337 | 2453 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2454 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2455 : "memory", "%"REG_a |
5337 | 2456 ); |
2457 #endif | |
2458 for(w= (width&(~15)); w < width; w++) | |
2459 { | |
2460 dest[2*w+0] = src1[w]; | |
2461 dest[2*w+1] = src2[w]; | |
2462 } | |
2463 #else | |
2464 for(w=0; w < width; w++) | |
2465 { | |
2466 dest[2*w+0] = src1[w]; | |
2467 dest[2*w+1] = src2[w]; | |
2468 } | |
2469 #endif | |
2470 dest += dstStride; | |
2471 src1 += src1Stride; | |
2472 src2 += src2Stride; | |
2473 } | |
2474 #ifdef HAVE_MMX | |
2475 asm( | |
2476 EMMS" \n\t" | |
2477 SFENCE" \n\t" | |
2478 ::: "memory" | |
2479 ); | |
2480 #endif | |
2481 } | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2482 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2483 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2484 uint8_t *dst1, uint8_t *dst2, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2485 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2486 long srcStride1, long srcStride2, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2487 long dstStride1, long dstStride2) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2488 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2489 long y,x,w,h; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2490 w=width/2; h=height/2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2491 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2492 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2493 PREFETCH" %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2494 PREFETCH" %1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2495 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2496 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2497 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2498 const uint8_t* s1=src1+srcStride1*(y>>1); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2499 uint8_t* d=dst1+dstStride1*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2500 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2501 #ifdef HAVE_MMX |
9392 | 2502 for(;x<w-31;x+=32) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2503 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2504 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2505 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2506 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2507 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2508 "movq 16%1, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2509 "movq 24%1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2510 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2511 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2512 "movq %%mm4, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2513 "movq %%mm6, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2514 "punpcklbw %%mm0, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2515 "punpckhbw %%mm1, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2516 "punpcklbw %%mm2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2517 "punpckhbw %%mm3, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2518 "punpcklbw %%mm4, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2519 "punpckhbw %%mm5, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2520 "punpcklbw %%mm6, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2521 "punpckhbw %%mm7, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2522 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2523 MOVNTQ" %%mm1, 8%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2524 MOVNTQ" %%mm2, 16%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2525 MOVNTQ" %%mm3, 24%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2526 MOVNTQ" %%mm4, 32%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2527 MOVNTQ" %%mm5, 40%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2528 MOVNTQ" %%mm6, 48%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2529 MOVNTQ" %%mm7, 56%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2530 :"=m"(d[2*x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2531 :"m"(s1[x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2532 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2533 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2534 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2535 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2536 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2537 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2538 const uint8_t* s2=src2+srcStride2*(y>>1); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2539 uint8_t* d=dst2+dstStride2*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2540 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2541 #ifdef HAVE_MMX |
9392 | 2542 for(;x<w-31;x+=32) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2543 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2544 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2545 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2546 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2547 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2548 "movq 16%1, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2549 "movq 24%1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2550 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2551 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2552 "movq %%mm4, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2553 "movq %%mm6, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2554 "punpcklbw %%mm0, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2555 "punpckhbw %%mm1, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2556 "punpcklbw %%mm2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2557 "punpckhbw %%mm3, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2558 "punpcklbw %%mm4, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2559 "punpckhbw %%mm5, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2560 "punpcklbw %%mm6, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2561 "punpckhbw %%mm7, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2562 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2563 MOVNTQ" %%mm1, 8%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2564 MOVNTQ" %%mm2, 16%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2565 MOVNTQ" %%mm3, 24%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2566 MOVNTQ" %%mm4, 32%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2567 MOVNTQ" %%mm5, 40%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2568 MOVNTQ" %%mm6, 48%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2569 MOVNTQ" %%mm7, 56%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2570 :"=m"(d[2*x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2571 :"m"(s2[x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2572 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2573 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2574 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2575 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2576 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2577 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2578 asm( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2579 EMMS" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2580 SFENCE" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2581 ::: "memory" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2582 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2583 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2584 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2585 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2586 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2587 uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2588 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2589 long srcStride1, long srcStride2, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2590 long srcStride3, long dstStride) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2591 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2592 long y,x,w,h; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2593 w=width/2; h=height; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2594 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2595 const uint8_t* yp=src1+srcStride1*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2596 const uint8_t* up=src2+srcStride2*(y>>2); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2597 const uint8_t* vp=src3+srcStride3*(y>>2); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2598 uint8_t* d=dst+dstStride*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2599 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2600 #ifdef HAVE_MMX |
9394 | 2601 for(;x<w-7;x+=8) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2602 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2603 asm volatile( |
9394 | 2604 PREFETCH" 32(%1, %0)\n\t" |
2605 PREFETCH" 32(%2, %0)\n\t" | |
2606 PREFETCH" 32(%3, %0)\n\t" | |
2607 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | |
2608 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */ | |
2609 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */ | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2610 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2611 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2612 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2613 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2614 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2615 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2616 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2617 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2618 "movq %%mm1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2619 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2620 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2621 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ |
9394 | 2622 MOVNTQ" %%mm0, (%4, %0, 8)\n\t" |
2623 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2624 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2625 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/ |
9394 | 2626 "movq 8(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2627 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2628 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2629 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/ |
9394 | 2630 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t" |
2631 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2632 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2633 "movq %%mm4, %%mm6\n\t" |
9394 | 2634 "movq 16(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2635 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2636 "punpcklbw %%mm5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2637 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2638 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/ |
9394 | 2639 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t" |
2640 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2641 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2642 "punpckhbw %%mm5, %%mm6\n\t" |
9394 | 2643 "movq 24(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2644 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2645 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2646 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/ |
9394 | 2647 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t" |
2648 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2649 |
9394 | 2650 : "+r" (x) |
2651 : "r"(yp), "r" (up), "r"(vp), "r"(d) | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2652 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2653 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2654 #endif |
9394 | 2655 for(; x<w; x++) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2656 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2657 const long x2= x<<2; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2658 d[8*x+0]=yp[x2]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2659 d[8*x+1]=up[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2660 d[8*x+2]=yp[x2+1]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2661 d[8*x+3]=vp[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2662 d[8*x+4]=yp[x2+2]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2663 d[8*x+5]=up[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2664 d[8*x+6]=yp[x2+3]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2665 d[8*x+7]=vp[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2666 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2667 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2668 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2669 asm( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2670 EMMS" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2671 SFENCE" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2672 ::: "memory" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2673 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2674 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2675 } |