Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 17586:65b39a32a7c4
Fix big-endian color permutation problems.
patch by Alan Curry, pacman_at_TheWorld_dot_com
author | diego |
---|---|
date | Sat, 11 Feb 2006 13:35:46 +0000 |
parents | e91f944f6ed9 |
children | 6a6db6b74735 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
3132 | 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) |
13423 | 9 * lot of big-endian byteorder fixes by Alex Beregszaszi |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
10 */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
11 |
6492 | 12 #include <stddef.h> |
13 #include <inttypes.h> /* for __WORDSIZE */ | |
14 | |
15 #ifndef __WORDSIZE | |
7421
0684cad9b204
use detected WORDSIZE instead of warning, when inttypes.h doesn't define __WORDSIZE
arpi
parents:
6608
diff
changeset
|
16 // #warning You have misconfigured system and probably will lose performance! |
0684cad9b204
use detected WORDSIZE instead of warning, when inttypes.h doesn't define __WORDSIZE
arpi
parents:
6608
diff
changeset
|
17 #define __WORDSIZE MP_WORDSIZE |
6492 | 18 #endif |
19 | |
3132 | 20 #undef PREFETCH |
21 #undef MOVNTQ | |
22 #undef EMMS | |
23 #undef SFENCE | |
24 #undef MMREG_SIZE | |
25 #undef PREFETCHW | |
26 #undef PAVGB | |
2755 | 27 |
3132 | 28 #ifdef HAVE_SSE2 |
29 #define MMREG_SIZE 16 | |
30 #else | |
31 #define MMREG_SIZE 8 | |
2535 | 32 #endif |
2513 | 33 |
3132 | 34 #ifdef HAVE_3DNOW |
35 #define PREFETCH "prefetch" | |
36 #define PREFETCHW "prefetchw" | |
37 #define PAVGB "pavgusb" | |
38 #elif defined ( HAVE_MMX2 ) | |
39 #define PREFETCH "prefetchnta" | |
40 #define PREFETCHW "prefetcht0" | |
41 #define PAVGB "pavgb" | |
42 #else | |
43 #define PREFETCH "/nop" | |
44 #define PREFETCHW "/nop" | |
45 #endif | |
46 | |
47 #ifdef HAVE_3DNOW | |
48 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
49 #define EMMS "femms" | |
50 #else | |
51 #define EMMS "emms" | |
52 #endif | |
53 | |
54 #ifdef HAVE_MMX2 | |
55 #define MOVNTQ "movntq" | |
56 #define SFENCE "sfence" | |
57 #else | |
58 #define MOVNTQ "movq" | |
59 #define SFENCE "/nop" | |
60 #endif | |
61 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
62 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size) |
2504 | 63 { |
2508 | 64 uint8_t *dest = dst; |
2677 | 65 const uint8_t *s = src; |
66 const uint8_t *end; | |
2510 | 67 #ifdef HAVE_MMX |
6605 | 68 const uint8_t *mm_end; |
2510 | 69 #endif |
2504 | 70 end = s + src_size; |
2510 | 71 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
72 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
6605 | 73 mm_end = end - 23; |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
74 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2510 | 75 while(s < mm_end) |
76 { | |
2511 | 77 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
78 PREFETCH" 32%1\n\t" |
2510 | 79 "movd %1, %%mm0\n\t" |
2738 | 80 "punpckldq 3%1, %%mm0\n\t" |
81 "movd 6%1, %%mm1\n\t" | |
82 "punpckldq 9%1, %%mm1\n\t" | |
83 "movd 12%1, %%mm2\n\t" | |
84 "punpckldq 15%1, %%mm2\n\t" | |
85 "movd 18%1, %%mm3\n\t" | |
86 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 87 "pand %%mm7, %%mm0\n\t" |
2738 | 88 "pand %%mm7, %%mm1\n\t" |
2510 | 89 "pand %%mm7, %%mm2\n\t" |
2738 | 90 "pand %%mm7, %%mm3\n\t" |
2511 | 91 MOVNTQ" %%mm0, %0\n\t" |
2738 | 92 MOVNTQ" %%mm1, 8%0\n\t" |
93 MOVNTQ" %%mm2, 16%0\n\t" | |
94 MOVNTQ" %%mm3, 24%0" | |
2510 | 95 :"=m"(*dest) |
96 :"m"(*s) | |
97 :"memory"); | |
2738 | 98 dest += 32; |
99 s += 24; | |
2510 | 100 } |
2513 | 101 __asm __volatile(SFENCE:::"memory"); |
2511 | 102 __asm __volatile(EMMS:::"memory"); |
2510 | 103 #endif |
2504 | 104 while(s < end) |
105 { | |
13423 | 106 #ifdef WORDS_BIGENDIAN |
17586 | 107 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ |
13423 | 108 *dest++ = 0; |
17586 | 109 *dest++ = s[2]; |
110 *dest++ = s[1]; | |
111 *dest++ = s[0]; | |
112 s+=3; | |
13423 | 113 #else |
2508 | 114 *dest++ = *s++; |
115 *dest++ = *s++; | |
116 *dest++ = *s++; | |
117 *dest++ = 0; | |
13423 | 118 #endif |
2504 | 119 } |
120 } | |
2505 | 121 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
122 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size) |
2505 | 123 { |
124 uint8_t *dest = dst; | |
2677 | 125 const uint8_t *s = src; |
126 const uint8_t *end; | |
2517 | 127 #ifdef HAVE_MMX |
6605 | 128 const uint8_t *mm_end; |
2517 | 129 #endif |
2505 | 130 end = s + src_size; |
2517 | 131 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
132 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
6605 | 133 mm_end = end - 31; |
2517 | 134 while(s < mm_end) |
135 { | |
136 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
137 PREFETCH" 32%1\n\t" |
2517 | 138 "movq %1, %%mm0\n\t" |
139 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
140 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
141 "movq 24%1, %%mm5\n\t" |
2517 | 142 "movq %%mm0, %%mm2\n\t" |
143 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
144 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
145 "movq %%mm5, %%mm7\n\t" |
2517 | 146 "psrlq $8, %%mm2\n\t" |
147 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
148 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
149 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
150 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
153 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
154 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
156 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
157 "pand %3, %%mm7\n\t" |
2517 | 158 "por %%mm2, %%mm0\n\t" |
159 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
160 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
163 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
165 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
166 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
167 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
168 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
169 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
170 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
171 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
172 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
173 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
174 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
175 "por %%mm5, %%mm4\n\t" |
3132 | 176 |
2517 | 177 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
178 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
179 MOVNTQ" %%mm4, 16%0" |
2517 | 180 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
181 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
182 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 183 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
184 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
185 s += 32; |
2517 | 186 } |
187 __asm __volatile(SFENCE:::"memory"); | |
188 __asm __volatile(EMMS:::"memory"); | |
189 #endif | |
2505 | 190 while(s < end) |
191 { | |
13423 | 192 #ifdef WORDS_BIGENDIAN |
17586 | 193 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ |
13423 | 194 s++; |
17586 | 195 dest[2] = *s++; |
196 dest[1] = *s++; | |
197 dest[0] = *s++; | |
198 dest += 3; | |
13423 | 199 #else |
2505 | 200 *dest++ = *s++; |
201 *dest++ = *s++; | |
202 *dest++ = *s++; | |
203 s++; | |
13423 | 204 #endif |
2505 | 205 } |
206 } | |
2506 | 207 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
208 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
209 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
210 ported to gcc & bugfixed : A'rpi |
2564 | 211 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
212 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
213 */ |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
214 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size) |
2506 | 215 { |
6492 | 216 register const uint8_t* s=src; |
217 register uint8_t* d=dst; | |
218 register const uint8_t *end; | |
6605 | 219 const uint8_t *mm_end; |
6492 | 220 end = s + src_size; |
2506 | 221 #ifdef HAVE_MMX |
6492 | 222 __asm __volatile(PREFETCH" %0"::"m"(*s)); |
223 __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); | |
6605 | 224 mm_end = end - 15; |
6492 | 225 while(s<mm_end) |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
226 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
227 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
228 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
229 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
230 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
231 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
232 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
233 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
234 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
235 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
236 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
237 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
238 MOVNTQ" %%mm2, 8%0" |
6492 | 239 :"=m"(*d) |
240 :"m"(*s) | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
241 ); |
6492 | 242 d+=16; |
243 s+=16; | |
2506 | 244 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
245 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
246 __asm __volatile(EMMS:::"memory"); |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
247 #endif |
6605 | 248 mm_end = end - 3; |
6492 | 249 while(s < mm_end) |
250 { | |
251 register unsigned x= *((uint32_t *)s); | |
252 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
253 d+=4; | |
254 s+=4; | |
255 } | |
256 if(s < end) | |
257 { | |
258 register unsigned short x= *((uint16_t *)s); | |
259 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
260 } | |
2506 | 261 } |
2694 | 262 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
263 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
264 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
265 register const uint8_t* s=src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
266 register uint8_t* d=dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
267 register const uint8_t *end; |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
268 const uint8_t *mm_end; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
269 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
270 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
271 __asm __volatile(PREFETCH" %0"::"m"(*s)); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
272 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
273 __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
274 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
275 while(s<mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
276 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
277 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
278 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
279 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
280 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
281 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
282 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
283 "psrlq $1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
284 "psrlq $1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
285 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
286 "pand %%mm7, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
287 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
288 "pand %%mm6, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
289 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
290 "por %%mm3, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
291 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
292 MOVNTQ" %%mm2, 8%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
293 :"=m"(*d) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
294 :"m"(*s) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
295 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
296 d+=16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
297 s+=16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
298 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
299 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
300 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
301 #endif |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
302 mm_end = end - 3; |
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
303 while(s < mm_end) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
304 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
305 register uint32_t x= *((uint32_t *)s); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
306 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
307 s+=4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
308 d+=4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
309 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
310 if(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
311 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
312 register uint16_t x= *((uint16_t *)s); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
313 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
314 s+=2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
315 d+=2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
316 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
317 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
318 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
319 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) |
2694 | 320 { |
6492 | 321 const uint8_t *s = src; |
322 const uint8_t *end; | |
2741 | 323 #ifdef HAVE_MMX |
6492 | 324 const uint8_t *mm_end; |
325 #endif | |
2741 | 326 uint16_t *d = (uint16_t *)dst; |
327 end = s + src_size; | |
6492 | 328 #ifdef HAVE_MMX |
9454 | 329 mm_end = end - 15; |
330 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
331 asm volatile( | |
332 "movq %3, %%mm5 \n\t" | |
333 "movq %4, %%mm6 \n\t" | |
334 "movq %5, %%mm7 \n\t" | |
335 ".balign 16 \n\t" | |
336 "1: \n\t" | |
337 PREFETCH" 32(%1) \n\t" | |
338 "movd (%1), %%mm0 \n\t" | |
339 "movd 4(%1), %%mm3 \n\t" | |
340 "punpckldq 8(%1), %%mm0 \n\t" | |
341 "punpckldq 12(%1), %%mm3 \n\t" | |
342 "movq %%mm0, %%mm1 \n\t" | |
343 "movq %%mm3, %%mm4 \n\t" | |
344 "pand %%mm6, %%mm0 \n\t" | |
345 "pand %%mm6, %%mm3 \n\t" | |
346 "pmaddwd %%mm7, %%mm0 \n\t" | |
347 "pmaddwd %%mm7, %%mm3 \n\t" | |
348 "pand %%mm5, %%mm1 \n\t" | |
349 "pand %%mm5, %%mm4 \n\t" | |
350 "por %%mm1, %%mm0 \n\t" | |
351 "por %%mm4, %%mm3 \n\t" | |
352 "psrld $5, %%mm0 \n\t" | |
353 "pslld $11, %%mm3 \n\t" | |
354 "por %%mm3, %%mm0 \n\t" | |
355 MOVNTQ" %%mm0, (%0) \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
356 "add $16, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
357 "add $8, %0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
358 "cmp %2, %1 \n\t" |
9454 | 359 " jb 1b \n\t" |
360 : "+r" (d), "+r"(s) | |
361 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | |
362 ); | |
363 #else | |
2741 | 364 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
365 __asm __volatile( | |
366 "movq %0, %%mm7\n\t" | |
367 "movq %1, %%mm6\n\t" | |
368 ::"m"(red_16mask),"m"(green_16mask)); | |
369 while(s < mm_end) | |
370 { | |
371 __asm __volatile( | |
372 PREFETCH" 32%1\n\t" | |
373 "movd %1, %%mm0\n\t" | |
374 "movd 4%1, %%mm3\n\t" | |
375 "punpckldq 8%1, %%mm0\n\t" | |
376 "punpckldq 12%1, %%mm3\n\t" | |
377 "movq %%mm0, %%mm1\n\t" | |
378 "movq %%mm0, %%mm2\n\t" | |
379 "movq %%mm3, %%mm4\n\t" | |
380 "movq %%mm3, %%mm5\n\t" | |
381 "psrlq $3, %%mm0\n\t" | |
382 "psrlq $3, %%mm3\n\t" | |
383 "pand %2, %%mm0\n\t" | |
384 "pand %2, %%mm3\n\t" | |
385 "psrlq $5, %%mm1\n\t" | |
386 "psrlq $5, %%mm4\n\t" | |
387 "pand %%mm6, %%mm1\n\t" | |
388 "pand %%mm6, %%mm4\n\t" | |
389 "psrlq $8, %%mm2\n\t" | |
390 "psrlq $8, %%mm5\n\t" | |
391 "pand %%mm7, %%mm2\n\t" | |
392 "pand %%mm7, %%mm5\n\t" | |
393 "por %%mm1, %%mm0\n\t" | |
394 "por %%mm4, %%mm3\n\t" | |
395 "por %%mm2, %%mm0\n\t" | |
396 "por %%mm5, %%mm3\n\t" | |
397 "psllq $16, %%mm3\n\t" | |
398 "por %%mm3, %%mm0\n\t" | |
399 MOVNTQ" %%mm0, %0\n\t" | |
400 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
401 d += 4; | |
402 s += 16; | |
403 } | |
9454 | 404 #endif |
6492 | 405 __asm __volatile(SFENCE:::"memory"); |
406 __asm __volatile(EMMS:::"memory"); | |
407 #endif | |
2741 | 408 while(s < end) |
409 { | |
14982
49dd10a86b23
Fixes rgb32to16 conversion for I think all platforms since the int8
diego
parents:
13720
diff
changeset
|
410 register int rgb = *(uint32_t*)s; s += 4; |
49dd10a86b23
Fixes rgb32to16 conversion for I think all platforms since the int8
diego
parents:
13720
diff
changeset
|
411 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); |
2741 | 412 } |
2694 | 413 } |
414 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
415 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
416 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
417 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
418 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
419 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
420 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
421 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
422 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
423 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
424 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
425 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
426 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
427 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
428 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
429 ::"m"(red_16mask),"m"(green_16mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
430 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
431 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
432 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
433 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
434 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
435 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
436 "movd 4%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
437 "punpckldq 8%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
438 "punpckldq 12%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
439 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
440 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
441 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
442 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
443 "psllq $8, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
444 "psllq $8, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
445 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
446 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
447 "psrlq $5, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
448 "psrlq $5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
449 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
450 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
451 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
452 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
453 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
454 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
455 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
456 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
457 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
458 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
459 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
460 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
461 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
462 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
463 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
464 s += 16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
465 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
466 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
467 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
468 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
469 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
470 { |
13423 | 471 // FIXME on bigendian |
17586 | 472 /* Looks bigendian-OK to me. --Pac. */ |
12385
b5c106b694e4
this isn't actually stupid, but it's not valid C and gcc 3.5 rejects it as such
rfelker
parents:
11072
diff
changeset
|
473 const int src= *s; s += 4; |
9430 | 474 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19); |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
475 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
476 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
477 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
478 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) |
2694 | 479 { |
6492 | 480 const uint8_t *s = src; |
481 const uint8_t *end; | |
2741 | 482 #ifdef HAVE_MMX |
6492 | 483 const uint8_t *mm_end; |
484 #endif | |
2741 | 485 uint16_t *d = (uint16_t *)dst; |
486 end = s + src_size; | |
6492 | 487 #ifdef HAVE_MMX |
9454 | 488 mm_end = end - 15; |
489 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
490 asm volatile( | |
491 "movq %3, %%mm5 \n\t" | |
492 "movq %4, %%mm6 \n\t" | |
493 "movq %5, %%mm7 \n\t" | |
494 ".balign 16 \n\t" | |
495 "1: \n\t" | |
496 PREFETCH" 32(%1) \n\t" | |
497 "movd (%1), %%mm0 \n\t" | |
498 "movd 4(%1), %%mm3 \n\t" | |
499 "punpckldq 8(%1), %%mm0 \n\t" | |
500 "punpckldq 12(%1), %%mm3 \n\t" | |
501 "movq %%mm0, %%mm1 \n\t" | |
502 "movq %%mm3, %%mm4 \n\t" | |
503 "pand %%mm6, %%mm0 \n\t" | |
504 "pand %%mm6, %%mm3 \n\t" | |
505 "pmaddwd %%mm7, %%mm0 \n\t" | |
506 "pmaddwd %%mm7, %%mm3 \n\t" | |
507 "pand %%mm5, %%mm1 \n\t" | |
508 "pand %%mm5, %%mm4 \n\t" | |
509 "por %%mm1, %%mm0 \n\t" | |
510 "por %%mm4, %%mm3 \n\t" | |
511 "psrld $6, %%mm0 \n\t" | |
512 "pslld $10, %%mm3 \n\t" | |
513 "por %%mm3, %%mm0 \n\t" | |
514 MOVNTQ" %%mm0, (%0) \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
515 "add $16, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
516 "add $8, %0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
517 "cmp %2, %1 \n\t" |
9454 | 518 " jb 1b \n\t" |
519 : "+r" (d), "+r"(s) | |
520 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | |
521 ); | |
522 #else | |
2741 | 523 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
524 __asm __volatile( | |
525 "movq %0, %%mm7\n\t" | |
526 "movq %1, %%mm6\n\t" | |
527 ::"m"(red_15mask),"m"(green_15mask)); | |
528 while(s < mm_end) | |
529 { | |
530 __asm __volatile( | |
531 PREFETCH" 32%1\n\t" | |
532 "movd %1, %%mm0\n\t" | |
533 "movd 4%1, %%mm3\n\t" | |
534 "punpckldq 8%1, %%mm0\n\t" | |
535 "punpckldq 12%1, %%mm3\n\t" | |
536 "movq %%mm0, %%mm1\n\t" | |
537 "movq %%mm0, %%mm2\n\t" | |
538 "movq %%mm3, %%mm4\n\t" | |
539 "movq %%mm3, %%mm5\n\t" | |
540 "psrlq $3, %%mm0\n\t" | |
541 "psrlq $3, %%mm3\n\t" | |
542 "pand %2, %%mm0\n\t" | |
543 "pand %2, %%mm3\n\t" | |
544 "psrlq $6, %%mm1\n\t" | |
545 "psrlq $6, %%mm4\n\t" | |
546 "pand %%mm6, %%mm1\n\t" | |
547 "pand %%mm6, %%mm4\n\t" | |
548 "psrlq $9, %%mm2\n\t" | |
549 "psrlq $9, %%mm5\n\t" | |
550 "pand %%mm7, %%mm2\n\t" | |
551 "pand %%mm7, %%mm5\n\t" | |
552 "por %%mm1, %%mm0\n\t" | |
553 "por %%mm4, %%mm3\n\t" | |
554 "por %%mm2, %%mm0\n\t" | |
555 "por %%mm5, %%mm3\n\t" | |
556 "psllq $16, %%mm3\n\t" | |
557 "por %%mm3, %%mm0\n\t" | |
558 MOVNTQ" %%mm0, %0\n\t" | |
559 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
560 d += 4; | |
561 s += 16; | |
562 } | |
9454 | 563 #endif |
6492 | 564 __asm __volatile(SFENCE:::"memory"); |
565 __asm __volatile(EMMS:::"memory"); | |
566 #endif | |
2741 | 567 while(s < end) |
568 { | |
13423 | 569 // FIXME on bigendian |
17586 | 570 /* Looks bigendian-OK to me. --Pac. */ |
12385
b5c106b694e4
this isn't actually stupid, but it's not valid C and gcc 3.5 rejects it as such
rfelker
parents:
11072
diff
changeset
|
571 const int src= *s; s += 4; |
9430 | 572 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9); |
2741 | 573 } |
2694 | 574 } |
575 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
576 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
577 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
578 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
579 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
580 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
581 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
582 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
583 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
584 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
585 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
586 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
587 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
588 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
589 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
590 ::"m"(red_15mask),"m"(green_15mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
591 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
592 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
593 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
594 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
595 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
596 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
597 "movd 4%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
598 "punpckldq 8%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
599 "punpckldq 12%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
600 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
601 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
602 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
603 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
604 "psllq $7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
605 "psllq $7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
606 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
607 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
608 "psrlq $6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
609 "psrlq $6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
610 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
611 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
612 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
613 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
614 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
615 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
616 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
617 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
618 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
619 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
620 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
621 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
622 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
623 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
624 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
625 s += 16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
626 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
627 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
628 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
629 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
630 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
631 { |
13423 | 632 // FIXME on bigendian |
17586 | 633 /* Looks bigendian-OK to me. --Pac. */ |
12385
b5c106b694e4
this isn't actually stupid, but it's not valid C and gcc 3.5 rejects it as such
rfelker
parents:
11072
diff
changeset
|
634 const int src= *s; s += 4; |
9430 | 635 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19); |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
636 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
637 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
638 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
639 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) |
2718 | 640 { |
6492 | 641 const uint8_t *s = src; |
642 const uint8_t *end; | |
2740 | 643 #ifdef HAVE_MMX |
6492 | 644 const uint8_t *mm_end; |
645 #endif | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
646 uint16_t *d = (uint16_t *)dst; |
2740 | 647 end = s + src_size; |
6492 | 648 #ifdef HAVE_MMX |
2738 | 649 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
650 __asm __volatile( | |
651 "movq %0, %%mm7\n\t" | |
652 "movq %1, %%mm6\n\t" | |
2741 | 653 ::"m"(red_16mask),"m"(green_16mask)); |
6605 | 654 mm_end = end - 11; |
2740 | 655 while(s < mm_end) |
2738 | 656 { |
657 __asm __volatile( | |
658 PREFETCH" 32%1\n\t" | |
659 "movd %1, %%mm0\n\t" | |
2740 | 660 "movd 3%1, %%mm3\n\t" |
661 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 662 "punpckldq 9%1, %%mm3\n\t" |
663 "movq %%mm0, %%mm1\n\t" | |
664 "movq %%mm0, %%mm2\n\t" | |
665 "movq %%mm3, %%mm4\n\t" | |
666 "movq %%mm3, %%mm5\n\t" | |
667 "psrlq $3, %%mm0\n\t" | |
668 "psrlq $3, %%mm3\n\t" | |
2740 | 669 "pand %2, %%mm0\n\t" |
670 "pand %2, %%mm3\n\t" | |
671 "psrlq $5, %%mm1\n\t" | |
672 "psrlq $5, %%mm4\n\t" | |
673 "pand %%mm6, %%mm1\n\t" | |
674 "pand %%mm6, %%mm4\n\t" | |
675 "psrlq $8, %%mm2\n\t" | |
676 "psrlq $8, %%mm5\n\t" | |
677 "pand %%mm7, %%mm2\n\t" | |
678 "pand %%mm7, %%mm5\n\t" | |
2738 | 679 "por %%mm1, %%mm0\n\t" |
2740 | 680 "por %%mm4, %%mm3\n\t" |
2738 | 681 "por %%mm2, %%mm0\n\t" |
682 "por %%mm5, %%mm3\n\t" | |
2740 | 683 "psllq $16, %%mm3\n\t" |
684 "por %%mm3, %%mm0\n\t" | |
2738 | 685 MOVNTQ" %%mm0, %0\n\t" |
2741 | 686 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 687 d += 4; |
688 s += 12; | |
2738 | 689 } |
6492 | 690 __asm __volatile(SFENCE:::"memory"); |
691 __asm __volatile(EMMS:::"memory"); | |
692 #endif | |
2740 | 693 while(s < end) |
694 { | |
695 const int b= *s++; | |
696 const int g= *s++; | |
697 const int r= *s++; | |
698 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
699 } | |
2718 | 700 } |
701 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
702 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
703 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
704 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
705 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
706 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
707 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
708 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
709 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
710 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
711 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
712 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
713 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
714 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
715 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
716 ::"m"(red_16mask),"m"(green_16mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
717 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
718 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
719 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
720 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
721 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
722 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
723 "movd 3%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
724 "punpckldq 6%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
725 "punpckldq 9%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
726 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
727 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
728 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
729 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
730 "psllq $8, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
731 "psllq $8, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
732 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
733 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
734 "psrlq $5, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
735 "psrlq $5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
736 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
737 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
738 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
739 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
740 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
741 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
742 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
743 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
744 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
745 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
746 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
747 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
748 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
749 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
750 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
751 s += 12; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
752 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
753 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
754 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
755 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
756 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
757 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
758 const int r= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
759 const int g= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
760 const int b= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
761 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
762 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
763 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
764 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
765 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) |
2718 | 766 { |
6492 | 767 const uint8_t *s = src; |
768 const uint8_t *end; | |
2741 | 769 #ifdef HAVE_MMX |
6492 | 770 const uint8_t *mm_end; |
771 #endif | |
2741 | 772 uint16_t *d = (uint16_t *)dst; |
773 end = s + src_size; | |
6492 | 774 #ifdef HAVE_MMX |
2741 | 775 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
776 __asm __volatile( | |
777 "movq %0, %%mm7\n\t" | |
778 "movq %1, %%mm6\n\t" | |
779 ::"m"(red_15mask),"m"(green_15mask)); | |
6605 | 780 mm_end = end - 11; |
2741 | 781 while(s < mm_end) |
782 { | |
783 __asm __volatile( | |
784 PREFETCH" 32%1\n\t" | |
785 "movd %1, %%mm0\n\t" | |
786 "movd 3%1, %%mm3\n\t" | |
787 "punpckldq 6%1, %%mm0\n\t" | |
788 "punpckldq 9%1, %%mm3\n\t" | |
789 "movq %%mm0, %%mm1\n\t" | |
790 "movq %%mm0, %%mm2\n\t" | |
791 "movq %%mm3, %%mm4\n\t" | |
792 "movq %%mm3, %%mm5\n\t" | |
793 "psrlq $3, %%mm0\n\t" | |
794 "psrlq $3, %%mm3\n\t" | |
795 "pand %2, %%mm0\n\t" | |
796 "pand %2, %%mm3\n\t" | |
797 "psrlq $6, %%mm1\n\t" | |
798 "psrlq $6, %%mm4\n\t" | |
799 "pand %%mm6, %%mm1\n\t" | |
800 "pand %%mm6, %%mm4\n\t" | |
801 "psrlq $9, %%mm2\n\t" | |
802 "psrlq $9, %%mm5\n\t" | |
803 "pand %%mm7, %%mm2\n\t" | |
804 "pand %%mm7, %%mm5\n\t" | |
805 "por %%mm1, %%mm0\n\t" | |
806 "por %%mm4, %%mm3\n\t" | |
807 "por %%mm2, %%mm0\n\t" | |
808 "por %%mm5, %%mm3\n\t" | |
809 "psllq $16, %%mm3\n\t" | |
810 "por %%mm3, %%mm0\n\t" | |
811 MOVNTQ" %%mm0, %0\n\t" | |
812 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
813 d += 4; | |
814 s += 12; | |
815 } | |
6492 | 816 __asm __volatile(SFENCE:::"memory"); |
817 __asm __volatile(EMMS:::"memory"); | |
818 #endif | |
2741 | 819 while(s < end) |
820 { | |
821 const int b= *s++; | |
822 const int g= *s++; | |
823 const int r= *s++; | |
824 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
825 } | |
6492 | 826 } |
827 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
828 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
829 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
830 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
831 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
832 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
833 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
834 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
835 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
836 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
837 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
838 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
839 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
840 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
841 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
842 ::"m"(red_15mask),"m"(green_15mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
843 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
844 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
845 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
846 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
847 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
848 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
849 "movd 3%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
850 "punpckldq 6%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
851 "punpckldq 9%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
852 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
853 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
854 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
855 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
856 "psllq $7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
857 "psllq $7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
858 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
859 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
860 "psrlq $6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
861 "psrlq $6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
862 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
863 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
864 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
865 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
866 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
867 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
868 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
869 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
870 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
871 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
872 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
873 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
874 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
875 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
876 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
877 s += 12; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
878 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
879 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
880 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
881 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
882 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
883 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
884 const int r= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
885 const int g= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
886 const int b= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
887 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
888 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
889 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
890 |
6492 | 891 /* |
892 I use here less accurate approximation by simply | |
893 left-shifting the input | |
894 value and filling the low order bits with | |
895 zeroes. This method improves png's | |
896 compression but this scheme cannot reproduce white exactly, since it does not | |
897 generate an all-ones maximum value; the net effect is to darken the | |
898 image slightly. | |
899 | |
900 The better method should be "left bit replication": | |
901 | |
902 4 3 2 1 0 | |
903 --------- | |
904 1 1 0 1 1 | |
905 | |
906 7 6 5 4 3 2 1 0 | |
907 ---------------- | |
908 1 1 0 1 1 1 1 0 | |
909 |=======| |===| | |
910 | Leftmost Bits Repeated to Fill Open Bits | |
911 | | |
912 Original Bits | |
913 */ | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
914 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 915 { |
916 const uint16_t *end; | |
917 #ifdef HAVE_MMX | |
918 const uint16_t *mm_end; | |
919 #endif | |
920 uint8_t *d = (uint8_t *)dst; | |
921 const uint16_t *s = (uint16_t *)src; | |
922 end = s + src_size/2; | |
923 #ifdef HAVE_MMX | |
924 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
6605 | 925 mm_end = end - 7; |
6492 | 926 while(s < mm_end) |
927 { | |
928 __asm __volatile( | |
929 PREFETCH" 32%1\n\t" | |
930 "movq %1, %%mm0\n\t" | |
931 "movq %1, %%mm1\n\t" | |
932 "movq %1, %%mm2\n\t" | |
933 "pand %2, %%mm0\n\t" | |
934 "pand %3, %%mm1\n\t" | |
935 "pand %4, %%mm2\n\t" | |
936 "psllq $3, %%mm0\n\t" | |
937 "psrlq $2, %%mm1\n\t" | |
938 "psrlq $7, %%mm2\n\t" | |
939 "movq %%mm0, %%mm3\n\t" | |
940 "movq %%mm1, %%mm4\n\t" | |
941 "movq %%mm2, %%mm5\n\t" | |
942 "punpcklwd %5, %%mm0\n\t" | |
943 "punpcklwd %5, %%mm1\n\t" | |
944 "punpcklwd %5, %%mm2\n\t" | |
945 "punpckhwd %5, %%mm3\n\t" | |
946 "punpckhwd %5, %%mm4\n\t" | |
947 "punpckhwd %5, %%mm5\n\t" | |
948 "psllq $8, %%mm1\n\t" | |
949 "psllq $16, %%mm2\n\t" | |
950 "por %%mm1, %%mm0\n\t" | |
951 "por %%mm2, %%mm0\n\t" | |
952 "psllq $8, %%mm4\n\t" | |
953 "psllq $16, %%mm5\n\t" | |
954 "por %%mm4, %%mm3\n\t" | |
955 "por %%mm5, %%mm3\n\t" | |
956 | |
957 "movq %%mm0, %%mm6\n\t" | |
958 "movq %%mm3, %%mm7\n\t" | |
959 | |
960 "movq 8%1, %%mm0\n\t" | |
961 "movq 8%1, %%mm1\n\t" | |
962 "movq 8%1, %%mm2\n\t" | |
963 "pand %2, %%mm0\n\t" | |
964 "pand %3, %%mm1\n\t" | |
965 "pand %4, %%mm2\n\t" | |
966 "psllq $3, %%mm0\n\t" | |
967 "psrlq $2, %%mm1\n\t" | |
968 "psrlq $7, %%mm2\n\t" | |
969 "movq %%mm0, %%mm3\n\t" | |
970 "movq %%mm1, %%mm4\n\t" | |
971 "movq %%mm2, %%mm5\n\t" | |
972 "punpcklwd %5, %%mm0\n\t" | |
973 "punpcklwd %5, %%mm1\n\t" | |
974 "punpcklwd %5, %%mm2\n\t" | |
975 "punpckhwd %5, %%mm3\n\t" | |
976 "punpckhwd %5, %%mm4\n\t" | |
977 "punpckhwd %5, %%mm5\n\t" | |
978 "psllq $8, %%mm1\n\t" | |
979 "psllq $16, %%mm2\n\t" | |
980 "por %%mm1, %%mm0\n\t" | |
981 "por %%mm2, %%mm0\n\t" | |
982 "psllq $8, %%mm4\n\t" | |
983 "psllq $16, %%mm5\n\t" | |
984 "por %%mm4, %%mm3\n\t" | |
985 "por %%mm5, %%mm3\n\t" | |
986 | |
987 :"=m"(*d) | |
988 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | |
989 :"memory"); | |
990 /* Borrowed 32 to 24 */ | |
991 __asm __volatile( | |
992 "movq %%mm0, %%mm4\n\t" | |
993 "movq %%mm3, %%mm5\n\t" | |
994 "movq %%mm6, %%mm0\n\t" | |
995 "movq %%mm7, %%mm1\n\t" | |
996 | |
997 "movq %%mm4, %%mm6\n\t" | |
998 "movq %%mm5, %%mm7\n\t" | |
999 "movq %%mm0, %%mm2\n\t" | |
1000 "movq %%mm1, %%mm3\n\t" | |
1001 | |
1002 "psrlq $8, %%mm2\n\t" | |
1003 "psrlq $8, %%mm3\n\t" | |
1004 "psrlq $8, %%mm6\n\t" | |
1005 "psrlq $8, %%mm7\n\t" | |
1006 "pand %2, %%mm0\n\t" | |
1007 "pand %2, %%mm1\n\t" | |
1008 "pand %2, %%mm4\n\t" | |
1009 "pand %2, %%mm5\n\t" | |
1010 "pand %3, %%mm2\n\t" | |
1011 "pand %3, %%mm3\n\t" | |
1012 "pand %3, %%mm6\n\t" | |
1013 "pand %3, %%mm7\n\t" | |
1014 "por %%mm2, %%mm0\n\t" | |
1015 "por %%mm3, %%mm1\n\t" | |
1016 "por %%mm6, %%mm4\n\t" | |
1017 "por %%mm7, %%mm5\n\t" | |
1018 | |
1019 "movq %%mm1, %%mm2\n\t" | |
1020 "movq %%mm4, %%mm3\n\t" | |
1021 "psllq $48, %%mm2\n\t" | |
1022 "psllq $32, %%mm3\n\t" | |
1023 "pand %4, %%mm2\n\t" | |
1024 "pand %5, %%mm3\n\t" | |
1025 "por %%mm2, %%mm0\n\t" | |
1026 "psrlq $16, %%mm1\n\t" | |
1027 "psrlq $32, %%mm4\n\t" | |
1028 "psllq $16, %%mm5\n\t" | |
1029 "por %%mm3, %%mm1\n\t" | |
1030 "pand %6, %%mm5\n\t" | |
1031 "por %%mm5, %%mm4\n\t" | |
1032 | |
1033 MOVNTQ" %%mm0, %0\n\t" | |
1034 MOVNTQ" %%mm1, 8%0\n\t" | |
1035 MOVNTQ" %%mm4, 16%0" | |
1036 | |
1037 :"=m"(*d) | |
1038 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1039 :"memory"); | |
1040 d += 24; | |
1041 s += 8; | |
1042 } | |
2741 | 1043 __asm __volatile(SFENCE:::"memory"); |
1044 __asm __volatile(EMMS:::"memory"); | |
6492 | 1045 #endif |
1046 while(s < end) | |
1047 { | |
1048 register uint16_t bgr; | |
1049 bgr = *s++; | |
1050 *d++ = (bgr&0x1F)<<3; | |
1051 *d++ = (bgr&0x3E0)>>2; | |
1052 *d++ = (bgr&0x7C00)>>7; | |
1053 } | |
1054 } | |
1055 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1056 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 1057 { |
1058 const uint16_t *end; | |
1059 #ifdef HAVE_MMX | |
1060 const uint16_t *mm_end; | |
1061 #endif | |
1062 uint8_t *d = (uint8_t *)dst; | |
1063 const uint16_t *s = (const uint16_t *)src; | |
1064 end = s + src_size/2; | |
1065 #ifdef HAVE_MMX | |
1066 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
6605 | 1067 mm_end = end - 7; |
6492 | 1068 while(s < mm_end) |
2718 | 1069 { |
6492 | 1070 __asm __volatile( |
1071 PREFETCH" 32%1\n\t" | |
1072 "movq %1, %%mm0\n\t" | |
1073 "movq %1, %%mm1\n\t" | |
1074 "movq %1, %%mm2\n\t" | |
1075 "pand %2, %%mm0\n\t" | |
1076 "pand %3, %%mm1\n\t" | |
1077 "pand %4, %%mm2\n\t" | |
1078 "psllq $3, %%mm0\n\t" | |
1079 "psrlq $3, %%mm1\n\t" | |
1080 "psrlq $8, %%mm2\n\t" | |
1081 "movq %%mm0, %%mm3\n\t" | |
1082 "movq %%mm1, %%mm4\n\t" | |
1083 "movq %%mm2, %%mm5\n\t" | |
1084 "punpcklwd %5, %%mm0\n\t" | |
1085 "punpcklwd %5, %%mm1\n\t" | |
1086 "punpcklwd %5, %%mm2\n\t" | |
1087 "punpckhwd %5, %%mm3\n\t" | |
1088 "punpckhwd %5, %%mm4\n\t" | |
1089 "punpckhwd %5, %%mm5\n\t" | |
1090 "psllq $8, %%mm1\n\t" | |
1091 "psllq $16, %%mm2\n\t" | |
1092 "por %%mm1, %%mm0\n\t" | |
1093 "por %%mm2, %%mm0\n\t" | |
1094 "psllq $8, %%mm4\n\t" | |
1095 "psllq $16, %%mm5\n\t" | |
1096 "por %%mm4, %%mm3\n\t" | |
1097 "por %%mm5, %%mm3\n\t" | |
1098 | |
1099 "movq %%mm0, %%mm6\n\t" | |
1100 "movq %%mm3, %%mm7\n\t" | |
1101 | |
1102 "movq 8%1, %%mm0\n\t" | |
1103 "movq 8%1, %%mm1\n\t" | |
1104 "movq 8%1, %%mm2\n\t" | |
1105 "pand %2, %%mm0\n\t" | |
1106 "pand %3, %%mm1\n\t" | |
1107 "pand %4, %%mm2\n\t" | |
1108 "psllq $3, %%mm0\n\t" | |
1109 "psrlq $3, %%mm1\n\t" | |
1110 "psrlq $8, %%mm2\n\t" | |
1111 "movq %%mm0, %%mm3\n\t" | |
1112 "movq %%mm1, %%mm4\n\t" | |
1113 "movq %%mm2, %%mm5\n\t" | |
1114 "punpcklwd %5, %%mm0\n\t" | |
1115 "punpcklwd %5, %%mm1\n\t" | |
1116 "punpcklwd %5, %%mm2\n\t" | |
1117 "punpckhwd %5, %%mm3\n\t" | |
1118 "punpckhwd %5, %%mm4\n\t" | |
1119 "punpckhwd %5, %%mm5\n\t" | |
1120 "psllq $8, %%mm1\n\t" | |
1121 "psllq $16, %%mm2\n\t" | |
1122 "por %%mm1, %%mm0\n\t" | |
1123 "por %%mm2, %%mm0\n\t" | |
1124 "psllq $8, %%mm4\n\t" | |
1125 "psllq $16, %%mm5\n\t" | |
1126 "por %%mm4, %%mm3\n\t" | |
1127 "por %%mm5, %%mm3\n\t" | |
1128 :"=m"(*d) | |
1129 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | |
1130 :"memory"); | |
1131 /* Borrowed 32 to 24 */ | |
1132 __asm __volatile( | |
1133 "movq %%mm0, %%mm4\n\t" | |
1134 "movq %%mm3, %%mm5\n\t" | |
1135 "movq %%mm6, %%mm0\n\t" | |
1136 "movq %%mm7, %%mm1\n\t" | |
1137 | |
1138 "movq %%mm4, %%mm6\n\t" | |
1139 "movq %%mm5, %%mm7\n\t" | |
1140 "movq %%mm0, %%mm2\n\t" | |
1141 "movq %%mm1, %%mm3\n\t" | |
1142 | |
1143 "psrlq $8, %%mm2\n\t" | |
1144 "psrlq $8, %%mm3\n\t" | |
1145 "psrlq $8, %%mm6\n\t" | |
1146 "psrlq $8, %%mm7\n\t" | |
1147 "pand %2, %%mm0\n\t" | |
1148 "pand %2, %%mm1\n\t" | |
1149 "pand %2, %%mm4\n\t" | |
1150 "pand %2, %%mm5\n\t" | |
1151 "pand %3, %%mm2\n\t" | |
1152 "pand %3, %%mm3\n\t" | |
1153 "pand %3, %%mm6\n\t" | |
1154 "pand %3, %%mm7\n\t" | |
1155 "por %%mm2, %%mm0\n\t" | |
1156 "por %%mm3, %%mm1\n\t" | |
1157 "por %%mm6, %%mm4\n\t" | |
1158 "por %%mm7, %%mm5\n\t" | |
1159 | |
1160 "movq %%mm1, %%mm2\n\t" | |
1161 "movq %%mm4, %%mm3\n\t" | |
1162 "psllq $48, %%mm2\n\t" | |
1163 "psllq $32, %%mm3\n\t" | |
1164 "pand %4, %%mm2\n\t" | |
1165 "pand %5, %%mm3\n\t" | |
1166 "por %%mm2, %%mm0\n\t" | |
1167 "psrlq $16, %%mm1\n\t" | |
1168 "psrlq $32, %%mm4\n\t" | |
1169 "psllq $16, %%mm5\n\t" | |
1170 "por %%mm3, %%mm1\n\t" | |
1171 "pand %6, %%mm5\n\t" | |
1172 "por %%mm5, %%mm4\n\t" | |
1173 | |
1174 MOVNTQ" %%mm0, %0\n\t" | |
1175 MOVNTQ" %%mm1, 8%0\n\t" | |
1176 MOVNTQ" %%mm4, 16%0" | |
1177 | |
1178 :"=m"(*d) | |
1179 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1180 :"memory"); | |
1181 d += 24; | |
1182 s += 8; | |
1183 } | |
1184 __asm __volatile(SFENCE:::"memory"); | |
1185 __asm __volatile(EMMS:::"memory"); | |
1186 #endif | |
1187 while(s < end) | |
1188 { | |
1189 register uint16_t bgr; | |
1190 bgr = *s++; | |
1191 *d++ = (bgr&0x1F)<<3; | |
1192 *d++ = (bgr&0x7E0)>>3; | |
1193 *d++ = (bgr&0xF800)>>8; | |
1194 } | |
1195 } | |
2718 | 1196 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1197 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 1198 { |
1199 const uint16_t *end; | |
1200 #ifdef HAVE_MMX | |
1201 const uint16_t *mm_end; | |
1202 #endif | |
1203 uint8_t *d = (uint8_t *)dst; | |
1204 const uint16_t *s = (const uint16_t *)src; | |
1205 end = s + src_size/2; | |
1206 #ifdef HAVE_MMX | |
1207 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1208 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
6605 | 1209 mm_end = end - 3; |
6492 | 1210 while(s < mm_end) |
1211 { | |
1212 __asm __volatile( | |
1213 PREFETCH" 32%1\n\t" | |
1214 "movq %1, %%mm0\n\t" | |
1215 "movq %1, %%mm1\n\t" | |
1216 "movq %1, %%mm2\n\t" | |
1217 "pand %2, %%mm0\n\t" | |
1218 "pand %3, %%mm1\n\t" | |
1219 "pand %4, %%mm2\n\t" | |
1220 "psllq $3, %%mm0\n\t" | |
1221 "psrlq $2, %%mm1\n\t" | |
1222 "psrlq $7, %%mm2\n\t" | |
1223 "movq %%mm0, %%mm3\n\t" | |
1224 "movq %%mm1, %%mm4\n\t" | |
1225 "movq %%mm2, %%mm5\n\t" | |
1226 "punpcklwd %%mm7, %%mm0\n\t" | |
1227 "punpcklwd %%mm7, %%mm1\n\t" | |
1228 "punpcklwd %%mm7, %%mm2\n\t" | |
1229 "punpckhwd %%mm7, %%mm3\n\t" | |
1230 "punpckhwd %%mm7, %%mm4\n\t" | |
1231 "punpckhwd %%mm7, %%mm5\n\t" | |
1232 "psllq $8, %%mm1\n\t" | |
1233 "psllq $16, %%mm2\n\t" | |
1234 "por %%mm1, %%mm0\n\t" | |
1235 "por %%mm2, %%mm0\n\t" | |
1236 "psllq $8, %%mm4\n\t" | |
1237 "psllq $16, %%mm5\n\t" | |
1238 "por %%mm4, %%mm3\n\t" | |
1239 "por %%mm5, %%mm3\n\t" | |
1240 MOVNTQ" %%mm0, %0\n\t" | |
1241 MOVNTQ" %%mm3, 8%0\n\t" | |
1242 :"=m"(*d) | |
1243 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | |
1244 :"memory"); | |
1245 d += 16; | |
1246 s += 4; | |
1247 } | |
1248 __asm __volatile(SFENCE:::"memory"); | |
1249 __asm __volatile(EMMS:::"memory"); | |
1250 #endif | |
1251 while(s < end) | |
1252 { | |
9430 | 1253 #if 0 //slightly slower on athlon |
1254 int bgr= *s++; | |
1255 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); | |
1256 #else | |
6492 | 1257 register uint16_t bgr; |
1258 bgr = *s++; | |
13423 | 1259 #ifdef WORDS_BIGENDIAN |
1260 *d++ = 0; | |
17586 | 1261 *d++ = (bgr&0x7C00)>>7; |
13423 | 1262 *d++ = (bgr&0x3E0)>>2; |
17586 | 1263 *d++ = (bgr&0x1F)<<3; |
13423 | 1264 #else |
6492 | 1265 *d++ = (bgr&0x1F)<<3; |
1266 *d++ = (bgr&0x3E0)>>2; | |
1267 *d++ = (bgr&0x7C00)>>7; | |
1268 *d++ = 0; | |
9430 | 1269 #endif |
13423 | 1270 |
1271 #endif | |
2718 | 1272 } |
6492 | 1273 } |
1274 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1275 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) |
6492 | 1276 { |
1277 const uint16_t *end; | |
1278 #ifdef HAVE_MMX | |
1279 const uint16_t *mm_end; | |
2741 | 1280 #endif |
6492 | 1281 uint8_t *d = (uint8_t *)dst; |
1282 const uint16_t *s = (uint16_t *)src; | |
1283 end = s + src_size/2; | |
1284 #ifdef HAVE_MMX | |
1285 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1286 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
6605 | 1287 mm_end = end - 3; |
6492 | 1288 while(s < mm_end) |
1289 { | |
1290 __asm __volatile( | |
1291 PREFETCH" 32%1\n\t" | |
1292 "movq %1, %%mm0\n\t" | |
1293 "movq %1, %%mm1\n\t" | |
1294 "movq %1, %%mm2\n\t" | |
1295 "pand %2, %%mm0\n\t" | |
1296 "pand %3, %%mm1\n\t" | |
1297 "pand %4, %%mm2\n\t" | |
1298 "psllq $3, %%mm0\n\t" | |
1299 "psrlq $3, %%mm1\n\t" | |
1300 "psrlq $8, %%mm2\n\t" | |
1301 "movq %%mm0, %%mm3\n\t" | |
1302 "movq %%mm1, %%mm4\n\t" | |
1303 "movq %%mm2, %%mm5\n\t" | |
1304 "punpcklwd %%mm7, %%mm0\n\t" | |
1305 "punpcklwd %%mm7, %%mm1\n\t" | |
1306 "punpcklwd %%mm7, %%mm2\n\t" | |
1307 "punpckhwd %%mm7, %%mm3\n\t" | |
1308 "punpckhwd %%mm7, %%mm4\n\t" | |
1309 "punpckhwd %%mm7, %%mm5\n\t" | |
1310 "psllq $8, %%mm1\n\t" | |
1311 "psllq $16, %%mm2\n\t" | |
1312 "por %%mm1, %%mm0\n\t" | |
1313 "por %%mm2, %%mm0\n\t" | |
1314 "psllq $8, %%mm4\n\t" | |
1315 "psllq $16, %%mm5\n\t" | |
1316 "por %%mm4, %%mm3\n\t" | |
1317 "por %%mm5, %%mm3\n\t" | |
1318 MOVNTQ" %%mm0, %0\n\t" | |
1319 MOVNTQ" %%mm3, 8%0\n\t" | |
1320 :"=m"(*d) | |
1321 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | |
1322 :"memory"); | |
1323 d += 16; | |
1324 s += 4; | |
1325 } | |
1326 __asm __volatile(SFENCE:::"memory"); | |
1327 __asm __volatile(EMMS:::"memory"); | |
1328 #endif | |
1329 while(s < end) | |
1330 { | |
1331 register uint16_t bgr; | |
1332 bgr = *s++; | |
13423 | 1333 #ifdef WORDS_BIGENDIAN |
1334 *d++ = 0; | |
17586 | 1335 *d++ = (bgr&0xF800)>>8; |
13423 | 1336 *d++ = (bgr&0x7E0)>>3; |
17586 | 1337 *d++ = (bgr&0x1F)<<3; |
13423 | 1338 #else |
6492 | 1339 *d++ = (bgr&0x1F)<<3; |
1340 *d++ = (bgr&0x7E0)>>3; | |
1341 *d++ = (bgr&0xF800)>>8; | |
1342 *d++ = 0; | |
13423 | 1343 #endif |
6492 | 1344 } |
2718 | 1345 } |
2694 | 1346 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1347 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) |
2755 | 1348 { |
1349 #ifdef HAVE_MMX | |
6492 | 1350 /* TODO: unroll this loop */ |
2755 | 1351 asm volatile ( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1352 "xor %%"REG_a", %%"REG_a" \n\t" |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1353 ".balign 16 \n\t" |
2755 | 1354 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1355 PREFETCH" 32(%0, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1356 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
2755 | 1357 "movq %%mm0, %%mm1 \n\t" |
1358 "movq %%mm0, %%mm2 \n\t" | |
1359 "pslld $16, %%mm0 \n\t" | |
1360 "psrld $16, %%mm1 \n\t" | |
6492 | 1361 "pand "MANGLE(mask32r)", %%mm0 \n\t" |
1362 "pand "MANGLE(mask32g)", %%mm2 \n\t" | |
1363 "pand "MANGLE(mask32b)", %%mm1 \n\t" | |
2755 | 1364 "por %%mm0, %%mm2 \n\t" |
1365 "por %%mm1, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1366 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1367 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1368 "cmp %2, %%"REG_a" \n\t" |
2755 | 1369 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1370 :: "r" (src), "r"(dst), "r" (src_size-7) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1371 : "%"REG_a |
2755 | 1372 ); |
2766 | 1373 |
1374 __asm __volatile(SFENCE:::"memory"); | |
1375 __asm __volatile(EMMS:::"memory"); | |
2755 | 1376 #else |
6492 | 1377 unsigned i; |
1378 unsigned num_pixels = src_size >> 2; | |
2755 | 1379 for(i=0; i<num_pixels; i++) |
1380 { | |
9988
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1381 #ifdef WORDS_BIGENDIAN |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1382 dst[4*i + 1] = src[4*i + 3]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1383 dst[4*i + 2] = src[4*i + 2]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1384 dst[4*i + 3] = src[4*i + 1]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1385 #else |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1386 dst[4*i + 0] = src[4*i + 2]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1387 dst[4*i + 1] = src[4*i + 1]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1388 dst[4*i + 2] = src[4*i + 0]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1389 #endif |
2755 | 1390 } |
1391 #endif | |
1392 } | |
1393 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1394 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) |
5582 | 1395 { |
6492 | 1396 unsigned i; |
5582 | 1397 #ifdef HAVE_MMX |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1398 long mmx_size= 23 - src_size; |
5582 | 1399 asm volatile ( |
1400 "movq "MANGLE(mask24r)", %%mm5 \n\t" | |
1401 "movq "MANGLE(mask24g)", %%mm6 \n\t" | |
1402 "movq "MANGLE(mask24b)", %%mm7 \n\t" | |
1403 ".balign 16 \n\t" | |
1404 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1405 PREFETCH" 32(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1406 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1407 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1408 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B |
5582 | 1409 "psllq $16, %%mm0 \n\t" // 00 BGR BGR |
1410 "pand %%mm5, %%mm0 \n\t" | |
1411 "pand %%mm6, %%mm1 \n\t" | |
1412 "pand %%mm7, %%mm2 \n\t" | |
1413 "por %%mm0, %%mm1 \n\t" | |
1414 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1415 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1416 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1417 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1418 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR |
5582 | 1419 "pand %%mm7, %%mm0 \n\t" |
1420 "pand %%mm5, %%mm1 \n\t" | |
1421 "pand %%mm6, %%mm2 \n\t" | |
1422 "por %%mm0, %%mm1 \n\t" | |
1423 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1424 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1425 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1426 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1427 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG |
5582 | 1428 "pand %%mm6, %%mm0 \n\t" |
1429 "pand %%mm7, %%mm1 \n\t" | |
1430 "pand %%mm5, %%mm2 \n\t" | |
1431 "por %%mm0, %%mm1 \n\t" | |
1432 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1433 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1434 "add $24, %%"REG_a" \n\t" |
5582 | 1435 " js 1b \n\t" |
1436 : "+a" (mmx_size) | |
1437 : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1438 ); | |
1439 | |
1440 __asm __volatile(SFENCE:::"memory"); | |
1441 __asm __volatile(EMMS:::"memory"); | |
1442 | |
6096 | 1443 if(mmx_size==23) return; //finihsed, was multiple of 8 |
6492 | 1444 |
5582 | 1445 src+= src_size; |
1446 dst+= src_size; | |
6492 | 1447 src_size= 23-mmx_size; |
5582 | 1448 src-= src_size; |
1449 dst-= src_size; | |
1450 #endif | |
1451 for(i=0; i<src_size; i+=3) | |
1452 { | |
6492 | 1453 register uint8_t x; |
5582 | 1454 x = src[i + 2]; |
1455 dst[i + 1] = src[i + 1]; | |
1456 dst[i + 2] = src[i + 0]; | |
1457 dst[i + 0] = x; | |
1458 } | |
1459 } | |
1460 | |
5588 | 1461 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1462 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1463 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) |
2701 | 1464 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1465 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1466 const long chromWidth= width>>1; |
2723 | 1467 for(y=0; y<height; y++) |
1468 { | |
2702 | 1469 #ifdef HAVE_MMX |
2723 | 1470 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
1471 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1472 "xor %%"REG_a", %%"REG_a" \n\t" |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1473 ".balign 16 \n\t" |
2723 | 1474 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1475 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1476 PREFETCH" 32(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1477 PREFETCH" 32(%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1478 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
2723 | 1479 "movq %%mm0, %%mm2 \n\t" // U(0) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1480 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
2723 | 1481 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1482 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1483 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1484 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1485 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
2723 | 1486 "movq %%mm3, %%mm4 \n\t" // Y(0) |
1487 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
1488 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
1489 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
1490 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
1491 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 1492 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1493 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1494 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1495 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1496 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
2702 | 1497 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1498 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1499 "cmp %4, %%"REG_a" \n\t" |
2723 | 1500 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1501 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1502 : "%"REG_a |
2723 | 1503 ); |
2702 | 1504 #else |
9393
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1505 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1506 #if defined ARCH_ALPHA && defined HAVE_MVI |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1507 #define pl2yuy2(n) \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1508 y1 = yc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1509 y2 = yc2[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1510 u = uc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1511 v = vc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1512 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1513 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1514 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1515 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1516 yuv1 = (u << 8) + (v << 24); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1517 yuv2 = yuv1 + y2; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1518 yuv1 += y1; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1519 qdst[n] = yuv1; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1520 qdst2[n] = yuv2; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1521 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1522 int i; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1523 uint64_t *qdst = (uint64_t *) dst; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1524 uint64_t *qdst2 = (uint64_t *) (dst + dstStride); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1525 const uint32_t *yc = (uint32_t *) ysrc; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1526 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1527 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1528 for(i = 0; i < chromWidth; i += 8){ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1529 uint64_t y1, y2, yuv1, yuv2; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1530 uint64_t u, v; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1531 /* Prefetch */ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1532 asm("ldq $31,64(%0)" :: "r"(yc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1533 asm("ldq $31,64(%0)" :: "r"(yc2)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1534 asm("ldq $31,64(%0)" :: "r"(uc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1535 asm("ldq $31,64(%0)" :: "r"(vc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1536 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1537 pl2yuy2(0); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1538 pl2yuy2(1); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1539 pl2yuy2(2); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1540 pl2yuy2(3); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1541 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1542 yc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1543 yc2 += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1544 uc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1545 vc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1546 qdst += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1547 qdst2 += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1548 } |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1549 y++; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1550 ysrc += lumStride; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1551 dst += dstStride; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1552 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1553 #elif __WORDSIZE >= 64 |
2723 | 1554 int i; |
6492 | 1555 uint64_t *ldst = (uint64_t *) dst; |
1556 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1557 for(i = 0; i < chromWidth; i += 2){ | |
1558 uint64_t k, l; | |
1559 k = yc[0] + (uc[0] << 8) + | |
1560 (yc[1] << 16) + (vc[0] << 24); | |
1561 l = yc[2] + (uc[1] << 8) + | |
1562 (yc[3] << 16) + (vc[1] << 24); | |
1563 *ldst++ = k + (l << 32); | |
1564 yc += 4; | |
1565 uc += 2; | |
1566 vc += 2; | |
2723 | 1567 } |
6492 | 1568 |
1569 #else | |
1570 int i, *idst = (int32_t *) dst; | |
1571 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1572 for(i = 0; i < chromWidth; i++){ | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1573 #ifdef WORDS_BIGENDIAN |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1574 *idst++ = (yc[0] << 24)+ (uc[0] << 16) + |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1575 (yc[1] << 8) + (vc[0] << 0); |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1576 #else |
6492 | 1577 *idst++ = yc[0] + (uc[0] << 8) + |
1578 (yc[1] << 16) + (vc[0] << 24); | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1579 #endif |
6492 | 1580 yc += 2; |
1581 uc++; | |
1582 vc++; | |
1583 } | |
1584 #endif | |
2723 | 1585 #endif |
5588 | 1586 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
2723 | 1587 { |
1588 usrc += chromStride; | |
1589 vsrc += chromStride; | |
1590 } | |
1591 ysrc += lumStride; | |
1592 dst += dstStride; | |
2701 | 1593 } |
2723 | 1594 #ifdef HAVE_MMX |
1595 asm( EMMS" \n\t" | |
1596 SFENCE" \n\t" | |
1597 :::"memory"); | |
2702 | 1598 #endif |
2701 | 1599 } |
1600 | |
2724 | 1601 /** |
1602 * | |
1603 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1604 * problem for anyone then tell me, and ill fix it) | |
1605 */ | |
5588 | 1606 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1607 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1608 long lumStride, long chromStride, long dstStride) |
5588 | 1609 { |
1610 //FIXME interpolate chroma | |
1611 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1612 } | |
1613 | |
11068 | 1614 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1615 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1616 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) |
11068 | 1617 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1618 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1619 const long chromWidth= width>>1; |
11068 | 1620 for(y=0; y<height; y++) |
1621 { | |
11072 | 1622 #ifdef HAVE_MMX |
1623 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) | |
1624 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1625 "xor %%"REG_a", %%"REG_a" \n\t" |
11072 | 1626 ".balign 16 \n\t" |
1627 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1628 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1629 PREFETCH" 32(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1630 PREFETCH" 32(%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1631 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
11072 | 1632 "movq %%mm0, %%mm2 \n\t" // U(0) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1633 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
11072 | 1634 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1635 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1636 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1637 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1638 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
11072 | 1639 "movq %%mm0, %%mm4 \n\t" // Y(0) |
1640 "movq %%mm2, %%mm6 \n\t" // Y(8) | |
1641 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | |
1642 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | |
1643 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | |
1644 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | |
1645 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1646 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1647 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1648 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1649 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
11072 | 1650 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1651 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1652 "cmp %4, %%"REG_a" \n\t" |
11072 | 1653 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1654 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1655 : "%"REG_a |
11072 | 1656 ); |
1657 #else | |
1658 //FIXME adapt the alpha asm code from yv12->yuy2 | |
1659 | |
11068 | 1660 #if __WORDSIZE >= 64 |
1661 int i; | |
1662 uint64_t *ldst = (uint64_t *) dst; | |
1663 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1664 for(i = 0; i < chromWidth; i += 2){ | |
1665 uint64_t k, l; | |
1666 k = uc[0] + (yc[0] << 8) + | |
1667 (vc[0] << 16) + (yc[1] << 24); | |
1668 l = uc[1] + (yc[2] << 8) + | |
1669 (vc[1] << 16) + (yc[3] << 24); | |
1670 *ldst++ = k + (l << 32); | |
1671 yc += 4; | |
1672 uc += 2; | |
1673 vc += 2; | |
1674 } | |
1675 | |
1676 #else | |
1677 int i, *idst = (int32_t *) dst; | |
1678 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1679 for(i = 0; i < chromWidth; i++){ | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1680 #ifdef WORDS_BIGENDIAN |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1681 *idst++ = (uc[0] << 24)+ (yc[0] << 16) + |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1682 (vc[0] << 8) + (yc[1] << 0); |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1683 #else |
11068 | 1684 *idst++ = uc[0] + (yc[0] << 8) + |
1685 (vc[0] << 16) + (yc[1] << 24); | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1686 #endif |
11068 | 1687 yc += 2; |
1688 uc++; | |
1689 vc++; | |
1690 } | |
1691 #endif | |
11072 | 1692 #endif |
11068 | 1693 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
1694 { | |
1695 usrc += chromStride; | |
1696 vsrc += chromStride; | |
1697 } | |
1698 ysrc += lumStride; | |
1699 dst += dstStride; | |
1700 } | |
11072 | 1701 #ifdef HAVE_MMX |
1702 asm( EMMS" \n\t" | |
1703 SFENCE" \n\t" | |
1704 :::"memory"); | |
1705 #endif | |
11068 | 1706 } |
1707 | |
1708 /** | |
1709 * | |
1710 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1711 * problem for anyone then tell me, and ill fix it) | |
1712 */ | |
1713 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1714 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1715 long lumStride, long chromStride, long dstStride) |
11068 | 1716 { |
1717 //FIXME interpolate chroma | |
1718 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1719 } | |
1720 | |
5588 | 1721 /** |
1722 * | |
1723 * width should be a multiple of 16 | |
1724 */ | |
1725 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1726 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1727 long lumStride, long chromStride, long dstStride) |
5588 | 1728 { |
1729 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1730 } | |
1731 | |
1732 /** | |
1733 * | |
1734 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1735 * problem for anyone then tell me, and ill fix it) | |
1736 */ | |
3132 | 1737 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1738 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1739 long lumStride, long chromStride, long srcStride) |
2701 | 1740 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1741 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1742 const long chromWidth= width>>1; |
2724 | 1743 for(y=0; y<height; y+=2) |
1744 { | |
2704 | 1745 #ifdef HAVE_MMX |
2724 | 1746 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1747 "xor %%"REG_a", %%"REG_a" \n\t" |
2724 | 1748 "pcmpeqw %%mm7, %%mm7 \n\t" |
1749 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1750 ".balign 16 \n\t" |
2724 | 1751 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1752 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1753 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1754 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
2724 | 1755 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) |
1756 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
1757 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
1758 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
1759 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1760 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1761 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1762 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
1763 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1764 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t" |
2704 | 1765 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1766 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1767 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12) |
2724 | 1768 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) |
1769 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
1770 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
1771 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
1772 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
1773 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
1774 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
1775 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 1776 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1777 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t" |
2724 | 1778 |
1779 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
1780 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
1781 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
1782 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
1783 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
1784 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
1785 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
1786 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 1787 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1788 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1789 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" |
2724 | 1790 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1791 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1792 "cmp %4, %%"REG_a" \n\t" |
2724 | 1793 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1794 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1795 : "memory", "%"REG_a |
2725 | 1796 ); |
2704 | 1797 |
2806 | 1798 ydst += lumStride; |
1799 src += srcStride; | |
1800 | |
2725 | 1801 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1802 "xor %%"REG_a", %%"REG_a" \n\t" |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1803 ".balign 16 \n\t" |
2724 | 1804 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1805 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1806 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1807 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1808 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1809 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12) |
2724 | 1810 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) |
1811 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
1812 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
1813 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
1814 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
1815 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 1816 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1817 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1818 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t" |
2724 | 1819 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1820 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1821 "cmp %4, %%"REG_a" \n\t" |
2724 | 1822 " jb 1b \n\t" |
2704 | 1823 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1824 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1825 : "memory", "%"REG_a |
2724 | 1826 ); |
2704 | 1827 #else |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1828 long i; |
2724 | 1829 for(i=0; i<chromWidth; i++) |
1830 { | |
1831 ydst[2*i+0] = src[4*i+0]; | |
1832 udst[i] = src[4*i+1]; | |
1833 ydst[2*i+1] = src[4*i+2]; | |
1834 vdst[i] = src[4*i+3]; | |
1835 } | |
1836 ydst += lumStride; | |
1837 src += srcStride; | |
1838 | |
1839 for(i=0; i<chromWidth; i++) | |
1840 { | |
1841 ydst[2*i+0] = src[4*i+0]; | |
1842 ydst[2*i+1] = src[4*i+2]; | |
1843 } | |
1844 #endif | |
1845 udst += chromStride; | |
1846 vdst += chromStride; | |
1847 ydst += lumStride; | |
1848 src += srcStride; | |
2701 | 1849 } |
2724 | 1850 #ifdef HAVE_MMX |
2847 | 1851 asm volatile( EMMS" \n\t" |
1852 SFENCE" \n\t" | |
1853 :::"memory"); | |
2704 | 1854 #endif |
2723 | 1855 } |
2801 | 1856 |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1857 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1858 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1859 long width, long height, long lumStride, long chromStride) |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1860 { |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1861 /* Y Plane */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1862 memcpy(ydst, ysrc, width*height); |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1863 |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1864 /* XXX: implement upscaling for U,V */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1865 } |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1866 |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1867 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride) |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1868 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1869 long x,y; |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1870 |
9256 | 1871 dst[0]= src[0]; |
1872 | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1873 // first line |
9256 | 1874 for(x=0; x<srcWidth-1; x++){ |
1875 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1876 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1877 } |
9256 | 1878 dst[2*srcWidth-1]= src[srcWidth-1]; |
1879 | |
1880 dst+= dstStride; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1881 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1882 for(y=1; y<srcHeight; y++){ |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1883 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1884 const long mmxSize= srcWidth&~15; |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1885 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1886 "mov %4, %%"REG_a" \n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1887 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1888 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1889 "movq (%1, %%"REG_a"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1890 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1891 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1892 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1893 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" |
9256 | 1894 PAVGB" %%mm0, %%mm5 \n\t" |
1895 PAVGB" %%mm0, %%mm3 \n\t" | |
1896 PAVGB" %%mm0, %%mm5 \n\t" | |
1897 PAVGB" %%mm0, %%mm3 \n\t" | |
1898 PAVGB" %%mm1, %%mm4 \n\t" | |
1899 PAVGB" %%mm1, %%mm2 \n\t" | |
1900 PAVGB" %%mm1, %%mm4 \n\t" | |
1901 PAVGB" %%mm1, %%mm2 \n\t" | |
1902 "movq %%mm5, %%mm7 \n\t" | |
1903 "movq %%mm4, %%mm6 \n\t" | |
1904 "punpcklbw %%mm3, %%mm5 \n\t" | |
1905 "punpckhbw %%mm3, %%mm7 \n\t" | |
1906 "punpcklbw %%mm2, %%mm4 \n\t" | |
1907 "punpckhbw %%mm2, %%mm6 \n\t" | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1908 #if 1 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1909 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1910 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1911 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1912 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1913 #else |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1914 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1915 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1916 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1917 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1918 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1919 "add $8, %%"REG_a" \n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1920 " js 1b \n\t" |
9256 | 1921 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1922 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1923 "g" (-mmxSize) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1924 : "%"REG_a |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1925 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1926 ); |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1927 #else |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1928 const long mmxSize=1; |
9256 | 1929 #endif |
1930 dst[0 ]= (3*src[0] + src[srcStride])>>2; | |
1931 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1932 |
9256 | 1933 for(x=mmxSize-1; x<srcWidth-1; x++){ |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1934 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1935 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1936 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1937 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1938 } |
9256 | 1939 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; |
1940 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1941 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1942 dst+=dstStride*2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1943 src+=srcStride; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1944 } |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1945 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1946 // last line |
9256 | 1947 #if 1 |
1948 dst[0]= src[0]; | |
1949 | |
1950 for(x=0; x<srcWidth-1; x++){ | |
1951 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1952 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1953 } | |
1954 dst[2*srcWidth-1]= src[srcWidth-1]; | |
1955 #else | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1956 for(x=0; x<srcWidth; x++){ |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1957 dst[2*x+0]= |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1958 dst[2*x+1]= src[x]; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1959 } |
9256 | 1960 #endif |
1961 | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1962 #ifdef HAVE_MMX |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1963 asm volatile( EMMS" \n\t" |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1964 SFENCE" \n\t" |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1965 :::"memory"); |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1966 #endif |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1967 } |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1968 |
2801 | 1969 /** |
1970 * | |
1971 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1972 * problem for anyone then tell me, and ill fix it) | |
3132 | 1973 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version |
2801 | 1974 */ |
3132 | 1975 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1976 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1977 long lumStride, long chromStride, long srcStride) |
2801 | 1978 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1979 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
1980 const long chromWidth= width>>1; |
2801 | 1981 for(y=0; y<height; y+=2) |
1982 { | |
2847 | 1983 #ifdef HAVE_MMX |
1984 asm volatile( | |
1985 "xorl %%eax, %%eax \n\t" | |
1986 "pcmpeqw %%mm7, %%mm7 \n\t" | |
1987 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
1988 ".balign 16 \n\t" | |
1989 "1: \n\t" | |
1990 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1991 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
1992 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
1993 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
1994 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
1995 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
1996 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
1997 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1998 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1999 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
2000 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
2001 | |
2002 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2003 | |
2004 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
2005 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
2006 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
2007 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
2008 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
2009 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
2010 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
2011 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
2012 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
2013 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2014 | |
2015 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
2016 | |
2017 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
2018 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
2019 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
2020 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
2021 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
2022 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
2023 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
2024 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2025 | |
2026 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
2027 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
2028 | |
2029 "addl $8, %%eax \n\t" | |
2030 "cmpl %4, %%eax \n\t" | |
2031 " jb 1b \n\t" | |
9394 | 2032 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2847 | 2033 : "memory", "%eax" |
2034 ); | |
2035 | |
2036 ydst += lumStride; | |
2037 src += srcStride; | |
2038 | |
2039 asm volatile( | |
2040 "xorl %%eax, %%eax \n\t" | |
2041 ".balign 16 \n\t" | |
2042 "1: \n\t" | |
2043 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
2044 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
2045 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
2046 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
2047 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
2048 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
2049 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
2050 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
2051 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
2052 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
2053 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2054 | |
2055 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
2056 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
2057 | |
2058 "addl $8, %%eax \n\t" | |
2059 "cmpl %4, %%eax \n\t" | |
2060 " jb 1b \n\t" | |
2061 | |
9394 | 2062 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2847 | 2063 : "memory", "%eax" |
2064 ); | |
2065 #else | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2066 long i; |
2801 | 2067 for(i=0; i<chromWidth; i++) |
2068 { | |
2069 udst[i] = src[4*i+0]; | |
2070 ydst[2*i+0] = src[4*i+1]; | |
2071 vdst[i] = src[4*i+2]; | |
2072 ydst[2*i+1] = src[4*i+3]; | |
2073 } | |
2074 ydst += lumStride; | |
2075 src += srcStride; | |
2076 | |
2077 for(i=0; i<chromWidth; i++) | |
2078 { | |
2079 ydst[2*i+0] = src[4*i+1]; | |
2080 ydst[2*i+1] = src[4*i+3]; | |
2081 } | |
2847 | 2082 #endif |
2801 | 2083 udst += chromStride; |
2084 vdst += chromStride; | |
2085 ydst += lumStride; | |
2086 src += srcStride; | |
2087 } | |
2847 | 2088 #ifdef HAVE_MMX |
2089 asm volatile( EMMS" \n\t" | |
2090 SFENCE" \n\t" | |
2091 :::"memory"); | |
2092 #endif | |
2801 | 2093 } |
2094 | |
3132 | 2095 /** |
2096 * | |
2097 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
2098 * problem for anyone then tell me, and ill fix it) | |
4622 | 2099 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version |
3132 | 2100 */ |
2101 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2102 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2103 long lumStride, long chromStride, long srcStride) |
3132 | 2104 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2105 long y; |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2106 const long chromWidth= width>>1; |
4622 | 2107 #ifdef HAVE_MMX |
2108 for(y=0; y<height-2; y+=2) | |
2109 { | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2110 long i; |
4622 | 2111 for(i=0; i<2; i++) |
2112 { | |
2113 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2114 "mov %2, %%"REG_a" \n\t" |
4923 | 2115 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
2116 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4622 | 2117 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2118 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
4622 | 2119 ".balign 16 \n\t" |
2120 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2121 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2122 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2123 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" |
4622 | 2124 "punpcklbw %%mm7, %%mm0 \n\t" |
2125 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2126 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2127 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" |
4622 | 2128 "punpcklbw %%mm7, %%mm2 \n\t" |
2129 "punpcklbw %%mm7, %%mm3 \n\t" | |
2130 "pmaddwd %%mm6, %%mm0 \n\t" | |
2131 "pmaddwd %%mm6, %%mm1 \n\t" | |
2132 "pmaddwd %%mm6, %%mm2 \n\t" | |
2133 "pmaddwd %%mm6, %%mm3 \n\t" | |
2134 #ifndef FAST_BGR2YV12 | |
2135 "psrad $8, %%mm0 \n\t" | |
2136 "psrad $8, %%mm1 \n\t" | |
2137 "psrad $8, %%mm2 \n\t" | |
2138 "psrad $8, %%mm3 \n\t" | |
2139 #endif | |
2140 "packssdw %%mm1, %%mm0 \n\t" | |
2141 "packssdw %%mm3, %%mm2 \n\t" | |
2142 "pmaddwd %%mm5, %%mm0 \n\t" | |
2143 "pmaddwd %%mm5, %%mm2 \n\t" | |
2144 "packssdw %%mm2, %%mm0 \n\t" | |
2145 "psraw $7, %%mm0 \n\t" | |
2146 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2147 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2148 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" |
4622 | 2149 "punpcklbw %%mm7, %%mm4 \n\t" |
2150 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2151 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2152 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" |
4622 | 2153 "punpcklbw %%mm7, %%mm2 \n\t" |
2154 "punpcklbw %%mm7, %%mm3 \n\t" | |
2155 "pmaddwd %%mm6, %%mm4 \n\t" | |
2156 "pmaddwd %%mm6, %%mm1 \n\t" | |
2157 "pmaddwd %%mm6, %%mm2 \n\t" | |
2158 "pmaddwd %%mm6, %%mm3 \n\t" | |
2159 #ifndef FAST_BGR2YV12 | |
2160 "psrad $8, %%mm4 \n\t" | |
2161 "psrad $8, %%mm1 \n\t" | |
2162 "psrad $8, %%mm2 \n\t" | |
2163 "psrad $8, %%mm3 \n\t" | |
2164 #endif | |
2165 "packssdw %%mm1, %%mm4 \n\t" | |
2166 "packssdw %%mm3, %%mm2 \n\t" | |
2167 "pmaddwd %%mm5, %%mm4 \n\t" | |
2168 "pmaddwd %%mm5, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2169 "add $24, %%"REG_b" \n\t" |
4622 | 2170 "packssdw %%mm2, %%mm4 \n\t" |
2171 "psraw $7, %%mm4 \n\t" | |
2172 | |
2173 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 2174 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4622 | 2175 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2176 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2177 "add $8, %%"REG_a" \n\t" |
4622 | 2178 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2179 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2180 : "%"REG_a, "%"REG_b |
4622 | 2181 ); |
2182 ydst += lumStride; | |
2183 src += srcStride; | |
2184 } | |
2185 src -= srcStride*2; | |
2186 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2187 "mov %4, %%"REG_a" \n\t" |
4923 | 2188 "movq "MANGLE(w1111)", %%mm5 \n\t" |
2189 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4622 | 2190 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2191 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2192 "add %%"REG_b", %%"REG_b" \n\t" |
4622 | 2193 ".balign 16 \n\t" |
2194 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2195 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2196 PREFETCH" 64(%1, %%"REG_b") \n\t" |
4622 | 2197 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2198 "movq (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2199 "movq (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2200 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2201 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2202 PAVGB" %%mm1, %%mm0 \n\t" |
2203 PAVGB" %%mm3, %%mm2 \n\t" | |
2204 "movq %%mm0, %%mm1 \n\t" | |
2205 "movq %%mm2, %%mm3 \n\t" | |
2206 "psrlq $24, %%mm0 \n\t" | |
2207 "psrlq $24, %%mm2 \n\t" | |
2208 PAVGB" %%mm1, %%mm0 \n\t" | |
2209 PAVGB" %%mm3, %%mm2 \n\t" | |
2210 "punpcklbw %%mm7, %%mm0 \n\t" | |
2211 "punpcklbw %%mm7, %%mm2 \n\t" | |
2212 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2213 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2214 "movd (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2215 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2216 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2217 "punpcklbw %%mm7, %%mm0 \n\t" |
2218 "punpcklbw %%mm7, %%mm1 \n\t" | |
2219 "punpcklbw %%mm7, %%mm2 \n\t" | |
2220 "punpcklbw %%mm7, %%mm3 \n\t" | |
2221 "paddw %%mm1, %%mm0 \n\t" | |
2222 "paddw %%mm3, %%mm2 \n\t" | |
2223 "paddw %%mm2, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2224 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2225 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2226 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2227 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2228 "punpcklbw %%mm7, %%mm4 \n\t" |
2229 "punpcklbw %%mm7, %%mm1 \n\t" | |
2230 "punpcklbw %%mm7, %%mm2 \n\t" | |
2231 "punpcklbw %%mm7, %%mm3 \n\t" | |
2232 "paddw %%mm1, %%mm4 \n\t" | |
2233 "paddw %%mm3, %%mm2 \n\t" | |
2234 "paddw %%mm4, %%mm2 \n\t" | |
2235 "psrlw $2, %%mm0 \n\t" | |
2236 "psrlw $2, %%mm2 \n\t" | |
2237 #endif | |
4923 | 2238 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
2239 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 2240 |
2241 "pmaddwd %%mm0, %%mm1 \n\t" | |
2242 "pmaddwd %%mm2, %%mm3 \n\t" | |
2243 "pmaddwd %%mm6, %%mm0 \n\t" | |
2244 "pmaddwd %%mm6, %%mm2 \n\t" | |
2245 #ifndef FAST_BGR2YV12 | |
2246 "psrad $8, %%mm0 \n\t" | |
2247 "psrad $8, %%mm1 \n\t" | |
2248 "psrad $8, %%mm2 \n\t" | |
2249 "psrad $8, %%mm3 \n\t" | |
2250 #endif | |
2251 "packssdw %%mm2, %%mm0 \n\t" | |
2252 "packssdw %%mm3, %%mm1 \n\t" | |
2253 "pmaddwd %%mm5, %%mm0 \n\t" | |
2254 "pmaddwd %%mm5, %%mm1 \n\t" | |
2255 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
2256 "psraw $7, %%mm0 \n\t" | |
2257 | |
2258 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2259 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2260 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2261 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2262 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2263 PAVGB" %%mm1, %%mm4 \n\t" |
2264 PAVGB" %%mm3, %%mm2 \n\t" | |
2265 "movq %%mm4, %%mm1 \n\t" | |
2266 "movq %%mm2, %%mm3 \n\t" | |
2267 "psrlq $24, %%mm4 \n\t" | |
2268 "psrlq $24, %%mm2 \n\t" | |
2269 PAVGB" %%mm1, %%mm4 \n\t" | |
2270 PAVGB" %%mm3, %%mm2 \n\t" | |
2271 "punpcklbw %%mm7, %%mm4 \n\t" | |
2272 "punpcklbw %%mm7, %%mm2 \n\t" | |
2273 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2274 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2275 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2276 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2277 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2278 "punpcklbw %%mm7, %%mm4 \n\t" |
2279 "punpcklbw %%mm7, %%mm1 \n\t" | |
2280 "punpcklbw %%mm7, %%mm2 \n\t" | |
2281 "punpcklbw %%mm7, %%mm3 \n\t" | |
2282 "paddw %%mm1, %%mm4 \n\t" | |
2283 "paddw %%mm3, %%mm2 \n\t" | |
2284 "paddw %%mm2, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2285 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2286 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2287 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2288 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2289 "punpcklbw %%mm7, %%mm5 \n\t" |
2290 "punpcklbw %%mm7, %%mm1 \n\t" | |
2291 "punpcklbw %%mm7, %%mm2 \n\t" | |
2292 "punpcklbw %%mm7, %%mm3 \n\t" | |
2293 "paddw %%mm1, %%mm5 \n\t" | |
2294 "paddw %%mm3, %%mm2 \n\t" | |
2295 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 2296 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4622 | 2297 "psrlw $2, %%mm4 \n\t" |
2298 "psrlw $2, %%mm2 \n\t" | |
2299 #endif | |
4923 | 2300 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
2301 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 2302 |
2303 "pmaddwd %%mm4, %%mm1 \n\t" | |
2304 "pmaddwd %%mm2, %%mm3 \n\t" | |
2305 "pmaddwd %%mm6, %%mm4 \n\t" | |
2306 "pmaddwd %%mm6, %%mm2 \n\t" | |
2307 #ifndef FAST_BGR2YV12 | |
2308 "psrad $8, %%mm4 \n\t" | |
2309 "psrad $8, %%mm1 \n\t" | |
2310 "psrad $8, %%mm2 \n\t" | |
2311 "psrad $8, %%mm3 \n\t" | |
2312 #endif | |
2313 "packssdw %%mm2, %%mm4 \n\t" | |
2314 "packssdw %%mm3, %%mm1 \n\t" | |
2315 "pmaddwd %%mm5, %%mm4 \n\t" | |
2316 "pmaddwd %%mm5, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2317 "add $24, %%"REG_b" \n\t" |
4622 | 2318 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2319 "psraw $7, %%mm4 \n\t" | |
2320 | |
2321 "movq %%mm0, %%mm1 \n\t" | |
2322 "punpckldq %%mm4, %%mm0 \n\t" | |
2323 "punpckhdq %%mm4, %%mm1 \n\t" | |
2324 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 2325 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2326 "movd %%mm0, (%2, %%"REG_a") \n\t" |
4622 | 2327 "punpckhdq %%mm0, %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2328 "movd %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2329 "add $4, %%"REG_a" \n\t" |
4622 | 2330 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2331 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2332 : "%"REG_a, "%"REG_b |
4622 | 2333 ); |
2334 | |
2335 udst += chromStride; | |
2336 vdst += chromStride; | |
2337 src += srcStride*2; | |
2338 } | |
2339 | |
2340 asm volatile( EMMS" \n\t" | |
2341 SFENCE" \n\t" | |
2342 :::"memory"); | |
2343 #else | |
2344 y=0; | |
2345 #endif | |
2346 for(; y<height; y+=2) | |
3132 | 2347 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2348 long i; |
3132 | 2349 for(i=0; i<chromWidth; i++) |
2350 { | |
2351 unsigned int b= src[6*i+0]; | |
2352 unsigned int g= src[6*i+1]; | |
2353 unsigned int r= src[6*i+2]; | |
2801 | 2354 |
3633 | 2355 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
2356 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
2357 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
3132 | 2358 |
2359 udst[i] = U; | |
2360 vdst[i] = V; | |
2361 ydst[2*i] = Y; | |
2362 | |
2363 b= src[6*i+3]; | |
2364 g= src[6*i+4]; | |
2365 r= src[6*i+5]; | |
2366 | |
3633 | 2367 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2368 ydst[2*i+1] = Y; |
2369 } | |
2370 ydst += lumStride; | |
2371 src += srcStride; | |
2372 | |
2373 for(i=0; i<chromWidth; i++) | |
2374 { | |
2375 unsigned int b= src[6*i+0]; | |
2376 unsigned int g= src[6*i+1]; | |
2377 unsigned int r= src[6*i+2]; | |
2378 | |
3633 | 2379 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2380 |
2381 ydst[2*i] = Y; | |
2382 | |
2383 b= src[6*i+3]; | |
2384 g= src[6*i+4]; | |
2385 r= src[6*i+5]; | |
2386 | |
3633 | 2387 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2388 ydst[2*i+1] = Y; |
2389 } | |
2390 udst += chromStride; | |
2391 vdst += chromStride; | |
2392 ydst += lumStride; | |
2393 src += srcStride; | |
2394 } | |
2395 } | |
5337 | 2396 |
2397 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2398 long width, long height, long src1Stride, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2399 long src2Stride, long dstStride){ |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2400 long h; |
5337 | 2401 |
2402 for(h=0; h < height; h++) | |
2403 { | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2404 long w; |
5337 | 2405 |
2406 #ifdef HAVE_MMX | |
2407 #ifdef HAVE_SSE2 | |
2408 asm( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2409 "xor %%"REG_a", %%"REG_a" \n\t" |
5337 | 2410 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2411 PREFETCH" 64(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2412 PREFETCH" 64(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2413 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2414 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2415 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" |
5337 | 2416 "punpcklbw %%xmm2, %%xmm0 \n\t" |
2417 "punpckhbw %%xmm2, %%xmm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2418 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2419 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2420 "add $16, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2421 "cmp %3, %%"REG_a" \n\t" |
5337 | 2422 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2423 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2424 : "memory", "%"REG_a"" |
5337 | 2425 ); |
2426 #else | |
2427 asm( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2428 "xor %%"REG_a", %%"REG_a" \n\t" |
5337 | 2429 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2430 PREFETCH" 64(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2431 PREFETCH" 64(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2432 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2433 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" |
5337 | 2434 "movq %%mm0, %%mm1 \n\t" |
2435 "movq %%mm2, %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2436 "movq (%2, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2437 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" |
5337 | 2438 "punpcklbw %%mm4, %%mm0 \n\t" |
2439 "punpckhbw %%mm4, %%mm1 \n\t" | |
2440 "punpcklbw %%mm5, %%mm2 \n\t" | |
2441 "punpckhbw %%mm5, %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2442 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2443 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2444 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2445 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2446 "add $16, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2447 "cmp %3, %%"REG_a" \n\t" |
5337 | 2448 " jb 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2449 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2450 : "memory", "%"REG_a |
5337 | 2451 ); |
2452 #endif | |
2453 for(w= (width&(~15)); w < width; w++) | |
2454 { | |
2455 dest[2*w+0] = src1[w]; | |
2456 dest[2*w+1] = src2[w]; | |
2457 } | |
2458 #else | |
2459 for(w=0; w < width; w++) | |
2460 { | |
2461 dest[2*w+0] = src1[w]; | |
2462 dest[2*w+1] = src2[w]; | |
2463 } | |
2464 #endif | |
2465 dest += dstStride; | |
2466 src1 += src1Stride; | |
2467 src2 += src2Stride; | |
2468 } | |
2469 #ifdef HAVE_MMX | |
2470 asm( | |
2471 EMMS" \n\t" | |
2472 SFENCE" \n\t" | |
2473 ::: "memory" | |
2474 ); | |
2475 #endif | |
2476 } | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2477 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2478 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2479 uint8_t *dst1, uint8_t *dst2, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2480 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2481 long srcStride1, long srcStride2, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2482 long dstStride1, long dstStride2) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2483 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2484 long y,x,w,h; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2485 w=width/2; h=height/2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2486 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2487 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2488 PREFETCH" %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2489 PREFETCH" %1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2490 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2491 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2492 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2493 const uint8_t* s1=src1+srcStride1*(y>>1); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2494 uint8_t* d=dst1+dstStride1*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2495 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2496 #ifdef HAVE_MMX |
9392 | 2497 for(;x<w-31;x+=32) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2498 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2499 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2500 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2501 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2502 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2503 "movq 16%1, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2504 "movq 24%1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2505 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2506 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2507 "movq %%mm4, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2508 "movq %%mm6, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2509 "punpcklbw %%mm0, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2510 "punpckhbw %%mm1, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2511 "punpcklbw %%mm2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2512 "punpckhbw %%mm3, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2513 "punpcklbw %%mm4, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2514 "punpckhbw %%mm5, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2515 "punpcklbw %%mm6, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2516 "punpckhbw %%mm7, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2517 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2518 MOVNTQ" %%mm1, 8%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2519 MOVNTQ" %%mm2, 16%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2520 MOVNTQ" %%mm3, 24%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2521 MOVNTQ" %%mm4, 32%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2522 MOVNTQ" %%mm5, 40%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2523 MOVNTQ" %%mm6, 48%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2524 MOVNTQ" %%mm7, 56%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2525 :"=m"(d[2*x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2526 :"m"(s1[x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2527 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2528 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2529 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2530 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2531 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2532 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2533 const uint8_t* s2=src2+srcStride2*(y>>1); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2534 uint8_t* d=dst2+dstStride2*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2535 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2536 #ifdef HAVE_MMX |
9392 | 2537 for(;x<w-31;x+=32) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2538 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2539 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2540 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2541 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2542 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2543 "movq 16%1, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2544 "movq 24%1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2545 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2546 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2547 "movq %%mm4, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2548 "movq %%mm6, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2549 "punpcklbw %%mm0, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2550 "punpckhbw %%mm1, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2551 "punpcklbw %%mm2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2552 "punpckhbw %%mm3, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2553 "punpcklbw %%mm4, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2554 "punpckhbw %%mm5, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2555 "punpcklbw %%mm6, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2556 "punpckhbw %%mm7, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2557 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2558 MOVNTQ" %%mm1, 8%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2559 MOVNTQ" %%mm2, 16%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2560 MOVNTQ" %%mm3, 24%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2561 MOVNTQ" %%mm4, 32%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2562 MOVNTQ" %%mm5, 40%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2563 MOVNTQ" %%mm6, 48%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2564 MOVNTQ" %%mm7, 56%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2565 :"=m"(d[2*x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2566 :"m"(s2[x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2567 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2568 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2569 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2570 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2571 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2572 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2573 asm( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2574 EMMS" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2575 SFENCE" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2576 ::: "memory" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2577 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2578 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2579 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2580 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2581 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2582 uint8_t *dst, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2583 long width, long height, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2584 long srcStride1, long srcStride2, |
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2585 long srcStride3, long dstStride) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2586 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2587 long y,x,w,h; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2588 w=width/2; h=height; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2589 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2590 const uint8_t* yp=src1+srcStride1*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2591 const uint8_t* up=src2+srcStride2*(y>>2); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2592 const uint8_t* vp=src3+srcStride3*(y>>2); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2593 uint8_t* d=dst+dstStride*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2594 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2595 #ifdef HAVE_MMX |
9394 | 2596 for(;x<w-7;x+=8) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2597 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2598 asm volatile( |
9394 | 2599 PREFETCH" 32(%1, %0)\n\t" |
2600 PREFETCH" 32(%2, %0)\n\t" | |
2601 PREFETCH" 32(%3, %0)\n\t" | |
2602 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | |
2603 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */ | |
2604 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */ | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2605 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2606 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2607 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2608 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2609 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2610 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2611 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2612 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2613 "movq %%mm1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2614 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2615 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2616 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ |
9394 | 2617 MOVNTQ" %%mm0, (%4, %0, 8)\n\t" |
2618 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2619 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2620 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/ |
9394 | 2621 "movq 8(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2622 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2623 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2624 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/ |
9394 | 2625 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t" |
2626 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2627 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2628 "movq %%mm4, %%mm6\n\t" |
9394 | 2629 "movq 16(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2630 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2631 "punpcklbw %%mm5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2632 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2633 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/ |
9394 | 2634 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t" |
2635 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2636 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2637 "punpckhbw %%mm5, %%mm6\n\t" |
9394 | 2638 "movq 24(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2639 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2640 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2641 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/ |
9394 | 2642 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t" |
2643 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2644 |
9394 | 2645 : "+r" (x) |
2646 : "r"(yp), "r" (up), "r"(vp), "r"(d) | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2647 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2648 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2649 #endif |
9394 | 2650 for(; x<w; x++) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2651 { |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
14982
diff
changeset
|
2652 const long x2= x<<2; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2653 d[8*x+0]=yp[x2]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2654 d[8*x+1]=up[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2655 d[8*x+2]=yp[x2+1]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2656 d[8*x+3]=vp[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2657 d[8*x+4]=yp[x2+2]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2658 d[8*x+5]=up[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2659 d[8*x+6]=yp[x2+3]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2660 d[8*x+7]=vp[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2661 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2662 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2663 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2664 asm( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2665 EMMS" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2666 SFENCE" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2667 ::: "memory" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2668 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2669 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2670 } |