Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 16287:c06b158cd79f
Sync with 1.1074
author | gpoirier |
---|---|
date | Mon, 22 Aug 2005 08:21:43 +0000 |
parents | 49dd10a86b23 |
children | e91f944f6ed9 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
3132 | 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) |
13423 | 9 * lot of big-endian byteorder fixes by Alex Beregszaszi |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
10 */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
11 |
6492 | 12 #include <stddef.h> |
13 #include <inttypes.h> /* for __WORDSIZE */ | |
14 | |
15 #ifndef __WORDSIZE | |
7421
0684cad9b204
use detected WORDSIZE instead of warning, when inttypes.h doesn't define __WORDSIZE
arpi
parents:
6608
diff
changeset
|
16 // #warning You have misconfigured system and probably will lose performance! |
0684cad9b204
use detected WORDSIZE instead of warning, when inttypes.h doesn't define __WORDSIZE
arpi
parents:
6608
diff
changeset
|
17 #define __WORDSIZE MP_WORDSIZE |
6492 | 18 #endif |
19 | |
3132 | 20 #undef PREFETCH |
21 #undef MOVNTQ | |
22 #undef EMMS | |
23 #undef SFENCE | |
24 #undef MMREG_SIZE | |
25 #undef PREFETCHW | |
26 #undef PAVGB | |
2755 | 27 |
3132 | 28 #ifdef HAVE_SSE2 |
29 #define MMREG_SIZE 16 | |
30 #else | |
31 #define MMREG_SIZE 8 | |
2535 | 32 #endif |
2513 | 33 |
3132 | 34 #ifdef HAVE_3DNOW |
35 #define PREFETCH "prefetch" | |
36 #define PREFETCHW "prefetchw" | |
37 #define PAVGB "pavgusb" | |
38 #elif defined ( HAVE_MMX2 ) | |
39 #define PREFETCH "prefetchnta" | |
40 #define PREFETCHW "prefetcht0" | |
41 #define PAVGB "pavgb" | |
42 #else | |
43 #define PREFETCH "/nop" | |
44 #define PREFETCHW "/nop" | |
45 #endif | |
46 | |
47 #ifdef HAVE_3DNOW | |
48 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
49 #define EMMS "femms" | |
50 #else | |
51 #define EMMS "emms" | |
52 #endif | |
53 | |
54 #ifdef HAVE_MMX2 | |
55 #define MOVNTQ "movntq" | |
56 #define SFENCE "sfence" | |
57 #else | |
58 #define MOVNTQ "movq" | |
59 #define SFENCE "/nop" | |
60 #endif | |
61 | |
62 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size) | |
2504 | 63 { |
2508 | 64 uint8_t *dest = dst; |
2677 | 65 const uint8_t *s = src; |
66 const uint8_t *end; | |
2510 | 67 #ifdef HAVE_MMX |
6605 | 68 const uint8_t *mm_end; |
2510 | 69 #endif |
2504 | 70 end = s + src_size; |
2510 | 71 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
72 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
6605 | 73 mm_end = end - 23; |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
74 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2510 | 75 while(s < mm_end) |
76 { | |
2511 | 77 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
78 PREFETCH" 32%1\n\t" |
2510 | 79 "movd %1, %%mm0\n\t" |
2738 | 80 "punpckldq 3%1, %%mm0\n\t" |
81 "movd 6%1, %%mm1\n\t" | |
82 "punpckldq 9%1, %%mm1\n\t" | |
83 "movd 12%1, %%mm2\n\t" | |
84 "punpckldq 15%1, %%mm2\n\t" | |
85 "movd 18%1, %%mm3\n\t" | |
86 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 87 "pand %%mm7, %%mm0\n\t" |
2738 | 88 "pand %%mm7, %%mm1\n\t" |
2510 | 89 "pand %%mm7, %%mm2\n\t" |
2738 | 90 "pand %%mm7, %%mm3\n\t" |
2511 | 91 MOVNTQ" %%mm0, %0\n\t" |
2738 | 92 MOVNTQ" %%mm1, 8%0\n\t" |
93 MOVNTQ" %%mm2, 16%0\n\t" | |
94 MOVNTQ" %%mm3, 24%0" | |
2510 | 95 :"=m"(*dest) |
96 :"m"(*s) | |
97 :"memory"); | |
2738 | 98 dest += 32; |
99 s += 24; | |
2510 | 100 } |
2513 | 101 __asm __volatile(SFENCE:::"memory"); |
2511 | 102 __asm __volatile(EMMS:::"memory"); |
2510 | 103 #endif |
2504 | 104 while(s < end) |
105 { | |
13423 | 106 #ifdef WORDS_BIGENDIAN |
107 *dest++ = 0; | |
108 *dest++ = *s++; | |
109 *dest++ = *s++; | |
110 *dest++ = *s++; | |
111 #else | |
2508 | 112 *dest++ = *s++; |
113 *dest++ = *s++; | |
114 *dest++ = *s++; | |
115 *dest++ = 0; | |
13423 | 116 #endif |
2504 | 117 } |
118 } | |
2505 | 119 |
3132 | 120 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 121 { |
122 uint8_t *dest = dst; | |
2677 | 123 const uint8_t *s = src; |
124 const uint8_t *end; | |
2517 | 125 #ifdef HAVE_MMX |
6605 | 126 const uint8_t *mm_end; |
2517 | 127 #endif |
2505 | 128 end = s + src_size; |
2517 | 129 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
130 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
6605 | 131 mm_end = end - 31; |
2517 | 132 while(s < mm_end) |
133 { | |
134 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
135 PREFETCH" 32%1\n\t" |
2517 | 136 "movq %1, %%mm0\n\t" |
137 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
138 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
139 "movq 24%1, %%mm5\n\t" |
2517 | 140 "movq %%mm0, %%mm2\n\t" |
141 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
142 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
143 "movq %%mm5, %%mm7\n\t" |
2517 | 144 "psrlq $8, %%mm2\n\t" |
145 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
146 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
147 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
148 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
149 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
150 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
153 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
154 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "pand %3, %%mm7\n\t" |
2517 | 156 "por %%mm2, %%mm0\n\t" |
157 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
158 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
159 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
160 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
163 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
165 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
166 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
167 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
168 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
169 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
170 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
171 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
172 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
173 "por %%mm5, %%mm4\n\t" |
3132 | 174 |
2517 | 175 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
176 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
177 MOVNTQ" %%mm4, 16%0" |
2517 | 178 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
179 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
180 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 181 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
182 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
183 s += 32; |
2517 | 184 } |
185 __asm __volatile(SFENCE:::"memory"); | |
186 __asm __volatile(EMMS:::"memory"); | |
187 #endif | |
2505 | 188 while(s < end) |
189 { | |
13423 | 190 #ifdef WORDS_BIGENDIAN |
191 s++; | |
192 *dest++ = *s++; | |
193 *dest++ = *s++; | |
194 *dest++ = *s++; | |
195 #else | |
2505 | 196 *dest++ = *s++; |
197 *dest++ = *s++; | |
198 *dest++ = *s++; | |
199 s++; | |
13423 | 200 #endif |
2505 | 201 } |
202 } | |
2506 | 203 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
204 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
205 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
206 ported to gcc & bugfixed : A'rpi |
2564 | 207 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
208 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
209 */ |
3132 | 210 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 211 { |
6492 | 212 register const uint8_t* s=src; |
213 register uint8_t* d=dst; | |
214 register const uint8_t *end; | |
6605 | 215 const uint8_t *mm_end; |
6492 | 216 end = s + src_size; |
2506 | 217 #ifdef HAVE_MMX |
6492 | 218 __asm __volatile(PREFETCH" %0"::"m"(*s)); |
219 __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); | |
6605 | 220 mm_end = end - 15; |
6492 | 221 while(s<mm_end) |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
222 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
223 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
224 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
225 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
226 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
227 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
228 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
229 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
230 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
231 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
232 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
233 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
234 MOVNTQ" %%mm2, 8%0" |
6492 | 235 :"=m"(*d) |
236 :"m"(*s) | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
237 ); |
6492 | 238 d+=16; |
239 s+=16; | |
2506 | 240 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
241 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
242 __asm __volatile(EMMS:::"memory"); |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
243 #endif |
6605 | 244 mm_end = end - 3; |
6492 | 245 while(s < mm_end) |
246 { | |
247 register unsigned x= *((uint32_t *)s); | |
248 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
249 d+=4; | |
250 s+=4; | |
251 } | |
252 if(s < end) | |
253 { | |
254 register unsigned short x= *((uint16_t *)s); | |
255 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
256 } | |
2506 | 257 } |
2694 | 258 |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
259 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
260 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
261 register const uint8_t* s=src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
262 register uint8_t* d=dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
263 register const uint8_t *end; |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
264 const uint8_t *mm_end; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
265 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
266 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
267 __asm __volatile(PREFETCH" %0"::"m"(*s)); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
268 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
269 __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
270 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
271 while(s<mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
272 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
273 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
274 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
275 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
276 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
277 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
278 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
279 "psrlq $1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
280 "psrlq $1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
281 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
282 "pand %%mm7, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
283 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
284 "pand %%mm6, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
285 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
286 "por %%mm3, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
287 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
288 MOVNTQ" %%mm2, 8%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
289 :"=m"(*d) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
290 :"m"(*s) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
291 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
292 d+=16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
293 s+=16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
294 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
295 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
296 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
297 #endif |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
298 mm_end = end - 3; |
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
299 while(s < mm_end) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
300 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
301 register uint32_t x= *((uint32_t *)s); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
302 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
303 s+=4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
304 d+=4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
305 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
306 if(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
307 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
308 register uint16_t x= *((uint16_t *)s); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
309 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
310 s+=2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
311 d+=2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
312 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
313 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
314 |
3132 | 315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 316 { |
6492 | 317 const uint8_t *s = src; |
318 const uint8_t *end; | |
2741 | 319 #ifdef HAVE_MMX |
6492 | 320 const uint8_t *mm_end; |
321 #endif | |
2741 | 322 uint16_t *d = (uint16_t *)dst; |
323 end = s + src_size; | |
6492 | 324 #ifdef HAVE_MMX |
9454 | 325 mm_end = end - 15; |
326 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
327 asm volatile( | |
328 "movq %3, %%mm5 \n\t" | |
329 "movq %4, %%mm6 \n\t" | |
330 "movq %5, %%mm7 \n\t" | |
331 ".balign 16 \n\t" | |
332 "1: \n\t" | |
333 PREFETCH" 32(%1) \n\t" | |
334 "movd (%1), %%mm0 \n\t" | |
335 "movd 4(%1), %%mm3 \n\t" | |
336 "punpckldq 8(%1), %%mm0 \n\t" | |
337 "punpckldq 12(%1), %%mm3 \n\t" | |
338 "movq %%mm0, %%mm1 \n\t" | |
339 "movq %%mm3, %%mm4 \n\t" | |
340 "pand %%mm6, %%mm0 \n\t" | |
341 "pand %%mm6, %%mm3 \n\t" | |
342 "pmaddwd %%mm7, %%mm0 \n\t" | |
343 "pmaddwd %%mm7, %%mm3 \n\t" | |
344 "pand %%mm5, %%mm1 \n\t" | |
345 "pand %%mm5, %%mm4 \n\t" | |
346 "por %%mm1, %%mm0 \n\t" | |
347 "por %%mm4, %%mm3 \n\t" | |
348 "psrld $5, %%mm0 \n\t" | |
349 "pslld $11, %%mm3 \n\t" | |
350 "por %%mm3, %%mm0 \n\t" | |
351 MOVNTQ" %%mm0, (%0) \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
352 "add $16, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
353 "add $8, %0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
354 "cmp %2, %1 \n\t" |
9454 | 355 " jb 1b \n\t" |
356 : "+r" (d), "+r"(s) | |
357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | |
358 ); | |
359 #else | |
2741 | 360 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
361 __asm __volatile( | |
362 "movq %0, %%mm7\n\t" | |
363 "movq %1, %%mm6\n\t" | |
364 ::"m"(red_16mask),"m"(green_16mask)); | |
365 while(s < mm_end) | |
366 { | |
367 __asm __volatile( | |
368 PREFETCH" 32%1\n\t" | |
369 "movd %1, %%mm0\n\t" | |
370 "movd 4%1, %%mm3\n\t" | |
371 "punpckldq 8%1, %%mm0\n\t" | |
372 "punpckldq 12%1, %%mm3\n\t" | |
373 "movq %%mm0, %%mm1\n\t" | |
374 "movq %%mm0, %%mm2\n\t" | |
375 "movq %%mm3, %%mm4\n\t" | |
376 "movq %%mm3, %%mm5\n\t" | |
377 "psrlq $3, %%mm0\n\t" | |
378 "psrlq $3, %%mm3\n\t" | |
379 "pand %2, %%mm0\n\t" | |
380 "pand %2, %%mm3\n\t" | |
381 "psrlq $5, %%mm1\n\t" | |
382 "psrlq $5, %%mm4\n\t" | |
383 "pand %%mm6, %%mm1\n\t" | |
384 "pand %%mm6, %%mm4\n\t" | |
385 "psrlq $8, %%mm2\n\t" | |
386 "psrlq $8, %%mm5\n\t" | |
387 "pand %%mm7, %%mm2\n\t" | |
388 "pand %%mm7, %%mm5\n\t" | |
389 "por %%mm1, %%mm0\n\t" | |
390 "por %%mm4, %%mm3\n\t" | |
391 "por %%mm2, %%mm0\n\t" | |
392 "por %%mm5, %%mm3\n\t" | |
393 "psllq $16, %%mm3\n\t" | |
394 "por %%mm3, %%mm0\n\t" | |
395 MOVNTQ" %%mm0, %0\n\t" | |
396 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
397 d += 4; | |
398 s += 16; | |
399 } | |
9454 | 400 #endif |
6492 | 401 __asm __volatile(SFENCE:::"memory"); |
402 __asm __volatile(EMMS:::"memory"); | |
403 #endif | |
2741 | 404 while(s < end) |
405 { | |
14982
49dd10a86b23
Fixes rgb32to16 conversion for I think all platforms since the int8
diego
parents:
13720
diff
changeset
|
406 register int rgb = *(uint32_t*)s; s += 4; |
49dd10a86b23
Fixes rgb32to16 conversion for I think all platforms since the int8
diego
parents:
13720
diff
changeset
|
407 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); |
2741 | 408 } |
2694 | 409 } |
410 | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
412 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
413 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
414 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
415 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
416 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
417 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
418 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
419 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
420 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
421 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
422 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
423 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
424 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
425 ::"m"(red_16mask),"m"(green_16mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
426 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
427 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
428 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
429 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
430 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
431 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
432 "movd 4%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
433 "punpckldq 8%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
434 "punpckldq 12%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
435 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
436 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
437 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
438 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
439 "psllq $8, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
440 "psllq $8, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
441 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
442 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
443 "psrlq $5, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
444 "psrlq $5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
445 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
446 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
447 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
448 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
449 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
450 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
451 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
452 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
453 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
454 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
455 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
456 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
457 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
458 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
459 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
460 s += 16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
461 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
462 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
463 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
464 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
465 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
466 { |
13423 | 467 // FIXME on bigendian |
12385
b5c106b694e4
this isn't actually stupid, but it's not valid C and gcc 3.5 rejects it as such
rfelker
parents:
11072
diff
changeset
|
468 const int src= *s; s += 4; |
9430 | 469 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19); |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
470 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
471 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
472 |
3132 | 473 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 474 { |
6492 | 475 const uint8_t *s = src; |
476 const uint8_t *end; | |
2741 | 477 #ifdef HAVE_MMX |
6492 | 478 const uint8_t *mm_end; |
479 #endif | |
2741 | 480 uint16_t *d = (uint16_t *)dst; |
481 end = s + src_size; | |
6492 | 482 #ifdef HAVE_MMX |
9454 | 483 mm_end = end - 15; |
484 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
485 asm volatile( | |
486 "movq %3, %%mm5 \n\t" | |
487 "movq %4, %%mm6 \n\t" | |
488 "movq %5, %%mm7 \n\t" | |
489 ".balign 16 \n\t" | |
490 "1: \n\t" | |
491 PREFETCH" 32(%1) \n\t" | |
492 "movd (%1), %%mm0 \n\t" | |
493 "movd 4(%1), %%mm3 \n\t" | |
494 "punpckldq 8(%1), %%mm0 \n\t" | |
495 "punpckldq 12(%1), %%mm3 \n\t" | |
496 "movq %%mm0, %%mm1 \n\t" | |
497 "movq %%mm3, %%mm4 \n\t" | |
498 "pand %%mm6, %%mm0 \n\t" | |
499 "pand %%mm6, %%mm3 \n\t" | |
500 "pmaddwd %%mm7, %%mm0 \n\t" | |
501 "pmaddwd %%mm7, %%mm3 \n\t" | |
502 "pand %%mm5, %%mm1 \n\t" | |
503 "pand %%mm5, %%mm4 \n\t" | |
504 "por %%mm1, %%mm0 \n\t" | |
505 "por %%mm4, %%mm3 \n\t" | |
506 "psrld $6, %%mm0 \n\t" | |
507 "pslld $10, %%mm3 \n\t" | |
508 "por %%mm3, %%mm0 \n\t" | |
509 MOVNTQ" %%mm0, (%0) \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
510 "add $16, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
511 "add $8, %0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
512 "cmp %2, %1 \n\t" |
9454 | 513 " jb 1b \n\t" |
514 : "+r" (d), "+r"(s) | |
515 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | |
516 ); | |
517 #else | |
2741 | 518 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
519 __asm __volatile( | |
520 "movq %0, %%mm7\n\t" | |
521 "movq %1, %%mm6\n\t" | |
522 ::"m"(red_15mask),"m"(green_15mask)); | |
523 while(s < mm_end) | |
524 { | |
525 __asm __volatile( | |
526 PREFETCH" 32%1\n\t" | |
527 "movd %1, %%mm0\n\t" | |
528 "movd 4%1, %%mm3\n\t" | |
529 "punpckldq 8%1, %%mm0\n\t" | |
530 "punpckldq 12%1, %%mm3\n\t" | |
531 "movq %%mm0, %%mm1\n\t" | |
532 "movq %%mm0, %%mm2\n\t" | |
533 "movq %%mm3, %%mm4\n\t" | |
534 "movq %%mm3, %%mm5\n\t" | |
535 "psrlq $3, %%mm0\n\t" | |
536 "psrlq $3, %%mm3\n\t" | |
537 "pand %2, %%mm0\n\t" | |
538 "pand %2, %%mm3\n\t" | |
539 "psrlq $6, %%mm1\n\t" | |
540 "psrlq $6, %%mm4\n\t" | |
541 "pand %%mm6, %%mm1\n\t" | |
542 "pand %%mm6, %%mm4\n\t" | |
543 "psrlq $9, %%mm2\n\t" | |
544 "psrlq $9, %%mm5\n\t" | |
545 "pand %%mm7, %%mm2\n\t" | |
546 "pand %%mm7, %%mm5\n\t" | |
547 "por %%mm1, %%mm0\n\t" | |
548 "por %%mm4, %%mm3\n\t" | |
549 "por %%mm2, %%mm0\n\t" | |
550 "por %%mm5, %%mm3\n\t" | |
551 "psllq $16, %%mm3\n\t" | |
552 "por %%mm3, %%mm0\n\t" | |
553 MOVNTQ" %%mm0, %0\n\t" | |
554 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
555 d += 4; | |
556 s += 16; | |
557 } | |
9454 | 558 #endif |
6492 | 559 __asm __volatile(SFENCE:::"memory"); |
560 __asm __volatile(EMMS:::"memory"); | |
561 #endif | |
2741 | 562 while(s < end) |
563 { | |
13423 | 564 // FIXME on bigendian |
12385
b5c106b694e4
this isn't actually stupid, but it's not valid C and gcc 3.5 rejects it as such
rfelker
parents:
11072
diff
changeset
|
565 const int src= *s; s += 4; |
9430 | 566 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9); |
2741 | 567 } |
2694 | 568 } |
569 | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
570 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
571 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
572 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
573 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
574 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
575 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
576 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
577 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
578 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
579 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
580 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
581 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
582 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
583 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
584 ::"m"(red_15mask),"m"(green_15mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
585 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
586 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
587 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
588 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
589 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
590 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
591 "movd 4%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
592 "punpckldq 8%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
593 "punpckldq 12%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
594 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
595 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
596 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
597 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
598 "psllq $7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
599 "psllq $7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
600 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
601 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
602 "psrlq $6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
603 "psrlq $6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
604 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
605 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
606 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
607 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
608 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
609 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
610 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
611 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
612 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
613 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
614 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
615 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
616 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
617 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
618 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
619 s += 16; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
620 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
621 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
622 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
623 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
624 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
625 { |
13423 | 626 // FIXME on bigendian |
12385
b5c106b694e4
this isn't actually stupid, but it's not valid C and gcc 3.5 rejects it as such
rfelker
parents:
11072
diff
changeset
|
627 const int src= *s; s += 4; |
9430 | 628 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19); |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
629 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
630 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
631 |
3132 | 632 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 633 { |
6492 | 634 const uint8_t *s = src; |
635 const uint8_t *end; | |
2740 | 636 #ifdef HAVE_MMX |
6492 | 637 const uint8_t *mm_end; |
638 #endif | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
639 uint16_t *d = (uint16_t *)dst; |
2740 | 640 end = s + src_size; |
6492 | 641 #ifdef HAVE_MMX |
2738 | 642 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
643 __asm __volatile( | |
644 "movq %0, %%mm7\n\t" | |
645 "movq %1, %%mm6\n\t" | |
2741 | 646 ::"m"(red_16mask),"m"(green_16mask)); |
6605 | 647 mm_end = end - 11; |
2740 | 648 while(s < mm_end) |
2738 | 649 { |
650 __asm __volatile( | |
651 PREFETCH" 32%1\n\t" | |
652 "movd %1, %%mm0\n\t" | |
2740 | 653 "movd 3%1, %%mm3\n\t" |
654 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 655 "punpckldq 9%1, %%mm3\n\t" |
656 "movq %%mm0, %%mm1\n\t" | |
657 "movq %%mm0, %%mm2\n\t" | |
658 "movq %%mm3, %%mm4\n\t" | |
659 "movq %%mm3, %%mm5\n\t" | |
660 "psrlq $3, %%mm0\n\t" | |
661 "psrlq $3, %%mm3\n\t" | |
2740 | 662 "pand %2, %%mm0\n\t" |
663 "pand %2, %%mm3\n\t" | |
664 "psrlq $5, %%mm1\n\t" | |
665 "psrlq $5, %%mm4\n\t" | |
666 "pand %%mm6, %%mm1\n\t" | |
667 "pand %%mm6, %%mm4\n\t" | |
668 "psrlq $8, %%mm2\n\t" | |
669 "psrlq $8, %%mm5\n\t" | |
670 "pand %%mm7, %%mm2\n\t" | |
671 "pand %%mm7, %%mm5\n\t" | |
2738 | 672 "por %%mm1, %%mm0\n\t" |
2740 | 673 "por %%mm4, %%mm3\n\t" |
2738 | 674 "por %%mm2, %%mm0\n\t" |
675 "por %%mm5, %%mm3\n\t" | |
2740 | 676 "psllq $16, %%mm3\n\t" |
677 "por %%mm3, %%mm0\n\t" | |
2738 | 678 MOVNTQ" %%mm0, %0\n\t" |
2741 | 679 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 680 d += 4; |
681 s += 12; | |
2738 | 682 } |
6492 | 683 __asm __volatile(SFENCE:::"memory"); |
684 __asm __volatile(EMMS:::"memory"); | |
685 #endif | |
2740 | 686 while(s < end) |
687 { | |
688 const int b= *s++; | |
689 const int g= *s++; | |
690 const int r= *s++; | |
691 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
692 } | |
2718 | 693 } |
694 | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
695 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
696 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
697 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
698 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
699 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
700 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
701 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
702 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
703 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
704 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
705 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
706 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
707 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
708 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
709 ::"m"(red_16mask),"m"(green_16mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
710 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
711 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
712 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
713 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
714 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
715 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
716 "movd 3%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
717 "punpckldq 6%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
718 "punpckldq 9%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
719 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
720 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
721 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
722 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
723 "psllq $8, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
724 "psllq $8, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
725 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
726 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
727 "psrlq $5, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
728 "psrlq $5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
729 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
730 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
731 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
732 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
733 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
734 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
735 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
736 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
737 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
738 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
739 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
740 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
741 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
742 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
743 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
744 s += 12; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
745 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
746 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
747 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
748 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
749 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
750 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
751 const int r= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
752 const int g= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
753 const int b= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
754 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
755 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
756 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
757 |
3132 | 758 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2718 | 759 { |
6492 | 760 const uint8_t *s = src; |
761 const uint8_t *end; | |
2741 | 762 #ifdef HAVE_MMX |
6492 | 763 const uint8_t *mm_end; |
764 #endif | |
2741 | 765 uint16_t *d = (uint16_t *)dst; |
766 end = s + src_size; | |
6492 | 767 #ifdef HAVE_MMX |
2741 | 768 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
769 __asm __volatile( | |
770 "movq %0, %%mm7\n\t" | |
771 "movq %1, %%mm6\n\t" | |
772 ::"m"(red_15mask),"m"(green_15mask)); | |
6605 | 773 mm_end = end - 11; |
2741 | 774 while(s < mm_end) |
775 { | |
776 __asm __volatile( | |
777 PREFETCH" 32%1\n\t" | |
778 "movd %1, %%mm0\n\t" | |
779 "movd 3%1, %%mm3\n\t" | |
780 "punpckldq 6%1, %%mm0\n\t" | |
781 "punpckldq 9%1, %%mm3\n\t" | |
782 "movq %%mm0, %%mm1\n\t" | |
783 "movq %%mm0, %%mm2\n\t" | |
784 "movq %%mm3, %%mm4\n\t" | |
785 "movq %%mm3, %%mm5\n\t" | |
786 "psrlq $3, %%mm0\n\t" | |
787 "psrlq $3, %%mm3\n\t" | |
788 "pand %2, %%mm0\n\t" | |
789 "pand %2, %%mm3\n\t" | |
790 "psrlq $6, %%mm1\n\t" | |
791 "psrlq $6, %%mm4\n\t" | |
792 "pand %%mm6, %%mm1\n\t" | |
793 "pand %%mm6, %%mm4\n\t" | |
794 "psrlq $9, %%mm2\n\t" | |
795 "psrlq $9, %%mm5\n\t" | |
796 "pand %%mm7, %%mm2\n\t" | |
797 "pand %%mm7, %%mm5\n\t" | |
798 "por %%mm1, %%mm0\n\t" | |
799 "por %%mm4, %%mm3\n\t" | |
800 "por %%mm2, %%mm0\n\t" | |
801 "por %%mm5, %%mm3\n\t" | |
802 "psllq $16, %%mm3\n\t" | |
803 "por %%mm3, %%mm0\n\t" | |
804 MOVNTQ" %%mm0, %0\n\t" | |
805 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
806 d += 4; | |
807 s += 12; | |
808 } | |
6492 | 809 __asm __volatile(SFENCE:::"memory"); |
810 __asm __volatile(EMMS:::"memory"); | |
811 #endif | |
2741 | 812 while(s < end) |
813 { | |
814 const int b= *s++; | |
815 const int g= *s++; | |
816 const int r= *s++; | |
817 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
818 } | |
6492 | 819 } |
820 | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
821 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
822 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
823 const uint8_t *s = src; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
824 const uint8_t *end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
825 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
826 const uint8_t *mm_end; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
827 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
828 uint16_t *d = (uint16_t *)dst; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
829 end = s + src_size; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
830 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
831 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
832 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
833 "movq %0, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
834 "movq %1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
835 ::"m"(red_15mask),"m"(green_15mask)); |
6608
da27a1bc1763
fixing memory overwrite bugs in the new converters
michael
parents:
6606
diff
changeset
|
836 mm_end = end - 15; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
837 while(s < mm_end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
838 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
839 __asm __volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
840 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
841 "movd %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
842 "movd 3%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
843 "punpckldq 6%1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
844 "punpckldq 9%1, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
845 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
846 "movq %%mm0, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
847 "movq %%mm3, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
848 "movq %%mm3, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
849 "psllq $7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
850 "psllq $7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
851 "pand %%mm7, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
852 "pand %%mm7, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
853 "psrlq $6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
854 "psrlq $6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
855 "pand %%mm6, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
856 "pand %%mm6, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
857 "psrlq $19, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
858 "psrlq $19, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
859 "pand %2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
860 "pand %2, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
861 "por %%mm1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
862 "por %%mm4, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
863 "por %%mm2, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
864 "por %%mm5, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
865 "psllq $16, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
866 "por %%mm3, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
867 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
868 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
869 d += 4; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
870 s += 12; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
871 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
872 __asm __volatile(SFENCE:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
873 __asm __volatile(EMMS:::"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
874 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
875 while(s < end) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
876 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
877 const int r= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
878 const int g= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
879 const int b= *s++; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
880 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
881 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
882 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
883 |
6492 | 884 /* |
885 I use here less accurate approximation by simply | |
886 left-shifting the input | |
887 value and filling the low order bits with | |
888 zeroes. This method improves png's | |
889 compression but this scheme cannot reproduce white exactly, since it does not | |
890 generate an all-ones maximum value; the net effect is to darken the | |
891 image slightly. | |
892 | |
893 The better method should be "left bit replication": | |
894 | |
895 4 3 2 1 0 | |
896 --------- | |
897 1 1 0 1 1 | |
898 | |
899 7 6 5 4 3 2 1 0 | |
900 ---------------- | |
901 1 1 0 1 1 1 1 0 | |
902 |=======| |===| | |
903 | Leftmost Bits Repeated to Fill Open Bits | |
904 | | |
905 Original Bits | |
906 */ | |
907 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
908 { | |
909 const uint16_t *end; | |
910 #ifdef HAVE_MMX | |
911 const uint16_t *mm_end; | |
912 #endif | |
913 uint8_t *d = (uint8_t *)dst; | |
914 const uint16_t *s = (uint16_t *)src; | |
915 end = s + src_size/2; | |
916 #ifdef HAVE_MMX | |
917 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
6605 | 918 mm_end = end - 7; |
6492 | 919 while(s < mm_end) |
920 { | |
921 __asm __volatile( | |
922 PREFETCH" 32%1\n\t" | |
923 "movq %1, %%mm0\n\t" | |
924 "movq %1, %%mm1\n\t" | |
925 "movq %1, %%mm2\n\t" | |
926 "pand %2, %%mm0\n\t" | |
927 "pand %3, %%mm1\n\t" | |
928 "pand %4, %%mm2\n\t" | |
929 "psllq $3, %%mm0\n\t" | |
930 "psrlq $2, %%mm1\n\t" | |
931 "psrlq $7, %%mm2\n\t" | |
932 "movq %%mm0, %%mm3\n\t" | |
933 "movq %%mm1, %%mm4\n\t" | |
934 "movq %%mm2, %%mm5\n\t" | |
935 "punpcklwd %5, %%mm0\n\t" | |
936 "punpcklwd %5, %%mm1\n\t" | |
937 "punpcklwd %5, %%mm2\n\t" | |
938 "punpckhwd %5, %%mm3\n\t" | |
939 "punpckhwd %5, %%mm4\n\t" | |
940 "punpckhwd %5, %%mm5\n\t" | |
941 "psllq $8, %%mm1\n\t" | |
942 "psllq $16, %%mm2\n\t" | |
943 "por %%mm1, %%mm0\n\t" | |
944 "por %%mm2, %%mm0\n\t" | |
945 "psllq $8, %%mm4\n\t" | |
946 "psllq $16, %%mm5\n\t" | |
947 "por %%mm4, %%mm3\n\t" | |
948 "por %%mm5, %%mm3\n\t" | |
949 | |
950 "movq %%mm0, %%mm6\n\t" | |
951 "movq %%mm3, %%mm7\n\t" | |
952 | |
953 "movq 8%1, %%mm0\n\t" | |
954 "movq 8%1, %%mm1\n\t" | |
955 "movq 8%1, %%mm2\n\t" | |
956 "pand %2, %%mm0\n\t" | |
957 "pand %3, %%mm1\n\t" | |
958 "pand %4, %%mm2\n\t" | |
959 "psllq $3, %%mm0\n\t" | |
960 "psrlq $2, %%mm1\n\t" | |
961 "psrlq $7, %%mm2\n\t" | |
962 "movq %%mm0, %%mm3\n\t" | |
963 "movq %%mm1, %%mm4\n\t" | |
964 "movq %%mm2, %%mm5\n\t" | |
965 "punpcklwd %5, %%mm0\n\t" | |
966 "punpcklwd %5, %%mm1\n\t" | |
967 "punpcklwd %5, %%mm2\n\t" | |
968 "punpckhwd %5, %%mm3\n\t" | |
969 "punpckhwd %5, %%mm4\n\t" | |
970 "punpckhwd %5, %%mm5\n\t" | |
971 "psllq $8, %%mm1\n\t" | |
972 "psllq $16, %%mm2\n\t" | |
973 "por %%mm1, %%mm0\n\t" | |
974 "por %%mm2, %%mm0\n\t" | |
975 "psllq $8, %%mm4\n\t" | |
976 "psllq $16, %%mm5\n\t" | |
977 "por %%mm4, %%mm3\n\t" | |
978 "por %%mm5, %%mm3\n\t" | |
979 | |
980 :"=m"(*d) | |
981 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | |
982 :"memory"); | |
983 /* Borrowed 32 to 24 */ | |
984 __asm __volatile( | |
985 "movq %%mm0, %%mm4\n\t" | |
986 "movq %%mm3, %%mm5\n\t" | |
987 "movq %%mm6, %%mm0\n\t" | |
988 "movq %%mm7, %%mm1\n\t" | |
989 | |
990 "movq %%mm4, %%mm6\n\t" | |
991 "movq %%mm5, %%mm7\n\t" | |
992 "movq %%mm0, %%mm2\n\t" | |
993 "movq %%mm1, %%mm3\n\t" | |
994 | |
995 "psrlq $8, %%mm2\n\t" | |
996 "psrlq $8, %%mm3\n\t" | |
997 "psrlq $8, %%mm6\n\t" | |
998 "psrlq $8, %%mm7\n\t" | |
999 "pand %2, %%mm0\n\t" | |
1000 "pand %2, %%mm1\n\t" | |
1001 "pand %2, %%mm4\n\t" | |
1002 "pand %2, %%mm5\n\t" | |
1003 "pand %3, %%mm2\n\t" | |
1004 "pand %3, %%mm3\n\t" | |
1005 "pand %3, %%mm6\n\t" | |
1006 "pand %3, %%mm7\n\t" | |
1007 "por %%mm2, %%mm0\n\t" | |
1008 "por %%mm3, %%mm1\n\t" | |
1009 "por %%mm6, %%mm4\n\t" | |
1010 "por %%mm7, %%mm5\n\t" | |
1011 | |
1012 "movq %%mm1, %%mm2\n\t" | |
1013 "movq %%mm4, %%mm3\n\t" | |
1014 "psllq $48, %%mm2\n\t" | |
1015 "psllq $32, %%mm3\n\t" | |
1016 "pand %4, %%mm2\n\t" | |
1017 "pand %5, %%mm3\n\t" | |
1018 "por %%mm2, %%mm0\n\t" | |
1019 "psrlq $16, %%mm1\n\t" | |
1020 "psrlq $32, %%mm4\n\t" | |
1021 "psllq $16, %%mm5\n\t" | |
1022 "por %%mm3, %%mm1\n\t" | |
1023 "pand %6, %%mm5\n\t" | |
1024 "por %%mm5, %%mm4\n\t" | |
1025 | |
1026 MOVNTQ" %%mm0, %0\n\t" | |
1027 MOVNTQ" %%mm1, 8%0\n\t" | |
1028 MOVNTQ" %%mm4, 16%0" | |
1029 | |
1030 :"=m"(*d) | |
1031 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1032 :"memory"); | |
1033 d += 24; | |
1034 s += 8; | |
1035 } | |
2741 | 1036 __asm __volatile(SFENCE:::"memory"); |
1037 __asm __volatile(EMMS:::"memory"); | |
6492 | 1038 #endif |
1039 while(s < end) | |
1040 { | |
1041 register uint16_t bgr; | |
1042 bgr = *s++; | |
1043 *d++ = (bgr&0x1F)<<3; | |
1044 *d++ = (bgr&0x3E0)>>2; | |
1045 *d++ = (bgr&0x7C00)>>7; | |
1046 } | |
1047 } | |
1048 | |
1049 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
1050 { | |
1051 const uint16_t *end; | |
1052 #ifdef HAVE_MMX | |
1053 const uint16_t *mm_end; | |
1054 #endif | |
1055 uint8_t *d = (uint8_t *)dst; | |
1056 const uint16_t *s = (const uint16_t *)src; | |
1057 end = s + src_size/2; | |
1058 #ifdef HAVE_MMX | |
1059 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
6605 | 1060 mm_end = end - 7; |
6492 | 1061 while(s < mm_end) |
2718 | 1062 { |
6492 | 1063 __asm __volatile( |
1064 PREFETCH" 32%1\n\t" | |
1065 "movq %1, %%mm0\n\t" | |
1066 "movq %1, %%mm1\n\t" | |
1067 "movq %1, %%mm2\n\t" | |
1068 "pand %2, %%mm0\n\t" | |
1069 "pand %3, %%mm1\n\t" | |
1070 "pand %4, %%mm2\n\t" | |
1071 "psllq $3, %%mm0\n\t" | |
1072 "psrlq $3, %%mm1\n\t" | |
1073 "psrlq $8, %%mm2\n\t" | |
1074 "movq %%mm0, %%mm3\n\t" | |
1075 "movq %%mm1, %%mm4\n\t" | |
1076 "movq %%mm2, %%mm5\n\t" | |
1077 "punpcklwd %5, %%mm0\n\t" | |
1078 "punpcklwd %5, %%mm1\n\t" | |
1079 "punpcklwd %5, %%mm2\n\t" | |
1080 "punpckhwd %5, %%mm3\n\t" | |
1081 "punpckhwd %5, %%mm4\n\t" | |
1082 "punpckhwd %5, %%mm5\n\t" | |
1083 "psllq $8, %%mm1\n\t" | |
1084 "psllq $16, %%mm2\n\t" | |
1085 "por %%mm1, %%mm0\n\t" | |
1086 "por %%mm2, %%mm0\n\t" | |
1087 "psllq $8, %%mm4\n\t" | |
1088 "psllq $16, %%mm5\n\t" | |
1089 "por %%mm4, %%mm3\n\t" | |
1090 "por %%mm5, %%mm3\n\t" | |
1091 | |
1092 "movq %%mm0, %%mm6\n\t" | |
1093 "movq %%mm3, %%mm7\n\t" | |
1094 | |
1095 "movq 8%1, %%mm0\n\t" | |
1096 "movq 8%1, %%mm1\n\t" | |
1097 "movq 8%1, %%mm2\n\t" | |
1098 "pand %2, %%mm0\n\t" | |
1099 "pand %3, %%mm1\n\t" | |
1100 "pand %4, %%mm2\n\t" | |
1101 "psllq $3, %%mm0\n\t" | |
1102 "psrlq $3, %%mm1\n\t" | |
1103 "psrlq $8, %%mm2\n\t" | |
1104 "movq %%mm0, %%mm3\n\t" | |
1105 "movq %%mm1, %%mm4\n\t" | |
1106 "movq %%mm2, %%mm5\n\t" | |
1107 "punpcklwd %5, %%mm0\n\t" | |
1108 "punpcklwd %5, %%mm1\n\t" | |
1109 "punpcklwd %5, %%mm2\n\t" | |
1110 "punpckhwd %5, %%mm3\n\t" | |
1111 "punpckhwd %5, %%mm4\n\t" | |
1112 "punpckhwd %5, %%mm5\n\t" | |
1113 "psllq $8, %%mm1\n\t" | |
1114 "psllq $16, %%mm2\n\t" | |
1115 "por %%mm1, %%mm0\n\t" | |
1116 "por %%mm2, %%mm0\n\t" | |
1117 "psllq $8, %%mm4\n\t" | |
1118 "psllq $16, %%mm5\n\t" | |
1119 "por %%mm4, %%mm3\n\t" | |
1120 "por %%mm5, %%mm3\n\t" | |
1121 :"=m"(*d) | |
1122 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | |
1123 :"memory"); | |
1124 /* Borrowed 32 to 24 */ | |
1125 __asm __volatile( | |
1126 "movq %%mm0, %%mm4\n\t" | |
1127 "movq %%mm3, %%mm5\n\t" | |
1128 "movq %%mm6, %%mm0\n\t" | |
1129 "movq %%mm7, %%mm1\n\t" | |
1130 | |
1131 "movq %%mm4, %%mm6\n\t" | |
1132 "movq %%mm5, %%mm7\n\t" | |
1133 "movq %%mm0, %%mm2\n\t" | |
1134 "movq %%mm1, %%mm3\n\t" | |
1135 | |
1136 "psrlq $8, %%mm2\n\t" | |
1137 "psrlq $8, %%mm3\n\t" | |
1138 "psrlq $8, %%mm6\n\t" | |
1139 "psrlq $8, %%mm7\n\t" | |
1140 "pand %2, %%mm0\n\t" | |
1141 "pand %2, %%mm1\n\t" | |
1142 "pand %2, %%mm4\n\t" | |
1143 "pand %2, %%mm5\n\t" | |
1144 "pand %3, %%mm2\n\t" | |
1145 "pand %3, %%mm3\n\t" | |
1146 "pand %3, %%mm6\n\t" | |
1147 "pand %3, %%mm7\n\t" | |
1148 "por %%mm2, %%mm0\n\t" | |
1149 "por %%mm3, %%mm1\n\t" | |
1150 "por %%mm6, %%mm4\n\t" | |
1151 "por %%mm7, %%mm5\n\t" | |
1152 | |
1153 "movq %%mm1, %%mm2\n\t" | |
1154 "movq %%mm4, %%mm3\n\t" | |
1155 "psllq $48, %%mm2\n\t" | |
1156 "psllq $32, %%mm3\n\t" | |
1157 "pand %4, %%mm2\n\t" | |
1158 "pand %5, %%mm3\n\t" | |
1159 "por %%mm2, %%mm0\n\t" | |
1160 "psrlq $16, %%mm1\n\t" | |
1161 "psrlq $32, %%mm4\n\t" | |
1162 "psllq $16, %%mm5\n\t" | |
1163 "por %%mm3, %%mm1\n\t" | |
1164 "pand %6, %%mm5\n\t" | |
1165 "por %%mm5, %%mm4\n\t" | |
1166 | |
1167 MOVNTQ" %%mm0, %0\n\t" | |
1168 MOVNTQ" %%mm1, 8%0\n\t" | |
1169 MOVNTQ" %%mm4, 16%0" | |
1170 | |
1171 :"=m"(*d) | |
1172 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1173 :"memory"); | |
1174 d += 24; | |
1175 s += 8; | |
1176 } | |
1177 __asm __volatile(SFENCE:::"memory"); | |
1178 __asm __volatile(EMMS:::"memory"); | |
1179 #endif | |
1180 while(s < end) | |
1181 { | |
1182 register uint16_t bgr; | |
1183 bgr = *s++; | |
1184 *d++ = (bgr&0x1F)<<3; | |
1185 *d++ = (bgr&0x7E0)>>3; | |
1186 *d++ = (bgr&0xF800)>>8; | |
1187 } | |
1188 } | |
2718 | 1189 |
6492 | 1190 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) |
1191 { | |
1192 const uint16_t *end; | |
1193 #ifdef HAVE_MMX | |
1194 const uint16_t *mm_end; | |
1195 #endif | |
1196 uint8_t *d = (uint8_t *)dst; | |
1197 const uint16_t *s = (const uint16_t *)src; | |
1198 end = s + src_size/2; | |
1199 #ifdef HAVE_MMX | |
1200 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1201 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
6605 | 1202 mm_end = end - 3; |
6492 | 1203 while(s < mm_end) |
1204 { | |
1205 __asm __volatile( | |
1206 PREFETCH" 32%1\n\t" | |
1207 "movq %1, %%mm0\n\t" | |
1208 "movq %1, %%mm1\n\t" | |
1209 "movq %1, %%mm2\n\t" | |
1210 "pand %2, %%mm0\n\t" | |
1211 "pand %3, %%mm1\n\t" | |
1212 "pand %4, %%mm2\n\t" | |
1213 "psllq $3, %%mm0\n\t" | |
1214 "psrlq $2, %%mm1\n\t" | |
1215 "psrlq $7, %%mm2\n\t" | |
1216 "movq %%mm0, %%mm3\n\t" | |
1217 "movq %%mm1, %%mm4\n\t" | |
1218 "movq %%mm2, %%mm5\n\t" | |
1219 "punpcklwd %%mm7, %%mm0\n\t" | |
1220 "punpcklwd %%mm7, %%mm1\n\t" | |
1221 "punpcklwd %%mm7, %%mm2\n\t" | |
1222 "punpckhwd %%mm7, %%mm3\n\t" | |
1223 "punpckhwd %%mm7, %%mm4\n\t" | |
1224 "punpckhwd %%mm7, %%mm5\n\t" | |
1225 "psllq $8, %%mm1\n\t" | |
1226 "psllq $16, %%mm2\n\t" | |
1227 "por %%mm1, %%mm0\n\t" | |
1228 "por %%mm2, %%mm0\n\t" | |
1229 "psllq $8, %%mm4\n\t" | |
1230 "psllq $16, %%mm5\n\t" | |
1231 "por %%mm4, %%mm3\n\t" | |
1232 "por %%mm5, %%mm3\n\t" | |
1233 MOVNTQ" %%mm0, %0\n\t" | |
1234 MOVNTQ" %%mm3, 8%0\n\t" | |
1235 :"=m"(*d) | |
1236 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | |
1237 :"memory"); | |
1238 d += 16; | |
1239 s += 4; | |
1240 } | |
1241 __asm __volatile(SFENCE:::"memory"); | |
1242 __asm __volatile(EMMS:::"memory"); | |
1243 #endif | |
1244 while(s < end) | |
1245 { | |
9430 | 1246 #if 0 //slightly slower on athlon |
1247 int bgr= *s++; | |
1248 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); | |
1249 #else | |
1250 //FIXME this is very likely wrong for bigendian (and the following converters too) | |
6492 | 1251 register uint16_t bgr; |
1252 bgr = *s++; | |
13423 | 1253 #ifdef WORDS_BIGENDIAN |
1254 *d++ = 0; | |
1255 *d++ = (bgr&0x1F)<<3; | |
1256 *d++ = (bgr&0x3E0)>>2; | |
1257 *d++ = (bgr&0x7C00)>>7; | |
1258 #else | |
6492 | 1259 *d++ = (bgr&0x1F)<<3; |
1260 *d++ = (bgr&0x3E0)>>2; | |
1261 *d++ = (bgr&0x7C00)>>7; | |
1262 *d++ = 0; | |
9430 | 1263 #endif |
13423 | 1264 |
1265 #endif | |
2718 | 1266 } |
6492 | 1267 } |
1268 | |
1269 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
1270 { | |
1271 const uint16_t *end; | |
1272 #ifdef HAVE_MMX | |
1273 const uint16_t *mm_end; | |
2741 | 1274 #endif |
6492 | 1275 uint8_t *d = (uint8_t *)dst; |
1276 const uint16_t *s = (uint16_t *)src; | |
1277 end = s + src_size/2; | |
1278 #ifdef HAVE_MMX | |
1279 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1280 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
6605 | 1281 mm_end = end - 3; |
6492 | 1282 while(s < mm_end) |
1283 { | |
1284 __asm __volatile( | |
1285 PREFETCH" 32%1\n\t" | |
1286 "movq %1, %%mm0\n\t" | |
1287 "movq %1, %%mm1\n\t" | |
1288 "movq %1, %%mm2\n\t" | |
1289 "pand %2, %%mm0\n\t" | |
1290 "pand %3, %%mm1\n\t" | |
1291 "pand %4, %%mm2\n\t" | |
1292 "psllq $3, %%mm0\n\t" | |
1293 "psrlq $3, %%mm1\n\t" | |
1294 "psrlq $8, %%mm2\n\t" | |
1295 "movq %%mm0, %%mm3\n\t" | |
1296 "movq %%mm1, %%mm4\n\t" | |
1297 "movq %%mm2, %%mm5\n\t" | |
1298 "punpcklwd %%mm7, %%mm0\n\t" | |
1299 "punpcklwd %%mm7, %%mm1\n\t" | |
1300 "punpcklwd %%mm7, %%mm2\n\t" | |
1301 "punpckhwd %%mm7, %%mm3\n\t" | |
1302 "punpckhwd %%mm7, %%mm4\n\t" | |
1303 "punpckhwd %%mm7, %%mm5\n\t" | |
1304 "psllq $8, %%mm1\n\t" | |
1305 "psllq $16, %%mm2\n\t" | |
1306 "por %%mm1, %%mm0\n\t" | |
1307 "por %%mm2, %%mm0\n\t" | |
1308 "psllq $8, %%mm4\n\t" | |
1309 "psllq $16, %%mm5\n\t" | |
1310 "por %%mm4, %%mm3\n\t" | |
1311 "por %%mm5, %%mm3\n\t" | |
1312 MOVNTQ" %%mm0, %0\n\t" | |
1313 MOVNTQ" %%mm3, 8%0\n\t" | |
1314 :"=m"(*d) | |
1315 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | |
1316 :"memory"); | |
1317 d += 16; | |
1318 s += 4; | |
1319 } | |
1320 __asm __volatile(SFENCE:::"memory"); | |
1321 __asm __volatile(EMMS:::"memory"); | |
1322 #endif | |
1323 while(s < end) | |
1324 { | |
1325 register uint16_t bgr; | |
1326 bgr = *s++; | |
13423 | 1327 #ifdef WORDS_BIGENDIAN |
1328 *d++ = 0; | |
1329 *d++ = (bgr&0x1F)<<3; | |
1330 *d++ = (bgr&0x7E0)>>3; | |
1331 *d++ = (bgr&0xF800)>>8; | |
1332 #else | |
6492 | 1333 *d++ = (bgr&0x1F)<<3; |
1334 *d++ = (bgr&0x7E0)>>3; | |
1335 *d++ = (bgr&0xF800)>>8; | |
1336 *d++ = 0; | |
13423 | 1337 #endif |
6492 | 1338 } |
2718 | 1339 } |
2694 | 1340 |
3132 | 1341 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
2755 | 1342 { |
1343 #ifdef HAVE_MMX | |
6492 | 1344 /* TODO: unroll this loop */ |
2755 | 1345 asm volatile ( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1346 "xor %%"REG_a", %%"REG_a" \n\t" |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1347 ".balign 16 \n\t" |
2755 | 1348 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1349 PREFETCH" 32(%0, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1350 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
2755 | 1351 "movq %%mm0, %%mm1 \n\t" |
1352 "movq %%mm0, %%mm2 \n\t" | |
1353 "pslld $16, %%mm0 \n\t" | |
1354 "psrld $16, %%mm1 \n\t" | |
6492 | 1355 "pand "MANGLE(mask32r)", %%mm0 \n\t" |
1356 "pand "MANGLE(mask32g)", %%mm2 \n\t" | |
1357 "pand "MANGLE(mask32b)", %%mm1 \n\t" | |
2755 | 1358 "por %%mm0, %%mm2 \n\t" |
1359 "por %%mm1, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1360 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1361 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1362 "cmp %2, %%"REG_a" \n\t" |
2755 | 1363 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1364 :: "r" (src), "r"(dst), "r" ((long)src_size-7) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1365 : "%"REG_a |
2755 | 1366 ); |
2766 | 1367 |
1368 __asm __volatile(SFENCE:::"memory"); | |
1369 __asm __volatile(EMMS:::"memory"); | |
2755 | 1370 #else |
6492 | 1371 unsigned i; |
1372 unsigned num_pixels = src_size >> 2; | |
2755 | 1373 for(i=0; i<num_pixels; i++) |
1374 { | |
9988
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1375 #ifdef WORDS_BIGENDIAN |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1376 dst[4*i + 1] = src[4*i + 3]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1377 dst[4*i + 2] = src[4*i + 2]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1378 dst[4*i + 3] = src[4*i + 1]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1379 #else |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1380 dst[4*i + 0] = src[4*i + 2]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1381 dst[4*i + 1] = src[4*i + 1]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1382 dst[4*i + 2] = src[4*i + 0]; |
a32fb6812221
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
michael
parents:
9987
diff
changeset
|
1383 #endif |
2755 | 1384 } |
1385 #endif | |
1386 } | |
1387 | |
5582 | 1388 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
1389 { | |
6492 | 1390 unsigned i; |
5582 | 1391 #ifdef HAVE_MMX |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1392 long mmx_size= 23 - src_size; |
5582 | 1393 asm volatile ( |
1394 "movq "MANGLE(mask24r)", %%mm5 \n\t" | |
1395 "movq "MANGLE(mask24g)", %%mm6 \n\t" | |
1396 "movq "MANGLE(mask24b)", %%mm7 \n\t" | |
1397 ".balign 16 \n\t" | |
1398 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1399 PREFETCH" 32(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1400 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1401 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1402 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B |
5582 | 1403 "psllq $16, %%mm0 \n\t" // 00 BGR BGR |
1404 "pand %%mm5, %%mm0 \n\t" | |
1405 "pand %%mm6, %%mm1 \n\t" | |
1406 "pand %%mm7, %%mm2 \n\t" | |
1407 "por %%mm0, %%mm1 \n\t" | |
1408 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1409 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1410 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1411 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1412 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR |
5582 | 1413 "pand %%mm7, %%mm0 \n\t" |
1414 "pand %%mm5, %%mm1 \n\t" | |
1415 "pand %%mm6, %%mm2 \n\t" | |
1416 "por %%mm0, %%mm1 \n\t" | |
1417 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1418 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1419 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1420 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1421 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG |
5582 | 1422 "pand %%mm6, %%mm0 \n\t" |
1423 "pand %%mm7, %%mm1 \n\t" | |
1424 "pand %%mm5, %%mm2 \n\t" | |
1425 "por %%mm0, %%mm1 \n\t" | |
1426 "por %%mm2, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1427 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1428 "add $24, %%"REG_a" \n\t" |
5582 | 1429 " js 1b \n\t" |
1430 : "+a" (mmx_size) | |
1431 : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1432 ); | |
1433 | |
1434 __asm __volatile(SFENCE:::"memory"); | |
1435 __asm __volatile(EMMS:::"memory"); | |
1436 | |
6096 | 1437 if(mmx_size==23) return; //finihsed, was multiple of 8 |
6492 | 1438 |
5582 | 1439 src+= src_size; |
1440 dst+= src_size; | |
6492 | 1441 src_size= 23-mmx_size; |
5582 | 1442 src-= src_size; |
1443 dst-= src_size; | |
1444 #endif | |
1445 for(i=0; i<src_size; i+=3) | |
1446 { | |
6492 | 1447 register uint8_t x; |
5582 | 1448 x = src[i + 2]; |
1449 dst[i + 1] = src[i + 1]; | |
1450 dst[i + 2] = src[i + 0]; | |
1451 dst[i + 0] = x; | |
1452 } | |
1453 } | |
1454 | |
5588 | 1455 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 1456 unsigned int width, unsigned int height, |
9392 | 1457 int lumStride, int chromStride, int dstStride, int vertLumPerChroma) |
2701 | 1458 { |
6492 | 1459 unsigned y; |
1460 const unsigned chromWidth= width>>1; | |
2723 | 1461 for(y=0; y<height; y++) |
1462 { | |
2702 | 1463 #ifdef HAVE_MMX |
2723 | 1464 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
1465 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1466 "xor %%"REG_a", %%"REG_a" \n\t" |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1467 ".balign 16 \n\t" |
2723 | 1468 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1469 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1470 PREFETCH" 32(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1471 PREFETCH" 32(%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1472 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
2723 | 1473 "movq %%mm0, %%mm2 \n\t" // U(0) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1474 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
2723 | 1475 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1476 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1477 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1478 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1479 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
2723 | 1480 "movq %%mm3, %%mm4 \n\t" // Y(0) |
1481 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
1482 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
1483 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
1484 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
1485 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 1486 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1487 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1488 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1489 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1490 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
2702 | 1491 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1492 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1493 "cmp %4, %%"REG_a" \n\t" |
2723 | 1494 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1495 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1496 : "%"REG_a |
2723 | 1497 ); |
2702 | 1498 #else |
9393
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1499 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1500 #if defined ARCH_ALPHA && defined HAVE_MVI |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1501 #define pl2yuy2(n) \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1502 y1 = yc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1503 y2 = yc2[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1504 u = uc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1505 v = vc[n]; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1506 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1507 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1508 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1509 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1510 yuv1 = (u << 8) + (v << 24); \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1511 yuv2 = yuv1 + y2; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1512 yuv1 += y1; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1513 qdst[n] = yuv1; \ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1514 qdst2[n] = yuv2; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1515 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1516 int i; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1517 uint64_t *qdst = (uint64_t *) dst; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1518 uint64_t *qdst2 = (uint64_t *) (dst + dstStride); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1519 const uint32_t *yc = (uint32_t *) ysrc; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1520 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1521 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1522 for(i = 0; i < chromWidth; i += 8){ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1523 uint64_t y1, y2, yuv1, yuv2; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1524 uint64_t u, v; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1525 /* Prefetch */ |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1526 asm("ldq $31,64(%0)" :: "r"(yc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1527 asm("ldq $31,64(%0)" :: "r"(yc2)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1528 asm("ldq $31,64(%0)" :: "r"(uc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1529 asm("ldq $31,64(%0)" :: "r"(vc)); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1530 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1531 pl2yuy2(0); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1532 pl2yuy2(1); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1533 pl2yuy2(2); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1534 pl2yuy2(3); |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1535 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1536 yc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1537 yc2 += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1538 uc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1539 vc += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1540 qdst += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1541 qdst2 += 4; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1542 } |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1543 y++; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1544 ysrc += lumStride; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1545 dst += dstStride; |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1546 |
5f9c97070b56
yv12 -> yuy2 converter in alpha asm (from mplayerxp)
michael
parents:
9392
diff
changeset
|
1547 #elif __WORDSIZE >= 64 |
2723 | 1548 int i; |
6492 | 1549 uint64_t *ldst = (uint64_t *) dst; |
1550 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1551 for(i = 0; i < chromWidth; i += 2){ | |
1552 uint64_t k, l; | |
1553 k = yc[0] + (uc[0] << 8) + | |
1554 (yc[1] << 16) + (vc[0] << 24); | |
1555 l = yc[2] + (uc[1] << 8) + | |
1556 (yc[3] << 16) + (vc[1] << 24); | |
1557 *ldst++ = k + (l << 32); | |
1558 yc += 4; | |
1559 uc += 2; | |
1560 vc += 2; | |
2723 | 1561 } |
6492 | 1562 |
1563 #else | |
1564 int i, *idst = (int32_t *) dst; | |
1565 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1566 for(i = 0; i < chromWidth; i++){ | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1567 #ifdef WORDS_BIGENDIAN |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1568 *idst++ = (yc[0] << 24)+ (uc[0] << 16) + |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1569 (yc[1] << 8) + (vc[0] << 0); |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1570 #else |
6492 | 1571 *idst++ = yc[0] + (uc[0] << 8) + |
1572 (yc[1] << 16) + (vc[0] << 24); | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1573 #endif |
6492 | 1574 yc += 2; |
1575 uc++; | |
1576 vc++; | |
1577 } | |
1578 #endif | |
2723 | 1579 #endif |
5588 | 1580 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
2723 | 1581 { |
1582 usrc += chromStride; | |
1583 vsrc += chromStride; | |
1584 } | |
1585 ysrc += lumStride; | |
1586 dst += dstStride; | |
2701 | 1587 } |
2723 | 1588 #ifdef HAVE_MMX |
1589 asm( EMMS" \n\t" | |
1590 SFENCE" \n\t" | |
1591 :::"memory"); | |
2702 | 1592 #endif |
2701 | 1593 } |
1594 | |
2724 | 1595 /** |
1596 * | |
1597 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1598 * problem for anyone then tell me, and ill fix it) | |
1599 */ | |
5588 | 1600 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1601 unsigned int width, unsigned int height, | |
9392 | 1602 int lumStride, int chromStride, int dstStride) |
5588 | 1603 { |
1604 //FIXME interpolate chroma | |
1605 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1606 } | |
1607 | |
11068 | 1608 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1609 unsigned int width, unsigned int height, | |
1610 int lumStride, int chromStride, int dstStride, int vertLumPerChroma) | |
1611 { | |
1612 unsigned y; | |
1613 const unsigned chromWidth= width>>1; | |
1614 for(y=0; y<height; y++) | |
1615 { | |
11072 | 1616 #ifdef HAVE_MMX |
1617 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) | |
1618 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1619 "xor %%"REG_a", %%"REG_a" \n\t" |
11072 | 1620 ".balign 16 \n\t" |
1621 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1622 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1623 PREFETCH" 32(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1624 PREFETCH" 32(%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1625 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
11072 | 1626 "movq %%mm0, %%mm2 \n\t" // U(0) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1627 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
11072 | 1628 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1629 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1630 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1631 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1632 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
11072 | 1633 "movq %%mm0, %%mm4 \n\t" // Y(0) |
1634 "movq %%mm2, %%mm6 \n\t" // Y(8) | |
1635 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | |
1636 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | |
1637 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | |
1638 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | |
1639 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1640 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1641 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1642 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1643 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
11072 | 1644 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1645 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1646 "cmp %4, %%"REG_a" \n\t" |
11072 | 1647 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1648 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1649 : "%"REG_a |
11072 | 1650 ); |
1651 #else | |
1652 //FIXME adapt the alpha asm code from yv12->yuy2 | |
1653 | |
11068 | 1654 #if __WORDSIZE >= 64 |
1655 int i; | |
1656 uint64_t *ldst = (uint64_t *) dst; | |
1657 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1658 for(i = 0; i < chromWidth; i += 2){ | |
1659 uint64_t k, l; | |
1660 k = uc[0] + (yc[0] << 8) + | |
1661 (vc[0] << 16) + (yc[1] << 24); | |
1662 l = uc[1] + (yc[2] << 8) + | |
1663 (vc[1] << 16) + (yc[3] << 24); | |
1664 *ldst++ = k + (l << 32); | |
1665 yc += 4; | |
1666 uc += 2; | |
1667 vc += 2; | |
1668 } | |
1669 | |
1670 #else | |
1671 int i, *idst = (int32_t *) dst; | |
1672 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1673 for(i = 0; i < chromWidth; i++){ | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1674 #ifdef WORDS_BIGENDIAN |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1675 *idst++ = (uc[0] << 24)+ (yc[0] << 16) + |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1676 (vc[0] << 8) + (yc[1] << 0); |
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1677 #else |
11068 | 1678 *idst++ = uc[0] + (yc[0] << 8) + |
1679 (vc[0] << 16) + (yc[1] << 24); | |
12395
b969547bb0b1
bigendian fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
12385
diff
changeset
|
1680 #endif |
11068 | 1681 yc += 2; |
1682 uc++; | |
1683 vc++; | |
1684 } | |
1685 #endif | |
11072 | 1686 #endif |
11068 | 1687 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) |
1688 { | |
1689 usrc += chromStride; | |
1690 vsrc += chromStride; | |
1691 } | |
1692 ysrc += lumStride; | |
1693 dst += dstStride; | |
1694 } | |
11072 | 1695 #ifdef HAVE_MMX |
1696 asm( EMMS" \n\t" | |
1697 SFENCE" \n\t" | |
1698 :::"memory"); | |
1699 #endif | |
11068 | 1700 } |
1701 | |
1702 /** | |
1703 * | |
1704 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1705 * problem for anyone then tell me, and ill fix it) | |
1706 */ | |
1707 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1708 unsigned int width, unsigned int height, | |
1709 int lumStride, int chromStride, int dstStride) | |
1710 { | |
1711 //FIXME interpolate chroma | |
1712 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1713 } | |
1714 | |
5588 | 1715 /** |
1716 * | |
1717 * width should be a multiple of 16 | |
1718 */ | |
1719 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1720 unsigned int width, unsigned int height, | |
9392 | 1721 int lumStride, int chromStride, int dstStride) |
5588 | 1722 { |
1723 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1724 } | |
1725 | |
1726 /** | |
1727 * | |
1728 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1729 * problem for anyone then tell me, and ill fix it) | |
1730 */ | |
3132 | 1731 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2725 | 1732 unsigned int width, unsigned int height, |
9392 | 1733 int lumStride, int chromStride, int srcStride) |
2701 | 1734 { |
6492 | 1735 unsigned y; |
1736 const unsigned chromWidth= width>>1; | |
2724 | 1737 for(y=0; y<height; y+=2) |
1738 { | |
2704 | 1739 #ifdef HAVE_MMX |
2724 | 1740 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1741 "xor %%"REG_a", %%"REG_a" \n\t" |
2724 | 1742 "pcmpeqw %%mm7, %%mm7 \n\t" |
1743 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1744 ".balign 16 \n\t" |
2724 | 1745 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1746 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1747 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1748 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
2724 | 1749 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) |
1750 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
1751 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
1752 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
1753 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1754 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1755 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1756 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
1757 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1758 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t" |
2704 | 1759 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1760 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1761 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12) |
2724 | 1762 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) |
1763 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
1764 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
1765 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
1766 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
1767 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
1768 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
1769 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 1770 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1771 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t" |
2724 | 1772 |
1773 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
1774 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
1775 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
1776 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
1777 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
1778 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
1779 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
1780 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 1781 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1782 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1783 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" |
2724 | 1784 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1785 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1786 "cmp %4, %%"REG_a" \n\t" |
2724 | 1787 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1788 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1789 : "memory", "%"REG_a |
2725 | 1790 ); |
2704 | 1791 |
2806 | 1792 ydst += lumStride; |
1793 src += srcStride; | |
1794 | |
2725 | 1795 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1796 "xor %%"REG_a", %%"REG_a" \n\t" |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1797 ".balign 16 \n\t" |
2724 | 1798 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1799 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1800 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1801 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1802 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1803 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12) |
2724 | 1804 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) |
1805 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
1806 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
1807 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
1808 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
1809 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 1810 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1811 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1812 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t" |
2724 | 1813 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1814 "add $8, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1815 "cmp %4, %%"REG_a" \n\t" |
2724 | 1816 " jb 1b \n\t" |
2704 | 1817 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1818 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1819 : "memory", "%"REG_a |
2724 | 1820 ); |
2704 | 1821 #else |
6492 | 1822 unsigned i; |
2724 | 1823 for(i=0; i<chromWidth; i++) |
1824 { | |
1825 ydst[2*i+0] = src[4*i+0]; | |
1826 udst[i] = src[4*i+1]; | |
1827 ydst[2*i+1] = src[4*i+2]; | |
1828 vdst[i] = src[4*i+3]; | |
1829 } | |
1830 ydst += lumStride; | |
1831 src += srcStride; | |
1832 | |
1833 for(i=0; i<chromWidth; i++) | |
1834 { | |
1835 ydst[2*i+0] = src[4*i+0]; | |
1836 ydst[2*i+1] = src[4*i+2]; | |
1837 } | |
1838 #endif | |
1839 udst += chromStride; | |
1840 vdst += chromStride; | |
1841 ydst += lumStride; | |
1842 src += srcStride; | |
2701 | 1843 } |
2724 | 1844 #ifdef HAVE_MMX |
2847 | 1845 asm volatile( EMMS" \n\t" |
1846 SFENCE" \n\t" | |
1847 :::"memory"); | |
2704 | 1848 #endif |
2723 | 1849 } |
2801 | 1850 |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1851 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1852 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
9392 | 1853 unsigned int width, unsigned int height, int lumStride, int chromStride) |
6484
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1854 { |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1855 /* Y Plane */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1856 memcpy(ydst, ysrc, width*height); |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1857 |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1858 /* XXX: implement upscaling for U,V */ |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1859 } |
c5cf988c6d6f
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
alex
parents:
6096
diff
changeset
|
1860 |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1861 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1862 { |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1863 int x,y; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1864 |
9256 | 1865 dst[0]= src[0]; |
1866 | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1867 // first line |
9256 | 1868 for(x=0; x<srcWidth-1; x++){ |
1869 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1870 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1871 } |
9256 | 1872 dst[2*srcWidth-1]= src[srcWidth-1]; |
1873 | |
1874 dst+= dstStride; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1875 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1876 for(y=1; y<srcHeight; y++){ |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1877 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1878 const long mmxSize= srcWidth&~15; |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1879 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1880 "mov %4, %%"REG_a" \n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1881 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1882 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1883 "movq (%1, %%"REG_a"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1884 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1885 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1886 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1887 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" |
9256 | 1888 PAVGB" %%mm0, %%mm5 \n\t" |
1889 PAVGB" %%mm0, %%mm3 \n\t" | |
1890 PAVGB" %%mm0, %%mm5 \n\t" | |
1891 PAVGB" %%mm0, %%mm3 \n\t" | |
1892 PAVGB" %%mm1, %%mm4 \n\t" | |
1893 PAVGB" %%mm1, %%mm2 \n\t" | |
1894 PAVGB" %%mm1, %%mm4 \n\t" | |
1895 PAVGB" %%mm1, %%mm2 \n\t" | |
1896 "movq %%mm5, %%mm7 \n\t" | |
1897 "movq %%mm4, %%mm6 \n\t" | |
1898 "punpcklbw %%mm3, %%mm5 \n\t" | |
1899 "punpckhbw %%mm3, %%mm7 \n\t" | |
1900 "punpcklbw %%mm2, %%mm4 \n\t" | |
1901 "punpckhbw %%mm2, %%mm6 \n\t" | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1902 #if 1 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1903 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1904 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1905 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1906 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1907 #else |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1908 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1909 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1910 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1911 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1912 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1913 "add $8, %%"REG_a" \n\t" |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1914 " js 1b \n\t" |
9256 | 1915 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1916 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1917 "g" (-mmxSize) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
1918 : "%"REG_a |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1919 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1920 ); |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1921 #else |
9256 | 1922 const int mmxSize=1; |
1923 #endif | |
1924 dst[0 ]= (3*src[0] + src[srcStride])>>2; | |
1925 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1926 |
9256 | 1927 for(x=mmxSize-1; x<srcWidth-1; x++){ |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1928 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1929 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1930 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1931 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1932 } |
9256 | 1933 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; |
1934 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1935 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1936 dst+=dstStride*2; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1937 src+=srcStride; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1938 } |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1939 |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1940 // last line |
9256 | 1941 #if 1 |
1942 dst[0]= src[0]; | |
1943 | |
1944 for(x=0; x<srcWidth-1; x++){ | |
1945 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1946 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1947 } | |
1948 dst[2*srcWidth-1]= src[srcWidth-1]; | |
1949 #else | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1950 for(x=0; x<srcWidth; x++){ |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1951 dst[2*x+0]= |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1952 dst[2*x+1]= src[x]; |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1953 } |
9256 | 1954 #endif |
1955 | |
6582
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1956 #ifdef HAVE_MMX |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1957 asm volatile( EMMS" \n\t" |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1958 SFENCE" \n\t" |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1959 :::"memory"); |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1960 #endif |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1961 } |
f98313dcd428
yvu9 -> yv12 unscaled converter with linear chroma scaling
michael
parents:
6492
diff
changeset
|
1962 |
2801 | 1963 /** |
1964 * | |
1965 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1966 * problem for anyone then tell me, and ill fix it) | |
3132 | 1967 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version |
2801 | 1968 */ |
3132 | 1969 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2801 | 1970 unsigned int width, unsigned int height, |
9392 | 1971 int lumStride, int chromStride, int srcStride) |
2801 | 1972 { |
6492 | 1973 unsigned y; |
1974 const unsigned chromWidth= width>>1; | |
2801 | 1975 for(y=0; y<height; y+=2) |
1976 { | |
2847 | 1977 #ifdef HAVE_MMX |
1978 asm volatile( | |
1979 "xorl %%eax, %%eax \n\t" | |
1980 "pcmpeqw %%mm7, %%mm7 \n\t" | |
1981 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
1982 ".balign 16 \n\t" | |
1983 "1: \n\t" | |
1984 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1985 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
1986 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
1987 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
1988 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
1989 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
1990 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
1991 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1992 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1993 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1994 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
1995 | |
1996 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
1997 | |
1998 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
1999 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
2000 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
2001 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
2002 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
2003 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
2004 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
2005 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
2006 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
2007 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2008 | |
2009 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
2010 | |
2011 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
2012 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
2013 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
2014 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
2015 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
2016 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
2017 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
2018 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2019 | |
2020 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
2021 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
2022 | |
2023 "addl $8, %%eax \n\t" | |
2024 "cmpl %4, %%eax \n\t" | |
2025 " jb 1b \n\t" | |
9394 | 2026 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2847 | 2027 : "memory", "%eax" |
2028 ); | |
2029 | |
2030 ydst += lumStride; | |
2031 src += srcStride; | |
2032 | |
2033 asm volatile( | |
2034 "xorl %%eax, %%eax \n\t" | |
2035 ".balign 16 \n\t" | |
2036 "1: \n\t" | |
2037 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
2038 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
2039 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
2040 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
2041 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
2042 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
2043 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
2044 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
2045 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
2046 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
2047 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2048 | |
2049 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
2050 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
2051 | |
2052 "addl $8, %%eax \n\t" | |
2053 "cmpl %4, %%eax \n\t" | |
2054 " jb 1b \n\t" | |
2055 | |
9394 | 2056 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
2847 | 2057 : "memory", "%eax" |
2058 ); | |
2059 #else | |
6492 | 2060 unsigned i; |
2801 | 2061 for(i=0; i<chromWidth; i++) |
2062 { | |
2063 udst[i] = src[4*i+0]; | |
2064 ydst[2*i+0] = src[4*i+1]; | |
2065 vdst[i] = src[4*i+2]; | |
2066 ydst[2*i+1] = src[4*i+3]; | |
2067 } | |
2068 ydst += lumStride; | |
2069 src += srcStride; | |
2070 | |
2071 for(i=0; i<chromWidth; i++) | |
2072 { | |
2073 ydst[2*i+0] = src[4*i+1]; | |
2074 ydst[2*i+1] = src[4*i+3]; | |
2075 } | |
2847 | 2076 #endif |
2801 | 2077 udst += chromStride; |
2078 vdst += chromStride; | |
2079 ydst += lumStride; | |
2080 src += srcStride; | |
2081 } | |
2847 | 2082 #ifdef HAVE_MMX |
2083 asm volatile( EMMS" \n\t" | |
2084 SFENCE" \n\t" | |
2085 :::"memory"); | |
2086 #endif | |
2801 | 2087 } |
2088 | |
3132 | 2089 /** |
2090 * | |
2091 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
2092 * problem for anyone then tell me, and ill fix it) | |
4622 | 2093 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version |
3132 | 2094 */ |
2095 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
2096 unsigned int width, unsigned int height, | |
9392 | 2097 int lumStride, int chromStride, int srcStride) |
3132 | 2098 { |
6492 | 2099 unsigned y; |
2100 const unsigned chromWidth= width>>1; | |
4622 | 2101 #ifdef HAVE_MMX |
2102 for(y=0; y<height-2; y+=2) | |
2103 { | |
6492 | 2104 unsigned i; |
4622 | 2105 for(i=0; i<2; i++) |
2106 { | |
2107 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2108 "mov %2, %%"REG_a" \n\t" |
4923 | 2109 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
2110 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4622 | 2111 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2112 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
4622 | 2113 ".balign 16 \n\t" |
2114 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2115 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2116 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2117 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" |
4622 | 2118 "punpcklbw %%mm7, %%mm0 \n\t" |
2119 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2120 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2121 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" |
4622 | 2122 "punpcklbw %%mm7, %%mm2 \n\t" |
2123 "punpcklbw %%mm7, %%mm3 \n\t" | |
2124 "pmaddwd %%mm6, %%mm0 \n\t" | |
2125 "pmaddwd %%mm6, %%mm1 \n\t" | |
2126 "pmaddwd %%mm6, %%mm2 \n\t" | |
2127 "pmaddwd %%mm6, %%mm3 \n\t" | |
2128 #ifndef FAST_BGR2YV12 | |
2129 "psrad $8, %%mm0 \n\t" | |
2130 "psrad $8, %%mm1 \n\t" | |
2131 "psrad $8, %%mm2 \n\t" | |
2132 "psrad $8, %%mm3 \n\t" | |
2133 #endif | |
2134 "packssdw %%mm1, %%mm0 \n\t" | |
2135 "packssdw %%mm3, %%mm2 \n\t" | |
2136 "pmaddwd %%mm5, %%mm0 \n\t" | |
2137 "pmaddwd %%mm5, %%mm2 \n\t" | |
2138 "packssdw %%mm2, %%mm0 \n\t" | |
2139 "psraw $7, %%mm0 \n\t" | |
2140 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2141 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2142 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" |
4622 | 2143 "punpcklbw %%mm7, %%mm4 \n\t" |
2144 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2145 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2146 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" |
4622 | 2147 "punpcklbw %%mm7, %%mm2 \n\t" |
2148 "punpcklbw %%mm7, %%mm3 \n\t" | |
2149 "pmaddwd %%mm6, %%mm4 \n\t" | |
2150 "pmaddwd %%mm6, %%mm1 \n\t" | |
2151 "pmaddwd %%mm6, %%mm2 \n\t" | |
2152 "pmaddwd %%mm6, %%mm3 \n\t" | |
2153 #ifndef FAST_BGR2YV12 | |
2154 "psrad $8, %%mm4 \n\t" | |
2155 "psrad $8, %%mm1 \n\t" | |
2156 "psrad $8, %%mm2 \n\t" | |
2157 "psrad $8, %%mm3 \n\t" | |
2158 #endif | |
2159 "packssdw %%mm1, %%mm4 \n\t" | |
2160 "packssdw %%mm3, %%mm2 \n\t" | |
2161 "pmaddwd %%mm5, %%mm4 \n\t" | |
2162 "pmaddwd %%mm5, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2163 "add $24, %%"REG_b" \n\t" |
4622 | 2164 "packssdw %%mm2, %%mm4 \n\t" |
2165 "psraw $7, %%mm4 \n\t" | |
2166 | |
2167 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 2168 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4622 | 2169 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2170 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2171 "add $8, %%"REG_a" \n\t" |
4622 | 2172 " js 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2173 : : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2174 : "%"REG_a, "%"REG_b |
4622 | 2175 ); |
2176 ydst += lumStride; | |
2177 src += srcStride; | |
2178 } | |
2179 src -= srcStride*2; | |
2180 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2181 "mov %4, %%"REG_a" \n\t" |
4923 | 2182 "movq "MANGLE(w1111)", %%mm5 \n\t" |
2183 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4622 | 2184 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2185 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2186 "add %%"REG_b", %%"REG_b" \n\t" |
4622 | 2187 ".balign 16 \n\t" |
2188 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2189 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2190 PREFETCH" 64(%1, %%"REG_b") \n\t" |
4622 | 2191 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2192 "movq (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2193 "movq (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2194 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2195 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2196 PAVGB" %%mm1, %%mm0 \n\t" |
2197 PAVGB" %%mm3, %%mm2 \n\t" | |
2198 "movq %%mm0, %%mm1 \n\t" | |
2199 "movq %%mm2, %%mm3 \n\t" | |
2200 "psrlq $24, %%mm0 \n\t" | |
2201 "psrlq $24, %%mm2 \n\t" | |
2202 PAVGB" %%mm1, %%mm0 \n\t" | |
2203 PAVGB" %%mm3, %%mm2 \n\t" | |
2204 "punpcklbw %%mm7, %%mm0 \n\t" | |
2205 "punpcklbw %%mm7, %%mm2 \n\t" | |
2206 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2207 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2208 "movd (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2209 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2210 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2211 "punpcklbw %%mm7, %%mm0 \n\t" |
2212 "punpcklbw %%mm7, %%mm1 \n\t" | |
2213 "punpcklbw %%mm7, %%mm2 \n\t" | |
2214 "punpcklbw %%mm7, %%mm3 \n\t" | |
2215 "paddw %%mm1, %%mm0 \n\t" | |
2216 "paddw %%mm3, %%mm2 \n\t" | |
2217 "paddw %%mm2, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2218 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2219 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2220 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2221 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2222 "punpcklbw %%mm7, %%mm4 \n\t" |
2223 "punpcklbw %%mm7, %%mm1 \n\t" | |
2224 "punpcklbw %%mm7, %%mm2 \n\t" | |
2225 "punpcklbw %%mm7, %%mm3 \n\t" | |
2226 "paddw %%mm1, %%mm4 \n\t" | |
2227 "paddw %%mm3, %%mm2 \n\t" | |
2228 "paddw %%mm4, %%mm2 \n\t" | |
2229 "psrlw $2, %%mm0 \n\t" | |
2230 "psrlw $2, %%mm2 \n\t" | |
2231 #endif | |
4923 | 2232 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
2233 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 2234 |
2235 "pmaddwd %%mm0, %%mm1 \n\t" | |
2236 "pmaddwd %%mm2, %%mm3 \n\t" | |
2237 "pmaddwd %%mm6, %%mm0 \n\t" | |
2238 "pmaddwd %%mm6, %%mm2 \n\t" | |
2239 #ifndef FAST_BGR2YV12 | |
2240 "psrad $8, %%mm0 \n\t" | |
2241 "psrad $8, %%mm1 \n\t" | |
2242 "psrad $8, %%mm2 \n\t" | |
2243 "psrad $8, %%mm3 \n\t" | |
2244 #endif | |
2245 "packssdw %%mm2, %%mm0 \n\t" | |
2246 "packssdw %%mm3, %%mm1 \n\t" | |
2247 "pmaddwd %%mm5, %%mm0 \n\t" | |
2248 "pmaddwd %%mm5, %%mm1 \n\t" | |
2249 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
2250 "psraw $7, %%mm0 \n\t" | |
2251 | |
2252 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2253 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2254 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2255 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2256 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2257 PAVGB" %%mm1, %%mm4 \n\t" |
2258 PAVGB" %%mm3, %%mm2 \n\t" | |
2259 "movq %%mm4, %%mm1 \n\t" | |
2260 "movq %%mm2, %%mm3 \n\t" | |
2261 "psrlq $24, %%mm4 \n\t" | |
2262 "psrlq $24, %%mm2 \n\t" | |
2263 PAVGB" %%mm1, %%mm4 \n\t" | |
2264 PAVGB" %%mm3, %%mm2 \n\t" | |
2265 "punpcklbw %%mm7, %%mm4 \n\t" | |
2266 "punpcklbw %%mm7, %%mm2 \n\t" | |
2267 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2268 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2269 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2270 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2271 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2272 "punpcklbw %%mm7, %%mm4 \n\t" |
2273 "punpcklbw %%mm7, %%mm1 \n\t" | |
2274 "punpcklbw %%mm7, %%mm2 \n\t" | |
2275 "punpcklbw %%mm7, %%mm3 \n\t" | |
2276 "paddw %%mm1, %%mm4 \n\t" | |
2277 "paddw %%mm3, %%mm2 \n\t" | |
2278 "paddw %%mm2, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2279 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2280 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2281 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2282 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" |
4622 | 2283 "punpcklbw %%mm7, %%mm5 \n\t" |
2284 "punpcklbw %%mm7, %%mm1 \n\t" | |
2285 "punpcklbw %%mm7, %%mm2 \n\t" | |
2286 "punpcklbw %%mm7, %%mm3 \n\t" | |
2287 "paddw %%mm1, %%mm5 \n\t" | |
2288 "paddw %%mm3, %%mm2 \n\t" | |
2289 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 2290 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4622 | 2291 "psrlw $2, %%mm4 \n\t" |
2292 "psrlw $2, %%mm2 \n\t" | |
2293 #endif | |
4923 | 2294 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
2295 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4622 | 2296 |
2297 "pmaddwd %%mm4, %%mm1 \n\t" | |
2298 "pmaddwd %%mm2, %%mm3 \n\t" | |
2299 "pmaddwd %%mm6, %%mm4 \n\t" | |
2300 "pmaddwd %%mm6, %%mm2 \n\t" | |
2301 #ifndef FAST_BGR2YV12 | |
2302 "psrad $8, %%mm4 \n\t" | |
2303 "psrad $8, %%mm1 \n\t" | |
2304 "psrad $8, %%mm2 \n\t" | |
2305 "psrad $8, %%mm3 \n\t" | |
2306 #endif | |
2307 "packssdw %%mm2, %%mm4 \n\t" | |
2308 "packssdw %%mm3, %%mm1 \n\t" | |
2309 "pmaddwd %%mm5, %%mm4 \n\t" | |
2310 "pmaddwd %%mm5, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2311 "add $24, %%"REG_b" \n\t" |
4622 | 2312 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2313 "psraw $7, %%mm4 \n\t" | |
2314 | |
2315 "movq %%mm0, %%mm1 \n\t" | |
2316 "punpckldq %%mm4, %%mm0 \n\t" | |
2317 "punpckhdq %%mm4, %%mm1 \n\t" | |
2318 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 2319 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2320 "movd %%mm0, (%2, %%"REG_a") \n\t" |
4622 | 2321 "punpckhdq %%mm0, %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2322 "movd %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2323 "add $4, %%"REG_a" \n\t" |
4622 | 2324 " js 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2325 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2326 : "%"REG_a, "%"REG_b |
4622 | 2327 ); |
2328 | |
2329 udst += chromStride; | |
2330 vdst += chromStride; | |
2331 src += srcStride*2; | |
2332 } | |
2333 | |
2334 asm volatile( EMMS" \n\t" | |
2335 SFENCE" \n\t" | |
2336 :::"memory"); | |
2337 #else | |
2338 y=0; | |
2339 #endif | |
2340 for(; y<height; y+=2) | |
3132 | 2341 { |
6492 | 2342 unsigned i; |
3132 | 2343 for(i=0; i<chromWidth; i++) |
2344 { | |
2345 unsigned int b= src[6*i+0]; | |
2346 unsigned int g= src[6*i+1]; | |
2347 unsigned int r= src[6*i+2]; | |
2801 | 2348 |
3633 | 2349 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
2350 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
2351 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
3132 | 2352 |
2353 udst[i] = U; | |
2354 vdst[i] = V; | |
2355 ydst[2*i] = Y; | |
2356 | |
2357 b= src[6*i+3]; | |
2358 g= src[6*i+4]; | |
2359 r= src[6*i+5]; | |
2360 | |
3633 | 2361 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2362 ydst[2*i+1] = Y; |
2363 } | |
2364 ydst += lumStride; | |
2365 src += srcStride; | |
2366 | |
2367 for(i=0; i<chromWidth; i++) | |
2368 { | |
2369 unsigned int b= src[6*i+0]; | |
2370 unsigned int g= src[6*i+1]; | |
2371 unsigned int r= src[6*i+2]; | |
2372 | |
3633 | 2373 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2374 |
2375 ydst[2*i] = Y; | |
2376 | |
2377 b= src[6*i+3]; | |
2378 g= src[6*i+4]; | |
2379 r= src[6*i+5]; | |
2380 | |
3633 | 2381 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; |
3132 | 2382 ydst[2*i+1] = Y; |
2383 } | |
2384 udst += chromStride; | |
2385 vdst += chromStride; | |
2386 ydst += lumStride; | |
2387 src += srcStride; | |
2388 } | |
2389 } | |
5337 | 2390 |
2391 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |
9392 | 2392 unsigned width, unsigned height, int src1Stride, |
2393 int src2Stride, int dstStride){ | |
6492 | 2394 unsigned h; |
5337 | 2395 |
2396 for(h=0; h < height; h++) | |
2397 { | |
6492 | 2398 unsigned w; |
5337 | 2399 |
2400 #ifdef HAVE_MMX | |
2401 #ifdef HAVE_SSE2 | |
2402 asm( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2403 "xor %%"REG_a", %%"REG_a" \n\t" |
5337 | 2404 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2405 PREFETCH" 64(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2406 PREFETCH" 64(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2407 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2408 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2409 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" |
5337 | 2410 "punpcklbw %%xmm2, %%xmm0 \n\t" |
2411 "punpckhbw %%xmm2, %%xmm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2412 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2413 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2414 "add $16, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2415 "cmp %3, %%"REG_a" \n\t" |
5337 | 2416 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2417 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2418 : "memory", "%"REG_a"" |
5337 | 2419 ); |
2420 #else | |
2421 asm( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2422 "xor %%"REG_a", %%"REG_a" \n\t" |
5337 | 2423 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2424 PREFETCH" 64(%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2425 PREFETCH" 64(%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2426 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2427 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" |
5337 | 2428 "movq %%mm0, %%mm1 \n\t" |
2429 "movq %%mm2, %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2430 "movq (%2, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2431 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" |
5337 | 2432 "punpcklbw %%mm4, %%mm0 \n\t" |
2433 "punpckhbw %%mm4, %%mm1 \n\t" | |
2434 "punpcklbw %%mm5, %%mm2 \n\t" | |
2435 "punpckhbw %%mm5, %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2436 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2437 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2438 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2439 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2440 "add $16, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2441 "cmp %3, %%"REG_a" \n\t" |
5337 | 2442 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2443 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2444 : "memory", "%"REG_a |
5337 | 2445 ); |
2446 #endif | |
2447 for(w= (width&(~15)); w < width; w++) | |
2448 { | |
2449 dest[2*w+0] = src1[w]; | |
2450 dest[2*w+1] = src2[w]; | |
2451 } | |
2452 #else | |
2453 for(w=0; w < width; w++) | |
2454 { | |
2455 dest[2*w+0] = src1[w]; | |
2456 dest[2*w+1] = src2[w]; | |
2457 } | |
2458 #endif | |
2459 dest += dstStride; | |
2460 src1 += src1Stride; | |
2461 src2 += src2Stride; | |
2462 } | |
2463 #ifdef HAVE_MMX | |
2464 asm( | |
2465 EMMS" \n\t" | |
2466 SFENCE" \n\t" | |
2467 ::: "memory" | |
2468 ); | |
2469 #endif | |
2470 } | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2471 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2472 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2473 uint8_t *dst1, uint8_t *dst2, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2474 unsigned width, unsigned height, |
9392 | 2475 int srcStride1, int srcStride2, |
2476 int dstStride1, int dstStride2) | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2477 { |
9392 | 2478 unsigned int y,x,h; |
2479 int w; | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2480 w=width/2; h=height/2; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2481 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2482 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2483 PREFETCH" %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2484 PREFETCH" %1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2485 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2486 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2487 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2488 const uint8_t* s1=src1+srcStride1*(y>>1); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2489 uint8_t* d=dst1+dstStride1*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2490 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2491 #ifdef HAVE_MMX |
9392 | 2492 for(;x<w-31;x+=32) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2493 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2494 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2495 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2496 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2497 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2498 "movq 16%1, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2499 "movq 24%1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2500 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2501 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2502 "movq %%mm4, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2503 "movq %%mm6, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2504 "punpcklbw %%mm0, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2505 "punpckhbw %%mm1, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2506 "punpcklbw %%mm2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2507 "punpckhbw %%mm3, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2508 "punpcklbw %%mm4, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2509 "punpckhbw %%mm5, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2510 "punpcklbw %%mm6, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2511 "punpckhbw %%mm7, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2512 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2513 MOVNTQ" %%mm1, 8%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2514 MOVNTQ" %%mm2, 16%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2515 MOVNTQ" %%mm3, 24%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2516 MOVNTQ" %%mm4, 32%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2517 MOVNTQ" %%mm5, 40%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2518 MOVNTQ" %%mm6, 48%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2519 MOVNTQ" %%mm7, 56%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2520 :"=m"(d[2*x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2521 :"m"(s1[x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2522 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2523 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2524 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2525 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2526 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2527 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2528 const uint8_t* s2=src2+srcStride2*(y>>1); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2529 uint8_t* d=dst2+dstStride2*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2530 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2531 #ifdef HAVE_MMX |
9392 | 2532 for(;x<w-31;x+=32) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2533 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2534 asm volatile( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2535 PREFETCH" 32%1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2536 "movq %1, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2537 "movq 8%1, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2538 "movq 16%1, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2539 "movq 24%1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2540 "movq %%mm0, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2541 "movq %%mm2, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2542 "movq %%mm4, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2543 "movq %%mm6, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2544 "punpcklbw %%mm0, %%mm0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2545 "punpckhbw %%mm1, %%mm1\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2546 "punpcklbw %%mm2, %%mm2\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2547 "punpckhbw %%mm3, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2548 "punpcklbw %%mm4, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2549 "punpckhbw %%mm5, %%mm5\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2550 "punpcklbw %%mm6, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2551 "punpckhbw %%mm7, %%mm7\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2552 MOVNTQ" %%mm0, %0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2553 MOVNTQ" %%mm1, 8%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2554 MOVNTQ" %%mm2, 16%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2555 MOVNTQ" %%mm3, 24%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2556 MOVNTQ" %%mm4, 32%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2557 MOVNTQ" %%mm5, 40%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2558 MOVNTQ" %%mm6, 48%0\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2559 MOVNTQ" %%mm7, 56%0" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2560 :"=m"(d[2*x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2561 :"m"(s2[x]) |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2562 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2563 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2564 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2565 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2566 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2567 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2568 asm( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2569 EMMS" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2570 SFENCE" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2571 ::: "memory" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2572 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2573 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2574 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2575 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2576 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2577 uint8_t *dst, |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2578 unsigned width, unsigned height, |
9392 | 2579 int srcStride1, int srcStride2, |
2580 int srcStride3, int dstStride) | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2581 { |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
13423
diff
changeset
|
2582 unsigned long y,x,w,h; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2583 w=width/2; h=height; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2584 for(y=0;y<h;y++){ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2585 const uint8_t* yp=src1+srcStride1*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2586 const uint8_t* up=src2+srcStride2*(y>>2); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2587 const uint8_t* vp=src3+srcStride3*(y>>2); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2588 uint8_t* d=dst+dstStride*y; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2589 x=0; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2590 #ifdef HAVE_MMX |
9394 | 2591 for(;x<w-7;x+=8) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2592 { |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2593 asm volatile( |
9394 | 2594 PREFETCH" 32(%1, %0)\n\t" |
2595 PREFETCH" 32(%2, %0)\n\t" | |
2596 PREFETCH" 32(%3, %0)\n\t" | |
2597 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | |
2598 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */ | |
2599 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */ | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2600 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2601 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2602 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2603 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2604 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2605 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2606 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2607 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2608 "movq %%mm1, %%mm6\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2609 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2610 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2611 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ |
9394 | 2612 MOVNTQ" %%mm0, (%4, %0, 8)\n\t" |
2613 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2614 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2615 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/ |
9394 | 2616 "movq 8(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2617 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2618 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2619 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/ |
9394 | 2620 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t" |
2621 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2622 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2623 "movq %%mm4, %%mm6\n\t" |
9394 | 2624 "movq 16(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2625 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2626 "punpcklbw %%mm5, %%mm4\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2627 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2628 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/ |
9394 | 2629 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t" |
2630 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2631 |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2632 "punpckhbw %%mm5, %%mm6\n\t" |
9394 | 2633 "movq 24(%1, %0, 4), %%mm0\n\t" |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2634 "movq %%mm0, %%mm3\n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2635 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/ |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2636 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/ |
9394 | 2637 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t" |
2638 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t" | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2639 |
9394 | 2640 : "+r" (x) |
2641 : "r"(yp), "r" (up), "r"(vp), "r"(d) | |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2642 :"memory"); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2643 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2644 #endif |
9394 | 2645 for(; x<w; x++) |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2646 { |
9394 | 2647 const int x2= x<<2; |
6606
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2648 d[8*x+0]=yp[x2]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2649 d[8*x+1]=up[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2650 d[8*x+2]=yp[x2+1]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2651 d[8*x+3]=vp[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2652 d[8*x+4]=yp[x2+2]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2653 d[8*x+5]=up[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2654 d[8*x+6]=yp[x2+3]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2655 d[8*x+7]=vp[x]; |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2656 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2657 } |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2658 #ifdef HAVE_MMX |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2659 asm( |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2660 EMMS" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2661 SFENCE" \n\t" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2662 ::: "memory" |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2663 ); |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2664 #endif |
50b5d8367318
merging changes from mplayerxp (rgb2rgb*.{c,h} only)
michael
parents:
6605
diff
changeset
|
2665 } |