Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 2741:b8a692c59b64
MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
author | nick |
---|---|
date | Tue, 06 Nov 2001 16:35:17 +0000 |
parents | 1583214489a2 |
children | dece635a28e3 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
2702 | 8 * palette stuff & yuv stuff by Michael |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
9 */ |
2504 | 10 #include <inttypes.h> |
11 #include "../config.h" | |
12 #include "rgb2rgb.h" | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
13 #include "../mmx_defs.h" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
14 |
2535 | 15 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
16 static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
17 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
18 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
19 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
20 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
21 static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; |
2741 | 22 static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL; |
23 static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL; | |
24 static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; | |
25 static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL; | |
26 static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL; | |
27 static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; | |
2535 | 28 #endif |
2513 | 29 |
2718 | 30 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2504 | 31 { |
2508 | 32 uint8_t *dest = dst; |
2677 | 33 const uint8_t *s = src; |
34 const uint8_t *end; | |
2510 | 35 #ifdef HAVE_MMX |
36 uint8_t *mm_end; | |
37 #endif | |
2504 | 38 end = s + src_size; |
2510 | 39 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
40 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2740 | 41 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
42 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2740 | 43 if(mm_end == end) mm_end -= MMREG_SIZE*4; |
2510 | 44 while(s < mm_end) |
45 { | |
2511 | 46 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
47 PREFETCH" 32%1\n\t" |
2510 | 48 "movd %1, %%mm0\n\t" |
2738 | 49 "punpckldq 3%1, %%mm0\n\t" |
50 "movd 6%1, %%mm1\n\t" | |
51 "punpckldq 9%1, %%mm1\n\t" | |
52 "movd 12%1, %%mm2\n\t" | |
53 "punpckldq 15%1, %%mm2\n\t" | |
54 "movd 18%1, %%mm3\n\t" | |
55 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 56 "pand %%mm7, %%mm0\n\t" |
2738 | 57 "pand %%mm7, %%mm1\n\t" |
2510 | 58 "pand %%mm7, %%mm2\n\t" |
2738 | 59 "pand %%mm7, %%mm3\n\t" |
2511 | 60 MOVNTQ" %%mm0, %0\n\t" |
2738 | 61 MOVNTQ" %%mm1, 8%0\n\t" |
62 MOVNTQ" %%mm2, 16%0\n\t" | |
63 MOVNTQ" %%mm3, 24%0" | |
2510 | 64 :"=m"(*dest) |
65 :"m"(*s) | |
66 :"memory"); | |
2738 | 67 dest += 32; |
68 s += 24; | |
2510 | 69 } |
2513 | 70 __asm __volatile(SFENCE:::"memory"); |
2511 | 71 __asm __volatile(EMMS:::"memory"); |
2510 | 72 #endif |
2504 | 73 while(s < end) |
74 { | |
2508 | 75 *dest++ = *s++; |
76 *dest++ = *s++; | |
77 *dest++ = *s++; | |
78 *dest++ = 0; | |
2504 | 79 } |
80 } | |
2505 | 81 |
2718 | 82 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 83 { |
84 uint8_t *dest = dst; | |
2677 | 85 const uint8_t *s = src; |
86 const uint8_t *end; | |
2517 | 87 #ifdef HAVE_MMX |
88 uint8_t *mm_end; | |
89 #endif | |
2505 | 90 end = s + src_size; |
2517 | 91 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
92 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2517 | 93 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); |
94 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
95 "movq %0, %%mm7\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
96 "movq %1, %%mm6" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
97 ::"m"(mask24l),"m"(mask24h):"memory"); |
2517 | 98 while(s < mm_end) |
99 { | |
100 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
101 PREFETCH" 32%1\n\t" |
2517 | 102 "movq %1, %%mm0\n\t" |
103 "movq 8%1, %%mm1\n\t" | |
104 "movq %%mm0, %%mm2\n\t" | |
105 "movq %%mm1, %%mm3\n\t" | |
106 "psrlq $8, %%mm2\n\t" | |
107 "psrlq $8, %%mm3\n\t" | |
108 "pand %%mm7, %%mm0\n\t" | |
109 "pand %%mm7, %%mm1\n\t" | |
110 "pand %%mm6, %%mm2\n\t" | |
111 "pand %%mm6, %%mm3\n\t" | |
112 "por %%mm2, %%mm0\n\t" | |
113 "por %%mm3, %%mm1\n\t" | |
114 MOVNTQ" %%mm0, %0\n\t" | |
115 MOVNTQ" %%mm1, 6%0" | |
116 :"=m"(*dest) | |
117 :"m"(*s) | |
118 :"memory"); | |
119 dest += 12; | |
120 s += 16; | |
121 } | |
122 __asm __volatile(SFENCE:::"memory"); | |
123 __asm __volatile(EMMS:::"memory"); | |
124 #endif | |
2505 | 125 while(s < end) |
126 { | |
127 *dest++ = *s++; | |
128 *dest++ = *s++; | |
129 *dest++ = *s++; | |
130 s++; | |
131 } | |
132 } | |
2506 | 133 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
134 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
135 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
136 ported to gcc & bugfixed : A'rpi |
2564 | 137 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
138 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
139 */ |
2718 | 140 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 141 { |
142 #ifdef HAVE_MMX | |
2677 | 143 register const char* s=src+src_size; |
2506 | 144 register char* d=dst+src_size; |
145 register int offs=-src_size; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
146 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
147 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
148 "movq %0, %%mm4\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
149 ::"m"(mask15s)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
150 while(offs<0) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
151 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
152 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
153 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
154 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
155 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
156 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
157 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
158 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
159 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
160 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
161 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
162 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
163 MOVNTQ" %%mm2, 8%0" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
164 :"=m"(*(d+offs)) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
165 :"m"(*(s+offs)) |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
166 ); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
167 offs+=16; |
2506 | 168 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
169 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
170 __asm __volatile(EMMS:::"memory"); |
2506 | 171 #else |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
172 #if 0 |
2677 | 173 const uint16_t *s1=( uint16_t * )src; |
2506 | 174 uint16_t *d1=( uint16_t * )dst; |
175 uint16_t *e=((uint8_t *)s1)+src_size; | |
176 while( s1<e ){ | |
177 register int x=*( s1++ ); | |
178 /* rrrrrggggggbbbbb | |
179 0rrrrrgggggbbbbb | |
180 0111 1111 1110 0000=0x7FE0 | |
181 00000000000001 1111=0x001F */ | |
182 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); | |
183 } | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
184 #else |
2718 | 185 const unsigned *s1=( unsigned * )src; |
186 unsigned *d1=( unsigned * )dst; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
187 int i; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
188 int size= src_size>>2; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
189 for(i=0; i<size; i++) |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
190 { |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
191 register int x= s1[i]; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
192 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
193 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
194 |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
195 } |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
196 #endif |
2506 | 197 #endif |
198 } | |
2694 | 199 |
200 /** | |
201 * Pallete is assumed to contain bgr32 | |
202 */ | |
2718 | 203 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 204 { |
2718 | 205 unsigned i; |
2702 | 206 for(i=0; i<num_pixels; i++) |
2718 | 207 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ]; |
2694 | 208 } |
209 | |
2697 | 210 /** |
211 * Pallete is assumed to contain bgr32 | |
212 */ | |
2718 | 213 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2697 | 214 { |
2718 | 215 unsigned i; |
2697 | 216 /* |
217 writes 1 byte o much and might cause alignment issues on some architectures? | |
2702 | 218 for(i=0; i<num_pixels; i++) |
2718 | 219 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ]; |
2697 | 220 */ |
2702 | 221 for(i=0; i<num_pixels; i++) |
2697 | 222 { |
223 //FIXME slow? | |
224 dst[0]= palette[ src[i]*4+0 ]; | |
225 dst[1]= palette[ src[i]*4+1 ]; | |
226 dst[2]= palette[ src[i]*4+2 ]; | |
227 dst+= 3; | |
228 } | |
229 } | |
230 | |
2718 | 231 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 232 { |
2741 | 233 #ifdef HAVE_MMX |
234 const uint8_t *s = src; | |
235 const uint8_t *end,*mm_end; | |
236 uint16_t *d = (uint16_t *)dst; | |
237 end = s + src_size; | |
238 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
239 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
240 __asm __volatile( | |
241 "movq %0, %%mm7\n\t" | |
242 "movq %1, %%mm6\n\t" | |
243 ::"m"(red_16mask),"m"(green_16mask)); | |
244 while(s < mm_end) | |
245 { | |
246 __asm __volatile( | |
247 PREFETCH" 32%1\n\t" | |
248 "movd %1, %%mm0\n\t" | |
249 "movd 4%1, %%mm3\n\t" | |
250 "punpckldq 8%1, %%mm0\n\t" | |
251 "punpckldq 12%1, %%mm3\n\t" | |
252 "movq %%mm0, %%mm1\n\t" | |
253 "movq %%mm0, %%mm2\n\t" | |
254 "movq %%mm3, %%mm4\n\t" | |
255 "movq %%mm3, %%mm5\n\t" | |
256 "psrlq $3, %%mm0\n\t" | |
257 "psrlq $3, %%mm3\n\t" | |
258 "pand %2, %%mm0\n\t" | |
259 "pand %2, %%mm3\n\t" | |
260 "psrlq $5, %%mm1\n\t" | |
261 "psrlq $5, %%mm4\n\t" | |
262 "pand %%mm6, %%mm1\n\t" | |
263 "pand %%mm6, %%mm4\n\t" | |
264 "psrlq $8, %%mm2\n\t" | |
265 "psrlq $8, %%mm5\n\t" | |
266 "pand %%mm7, %%mm2\n\t" | |
267 "pand %%mm7, %%mm5\n\t" | |
268 "por %%mm1, %%mm0\n\t" | |
269 "por %%mm4, %%mm3\n\t" | |
270 "por %%mm2, %%mm0\n\t" | |
271 "por %%mm5, %%mm3\n\t" | |
272 "psllq $16, %%mm3\n\t" | |
273 "por %%mm3, %%mm0\n\t" | |
274 MOVNTQ" %%mm0, %0\n\t" | |
275 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
276 d += 4; | |
277 s += 16; | |
278 } | |
279 while(s < end) | |
280 { | |
281 const int b= *s++; | |
282 const int g= *s++; | |
283 const int r= *s++; | |
284 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
285 } | |
286 __asm __volatile(SFENCE:::"memory"); | |
287 __asm __volatile(EMMS:::"memory"); | |
288 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
289 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
290 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
291 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 292 { |
293 const int b= src[i+0]; | |
294 const int g= src[i+1]; | |
295 const int r= src[i+2]; | |
296 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
297 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2694 | 298 } |
2741 | 299 #endif |
2694 | 300 } |
301 | |
2718 | 302 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 303 { |
2741 | 304 #ifdef HAVE_MMX |
305 const uint8_t *s = src; | |
306 const uint8_t *end,*mm_end; | |
307 uint16_t *d = (uint16_t *)dst; | |
308 end = s + src_size; | |
309 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
310 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
311 __asm __volatile( | |
312 "movq %0, %%mm7\n\t" | |
313 "movq %1, %%mm6\n\t" | |
314 ::"m"(red_15mask),"m"(green_15mask)); | |
315 while(s < mm_end) | |
316 { | |
317 __asm __volatile( | |
318 PREFETCH" 32%1\n\t" | |
319 "movd %1, %%mm0\n\t" | |
320 "movd 4%1, %%mm3\n\t" | |
321 "punpckldq 8%1, %%mm0\n\t" | |
322 "punpckldq 12%1, %%mm3\n\t" | |
323 "movq %%mm0, %%mm1\n\t" | |
324 "movq %%mm0, %%mm2\n\t" | |
325 "movq %%mm3, %%mm4\n\t" | |
326 "movq %%mm3, %%mm5\n\t" | |
327 "psrlq $3, %%mm0\n\t" | |
328 "psrlq $3, %%mm3\n\t" | |
329 "pand %2, %%mm0\n\t" | |
330 "pand %2, %%mm3\n\t" | |
331 "psrlq $6, %%mm1\n\t" | |
332 "psrlq $6, %%mm4\n\t" | |
333 "pand %%mm6, %%mm1\n\t" | |
334 "pand %%mm6, %%mm4\n\t" | |
335 "psrlq $9, %%mm2\n\t" | |
336 "psrlq $9, %%mm5\n\t" | |
337 "pand %%mm7, %%mm2\n\t" | |
338 "pand %%mm7, %%mm5\n\t" | |
339 "por %%mm1, %%mm0\n\t" | |
340 "por %%mm4, %%mm3\n\t" | |
341 "por %%mm2, %%mm0\n\t" | |
342 "por %%mm5, %%mm3\n\t" | |
343 "psllq $16, %%mm3\n\t" | |
344 "por %%mm3, %%mm0\n\t" | |
345 MOVNTQ" %%mm0, %0\n\t" | |
346 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
347 d += 4; | |
348 s += 16; | |
349 } | |
350 while(s < end) | |
351 { | |
352 const int b= *s++; | |
353 const int g= *s++; | |
354 const int r= *s++; | |
355 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
356 } | |
357 __asm __volatile(SFENCE:::"memory"); | |
358 __asm __volatile(EMMS:::"memory"); | |
359 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
360 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
361 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
362 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 363 { |
364 const int b= src[i+0]; | |
365 const int g= src[i+1]; | |
366 const int r= src[i+2]; | |
367 | |
2720 | 368 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2694 | 369 } |
2741 | 370 #endif |
2694 | 371 } |
372 | |
2718 | 373 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size) |
374 { | |
2740 | 375 #ifdef HAVE_MMX |
376 const uint8_t *s = src; | |
377 const uint8_t *end,*mm_end; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
378 uint16_t *d = (uint16_t *)dst; |
2740 | 379 end = s + src_size; |
380 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
2738 | 381 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
382 __asm __volatile( | |
383 "movq %0, %%mm7\n\t" | |
384 "movq %1, %%mm6\n\t" | |
2741 | 385 ::"m"(red_16mask),"m"(green_16mask)); |
386 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
2740 | 387 while(s < mm_end) |
2738 | 388 { |
389 __asm __volatile( | |
390 PREFETCH" 32%1\n\t" | |
391 "movd %1, %%mm0\n\t" | |
2740 | 392 "movd 3%1, %%mm3\n\t" |
393 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 394 "punpckldq 9%1, %%mm3\n\t" |
395 "movq %%mm0, %%mm1\n\t" | |
396 "movq %%mm0, %%mm2\n\t" | |
397 "movq %%mm3, %%mm4\n\t" | |
398 "movq %%mm3, %%mm5\n\t" | |
399 "psrlq $3, %%mm0\n\t" | |
400 "psrlq $3, %%mm3\n\t" | |
2740 | 401 "pand %2, %%mm0\n\t" |
402 "pand %2, %%mm3\n\t" | |
403 "psrlq $5, %%mm1\n\t" | |
404 "psrlq $5, %%mm4\n\t" | |
405 "pand %%mm6, %%mm1\n\t" | |
406 "pand %%mm6, %%mm4\n\t" | |
407 "psrlq $8, %%mm2\n\t" | |
408 "psrlq $8, %%mm5\n\t" | |
409 "pand %%mm7, %%mm2\n\t" | |
410 "pand %%mm7, %%mm5\n\t" | |
2738 | 411 "por %%mm1, %%mm0\n\t" |
2740 | 412 "por %%mm4, %%mm3\n\t" |
2738 | 413 "por %%mm2, %%mm0\n\t" |
414 "por %%mm5, %%mm3\n\t" | |
2740 | 415 "psllq $16, %%mm3\n\t" |
416 "por %%mm3, %%mm0\n\t" | |
2738 | 417 MOVNTQ" %%mm0, %0\n\t" |
2741 | 418 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 419 d += 4; |
420 s += 12; | |
2738 | 421 } |
2740 | 422 while(s < end) |
423 { | |
424 const int b= *s++; | |
425 const int g= *s++; | |
426 const int r= *s++; | |
427 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
428 } | |
429 __asm __volatile(SFENCE:::"memory"); | |
430 __asm __volatile(EMMS:::"memory"); | |
431 #else | |
432 unsigned j,i,num_pixels=src_size/3; | |
433 uint16_t *d = (uint16_t *)dst; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
434 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 435 { |
436 const int b= src[i+0]; | |
437 const int g= src[i+1]; | |
438 const int r= src[i+2]; | |
439 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
440 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2718 | 441 } |
2740 | 442 #endif |
2718 | 443 } |
444 | |
445 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
446 { | |
2741 | 447 #ifdef HAVE_MMX |
448 const uint8_t *s = src; | |
449 const uint8_t *end,*mm_end; | |
450 uint16_t *d = (uint16_t *)dst; | |
451 end = s + src_size; | |
452 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
453 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
454 __asm __volatile( | |
455 "movq %0, %%mm7\n\t" | |
456 "movq %1, %%mm6\n\t" | |
457 ::"m"(red_15mask),"m"(green_15mask)); | |
458 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
459 while(s < mm_end) | |
460 { | |
461 __asm __volatile( | |
462 PREFETCH" 32%1\n\t" | |
463 "movd %1, %%mm0\n\t" | |
464 "movd 3%1, %%mm3\n\t" | |
465 "punpckldq 6%1, %%mm0\n\t" | |
466 "punpckldq 9%1, %%mm3\n\t" | |
467 "movq %%mm0, %%mm1\n\t" | |
468 "movq %%mm0, %%mm2\n\t" | |
469 "movq %%mm3, %%mm4\n\t" | |
470 "movq %%mm3, %%mm5\n\t" | |
471 "psrlq $3, %%mm0\n\t" | |
472 "psrlq $3, %%mm3\n\t" | |
473 "pand %2, %%mm0\n\t" | |
474 "pand %2, %%mm3\n\t" | |
475 "psrlq $6, %%mm1\n\t" | |
476 "psrlq $6, %%mm4\n\t" | |
477 "pand %%mm6, %%mm1\n\t" | |
478 "pand %%mm6, %%mm4\n\t" | |
479 "psrlq $9, %%mm2\n\t" | |
480 "psrlq $9, %%mm5\n\t" | |
481 "pand %%mm7, %%mm2\n\t" | |
482 "pand %%mm7, %%mm5\n\t" | |
483 "por %%mm1, %%mm0\n\t" | |
484 "por %%mm4, %%mm3\n\t" | |
485 "por %%mm2, %%mm0\n\t" | |
486 "por %%mm5, %%mm3\n\t" | |
487 "psllq $16, %%mm3\n\t" | |
488 "por %%mm3, %%mm0\n\t" | |
489 MOVNTQ" %%mm0, %0\n\t" | |
490 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
491 d += 4; | |
492 s += 12; | |
493 } | |
494 while(s < end) | |
495 { | |
496 const int b= *s++; | |
497 const int g= *s++; | |
498 const int r= *s++; | |
499 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
500 } | |
501 __asm __volatile(SFENCE:::"memory"); | |
502 __asm __volatile(EMMS:::"memory"); | |
503 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
504 unsigned j,i,num_pixels=src_size/3; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
505 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
506 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 507 { |
508 const int b= src[i+0]; | |
509 const int g= src[i+1]; | |
510 const int r= src[i+2]; | |
511 | |
2720 | 512 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2718 | 513 } |
2741 | 514 #endif |
2718 | 515 } |
2694 | 516 |
517 /** | |
518 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette | |
519 */ | |
2718 | 520 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 521 { |
2718 | 522 unsigned i; |
2702 | 523 for(i=0; i<num_pixels; i++) |
2694 | 524 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; |
525 } | |
526 | |
527 /** | |
528 * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette | |
529 */ | |
2718 | 530 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 531 { |
2718 | 532 unsigned i; |
2702 | 533 for(i=0; i<num_pixels; i++) |
2694 | 534 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; |
2697 | 535 } |
2702 | 536 /** |
537 * | |
2724 | 538 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
539 * problem for anyone then tell me, and ill fix it) | |
2702 | 540 */ |
2723 | 541 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 542 unsigned int width, unsigned int height, |
543 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
2701 | 544 { |
2723 | 545 int y; |
546 const int chromWidth= width>>1; | |
547 for(y=0; y<height; y++) | |
548 { | |
2702 | 549 #ifdef HAVE_MMX |
2723 | 550 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
551 asm volatile( | |
552 "xorl %%eax, %%eax \n\t" | |
553 "1: \n\t" | |
554 PREFETCH" 32(%1, %%eax, 2) \n\t" | |
555 PREFETCH" 32(%2, %%eax) \n\t" | |
556 PREFETCH" 32(%3, %%eax) \n\t" | |
557 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | |
558 "movq %%mm0, %%mm2 \n\t" // U(0) | |
559 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | |
560 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
561 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
562 | |
563 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | |
564 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | |
565 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
566 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
567 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
568 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
569 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
570 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 571 |
2723 | 572 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" |
573 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | |
574 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | |
575 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | |
2702 | 576 |
2723 | 577 "addl $8, %%eax \n\t" |
578 "cmpl %4, %%eax \n\t" | |
579 " jb 1b \n\t" | |
580 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) | |
581 : "%eax" | |
582 ); | |
2702 | 583 #else |
2723 | 584 int i; |
585 for(i=0; i<chromWidth; i++) | |
586 { | |
587 dst[4*i+0] = ysrc[2*i+0]; | |
588 dst[4*i+1] = usrc[i]; | |
589 dst[4*i+2] = ysrc[2*i+1]; | |
590 dst[4*i+3] = vsrc[i]; | |
591 } | |
592 #endif | |
593 if(y&1) | |
594 { | |
595 usrc += chromStride; | |
596 vsrc += chromStride; | |
597 } | |
598 ysrc += lumStride; | |
599 dst += dstStride; | |
2701 | 600 } |
2723 | 601 #ifdef HAVE_MMX |
602 asm( EMMS" \n\t" | |
603 SFENCE" \n\t" | |
604 :::"memory"); | |
2702 | 605 #endif |
2701 | 606 } |
607 | |
2724 | 608 /** |
609 * | |
610 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
611 * problem for anyone then tell me, and ill fix it) | |
612 */ | |
613 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
2725 | 614 unsigned int width, unsigned int height, |
615 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
2701 | 616 { |
2724 | 617 int y; |
618 const int chromWidth= width>>1; | |
619 for(y=0; y<height; y+=2) | |
620 { | |
2704 | 621 #ifdef HAVE_MMX |
2724 | 622 asm volatile( |
623 "xorl %%eax, %%eax \n\t" | |
624 "pcmpeqw %%mm7, %%mm7 \n\t" | |
625 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
626 "1: \n\t" | |
627 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
628 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
629 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
630 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
631 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
632 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
633 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
634 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
635 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
636 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
637 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
638 | |
639 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2704 | 640 |
2724 | 641 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) |
642 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | |
643 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
644 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
645 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
646 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
647 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
648 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
649 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
650 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 651 |
2724 | 652 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" |
653 | |
654 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
655 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
656 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
657 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
658 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
659 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
660 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
661 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 662 |
2724 | 663 MOVNTQ" %%mm0, (%3, %%eax) \n\t" |
664 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
665 | |
666 "addl $8, %%eax \n\t" | |
667 "cmpl %4, %%eax \n\t" | |
668 " jb 1b \n\t" | |
2725 | 669 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
670 : "memory", "%eax" | |
671 ); | |
2704 | 672 |
2725 | 673 asm volatile( |
674 "xorl %%eax, %%eax \n\t" | |
2724 | 675 "1: \n\t" |
676 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
677 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
678 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
679 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
680 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
681 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
682 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
683 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
684 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
685 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
686 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 687 |
2724 | 688 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" |
689 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
690 | |
691 "addl $8, %%eax \n\t" | |
2725 | 692 "cmpl %4, %%eax \n\t" |
2724 | 693 " jb 1b \n\t" |
2704 | 694 |
2725 | 695 ::"r"(src+srcStride), "r"(ydst+lumStride), "r"(udst), "r"(vdst), "r" (chromWidth) |
2724 | 696 : "memory", "%eax" |
697 ); | |
2704 | 698 #else |
2724 | 699 int i; |
700 for(i=0; i<chromWidth; i++) | |
701 { | |
702 ydst[2*i+0] = src[4*i+0]; | |
703 udst[i] = src[4*i+1]; | |
704 ydst[2*i+1] = src[4*i+2]; | |
705 vdst[i] = src[4*i+3]; | |
706 } | |
707 ydst += lumStride; | |
708 src += srcStride; | |
709 | |
710 for(i=0; i<chromWidth; i++) | |
711 { | |
712 ydst[2*i+0] = src[4*i+0]; | |
713 ydst[2*i+1] = src[4*i+2]; | |
714 } | |
715 #endif | |
716 udst += chromStride; | |
717 vdst += chromStride; | |
718 ydst += lumStride; | |
719 src += srcStride; | |
2701 | 720 } |
2724 | 721 #ifdef HAVE_MMX |
722 asm( EMMS" \n\t" | |
723 SFENCE" \n\t" | |
724 :::"memory"); | |
2704 | 725 #endif |
2723 | 726 } |