Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 3069:e6bbc5e6a054
wrote a very annoying bug
author | alex |
---|---|
date | Thu, 22 Nov 2001 15:10:38 +0000 |
parents | 1d92268eb8fc |
children | ab67556586fa |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
2702 | 8 * palette stuff & yuv stuff by Michael |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
9 */ |
2504 | 10 #include <inttypes.h> |
11 #include "../config.h" | |
12 #include "rgb2rgb.h" | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
13 #include "../mmx_defs.h" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
14 |
2535 | 15 #ifdef HAVE_MMX |
2755 | 16 static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL; |
17 static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL; | |
18 static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL; | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
19 static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
20 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
21 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
22 static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
23 static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
24 static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL; |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
25 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
26 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
27 static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; |
2741 | 28 static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL; |
29 static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL; | |
30 static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; | |
31 static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL; | |
32 static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL; | |
33 static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; | |
2755 | 34 #if 0 |
35 static volatile uint64_t __attribute__((aligned(8))) b5Dither; | |
36 static volatile uint64_t __attribute__((aligned(8))) g5Dither; | |
37 static volatile uint64_t __attribute__((aligned(8))) g6Dither; | |
38 static volatile uint64_t __attribute__((aligned(8))) r5Dither; | |
39 | |
40 static uint64_t __attribute__((aligned(8))) dither4[2]={ | |
41 0x0103010301030103LL, | |
42 0x0200020002000200LL,}; | |
43 | |
44 static uint64_t __attribute__((aligned(8))) dither8[2]={ | |
45 0x0602060206020602LL, | |
46 0x0004000400040004LL,}; | |
47 #endif | |
2535 | 48 #endif |
2513 | 49 |
2718 | 50 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2504 | 51 { |
2508 | 52 uint8_t *dest = dst; |
2677 | 53 const uint8_t *s = src; |
54 const uint8_t *end; | |
2510 | 55 #ifdef HAVE_MMX |
56 uint8_t *mm_end; | |
57 #endif | |
2504 | 58 end = s + src_size; |
2510 | 59 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
60 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2740 | 61 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
62 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2740 | 63 if(mm_end == end) mm_end -= MMREG_SIZE*4; |
2510 | 64 while(s < mm_end) |
65 { | |
2511 | 66 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
67 PREFETCH" 32%1\n\t" |
2510 | 68 "movd %1, %%mm0\n\t" |
2738 | 69 "punpckldq 3%1, %%mm0\n\t" |
70 "movd 6%1, %%mm1\n\t" | |
71 "punpckldq 9%1, %%mm1\n\t" | |
72 "movd 12%1, %%mm2\n\t" | |
73 "punpckldq 15%1, %%mm2\n\t" | |
74 "movd 18%1, %%mm3\n\t" | |
75 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 76 "pand %%mm7, %%mm0\n\t" |
2738 | 77 "pand %%mm7, %%mm1\n\t" |
2510 | 78 "pand %%mm7, %%mm2\n\t" |
2738 | 79 "pand %%mm7, %%mm3\n\t" |
2511 | 80 MOVNTQ" %%mm0, %0\n\t" |
2738 | 81 MOVNTQ" %%mm1, 8%0\n\t" |
82 MOVNTQ" %%mm2, 16%0\n\t" | |
83 MOVNTQ" %%mm3, 24%0" | |
2510 | 84 :"=m"(*dest) |
85 :"m"(*s) | |
86 :"memory"); | |
2738 | 87 dest += 32; |
88 s += 24; | |
2510 | 89 } |
2513 | 90 __asm __volatile(SFENCE:::"memory"); |
2511 | 91 __asm __volatile(EMMS:::"memory"); |
2510 | 92 #endif |
2504 | 93 while(s < end) |
94 { | |
2508 | 95 *dest++ = *s++; |
96 *dest++ = *s++; | |
97 *dest++ = *s++; | |
98 *dest++ = 0; | |
2504 | 99 } |
100 } | |
2505 | 101 |
2718 | 102 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 103 { |
104 uint8_t *dest = dst; | |
2677 | 105 const uint8_t *s = src; |
106 const uint8_t *end; | |
2517 | 107 #ifdef HAVE_MMX |
108 uint8_t *mm_end; | |
109 #endif | |
2505 | 110 end = s + src_size; |
2517 | 111 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
112 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
113 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2517 | 114 while(s < mm_end) |
115 { | |
116 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
117 PREFETCH" 32%1\n\t" |
2517 | 118 "movq %1, %%mm0\n\t" |
119 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
120 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
121 "movq 24%1, %%mm5\n\t" |
2517 | 122 "movq %%mm0, %%mm2\n\t" |
123 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
124 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
125 "movq %%mm5, %%mm7\n\t" |
2517 | 126 "psrlq $8, %%mm2\n\t" |
127 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
128 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
129 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
130 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
131 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
132 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
133 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
134 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
135 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
136 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
137 "pand %3, %%mm7\n\t" |
2517 | 138 "por %%mm2, %%mm0\n\t" |
139 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
140 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
141 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
142 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
143 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
144 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
145 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
146 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
147 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
148 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
149 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
150 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
151 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
152 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
153 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
154 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
155 "por %%mm5, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
156 |
2517 | 157 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
158 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
159 MOVNTQ" %%mm4, 16%0" |
2517 | 160 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
161 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
162 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 163 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
164 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
165 s += 32; |
2517 | 166 } |
167 __asm __volatile(SFENCE:::"memory"); | |
168 __asm __volatile(EMMS:::"memory"); | |
169 #endif | |
2505 | 170 while(s < end) |
171 { | |
172 *dest++ = *s++; | |
173 *dest++ = *s++; | |
174 *dest++ = *s++; | |
175 s++; | |
176 } | |
177 } | |
2506 | 178 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
179 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
180 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
181 ported to gcc & bugfixed : A'rpi |
2564 | 182 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
183 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
184 */ |
2718 | 185 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 186 { |
187 #ifdef HAVE_MMX | |
2677 | 188 register const char* s=src+src_size; |
2506 | 189 register char* d=dst+src_size; |
190 register int offs=-src_size; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
191 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
192 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
193 "movq %0, %%mm4\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
194 ::"m"(mask15s)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
195 while(offs<0) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
196 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
197 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
198 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
199 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
200 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
201 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
202 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
203 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
204 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
205 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
206 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
207 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
208 MOVNTQ" %%mm2, 8%0" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
209 :"=m"(*(d+offs)) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
210 :"m"(*(s+offs)) |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
211 ); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
212 offs+=16; |
2506 | 213 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
214 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
215 __asm __volatile(EMMS:::"memory"); |
2506 | 216 #else |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
217 #if 0 |
2677 | 218 const uint16_t *s1=( uint16_t * )src; |
2506 | 219 uint16_t *d1=( uint16_t * )dst; |
220 uint16_t *e=((uint8_t *)s1)+src_size; | |
221 while( s1<e ){ | |
222 register int x=*( s1++ ); | |
223 /* rrrrrggggggbbbbb | |
224 0rrrrrgggggbbbbb | |
225 0111 1111 1110 0000=0x7FE0 | |
226 00000000000001 1111=0x001F */ | |
227 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); | |
228 } | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
229 #else |
2718 | 230 const unsigned *s1=( unsigned * )src; |
231 unsigned *d1=( unsigned * )dst; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
232 int i; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
233 int size= src_size>>2; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
234 for(i=0; i<size; i++) |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
235 { |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
236 register int x= s1[i]; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
237 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
238 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
239 |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
240 } |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
241 #endif |
2506 | 242 #endif |
243 } | |
2694 | 244 |
245 /** | |
246 * Pallete is assumed to contain bgr32 | |
247 */ | |
2718 | 248 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 249 { |
2718 | 250 unsigned i; |
2702 | 251 for(i=0; i<num_pixels; i++) |
2718 | 252 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ]; |
2694 | 253 } |
254 | |
2697 | 255 /** |
256 * Pallete is assumed to contain bgr32 | |
257 */ | |
2718 | 258 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2697 | 259 { |
2718 | 260 unsigned i; |
2697 | 261 /* |
262 writes 1 byte o much and might cause alignment issues on some architectures? | |
2702 | 263 for(i=0; i<num_pixels; i++) |
2718 | 264 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ]; |
2697 | 265 */ |
2702 | 266 for(i=0; i<num_pixels; i++) |
2697 | 267 { |
268 //FIXME slow? | |
269 dst[0]= palette[ src[i]*4+0 ]; | |
270 dst[1]= palette[ src[i]*4+1 ]; | |
271 dst[2]= palette[ src[i]*4+2 ]; | |
272 dst+= 3; | |
273 } | |
274 } | |
275 | |
2718 | 276 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 277 { |
2741 | 278 #ifdef HAVE_MMX |
279 const uint8_t *s = src; | |
280 const uint8_t *end,*mm_end; | |
281 uint16_t *d = (uint16_t *)dst; | |
282 end = s + src_size; | |
283 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
284 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
285 __asm __volatile( | |
286 "movq %0, %%mm7\n\t" | |
287 "movq %1, %%mm6\n\t" | |
288 ::"m"(red_16mask),"m"(green_16mask)); | |
289 while(s < mm_end) | |
290 { | |
291 __asm __volatile( | |
292 PREFETCH" 32%1\n\t" | |
293 "movd %1, %%mm0\n\t" | |
294 "movd 4%1, %%mm3\n\t" | |
295 "punpckldq 8%1, %%mm0\n\t" | |
296 "punpckldq 12%1, %%mm3\n\t" | |
297 "movq %%mm0, %%mm1\n\t" | |
298 "movq %%mm0, %%mm2\n\t" | |
299 "movq %%mm3, %%mm4\n\t" | |
300 "movq %%mm3, %%mm5\n\t" | |
301 "psrlq $3, %%mm0\n\t" | |
302 "psrlq $3, %%mm3\n\t" | |
303 "pand %2, %%mm0\n\t" | |
304 "pand %2, %%mm3\n\t" | |
305 "psrlq $5, %%mm1\n\t" | |
306 "psrlq $5, %%mm4\n\t" | |
307 "pand %%mm6, %%mm1\n\t" | |
308 "pand %%mm6, %%mm4\n\t" | |
309 "psrlq $8, %%mm2\n\t" | |
310 "psrlq $8, %%mm5\n\t" | |
311 "pand %%mm7, %%mm2\n\t" | |
312 "pand %%mm7, %%mm5\n\t" | |
313 "por %%mm1, %%mm0\n\t" | |
314 "por %%mm4, %%mm3\n\t" | |
315 "por %%mm2, %%mm0\n\t" | |
316 "por %%mm5, %%mm3\n\t" | |
317 "psllq $16, %%mm3\n\t" | |
318 "por %%mm3, %%mm0\n\t" | |
319 MOVNTQ" %%mm0, %0\n\t" | |
320 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
321 d += 4; | |
322 s += 16; | |
323 } | |
324 while(s < end) | |
325 { | |
326 const int b= *s++; | |
327 const int g= *s++; | |
328 const int r= *s++; | |
329 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
330 } | |
331 __asm __volatile(SFENCE:::"memory"); | |
332 __asm __volatile(EMMS:::"memory"); | |
333 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
334 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
335 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
336 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 337 { |
338 const int b= src[i+0]; | |
339 const int g= src[i+1]; | |
340 const int r= src[i+2]; | |
341 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
342 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2694 | 343 } |
2741 | 344 #endif |
2694 | 345 } |
346 | |
2718 | 347 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 348 { |
2741 | 349 #ifdef HAVE_MMX |
350 const uint8_t *s = src; | |
351 const uint8_t *end,*mm_end; | |
352 uint16_t *d = (uint16_t *)dst; | |
353 end = s + src_size; | |
354 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
355 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
356 __asm __volatile( | |
357 "movq %0, %%mm7\n\t" | |
358 "movq %1, %%mm6\n\t" | |
359 ::"m"(red_15mask),"m"(green_15mask)); | |
360 while(s < mm_end) | |
361 { | |
362 __asm __volatile( | |
363 PREFETCH" 32%1\n\t" | |
364 "movd %1, %%mm0\n\t" | |
365 "movd 4%1, %%mm3\n\t" | |
366 "punpckldq 8%1, %%mm0\n\t" | |
367 "punpckldq 12%1, %%mm3\n\t" | |
368 "movq %%mm0, %%mm1\n\t" | |
369 "movq %%mm0, %%mm2\n\t" | |
370 "movq %%mm3, %%mm4\n\t" | |
371 "movq %%mm3, %%mm5\n\t" | |
372 "psrlq $3, %%mm0\n\t" | |
373 "psrlq $3, %%mm3\n\t" | |
374 "pand %2, %%mm0\n\t" | |
375 "pand %2, %%mm3\n\t" | |
376 "psrlq $6, %%mm1\n\t" | |
377 "psrlq $6, %%mm4\n\t" | |
378 "pand %%mm6, %%mm1\n\t" | |
379 "pand %%mm6, %%mm4\n\t" | |
380 "psrlq $9, %%mm2\n\t" | |
381 "psrlq $9, %%mm5\n\t" | |
382 "pand %%mm7, %%mm2\n\t" | |
383 "pand %%mm7, %%mm5\n\t" | |
384 "por %%mm1, %%mm0\n\t" | |
385 "por %%mm4, %%mm3\n\t" | |
386 "por %%mm2, %%mm0\n\t" | |
387 "por %%mm5, %%mm3\n\t" | |
388 "psllq $16, %%mm3\n\t" | |
389 "por %%mm3, %%mm0\n\t" | |
390 MOVNTQ" %%mm0, %0\n\t" | |
391 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
392 d += 4; | |
393 s += 16; | |
394 } | |
395 while(s < end) | |
396 { | |
397 const int b= *s++; | |
398 const int g= *s++; | |
399 const int r= *s++; | |
400 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
401 } | |
402 __asm __volatile(SFENCE:::"memory"); | |
403 __asm __volatile(EMMS:::"memory"); | |
404 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
405 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
406 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
407 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 408 { |
409 const int b= src[i+0]; | |
410 const int g= src[i+1]; | |
411 const int r= src[i+2]; | |
412 | |
2720 | 413 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2694 | 414 } |
2741 | 415 #endif |
2694 | 416 } |
417 | |
2718 | 418 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size) |
419 { | |
2740 | 420 #ifdef HAVE_MMX |
421 const uint8_t *s = src; | |
422 const uint8_t *end,*mm_end; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
423 uint16_t *d = (uint16_t *)dst; |
2740 | 424 end = s + src_size; |
425 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
2738 | 426 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
427 __asm __volatile( | |
428 "movq %0, %%mm7\n\t" | |
429 "movq %1, %%mm6\n\t" | |
2741 | 430 ::"m"(red_16mask),"m"(green_16mask)); |
431 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
2740 | 432 while(s < mm_end) |
2738 | 433 { |
434 __asm __volatile( | |
435 PREFETCH" 32%1\n\t" | |
436 "movd %1, %%mm0\n\t" | |
2740 | 437 "movd 3%1, %%mm3\n\t" |
438 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 439 "punpckldq 9%1, %%mm3\n\t" |
440 "movq %%mm0, %%mm1\n\t" | |
441 "movq %%mm0, %%mm2\n\t" | |
442 "movq %%mm3, %%mm4\n\t" | |
443 "movq %%mm3, %%mm5\n\t" | |
444 "psrlq $3, %%mm0\n\t" | |
445 "psrlq $3, %%mm3\n\t" | |
2740 | 446 "pand %2, %%mm0\n\t" |
447 "pand %2, %%mm3\n\t" | |
448 "psrlq $5, %%mm1\n\t" | |
449 "psrlq $5, %%mm4\n\t" | |
450 "pand %%mm6, %%mm1\n\t" | |
451 "pand %%mm6, %%mm4\n\t" | |
452 "psrlq $8, %%mm2\n\t" | |
453 "psrlq $8, %%mm5\n\t" | |
454 "pand %%mm7, %%mm2\n\t" | |
455 "pand %%mm7, %%mm5\n\t" | |
2738 | 456 "por %%mm1, %%mm0\n\t" |
2740 | 457 "por %%mm4, %%mm3\n\t" |
2738 | 458 "por %%mm2, %%mm0\n\t" |
459 "por %%mm5, %%mm3\n\t" | |
2740 | 460 "psllq $16, %%mm3\n\t" |
461 "por %%mm3, %%mm0\n\t" | |
2738 | 462 MOVNTQ" %%mm0, %0\n\t" |
2741 | 463 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 464 d += 4; |
465 s += 12; | |
2738 | 466 } |
2740 | 467 while(s < end) |
468 { | |
469 const int b= *s++; | |
470 const int g= *s++; | |
471 const int r= *s++; | |
472 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
473 } | |
474 __asm __volatile(SFENCE:::"memory"); | |
475 __asm __volatile(EMMS:::"memory"); | |
476 #else | |
477 unsigned j,i,num_pixels=src_size/3; | |
478 uint16_t *d = (uint16_t *)dst; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
479 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 480 { |
481 const int b= src[i+0]; | |
482 const int g= src[i+1]; | |
483 const int r= src[i+2]; | |
484 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
485 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2718 | 486 } |
2740 | 487 #endif |
2718 | 488 } |
489 | |
490 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
491 { | |
2741 | 492 #ifdef HAVE_MMX |
493 const uint8_t *s = src; | |
494 const uint8_t *end,*mm_end; | |
495 uint16_t *d = (uint16_t *)dst; | |
496 end = s + src_size; | |
497 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
498 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
499 __asm __volatile( | |
500 "movq %0, %%mm7\n\t" | |
501 "movq %1, %%mm6\n\t" | |
502 ::"m"(red_15mask),"m"(green_15mask)); | |
503 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
504 while(s < mm_end) | |
505 { | |
506 __asm __volatile( | |
507 PREFETCH" 32%1\n\t" | |
508 "movd %1, %%mm0\n\t" | |
509 "movd 3%1, %%mm3\n\t" | |
510 "punpckldq 6%1, %%mm0\n\t" | |
511 "punpckldq 9%1, %%mm3\n\t" | |
512 "movq %%mm0, %%mm1\n\t" | |
513 "movq %%mm0, %%mm2\n\t" | |
514 "movq %%mm3, %%mm4\n\t" | |
515 "movq %%mm3, %%mm5\n\t" | |
516 "psrlq $3, %%mm0\n\t" | |
517 "psrlq $3, %%mm3\n\t" | |
518 "pand %2, %%mm0\n\t" | |
519 "pand %2, %%mm3\n\t" | |
520 "psrlq $6, %%mm1\n\t" | |
521 "psrlq $6, %%mm4\n\t" | |
522 "pand %%mm6, %%mm1\n\t" | |
523 "pand %%mm6, %%mm4\n\t" | |
524 "psrlq $9, %%mm2\n\t" | |
525 "psrlq $9, %%mm5\n\t" | |
526 "pand %%mm7, %%mm2\n\t" | |
527 "pand %%mm7, %%mm5\n\t" | |
528 "por %%mm1, %%mm0\n\t" | |
529 "por %%mm4, %%mm3\n\t" | |
530 "por %%mm2, %%mm0\n\t" | |
531 "por %%mm5, %%mm3\n\t" | |
532 "psllq $16, %%mm3\n\t" | |
533 "por %%mm3, %%mm0\n\t" | |
534 MOVNTQ" %%mm0, %0\n\t" | |
535 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
536 d += 4; | |
537 s += 12; | |
538 } | |
539 while(s < end) | |
540 { | |
541 const int b= *s++; | |
542 const int g= *s++; | |
543 const int r= *s++; | |
544 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
545 } | |
546 __asm __volatile(SFENCE:::"memory"); | |
547 __asm __volatile(EMMS:::"memory"); | |
548 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
549 unsigned j,i,num_pixels=src_size/3; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
550 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
551 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 552 { |
553 const int b= src[i+0]; | |
554 const int g= src[i+1]; | |
555 const int r= src[i+2]; | |
556 | |
2720 | 557 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2718 | 558 } |
2741 | 559 #endif |
2718 | 560 } |
2694 | 561 |
562 /** | |
563 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette | |
564 */ | |
2718 | 565 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 566 { |
2718 | 567 unsigned i; |
2702 | 568 for(i=0; i<num_pixels; i++) |
2694 | 569 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; |
570 } | |
571 | |
572 /** | |
573 * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette | |
574 */ | |
2718 | 575 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 576 { |
2718 | 577 unsigned i; |
2702 | 578 for(i=0; i<num_pixels; i++) |
2694 | 579 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; |
2697 | 580 } |
2755 | 581 |
582 void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size) | |
583 { | |
584 int num_pixels= src_size >> 2; | |
585 #ifdef HAVE_MMX | |
586 asm volatile ( | |
587 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
588 ".balign 16 \n\t" |
2755 | 589 "1: \n\t" |
590 PREFETCH" 32(%0, %%eax) \n\t" | |
591 "movq (%0, %%eax), %%mm0 \n\t" | |
592 "movq %%mm0, %%mm1 \n\t" | |
593 "movq %%mm0, %%mm2 \n\t" | |
594 "pslld $16, %%mm0 \n\t" | |
595 "psrld $16, %%mm1 \n\t" | |
596 "pand mask32r, %%mm0 \n\t" | |
597 "pand mask32g, %%mm2 \n\t" | |
598 "pand mask32b, %%mm1 \n\t" | |
599 "por %%mm0, %%mm2 \n\t" | |
600 "por %%mm1, %%mm2 \n\t" | |
601 MOVNTQ" %%mm2, (%1, %%eax) \n\t" | |
602 "addl $2, %%eax \n\t" | |
603 "cmpl %2, %%eax \n\t" | |
604 " jb 1b \n\t" | |
605 :: "r" (src), "r"(dst), "r" (num_pixels) | |
606 : "%eax" | |
607 ); | |
2766 | 608 |
609 __asm __volatile(SFENCE:::"memory"); | |
610 __asm __volatile(EMMS:::"memory"); | |
2755 | 611 #else |
612 int i; | |
613 for(i=0; i<num_pixels; i++) | |
614 { | |
615 dst[4*i + 0] = src[4*i + 2]; | |
616 dst[4*i + 1] = src[4*i + 1]; | |
617 dst[4*i + 2] = src[4*i + 0]; | |
618 } | |
619 #endif | |
620 } | |
621 | |
2702 | 622 /** |
623 * | |
2724 | 624 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
625 * problem for anyone then tell me, and ill fix it) | |
2702 | 626 */ |
2723 | 627 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 628 unsigned int width, unsigned int height, |
629 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
2701 | 630 { |
2723 | 631 int y; |
632 const int chromWidth= width>>1; | |
633 for(y=0; y<height; y++) | |
634 { | |
2702 | 635 #ifdef HAVE_MMX |
2723 | 636 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
637 asm volatile( | |
638 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
639 ".balign 16 \n\t" |
2723 | 640 "1: \n\t" |
641 PREFETCH" 32(%1, %%eax, 2) \n\t" | |
642 PREFETCH" 32(%2, %%eax) \n\t" | |
643 PREFETCH" 32(%3, %%eax) \n\t" | |
644 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | |
645 "movq %%mm0, %%mm2 \n\t" // U(0) | |
646 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | |
647 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
648 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
649 | |
650 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | |
651 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | |
652 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
653 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
654 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
655 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
656 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
657 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 658 |
2723 | 659 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" |
660 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | |
661 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | |
662 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | |
2702 | 663 |
2723 | 664 "addl $8, %%eax \n\t" |
665 "cmpl %4, %%eax \n\t" | |
666 " jb 1b \n\t" | |
667 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) | |
668 : "%eax" | |
669 ); | |
2702 | 670 #else |
2723 | 671 int i; |
672 for(i=0; i<chromWidth; i++) | |
673 { | |
674 dst[4*i+0] = ysrc[2*i+0]; | |
675 dst[4*i+1] = usrc[i]; | |
676 dst[4*i+2] = ysrc[2*i+1]; | |
677 dst[4*i+3] = vsrc[i]; | |
678 } | |
679 #endif | |
680 if(y&1) | |
681 { | |
682 usrc += chromStride; | |
683 vsrc += chromStride; | |
684 } | |
685 ysrc += lumStride; | |
686 dst += dstStride; | |
2701 | 687 } |
2723 | 688 #ifdef HAVE_MMX |
689 asm( EMMS" \n\t" | |
690 SFENCE" \n\t" | |
691 :::"memory"); | |
2702 | 692 #endif |
2701 | 693 } |
694 | |
2724 | 695 /** |
696 * | |
697 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
698 * problem for anyone then tell me, and ill fix it) | |
699 */ | |
700 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
2725 | 701 unsigned int width, unsigned int height, |
702 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
2701 | 703 { |
2724 | 704 int y; |
705 const int chromWidth= width>>1; | |
706 for(y=0; y<height; y+=2) | |
707 { | |
2704 | 708 #ifdef HAVE_MMX |
2724 | 709 asm volatile( |
710 "xorl %%eax, %%eax \n\t" | |
711 "pcmpeqw %%mm7, %%mm7 \n\t" | |
712 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
713 ".balign 16 \n\t" |
2724 | 714 "1: \n\t" |
715 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
716 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
717 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
718 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
719 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
720 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
721 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
722 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
723 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
724 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
725 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
726 | |
727 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2704 | 728 |
2724 | 729 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) |
730 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | |
731 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
732 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
733 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
734 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
735 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
736 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
737 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
738 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 739 |
2724 | 740 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" |
741 | |
742 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
743 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
744 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
745 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
746 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
747 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
748 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
749 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 750 |
2724 | 751 MOVNTQ" %%mm0, (%3, %%eax) \n\t" |
752 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
753 | |
754 "addl $8, %%eax \n\t" | |
755 "cmpl %4, %%eax \n\t" | |
756 " jb 1b \n\t" | |
2725 | 757 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
758 : "memory", "%eax" | |
759 ); | |
2704 | 760 |
2806 | 761 ydst += lumStride; |
762 src += srcStride; | |
763 | |
2725 | 764 asm volatile( |
765 "xorl %%eax, %%eax \n\t" | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
766 ".balign 16 \n\t" |
2724 | 767 "1: \n\t" |
768 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
769 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
770 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
771 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
772 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
773 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
774 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
775 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
776 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
777 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
778 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 779 |
2724 | 780 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" |
781 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
782 | |
783 "addl $8, %%eax \n\t" | |
2725 | 784 "cmpl %4, %%eax \n\t" |
2724 | 785 " jb 1b \n\t" |
2704 | 786 |
2806 | 787 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
2724 | 788 : "memory", "%eax" |
789 ); | |
2704 | 790 #else |
2724 | 791 int i; |
792 for(i=0; i<chromWidth; i++) | |
793 { | |
794 ydst[2*i+0] = src[4*i+0]; | |
795 udst[i] = src[4*i+1]; | |
796 ydst[2*i+1] = src[4*i+2]; | |
797 vdst[i] = src[4*i+3]; | |
798 } | |
799 ydst += lumStride; | |
800 src += srcStride; | |
801 | |
802 for(i=0; i<chromWidth; i++) | |
803 { | |
804 ydst[2*i+0] = src[4*i+0]; | |
805 ydst[2*i+1] = src[4*i+2]; | |
806 } | |
807 #endif | |
808 udst += chromStride; | |
809 vdst += chromStride; | |
810 ydst += lumStride; | |
811 src += srcStride; | |
2701 | 812 } |
2724 | 813 #ifdef HAVE_MMX |
2847 | 814 asm volatile( EMMS" \n\t" |
815 SFENCE" \n\t" | |
816 :::"memory"); | |
2704 | 817 #endif |
2723 | 818 } |
2801 | 819 |
820 /** | |
821 * | |
822 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
823 * problem for anyone then tell me, and ill fix it) | |
824 */ | |
825 void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
826 unsigned int width, unsigned int height, | |
827 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
828 { | |
829 int y; | |
830 const int chromWidth= width>>1; | |
831 for(y=0; y<height; y+=2) | |
832 { | |
2847 | 833 #ifdef HAVE_MMX |
834 asm volatile( | |
835 "xorl %%eax, %%eax \n\t" | |
836 "pcmpeqw %%mm7, %%mm7 \n\t" | |
837 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
838 ".balign 16 \n\t" | |
839 "1: \n\t" | |
840 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
841 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
842 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
843 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
844 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
845 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
846 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
847 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
848 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
849 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
850 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
851 | |
852 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
853 | |
854 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
855 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
856 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
857 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
858 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
859 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
860 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
861 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
862 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
863 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
864 | |
865 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
866 | |
867 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
868 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
869 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
870 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
871 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
872 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
873 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
874 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
875 | |
876 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
877 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
878 | |
879 "addl $8, %%eax \n\t" | |
880 "cmpl %4, %%eax \n\t" | |
881 " jb 1b \n\t" | |
882 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
883 : "memory", "%eax" | |
884 ); | |
885 | |
886 ydst += lumStride; | |
887 src += srcStride; | |
888 | |
889 asm volatile( | |
890 "xorl %%eax, %%eax \n\t" | |
891 ".balign 16 \n\t" | |
892 "1: \n\t" | |
893 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
894 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
895 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
896 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
897 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
898 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
899 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
900 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
901 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
902 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
903 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
904 | |
905 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
906 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
907 | |
908 "addl $8, %%eax \n\t" | |
909 "cmpl %4, %%eax \n\t" | |
910 " jb 1b \n\t" | |
911 | |
912 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) | |
913 : "memory", "%eax" | |
914 ); | |
915 #else | |
2801 | 916 int i; |
917 for(i=0; i<chromWidth; i++) | |
918 { | |
919 udst[i] = src[4*i+0]; | |
920 ydst[2*i+0] = src[4*i+1]; | |
921 vdst[i] = src[4*i+2]; | |
922 ydst[2*i+1] = src[4*i+3]; | |
923 } | |
924 ydst += lumStride; | |
925 src += srcStride; | |
926 | |
927 for(i=0; i<chromWidth; i++) | |
928 { | |
929 ydst[2*i+0] = src[4*i+1]; | |
930 ydst[2*i+1] = src[4*i+3]; | |
931 } | |
2847 | 932 #endif |
2801 | 933 udst += chromStride; |
934 vdst += chromStride; | |
935 ydst += lumStride; | |
936 src += srcStride; | |
937 } | |
2847 | 938 #ifdef HAVE_MMX |
939 asm volatile( EMMS" \n\t" | |
940 SFENCE" \n\t" | |
941 :::"memory"); | |
942 #endif | |
2801 | 943 } |
944 | |
945 |