Mercurial > mplayer.hg
annotate postproc/rgb2rgb_template.c @ 2746:dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
author | nick |
---|---|
date | Tue, 06 Nov 2001 17:14:22 +0000 |
parents | b8a692c59b64 |
children | 2f93f4351765 |
rev | line source |
---|---|
2694 | 1 /* |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
2 * |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
3 * rgb2rgb.c, Software RGB to RGB convertor |
2732 | 4 * pluralize by Software PAL8 to RGB convertor |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
7 * Written by Nick Kurshev. |
2702 | 8 * palette stuff & yuv stuff by Michael |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
9 */ |
2504 | 10 #include <inttypes.h> |
11 #include "../config.h" | |
12 #include "rgb2rgb.h" | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
13 #include "../mmx_defs.h" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
14 |
2535 | 15 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
16 static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
17 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
18 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
19 static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
20 static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
21 static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL; |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
22 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
23 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
24 static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; |
2741 | 25 static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL; |
26 static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL; | |
27 static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; | |
28 static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL; | |
29 static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL; | |
30 static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; | |
2535 | 31 #endif |
2513 | 32 |
2718 | 33 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2504 | 34 { |
2508 | 35 uint8_t *dest = dst; |
2677 | 36 const uint8_t *s = src; |
37 const uint8_t *end; | |
2510 | 38 #ifdef HAVE_MMX |
39 uint8_t *mm_end; | |
40 #endif | |
2504 | 41 end = s + src_size; |
2510 | 42 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
43 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2740 | 44 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
45 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
2740 | 46 if(mm_end == end) mm_end -= MMREG_SIZE*4; |
2510 | 47 while(s < mm_end) |
48 { | |
2511 | 49 __asm __volatile( |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
50 PREFETCH" 32%1\n\t" |
2510 | 51 "movd %1, %%mm0\n\t" |
2738 | 52 "punpckldq 3%1, %%mm0\n\t" |
53 "movd 6%1, %%mm1\n\t" | |
54 "punpckldq 9%1, %%mm1\n\t" | |
55 "movd 12%1, %%mm2\n\t" | |
56 "punpckldq 15%1, %%mm2\n\t" | |
57 "movd 18%1, %%mm3\n\t" | |
58 "punpckldq 21%1, %%mm3\n\t" | |
2510 | 59 "pand %%mm7, %%mm0\n\t" |
2738 | 60 "pand %%mm7, %%mm1\n\t" |
2510 | 61 "pand %%mm7, %%mm2\n\t" |
2738 | 62 "pand %%mm7, %%mm3\n\t" |
2511 | 63 MOVNTQ" %%mm0, %0\n\t" |
2738 | 64 MOVNTQ" %%mm1, 8%0\n\t" |
65 MOVNTQ" %%mm2, 16%0\n\t" | |
66 MOVNTQ" %%mm3, 24%0" | |
2510 | 67 :"=m"(*dest) |
68 :"m"(*s) | |
69 :"memory"); | |
2738 | 70 dest += 32; |
71 s += 24; | |
2510 | 72 } |
2513 | 73 __asm __volatile(SFENCE:::"memory"); |
2511 | 74 __asm __volatile(EMMS:::"memory"); |
2510 | 75 #endif |
2504 | 76 while(s < end) |
77 { | |
2508 | 78 *dest++ = *s++; |
79 *dest++ = *s++; | |
80 *dest++ = *s++; | |
81 *dest++ = 0; | |
2504 | 82 } |
83 } | |
2505 | 84 |
2718 | 85 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2505 | 86 { |
87 uint8_t *dest = dst; | |
2677 | 88 const uint8_t *s = src; |
89 const uint8_t *end; | |
2517 | 90 #ifdef HAVE_MMX |
91 uint8_t *mm_end; | |
92 #endif | |
2505 | 93 end = s + src_size; |
2517 | 94 #ifdef HAVE_MMX |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
95 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
96 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4)); |
2517 | 97 while(s < mm_end) |
98 { | |
99 __asm __volatile( | |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
100 PREFETCH" 32%1\n\t" |
2517 | 101 "movq %1, %%mm0\n\t" |
102 "movq 8%1, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
103 "movq 16%1, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
104 "movq 24%1, %%mm5\n\t" |
2517 | 105 "movq %%mm0, %%mm2\n\t" |
106 "movq %%mm1, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
107 "movq %%mm4, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
108 "movq %%mm5, %%mm7\n\t" |
2517 | 109 "psrlq $8, %%mm2\n\t" |
110 "psrlq $8, %%mm3\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
111 "psrlq $8, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
112 "psrlq $8, %%mm7\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
113 "pand %2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
114 "pand %2, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
115 "pand %2, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
116 "pand %2, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
117 "pand %3, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
118 "pand %3, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
119 "pand %3, %%mm6\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
120 "pand %3, %%mm7\n\t" |
2517 | 121 "por %%mm2, %%mm0\n\t" |
122 "por %%mm3, %%mm1\n\t" | |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
123 "por %%mm6, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
124 "por %%mm7, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
125 |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
126 "movq %%mm1, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
127 "movq %%mm4, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
128 "psllq $48, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
129 "psllq $32, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
130 "pand %4, %%mm2\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
131 "pand %5, %%mm3\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
132 "por %%mm2, %%mm0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
133 "psrlq $16, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
134 "psrlq $32, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
135 "psllq $16, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
136 "por %%mm3, %%mm1\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
137 "pand %6, %%mm5\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
138 "por %%mm5, %%mm4\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
139 |
2517 | 140 MOVNTQ" %%mm0, %0\n\t" |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
141 MOVNTQ" %%mm1, 8%0\n\t" |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
142 MOVNTQ" %%mm4, 16%0" |
2517 | 143 :"=m"(*dest) |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
144 :"m"(*s),"m"(mask24l), |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
145 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
2517 | 146 :"memory"); |
2746
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
147 dest += 24; |
dece635a28e3
Minor speedup of rgb32to24. (performance is not successful)
nick
parents:
2741
diff
changeset
|
148 s += 32; |
2517 | 149 } |
150 __asm __volatile(SFENCE:::"memory"); | |
151 __asm __volatile(EMMS:::"memory"); | |
152 #endif | |
2505 | 153 while(s < end) |
154 { | |
155 *dest++ = *s++; | |
156 *dest++ = *s++; | |
157 *dest++ = *s++; | |
158 s++; | |
159 } | |
160 } | |
2506 | 161 |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
162 /* |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
163 Original by Strepto/Astral |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
164 ported to gcc & bugfixed : A'rpi |
2564 | 165 MMX2, 3DNOW optimization by Nick Kurshev |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
166 32bit c version, and and&add trick by Michael Niedermayer |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
167 */ |
2718 | 168 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size) |
2506 | 169 { |
170 #ifdef HAVE_MMX | |
2677 | 171 register const char* s=src+src_size; |
2506 | 172 register char* d=dst+src_size; |
173 register int offs=-src_size; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
174 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
175 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
176 "movq %0, %%mm4\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
177 ::"m"(mask15s)); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
178 while(offs<0) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
179 { |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
180 __asm __volatile( |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
181 PREFETCH" 32%1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
182 "movq %1, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
183 "movq 8%1, %%mm2\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
184 "movq %%mm0, %%mm1\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
185 "movq %%mm2, %%mm3\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
186 "pand %%mm4, %%mm0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
187 "pand %%mm4, %%mm2\n\t" |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
188 "paddw %%mm1, %%mm0\n\t" |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
189 "paddw %%mm3, %%mm2\n\t" |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
190 MOVNTQ" %%mm0, %0\n\t" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
191 MOVNTQ" %%mm2, 8%0" |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
192 :"=m"(*(d+offs)) |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
193 :"m"(*(s+offs)) |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
194 ); |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
195 offs+=16; |
2506 | 196 } |
2538
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
197 __asm __volatile(SFENCE:::"memory"); |
71320898b333
Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents:
2535
diff
changeset
|
198 __asm __volatile(EMMS:::"memory"); |
2506 | 199 #else |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
200 #if 0 |
2677 | 201 const uint16_t *s1=( uint16_t * )src; |
2506 | 202 uint16_t *d1=( uint16_t * )dst; |
203 uint16_t *e=((uint8_t *)s1)+src_size; | |
204 while( s1<e ){ | |
205 register int x=*( s1++ ); | |
206 /* rrrrrggggggbbbbb | |
207 0rrrrrgggggbbbbb | |
208 0111 1111 1110 0000=0x7FE0 | |
209 00000000000001 1111=0x001F */ | |
210 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); | |
211 } | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
212 #else |
2718 | 213 const unsigned *s1=( unsigned * )src; |
214 unsigned *d1=( unsigned * )dst; | |
2698
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
215 int i; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
216 int size= src_size>>2; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
217 for(i=0; i<size; i++) |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
218 { |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
219 register int x= s1[i]; |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
220 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
221 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
222 |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
223 } |
22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents:
2697
diff
changeset
|
224 #endif |
2506 | 225 #endif |
226 } | |
2694 | 227 |
228 /** | |
229 * Pallete is assumed to contain bgr32 | |
230 */ | |
2718 | 231 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 232 { |
2718 | 233 unsigned i; |
2702 | 234 for(i=0; i<num_pixels; i++) |
2718 | 235 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ]; |
2694 | 236 } |
237 | |
2697 | 238 /** |
239 * Pallete is assumed to contain bgr32 | |
240 */ | |
2718 | 241 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2697 | 242 { |
2718 | 243 unsigned i; |
2697 | 244 /* |
245 writes 1 byte o much and might cause alignment issues on some architectures? | |
2702 | 246 for(i=0; i<num_pixels; i++) |
2718 | 247 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ]; |
2697 | 248 */ |
2702 | 249 for(i=0; i<num_pixels; i++) |
2697 | 250 { |
251 //FIXME slow? | |
252 dst[0]= palette[ src[i]*4+0 ]; | |
253 dst[1]= palette[ src[i]*4+1 ]; | |
254 dst[2]= palette[ src[i]*4+2 ]; | |
255 dst+= 3; | |
256 } | |
257 } | |
258 | |
2718 | 259 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 260 { |
2741 | 261 #ifdef HAVE_MMX |
262 const uint8_t *s = src; | |
263 const uint8_t *end,*mm_end; | |
264 uint16_t *d = (uint16_t *)dst; | |
265 end = s + src_size; | |
266 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
267 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
268 __asm __volatile( | |
269 "movq %0, %%mm7\n\t" | |
270 "movq %1, %%mm6\n\t" | |
271 ::"m"(red_16mask),"m"(green_16mask)); | |
272 while(s < mm_end) | |
273 { | |
274 __asm __volatile( | |
275 PREFETCH" 32%1\n\t" | |
276 "movd %1, %%mm0\n\t" | |
277 "movd 4%1, %%mm3\n\t" | |
278 "punpckldq 8%1, %%mm0\n\t" | |
279 "punpckldq 12%1, %%mm3\n\t" | |
280 "movq %%mm0, %%mm1\n\t" | |
281 "movq %%mm0, %%mm2\n\t" | |
282 "movq %%mm3, %%mm4\n\t" | |
283 "movq %%mm3, %%mm5\n\t" | |
284 "psrlq $3, %%mm0\n\t" | |
285 "psrlq $3, %%mm3\n\t" | |
286 "pand %2, %%mm0\n\t" | |
287 "pand %2, %%mm3\n\t" | |
288 "psrlq $5, %%mm1\n\t" | |
289 "psrlq $5, %%mm4\n\t" | |
290 "pand %%mm6, %%mm1\n\t" | |
291 "pand %%mm6, %%mm4\n\t" | |
292 "psrlq $8, %%mm2\n\t" | |
293 "psrlq $8, %%mm5\n\t" | |
294 "pand %%mm7, %%mm2\n\t" | |
295 "pand %%mm7, %%mm5\n\t" | |
296 "por %%mm1, %%mm0\n\t" | |
297 "por %%mm4, %%mm3\n\t" | |
298 "por %%mm2, %%mm0\n\t" | |
299 "por %%mm5, %%mm3\n\t" | |
300 "psllq $16, %%mm3\n\t" | |
301 "por %%mm3, %%mm0\n\t" | |
302 MOVNTQ" %%mm0, %0\n\t" | |
303 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
304 d += 4; | |
305 s += 16; | |
306 } | |
307 while(s < end) | |
308 { | |
309 const int b= *s++; | |
310 const int g= *s++; | |
311 const int r= *s++; | |
312 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
313 } | |
314 __asm __volatile(SFENCE:::"memory"); | |
315 __asm __volatile(EMMS:::"memory"); | |
316 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
317 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
318 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
319 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 320 { |
321 const int b= src[i+0]; | |
322 const int g= src[i+1]; | |
323 const int r= src[i+2]; | |
324 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
325 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2694 | 326 } |
2741 | 327 #endif |
2694 | 328 } |
329 | |
2718 | 330 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size) |
2694 | 331 { |
2741 | 332 #ifdef HAVE_MMX |
333 const uint8_t *s = src; | |
334 const uint8_t *end,*mm_end; | |
335 uint16_t *d = (uint16_t *)dst; | |
336 end = s + src_size; | |
337 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
338 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
339 __asm __volatile( | |
340 "movq %0, %%mm7\n\t" | |
341 "movq %1, %%mm6\n\t" | |
342 ::"m"(red_15mask),"m"(green_15mask)); | |
343 while(s < mm_end) | |
344 { | |
345 __asm __volatile( | |
346 PREFETCH" 32%1\n\t" | |
347 "movd %1, %%mm0\n\t" | |
348 "movd 4%1, %%mm3\n\t" | |
349 "punpckldq 8%1, %%mm0\n\t" | |
350 "punpckldq 12%1, %%mm3\n\t" | |
351 "movq %%mm0, %%mm1\n\t" | |
352 "movq %%mm0, %%mm2\n\t" | |
353 "movq %%mm3, %%mm4\n\t" | |
354 "movq %%mm3, %%mm5\n\t" | |
355 "psrlq $3, %%mm0\n\t" | |
356 "psrlq $3, %%mm3\n\t" | |
357 "pand %2, %%mm0\n\t" | |
358 "pand %2, %%mm3\n\t" | |
359 "psrlq $6, %%mm1\n\t" | |
360 "psrlq $6, %%mm4\n\t" | |
361 "pand %%mm6, %%mm1\n\t" | |
362 "pand %%mm6, %%mm4\n\t" | |
363 "psrlq $9, %%mm2\n\t" | |
364 "psrlq $9, %%mm5\n\t" | |
365 "pand %%mm7, %%mm2\n\t" | |
366 "pand %%mm7, %%mm5\n\t" | |
367 "por %%mm1, %%mm0\n\t" | |
368 "por %%mm4, %%mm3\n\t" | |
369 "por %%mm2, %%mm0\n\t" | |
370 "por %%mm5, %%mm3\n\t" | |
371 "psllq $16, %%mm3\n\t" | |
372 "por %%mm3, %%mm0\n\t" | |
373 MOVNTQ" %%mm0, %0\n\t" | |
374 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
375 d += 4; | |
376 s += 16; | |
377 } | |
378 while(s < end) | |
379 { | |
380 const int b= *s++; | |
381 const int g= *s++; | |
382 const int r= *s++; | |
383 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
384 } | |
385 __asm __volatile(SFENCE:::"memory"); | |
386 __asm __volatile(EMMS:::"memory"); | |
387 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
388 unsigned j,i,num_pixels=src_size/4; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
389 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
390 for(i=0,j=0; j<num_pixels; i+=4,j++) |
2694 | 391 { |
392 const int b= src[i+0]; | |
393 const int g= src[i+1]; | |
394 const int r= src[i+2]; | |
395 | |
2720 | 396 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2694 | 397 } |
2741 | 398 #endif |
2694 | 399 } |
400 | |
2718 | 401 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size) |
402 { | |
2740 | 403 #ifdef HAVE_MMX |
404 const uint8_t *s = src; | |
405 const uint8_t *end,*mm_end; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
406 uint16_t *d = (uint16_t *)dst; |
2740 | 407 end = s + src_size; |
408 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
2738 | 409 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
410 __asm __volatile( | |
411 "movq %0, %%mm7\n\t" | |
412 "movq %1, %%mm6\n\t" | |
2741 | 413 ::"m"(red_16mask),"m"(green_16mask)); |
414 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
2740 | 415 while(s < mm_end) |
2738 | 416 { |
417 __asm __volatile( | |
418 PREFETCH" 32%1\n\t" | |
419 "movd %1, %%mm0\n\t" | |
2740 | 420 "movd 3%1, %%mm3\n\t" |
421 "punpckldq 6%1, %%mm0\n\t" | |
2738 | 422 "punpckldq 9%1, %%mm3\n\t" |
423 "movq %%mm0, %%mm1\n\t" | |
424 "movq %%mm0, %%mm2\n\t" | |
425 "movq %%mm3, %%mm4\n\t" | |
426 "movq %%mm3, %%mm5\n\t" | |
427 "psrlq $3, %%mm0\n\t" | |
428 "psrlq $3, %%mm3\n\t" | |
2740 | 429 "pand %2, %%mm0\n\t" |
430 "pand %2, %%mm3\n\t" | |
431 "psrlq $5, %%mm1\n\t" | |
432 "psrlq $5, %%mm4\n\t" | |
433 "pand %%mm6, %%mm1\n\t" | |
434 "pand %%mm6, %%mm4\n\t" | |
435 "psrlq $8, %%mm2\n\t" | |
436 "psrlq $8, %%mm5\n\t" | |
437 "pand %%mm7, %%mm2\n\t" | |
438 "pand %%mm7, %%mm5\n\t" | |
2738 | 439 "por %%mm1, %%mm0\n\t" |
2740 | 440 "por %%mm4, %%mm3\n\t" |
2738 | 441 "por %%mm2, %%mm0\n\t" |
442 "por %%mm5, %%mm3\n\t" | |
2740 | 443 "psllq $16, %%mm3\n\t" |
444 "por %%mm3, %%mm0\n\t" | |
2738 | 445 MOVNTQ" %%mm0, %0\n\t" |
2741 | 446 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
2740 | 447 d += 4; |
448 s += 12; | |
2738 | 449 } |
2740 | 450 while(s < end) |
451 { | |
452 const int b= *s++; | |
453 const int g= *s++; | |
454 const int r= *s++; | |
455 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
456 } | |
457 __asm __volatile(SFENCE:::"memory"); | |
458 __asm __volatile(EMMS:::"memory"); | |
459 #else | |
460 unsigned j,i,num_pixels=src_size/3; | |
461 uint16_t *d = (uint16_t *)dst; | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
462 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 463 { |
464 const int b= src[i+0]; | |
465 const int g= src[i+1]; | |
466 const int r= src[i+2]; | |
467 | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
468 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
2718 | 469 } |
2740 | 470 #endif |
2718 | 471 } |
472 | |
473 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size) | |
474 { | |
2741 | 475 #ifdef HAVE_MMX |
476 const uint8_t *s = src; | |
477 const uint8_t *end,*mm_end; | |
478 uint16_t *d = (uint16_t *)dst; | |
479 end = s + src_size; | |
480 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2)); | |
481 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
482 __asm __volatile( | |
483 "movq %0, %%mm7\n\t" | |
484 "movq %1, %%mm6\n\t" | |
485 ::"m"(red_15mask),"m"(green_15mask)); | |
486 if(mm_end == end) mm_end -= MMREG_SIZE*2; | |
487 while(s < mm_end) | |
488 { | |
489 __asm __volatile( | |
490 PREFETCH" 32%1\n\t" | |
491 "movd %1, %%mm0\n\t" | |
492 "movd 3%1, %%mm3\n\t" | |
493 "punpckldq 6%1, %%mm0\n\t" | |
494 "punpckldq 9%1, %%mm3\n\t" | |
495 "movq %%mm0, %%mm1\n\t" | |
496 "movq %%mm0, %%mm2\n\t" | |
497 "movq %%mm3, %%mm4\n\t" | |
498 "movq %%mm3, %%mm5\n\t" | |
499 "psrlq $3, %%mm0\n\t" | |
500 "psrlq $3, %%mm3\n\t" | |
501 "pand %2, %%mm0\n\t" | |
502 "pand %2, %%mm3\n\t" | |
503 "psrlq $6, %%mm1\n\t" | |
504 "psrlq $6, %%mm4\n\t" | |
505 "pand %%mm6, %%mm1\n\t" | |
506 "pand %%mm6, %%mm4\n\t" | |
507 "psrlq $9, %%mm2\n\t" | |
508 "psrlq $9, %%mm5\n\t" | |
509 "pand %%mm7, %%mm2\n\t" | |
510 "pand %%mm7, %%mm5\n\t" | |
511 "por %%mm1, %%mm0\n\t" | |
512 "por %%mm4, %%mm3\n\t" | |
513 "por %%mm2, %%mm0\n\t" | |
514 "por %%mm5, %%mm3\n\t" | |
515 "psllq $16, %%mm3\n\t" | |
516 "por %%mm3, %%mm0\n\t" | |
517 MOVNTQ" %%mm0, %0\n\t" | |
518 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
519 d += 4; | |
520 s += 12; | |
521 } | |
522 while(s < end) | |
523 { | |
524 const int b= *s++; | |
525 const int g= *s++; | |
526 const int r= *s++; | |
527 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
528 } | |
529 __asm __volatile(SFENCE:::"memory"); | |
530 __asm __volatile(EMMS:::"memory"); | |
531 #else | |
2719
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
532 unsigned j,i,num_pixels=src_size/3; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
533 uint16_t *d = (uint16_t *)dst; |
fafa73d6d80c
Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents:
2718
diff
changeset
|
534 for(i=0,j=0; j<num_pixels; i+=3,j++) |
2718 | 535 { |
536 const int b= src[i+0]; | |
537 const int g= src[i+1]; | |
538 const int r= src[i+2]; | |
539 | |
2720 | 540 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
2718 | 541 } |
2741 | 542 #endif |
2718 | 543 } |
2694 | 544 |
545 /** | |
546 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette | |
547 */ | |
2718 | 548 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 549 { |
2718 | 550 unsigned i; |
2702 | 551 for(i=0; i<num_pixels; i++) |
2694 | 552 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; |
553 } | |
554 | |
555 /** | |
556 * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette | |
557 */ | |
2718 | 558 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette) |
2694 | 559 { |
2718 | 560 unsigned i; |
2702 | 561 for(i=0; i<num_pixels; i++) |
2694 | 562 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; |
2697 | 563 } |
2702 | 564 /** |
565 * | |
2724 | 566 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
567 * problem for anyone then tell me, and ill fix it) | |
2702 | 568 */ |
2723 | 569 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
2725 | 570 unsigned int width, unsigned int height, |
571 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) | |
2701 | 572 { |
2723 | 573 int y; |
574 const int chromWidth= width>>1; | |
575 for(y=0; y<height; y++) | |
576 { | |
2702 | 577 #ifdef HAVE_MMX |
2723 | 578 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
579 asm volatile( | |
580 "xorl %%eax, %%eax \n\t" | |
581 "1: \n\t" | |
582 PREFETCH" 32(%1, %%eax, 2) \n\t" | |
583 PREFETCH" 32(%2, %%eax) \n\t" | |
584 PREFETCH" 32(%3, %%eax) \n\t" | |
585 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | |
586 "movq %%mm0, %%mm2 \n\t" // U(0) | |
587 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | |
588 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
589 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
590 | |
591 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | |
592 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | |
593 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
594 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
595 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
596 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
597 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
598 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
2702 | 599 |
2723 | 600 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" |
601 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | |
602 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | |
603 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | |
2702 | 604 |
2723 | 605 "addl $8, %%eax \n\t" |
606 "cmpl %4, %%eax \n\t" | |
607 " jb 1b \n\t" | |
608 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) | |
609 : "%eax" | |
610 ); | |
2702 | 611 #else |
2723 | 612 int i; |
613 for(i=0; i<chromWidth; i++) | |
614 { | |
615 dst[4*i+0] = ysrc[2*i+0]; | |
616 dst[4*i+1] = usrc[i]; | |
617 dst[4*i+2] = ysrc[2*i+1]; | |
618 dst[4*i+3] = vsrc[i]; | |
619 } | |
620 #endif | |
621 if(y&1) | |
622 { | |
623 usrc += chromStride; | |
624 vsrc += chromStride; | |
625 } | |
626 ysrc += lumStride; | |
627 dst += dstStride; | |
2701 | 628 } |
2723 | 629 #ifdef HAVE_MMX |
630 asm( EMMS" \n\t" | |
631 SFENCE" \n\t" | |
632 :::"memory"); | |
2702 | 633 #endif |
2701 | 634 } |
635 | |
2724 | 636 /** |
637 * | |
638 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
639 * problem for anyone then tell me, and ill fix it) | |
640 */ | |
641 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
2725 | 642 unsigned int width, unsigned int height, |
643 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) | |
2701 | 644 { |
2724 | 645 int y; |
646 const int chromWidth= width>>1; | |
647 for(y=0; y<height; y+=2) | |
648 { | |
2704 | 649 #ifdef HAVE_MMX |
2724 | 650 asm volatile( |
651 "xorl %%eax, %%eax \n\t" | |
652 "pcmpeqw %%mm7, %%mm7 \n\t" | |
653 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
654 "1: \n\t" | |
655 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
656 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
657 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
658 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
659 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
660 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
661 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
662 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
663 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
664 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
665 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
666 | |
667 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2704 | 668 |
2724 | 669 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) |
670 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | |
671 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
672 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
673 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
674 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
675 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
676 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
677 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
678 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2704 | 679 |
2724 | 680 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" |
681 | |
682 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
683 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
684 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
685 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
686 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
687 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
688 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
689 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2704 | 690 |
2724 | 691 MOVNTQ" %%mm0, (%3, %%eax) \n\t" |
692 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
693 | |
694 "addl $8, %%eax \n\t" | |
695 "cmpl %4, %%eax \n\t" | |
696 " jb 1b \n\t" | |
2725 | 697 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) |
698 : "memory", "%eax" | |
699 ); | |
2704 | 700 |
2725 | 701 asm volatile( |
702 "xorl %%eax, %%eax \n\t" | |
2724 | 703 "1: \n\t" |
704 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
705 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
706 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
707 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
708 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
709 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
710 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
711 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
712 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
713 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
714 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2704 | 715 |
2724 | 716 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" |
717 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
718 | |
719 "addl $8, %%eax \n\t" | |
2725 | 720 "cmpl %4, %%eax \n\t" |
2724 | 721 " jb 1b \n\t" |
2704 | 722 |
2725 | 723 ::"r"(src+srcStride), "r"(ydst+lumStride), "r"(udst), "r"(vdst), "r" (chromWidth) |
2724 | 724 : "memory", "%eax" |
725 ); | |
2704 | 726 #else |
2724 | 727 int i; |
728 for(i=0; i<chromWidth; i++) | |
729 { | |
730 ydst[2*i+0] = src[4*i+0]; | |
731 udst[i] = src[4*i+1]; | |
732 ydst[2*i+1] = src[4*i+2]; | |
733 vdst[i] = src[4*i+3]; | |
734 } | |
735 ydst += lumStride; | |
736 src += srcStride; | |
737 | |
738 for(i=0; i<chromWidth; i++) | |
739 { | |
740 ydst[2*i+0] = src[4*i+0]; | |
741 ydst[2*i+1] = src[4*i+2]; | |
742 } | |
743 #endif | |
744 udst += chromStride; | |
745 vdst += chromStride; | |
746 ydst += lumStride; | |
747 src += srcStride; | |
2701 | 748 } |
2724 | 749 #ifdef HAVE_MMX |
750 asm( EMMS" \n\t" | |
751 SFENCE" \n\t" | |
752 :::"memory"); | |
2704 | 753 #endif |
2723 | 754 } |