Mercurial > mplayer.hg
comparison postproc/rgb2rgb.c @ 2698:22652c028692
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
author | michael |
---|---|
date | Sun, 04 Nov 2001 23:04:26 +0000 |
parents | 1eaf3f89e49f |
children | 9b47bc409083 |
comparison
equal
deleted
inserted
replaced
2697:1eaf3f89e49f | 2698:22652c028692 |
---|---|
12 static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; | 12 static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; |
13 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; | 13 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; |
14 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; | 14 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; |
15 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ | 15 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ |
16 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ | 16 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ |
17 static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; | |
17 #endif | 18 #endif |
18 | 19 |
19 void rgb24to32(const uint8_t *src,uint8_t *dst,uint32_t src_size) | 20 void rgb24to32(const uint8_t *src,uint8_t *dst,uint32_t src_size) |
20 { | 21 { |
21 uint8_t *dest = dst; | 22 uint8_t *dest = dst; |
117 | 118 |
118 /* | 119 /* |
119 Original by Strepto/Astral | 120 Original by Strepto/Astral |
120 ported to gcc & bugfixed : A'rpi | 121 ported to gcc & bugfixed : A'rpi |
121 MMX2, 3DNOW optimization by Nick Kurshev | 122 MMX2, 3DNOW optimization by Nick Kurshev |
123 32bit c version, and and&add trick by Michael Niedermayer | |
122 */ | 124 */ |
123 void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) | 125 void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) |
124 { | 126 { |
125 #ifdef HAVE_MMX | 127 #ifdef HAVE_MMX |
126 register const char* s=src+src_size; | 128 register const char* s=src+src_size; |
127 register char* d=dst+src_size; | 129 register char* d=dst+src_size; |
128 register int offs=-src_size; | 130 register int offs=-src_size; |
129 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)):"memory"); | 131 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); |
130 __asm __volatile( | 132 __asm __volatile( |
131 "movq %0, %%mm4\n\t" | 133 "movq %0, %%mm4\n\t" |
132 "movq %1, %%mm5" | 134 ::"m"(mask15s)); |
133 ::"m"(mask15b), "m"(mask15rg):"memory"); | |
134 while(offs<0) | 135 while(offs<0) |
135 { | 136 { |
136 __asm __volatile( | 137 __asm __volatile( |
137 PREFETCH" 32%1\n\t" | 138 PREFETCH" 32%1\n\t" |
138 "movq %1, %%mm0\n\t" | 139 "movq %1, %%mm0\n\t" |
139 "movq 8%1, %%mm2\n\t" | 140 "movq 8%1, %%mm2\n\t" |
140 "movq %%mm0, %%mm1\n\t" | 141 "movq %%mm0, %%mm1\n\t" |
141 "movq %%mm2, %%mm3\n\t" | 142 "movq %%mm2, %%mm3\n\t" |
142 "pand %%mm4, %%mm0\n\t" | 143 "pand %%mm4, %%mm0\n\t" |
143 "pand %%mm5, %%mm1\n\t" | |
144 "pand %%mm4, %%mm2\n\t" | 144 "pand %%mm4, %%mm2\n\t" |
145 "pand %%mm5, %%mm3\n\t" | 145 "paddw %%mm1, %%mm0\n\t" |
146 "psllq $1, %%mm1\n\t" | 146 "paddw %%mm3, %%mm2\n\t" |
147 "psllq $1, %%mm3\n\t" | |
148 "por %%mm1, %%mm0\n\t" | |
149 "por %%mm3, %%mm2\n\t" | |
150 MOVNTQ" %%mm0, %0\n\t" | 147 MOVNTQ" %%mm0, %0\n\t" |
151 MOVNTQ" %%mm2, 8%0" | 148 MOVNTQ" %%mm2, 8%0" |
152 :"=m"(*(d+offs)) | 149 :"=m"(*(d+offs)) |
153 :"m"(*(s+offs)) | 150 :"m"(*(s+offs)) |
154 :"memory"); | 151 ); |
155 offs+=16; | 152 offs+=16; |
156 } | 153 } |
157 __asm __volatile(SFENCE:::"memory"); | 154 __asm __volatile(SFENCE:::"memory"); |
158 __asm __volatile(EMMS:::"memory"); | 155 __asm __volatile(EMMS:::"memory"); |
159 #else | 156 #else |
157 #if 0 | |
160 const uint16_t *s1=( uint16_t * )src; | 158 const uint16_t *s1=( uint16_t * )src; |
161 uint16_t *d1=( uint16_t * )dst; | 159 uint16_t *d1=( uint16_t * )dst; |
162 uint16_t *e=((uint8_t *)s1)+src_size; | 160 uint16_t *e=((uint8_t *)s1)+src_size; |
163 while( s1<e ){ | 161 while( s1<e ){ |
164 register int x=*( s1++ ); | 162 register int x=*( s1++ ); |
166 0rrrrrgggggbbbbb | 164 0rrrrrgggggbbbbb |
167 0111 1111 1110 0000=0x7FE0 | 165 0111 1111 1110 0000=0x7FE0 |
168 00000000000001 1111=0x001F */ | 166 00000000000001 1111=0x001F */ |
169 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); | 167 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); |
170 } | 168 } |
169 #else | |
170 const uint32_t *s1=( uint32_t * )src; | |
171 uint32_t *d1=( uint32_t * )dst; | |
172 int i; | |
173 int size= src_size>>2; | |
174 for(i=0; i<size; i++) | |
175 { | |
176 register int x= s1[i]; | |
177 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true | |
178 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
179 | |
180 } | |
181 #endif | |
171 #endif | 182 #endif |
172 } | 183 } |
173 | 184 |
174 /** | 185 /** |
175 * Pallete is assumed to contain bgr32 | 186 * Pallete is assumed to contain bgr32 |