Mercurial > libavcodec.hg
comparison dsputil.c @ 997:4dfe15ae0078 libavcodec
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
author | michaelni |
---|---|
date | Sat, 11 Jan 2003 22:32:56 +0000 |
parents | ad44196ea483 |
children | b2cf2a1d9a51 |
comparison
equal
deleted
inserted
replaced
996:ad44196ea483 | 997:4dfe15ae0078 |
---|---|
127 UINT32 *sq = squareTbl + 256; | 127 UINT32 *sq = squareTbl + 256; |
128 | 128 |
129 s = 0; | 129 s = 0; |
130 for (i = 0; i < 16; i++) { | 130 for (i = 0; i < 16; i++) { |
131 for (j = 0; j < 16; j += 8) { | 131 for (j = 0; j < 16; j += 8) { |
132 #if 0 | |
132 s += sq[pix[0]]; | 133 s += sq[pix[0]]; |
133 s += sq[pix[1]]; | 134 s += sq[pix[1]]; |
134 s += sq[pix[2]]; | 135 s += sq[pix[2]]; |
135 s += sq[pix[3]]; | 136 s += sq[pix[3]]; |
136 s += sq[pix[4]]; | 137 s += sq[pix[4]]; |
137 s += sq[pix[5]]; | 138 s += sq[pix[5]]; |
138 s += sq[pix[6]]; | 139 s += sq[pix[6]]; |
139 s += sq[pix[7]]; | 140 s += sq[pix[7]]; |
141 #else | |
142 #if LONG_MAX > 2147483647 | |
143 register uint64_t x=*(uint64_t*)pix; | |
144 s += sq[x&0xff]; | |
145 s += sq[(x>>8)&0xff]; | |
146 s += sq[(x>>16)&0xff]; | |
147 s += sq[(x>>24)&0xff]; | |
148 s += sq[(x>>32)&0xff]; | |
149 s += sq[(x>>40)&0xff]; | |
150 s += sq[(x>>48)&0xff]; | |
151 s += sq[(x>>56)&0xff]; | |
152 #else | |
153 register uint32_t x=*(uint32_t*)pix; | |
154 s += sq[x&0xff]; | |
155 s += sq[(x>>8)&0xff]; | |
156 s += sq[(x>>16)&0xff]; | |
157 s += sq[(x>>24)&0xff]; | |
158 x=*(uint32_t*)(pix+4); | |
159 s += sq[x&0xff]; | |
160 s += sq[(x>>8)&0xff]; | |
161 s += sq[(x>>16)&0xff]; | |
162 s += sq[(x>>24)&0xff]; | |
163 #endif | |
164 #endif | |
140 pix += 8; | 165 pix += 8; |
141 } | 166 } |
142 pix += line_size - 16; | 167 pix += line_size - 16; |
143 } | 168 } |
144 return s; | 169 return s; |
172 UINT32 *sq = squareTbl + 256; | 197 UINT32 *sq = squareTbl + 256; |
173 | 198 |
174 s = 0; | 199 s = 0; |
175 for (i = 0; i < 16; i++) { | 200 for (i = 0; i < 16; i++) { |
176 for (j = 0; j < 16; j += 8) { | 201 for (j = 0; j < 16; j += 8) { |
202 #if 1 | |
203 #if LONG_MAX > 2147483647 | |
204 uint64_t x,y; | |
205 x=*(uint64_t*)pix1; | |
206 y=*(uint64_t*)pix2; | |
207 | |
208 s += sq[(x&0xff) - (y&0xff)]; | |
209 s += sq[((x>>8)&0xff) - ((y>>8)&0xff)]; | |
210 s += sq[((x>>16)&0xff) - ((y>>16)&0xff)]; | |
211 s += sq[((x>>24)&0xff) - ((y>>24)&0xff)]; | |
212 s += sq[((x>>32)&0xff) - ((y>>32)&0xff)]; | |
213 s += sq[((x>>40)&0xff) - ((y>>40)&0xff)]; | |
214 s += sq[((x>>48)&0xff) - ((y>>48)&0xff)]; | |
215 s += sq[((x>>56)&0xff) - ((y>>56)&0xff)]; | |
216 #else | |
217 uint32_t x,y; | |
218 x=*(uint32_t*)pix1; | |
219 y=*(uint32_t*)pix2; | |
220 | |
221 s += sq[(x&0xff) - (y&0xff)]; | |
222 s += sq[((x>>8)&0xff) - ((y>>8)&0xff)]; | |
223 s += sq[((x>>16)&0xff) - ((y>>16)&0xff)]; | |
224 s += sq[((x>>24)&0xff) - ((y>>24)&0xff)]; | |
225 | |
226 x=*(uint32_t*)(pix1+4); | |
227 y=*(uint32_t*)(pix2+4); | |
228 s += sq[(x&0xff) - (y&0xff)]; | |
229 s += sq[((x>>8)&0xff) - ((y>>8)&0xff)]; | |
230 s += sq[((x>>16)&0xff) - ((y>>16)&0xff)]; | |
231 s += sq[((x>>24)&0xff) - ((y>>24)&0xff)]; | |
232 #endif | |
233 #else | |
177 s += sq[pix1[0] - pix2[0]]; | 234 s += sq[pix1[0] - pix2[0]]; |
178 s += sq[pix1[1] - pix2[1]]; | 235 s += sq[pix1[1] - pix2[1]]; |
179 s += sq[pix1[2] - pix2[2]]; | 236 s += sq[pix1[2] - pix2[2]]; |
180 s += sq[pix1[3] - pix2[3]]; | 237 s += sq[pix1[3] - pix2[3]]; |
181 s += sq[pix1[4] - pix2[4]]; | 238 s += sq[pix1[4] - pix2[4]]; |
182 s += sq[pix1[5] - pix2[5]]; | 239 s += sq[pix1[5] - pix2[5]]; |
183 s += sq[pix1[6] - pix2[6]]; | 240 s += sq[pix1[6] - pix2[6]]; |
184 s += sq[pix1[7] - pix2[7]]; | 241 s += sq[pix1[7] - pix2[7]]; |
242 #endif | |
185 pix1 += 8; | 243 pix1 += 8; |
186 pix2 += 8; | 244 pix2 += 8; |
187 } | 245 } |
188 pix1 += line_size - 16; | 246 pix1 += line_size - 16; |
189 pix2 += line_size - 16; | 247 pix2 += line_size - 16; |