Mercurial > mplayer.hg
comparison postproc/swscale_altivec_template.c @ 12130:2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
author | michael |
---|---|
date | Tue, 06 Apr 2004 00:25:47 +0000 |
parents | 21e5cb258a95 |
children | 79a2af950cf7 |
comparison
equal
deleted
inserted
replaced
12129:c1aff21286dd | 12130:2ef24558b732 |
---|---|
18 You should have received a copy of the GNU General Public License | 18 You should have received a copy of the GNU General Public License |
19 along with this program; if not, write to the Free Software | 19 along with this program; if not, write to the Free Software |
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 */ | 21 */ |
22 | 22 |
23 static const vector unsigned int altivec_vectorShiftInt19 = {19, 19, 19, 19}; | 23 #ifdef CONFIG_DARWIN |
24 static const vector signed int vzero = | |
25 (vector signed int)(0, 0, 0, 0); | |
26 static const vector unsigned int altivec_vectorShiftInt19 = | |
27 (vector unsigned int)(19, 19, 19, 19); | |
28 #else | |
29 static const vector signed int vzero = | |
30 (vector signed int){0,0,0,0}; | |
31 static const vector unsigned int altivec_vectorShiftInt19 = | |
32 (vector unsigned int){19, 19, 19, 19}; | |
33 | |
34 #endif | |
35 | |
24 static inline void | 36 static inline void |
25 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) { | 37 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) { |
26 register int i; | 38 register int i; |
27 if ((unsigned long)dest % 16) { | 39 if ((unsigned long)dest % 16) { |
28 /* badly aligned store, we force store alignement */ | 40 /* badly aligned store, we force store alignement */ |
199 } | 211 } |
200 altivec_packIntArrayToCharArray(u,uDest,chrDstW); | 212 altivec_packIntArrayToCharArray(u,uDest,chrDstW); |
201 altivec_packIntArrayToCharArray(v,vDest,chrDstW); | 213 altivec_packIntArrayToCharArray(v,vDest,chrDstW); |
202 } | 214 } |
203 } | 215 } |
216 | |
217 static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, int filterSize) { | |
218 register int i; | |
219 int __attribute__ ((aligned (16))) tempo[4]; | |
220 | |
221 if (filterSize % 4) { | |
222 for(i=0; i<dstW; i++) { | |
223 register int j; | |
224 register int srcPos = filterPos[i]; | |
225 register int val = 0; | |
226 for(j=0; j<filterSize; j++) { | |
227 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
228 } | |
229 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); | |
230 } | |
231 } | |
232 else | |
233 switch (filterSize) { | |
234 case 4: | |
235 { | |
236 for(i=0; i<dstW; i++) { | |
237 register int j; | |
238 register int srcPos = filterPos[i]; | |
239 | |
240 vector unsigned char src_v0 = vec_ld(srcPos, src); | |
241 vector unsigned char src_v1; | |
242 if ((((int)src + srcPos)% 16) > 12) { | |
243 src_v1 = vec_ld(srcPos + 16, src); | |
244 } | |
245 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); | |
246 | |
247 vector signed short src_v = // vec_unpackh sign-extends... | |
248 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | |
249 // now put our elements in the even slots | |
250 src_v = vec_mergeh(src_v, (vector signed short)vzero); | |
251 | |
252 vector signed short filter_v = vec_ld(i << 3, filter); | |
253 // the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2) | |
254 | |
255 // the neat trick : we only care for half the elements, | |
256 // high or low depending on (i<<3)%16 (it's 0 or 8 here), | |
257 // and we're going to use vec_mule, so we chose | |
258 // carefully how to "unpack" the elements into the even slots | |
259 if ((i << 3) % 16) | |
260 filter_v = vec_mergel(filter_v,(vector signed short)vzero); | |
261 else | |
262 filter_v = vec_mergeh(filter_v,(vector signed short)vzero); | |
263 | |
264 vector signed int val_vEven = vec_mule(src_v, filter_v); | |
265 vector signed int val_s = vec_sums(val_vEven, vzero); | |
266 vec_st(val_s, 0, tempo); | |
267 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1); | |
268 } | |
269 } | |
270 break; | |
271 | |
272 case 8: | |
273 { | |
274 for(i=0; i<dstW; i++) { | |
275 register int srcPos = filterPos[i]; | |
276 | |
277 vector unsigned char src_v0 = vec_ld(srcPos, src); | |
278 vector unsigned char src_v1; | |
279 if ((((int)src + srcPos)% 16) > 8) { | |
280 src_v1 = vec_ld(srcPos + 16, src); | |
281 } | |
282 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); | |
283 | |
284 vector signed short src_v = // vec_unpackh sign-extends... | |
285 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | |
286 vector signed short filter_v = vec_ld(i << 4, filter); | |
287 // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2) | |
288 | |
289 vector signed int val_v = vec_msums(src_v, filter_v, (vector signed int)vzero); | |
290 vector signed int val_s = vec_sums(val_v, vzero); | |
291 vec_st(val_s, 0, tempo); | |
292 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1); | |
293 } | |
294 } | |
295 break; | |
296 | |
297 case 16: | |
298 { | |
299 for(i=0; i<dstW; i++) { | |
300 register int srcPos = filterPos[i]; | |
301 | |
302 vector unsigned char src_v0 = vec_ld(srcPos, src); | |
303 vector unsigned char src_v1 = vec_ld(srcPos + 16, src); | |
304 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); | |
305 | |
306 vector signed short src_vA = // vec_unpackh sign-extends... | |
307 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | |
308 vector signed short src_vB = // vec_unpackh sign-extends... | |
309 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); | |
310 | |
311 vector signed short filter_v0 = vec_ld(i << 5, filter); | |
312 vector signed short filter_v1 = vec_ld((i << 5) + 16, filter); | |
313 // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2) | |
314 | |
315 vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero); | |
316 vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc); | |
317 | |
318 vector signed int val_s = vec_sums(val_v, vzero); | |
319 | |
320 vec_st(val_s, 0, tempo); | |
321 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1); | |
322 } | |
323 } | |
324 break; | |
325 | |
326 default: | |
327 { | |
328 for(i=0; i<dstW; i++) { | |
329 register int j; | |
330 register int srcPos = filterPos[i]; | |
331 | |
332 vector signed int val_v = (vector signed int)vzero; | |
333 vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter); | |
334 vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter); | |
335 | |
336 vector unsigned char src_v0 = vec_ld(srcPos, src); | |
337 vector unsigned char permS = vec_lvsl(srcPos, src); | |
338 | |
339 for (j = 0 ; j < filterSize - 15; j += 16) { | |
340 vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src); | |
341 vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); | |
342 | |
343 vector signed short src_vA = // vec_unpackh sign-extends... | |
344 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | |
345 vector signed short src_vB = // vec_unpackh sign-extends... | |
346 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); | |
347 | |
348 vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); | |
349 vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter); | |
350 vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF); | |
351 vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF); | |
352 | |
353 vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); | |
354 val_v = vec_msums(src_vB, filter_v1, val_acc); | |
355 | |
356 filter_v0R = filter_v2R; | |
357 src_v0 = src_v1; | |
358 } | |
359 | |
360 if (j < (filterSize-7)) { | |
361 // loading src_v0 is useless, it's already done above | |
362 //vector unsigned char src_v0 = vec_ld(srcPos + j, src); | |
363 vector unsigned char src_v1; | |
364 if ((((int)src + srcPos)% 16) > 8) { | |
365 src_v1 = vec_ld(srcPos + j + 16, src); | |
366 } | |
367 vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); | |
368 | |
369 vector signed short src_v = // vec_unpackh sign-extends... | |
370 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | |
371 // loading filter_v0R is useless, it's already done above | |
372 //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter); | |
373 vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); | |
374 vector signed short filter_v = vec_perm(filter_v0R, filter_v1R, permF); | |
375 | |
376 val_v = vec_msums(src_v, filter_v, val_v); | |
377 } | |
378 | |
379 vector signed int val_s = vec_sums(val_v, vzero); | |
380 | |
381 vec_st(val_s, 0, tempo); | |
382 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1); | |
383 } | |
384 | |
385 } | |
386 } | |
387 } |