comparison imgresample.c @ 5750:09f99af1db40 libavcodec

Sanitize altivec code so it can be built with runtime check properly
author lu_zero
date Tue, 02 Oct 2007 11:39:32 +0000
parents 9968f39d03aa
children 80103098c797
comparison
equal deleted inserted replaced
5749:784dcbdc910f 5750:09f99af1db40
25 */ 25 */
26 26
27 #include "avcodec.h" 27 #include "avcodec.h"
28 #include "swscale.h" 28 #include "swscale.h"
29 #include "dsputil.h" 29 #include "dsputil.h"
30
31 #ifdef HAVE_ALTIVEC
32 #include "ppc/imgresample_altivec.h"
33 #endif
30 34
31 #define NB_COMPONENTS 3 35 #define NB_COMPONENTS 3
32 36
33 #define PHASE_BITS 4 37 #define PHASE_BITS 4
34 #define NB_PHASES (1 << PHASE_BITS) 38 #define NB_PHASES (1 << PHASE_BITS)
278 dst_width--; 282 dst_width--;
279 } 283 }
280 emms(); 284 emms();
281 } 285 }
282 #endif /* HAVE_MMX */ 286 #endif /* HAVE_MMX */
283
284 #ifdef HAVE_ALTIVEC
285 typedef union {
286 vector unsigned char v;
287 unsigned char c[16];
288 } vec_uc_t;
289
290 typedef union {
291 vector signed short v;
292 signed short s[8];
293 } vec_ss_t;
294
295 void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
296 int wrap, int16_t *filter)
297 {
298 int sum, i;
299 const uint8_t *s;
300 vector unsigned char *tv, tmp, dstv, zero;
301 vec_ss_t srchv[4], srclv[4], fv[4];
302 vector signed short zeros, sumhv, sumlv;
303 s = src;
304
305 for(i=0;i<4;i++)
306 {
307 /*
308 The vec_madds later on does an implicit >>15 on the result.
309 Since FILTER_BITS is 8, and we have 15 bits of magnitude in
310 a signed short, we have just enough bits to pre-shift our
311 filter constants <<7 to compensate for vec_madds.
312 */
313 fv[i].s[0] = filter[i] << (15-FILTER_BITS);
314 fv[i].v = vec_splat(fv[i].v, 0);
315 }
316
317 zero = vec_splat_u8(0);
318 zeros = vec_splat_s16(0);
319
320
321 /*
322 When we're resampling, we'd ideally like both our input buffers,
323 and output buffers to be 16-byte aligned, so we can do both aligned
324 reads and writes. Sadly we can't always have this at the moment, so
325 we opt for aligned writes, as unaligned writes have a huge overhead.
326 To do this, do enough scalar resamples to get dst 16-byte aligned.
327 */
328 i = (-(int)dst) & 0xf;
329 while(i>0) {
330 sum = s[0 * wrap] * filter[0] +
331 s[1 * wrap] * filter[1] +
332 s[2 * wrap] * filter[2] +
333 s[3 * wrap] * filter[3];
334 sum = sum >> FILTER_BITS;
335 if (sum<0) sum = 0; else if (sum>255) sum=255;
336 dst[0] = sum;
337 dst++;
338 s++;
339 dst_width--;
340 i--;
341 }
342
343 /* Do our altivec resampling on 16 pixels at once. */
344 while(dst_width>=16) {
345 /*
346 Read 16 (potentially unaligned) bytes from each of
347 4 lines into 4 vectors, and split them into shorts.
348 Interleave the multipy/accumulate for the resample
349 filter with the loads to hide the 3 cycle latency
350 the vec_madds have.
351 */
352 tv = (vector unsigned char *) &s[0 * wrap];
353 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
354 srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
355 srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
356 sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
357 sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
358
359 tv = (vector unsigned char *) &s[1 * wrap];
360 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
361 srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
362 srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
363 sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
364 sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
365
366 tv = (vector unsigned char *) &s[2 * wrap];
367 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
368 srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
369 srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
370 sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
371 sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
372
373 tv = (vector unsigned char *) &s[3 * wrap];
374 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
375 srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
376 srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
377 sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
378 sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
379
380 /*
381 Pack the results into our destination vector,
382 and do an aligned write of that back to memory.
383 */
384 dstv = vec_packsu(sumhv, sumlv) ;
385 vec_st(dstv, 0, (vector unsigned char *) dst);
386
387 dst+=16;
388 s+=16;
389 dst_width-=16;
390 }
391
392 /*
393 If there are any leftover pixels, resample them
394 with the slow scalar method.
395 */
396 while(dst_width>0) {
397 sum = s[0 * wrap] * filter[0] +
398 s[1 * wrap] * filter[1] +
399 s[2 * wrap] * filter[2] +
400 s[3 * wrap] * filter[3];
401 sum = sum >> FILTER_BITS;
402 if (sum<0) sum = 0; else if (sum>255) sum=255;
403 dst[0] = sum;
404 dst++;
405 s++;
406 dst_width--;
407 }
408 }
409 #endif /* HAVE_ALTIVEC */
410 287
411 /* slow version to handle limit cases. Does not need optimisation */ 288 /* slow version to handle limit cases. Does not need optimisation */
412 static void h_resample_slow(uint8_t *dst, int dst_width, 289 static void h_resample_slow(uint8_t *dst, int dst_width,
413 const uint8_t *src, int src_width, 290 const uint8_t *src, int src_width,
414 int src_start, int src_incr, int16_t *filters) 291 int src_start, int src_incr, int16_t *filters)