Mercurial > libavcodec.hg
comparison imgresample.c @ 5750:09f99af1db40 libavcodec
Sanitize altivec code so it can be built with runtime check properly
author | lu_zero |
---|---|
date | Tue, 02 Oct 2007 11:39:32 +0000 |
parents | 9968f39d03aa |
children | 80103098c797 |
comparison
equal
deleted
inserted
replaced
5749:784dcbdc910f | 5750:09f99af1db40 |
---|---|
25 */ | 25 */ |
26 | 26 |
27 #include "avcodec.h" | 27 #include "avcodec.h" |
28 #include "swscale.h" | 28 #include "swscale.h" |
29 #include "dsputil.h" | 29 #include "dsputil.h" |
30 | |
31 #ifdef HAVE_ALTIVEC | |
32 #include "ppc/imgresample_altivec.h" | |
33 #endif | |
30 | 34 |
31 #define NB_COMPONENTS 3 | 35 #define NB_COMPONENTS 3 |
32 | 36 |
33 #define PHASE_BITS 4 | 37 #define PHASE_BITS 4 |
34 #define NB_PHASES (1 << PHASE_BITS) | 38 #define NB_PHASES (1 << PHASE_BITS) |
278 dst_width--; | 282 dst_width--; |
279 } | 283 } |
280 emms(); | 284 emms(); |
281 } | 285 } |
282 #endif /* HAVE_MMX */ | 286 #endif /* HAVE_MMX */ |
283 | |
284 #ifdef HAVE_ALTIVEC | |
285 typedef union { | |
286 vector unsigned char v; | |
287 unsigned char c[16]; | |
288 } vec_uc_t; | |
289 | |
290 typedef union { | |
291 vector signed short v; | |
292 signed short s[8]; | |
293 } vec_ss_t; | |
294 | |
295 void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |
296 int wrap, int16_t *filter) | |
297 { | |
298 int sum, i; | |
299 const uint8_t *s; | |
300 vector unsigned char *tv, tmp, dstv, zero; | |
301 vec_ss_t srchv[4], srclv[4], fv[4]; | |
302 vector signed short zeros, sumhv, sumlv; | |
303 s = src; | |
304 | |
305 for(i=0;i<4;i++) | |
306 { | |
307 /* | |
308 The vec_madds later on does an implicit >>15 on the result. | |
309 Since FILTER_BITS is 8, and we have 15 bits of magnitude in | |
310 a signed short, we have just enough bits to pre-shift our | |
311 filter constants <<7 to compensate for vec_madds. | |
312 */ | |
313 fv[i].s[0] = filter[i] << (15-FILTER_BITS); | |
314 fv[i].v = vec_splat(fv[i].v, 0); | |
315 } | |
316 | |
317 zero = vec_splat_u8(0); | |
318 zeros = vec_splat_s16(0); | |
319 | |
320 | |
321 /* | |
322 When we're resampling, we'd ideally like both our input buffers, | |
323 and output buffers to be 16-byte aligned, so we can do both aligned | |
324 reads and writes. Sadly we can't always have this at the moment, so | |
325 we opt for aligned writes, as unaligned writes have a huge overhead. | |
326 To do this, do enough scalar resamples to get dst 16-byte aligned. | |
327 */ | |
328 i = (-(int)dst) & 0xf; | |
329 while(i>0) { | |
330 sum = s[0 * wrap] * filter[0] + | |
331 s[1 * wrap] * filter[1] + | |
332 s[2 * wrap] * filter[2] + | |
333 s[3 * wrap] * filter[3]; | |
334 sum = sum >> FILTER_BITS; | |
335 if (sum<0) sum = 0; else if (sum>255) sum=255; | |
336 dst[0] = sum; | |
337 dst++; | |
338 s++; | |
339 dst_width--; | |
340 i--; | |
341 } | |
342 | |
343 /* Do our altivec resampling on 16 pixels at once. */ | |
344 while(dst_width>=16) { | |
345 /* | |
346 Read 16 (potentially unaligned) bytes from each of | |
347 4 lines into 4 vectors, and split them into shorts. | |
348 Interleave the multipy/accumulate for the resample | |
349 filter with the loads to hide the 3 cycle latency | |
350 the vec_madds have. | |
351 */ | |
352 tv = (vector unsigned char *) &s[0 * wrap]; | |
353 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); | |
354 srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); | |
355 srclv[0].v = (vector signed short) vec_mergel(zero, tmp); | |
356 sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); | |
357 sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); | |
358 | |
359 tv = (vector unsigned char *) &s[1 * wrap]; | |
360 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); | |
361 srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); | |
362 srclv[1].v = (vector signed short) vec_mergel(zero, tmp); | |
363 sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); | |
364 sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); | |
365 | |
366 tv = (vector unsigned char *) &s[2 * wrap]; | |
367 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); | |
368 srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); | |
369 srclv[2].v = (vector signed short) vec_mergel(zero, tmp); | |
370 sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); | |
371 sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); | |
372 | |
373 tv = (vector unsigned char *) &s[3 * wrap]; | |
374 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); | |
375 srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); | |
376 srclv[3].v = (vector signed short) vec_mergel(zero, tmp); | |
377 sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); | |
378 sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); | |
379 | |
380 /* | |
381 Pack the results into our destination vector, | |
382 and do an aligned write of that back to memory. | |
383 */ | |
384 dstv = vec_packsu(sumhv, sumlv) ; | |
385 vec_st(dstv, 0, (vector unsigned char *) dst); | |
386 | |
387 dst+=16; | |
388 s+=16; | |
389 dst_width-=16; | |
390 } | |
391 | |
392 /* | |
393 If there are any leftover pixels, resample them | |
394 with the slow scalar method. | |
395 */ | |
396 while(dst_width>0) { | |
397 sum = s[0 * wrap] * filter[0] + | |
398 s[1 * wrap] * filter[1] + | |
399 s[2 * wrap] * filter[2] + | |
400 s[3 * wrap] * filter[3]; | |
401 sum = sum >> FILTER_BITS; | |
402 if (sum<0) sum = 0; else if (sum>255) sum=255; | |
403 dst[0] = sum; | |
404 dst++; | |
405 s++; | |
406 dst_width--; | |
407 } | |
408 } | |
409 #endif /* HAVE_ALTIVEC */ | |
410 | 287 |
411 /* slow version to handle limit cases. Does not need optimisation */ | 288 /* slow version to handle limit cases. Does not need optimisation */ |
412 static void h_resample_slow(uint8_t *dst, int dst_width, | 289 static void h_resample_slow(uint8_t *dst, int dst_width, |
413 const uint8_t *src, int src_width, | 290 const uint8_t *src, int src_width, |
414 int src_start, int src_incr, int16_t *filters) | 291 int src_start, int src_incr, int16_t *filters) |