Mercurial > libavcodec.hg
changeset 894:a408778eff87 libavcodec
altivec accelerated v-resample patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
author | michaelni |
---|---|
date | Tue, 26 Nov 2002 09:21:01 +0000 |
parents | 6faecb29b3b5 |
children | 1a5926aeed5f |
files | dsputil.h imgresample.c ppc/dsputil_ppc.c |
diffstat | 3 files changed, 143 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/dsputil.h Tue Nov 26 08:58:24 2002 +0000 +++ b/dsputil.h Tue Nov 26 09:21:01 2002 +0000 @@ -190,6 +190,10 @@ #elif defined(ARCH_POWERPC) +#define MM_ALTIVEC 0x0001 /* standard AltiVec */ + +extern int mm_flags; + #define __align8 __attribute__ ((aligned (16))) void dsputil_init_ppc(DSPContext* c, unsigned mask);
--- a/imgresample.c Tue Nov 26 08:58:24 2002 +0000 +++ b/imgresample.c Tue Nov 26 09:21:01 2002 +0000 @@ -22,7 +22,7 @@ #ifdef USE_FASTMEMCPY #include "fastmemcpy.h" #endif - +extern int mm_flags; #define NB_COMPONENTS 3 @@ -264,6 +264,133 @@ } #endif +#ifdef HAVE_ALTIVEC +typedef union { + vector unsigned char v; + unsigned char c[16]; +} vec_uc_t; + +typedef union { + vector signed short v; + signed short s[8]; +} vec_ss_t; + +void v_resample16_altivec(UINT8 *dst, int dst_width, UINT8 *src, int wrap, + INT16 *filter) +{ + int sum, i; + uint8_t *s; + vector unsigned char *tv, tmp, dstv, zero; + vec_ss_t srchv[4], srclv[4], fv[4]; + vector signed short zeros, sumhv, sumlv; + s = src; + + for(i=0;i<4;i++) + { + /* + The vec_madds later on does an implicit >>15 on the result. + Since FILTER_BITS is 8, and we have 15 bits of magnitude in + a signed short, we have just enough bits to pre-shift our + filter constants <<7 to compensate for vec_madds. + */ + fv[i].s[0] = filter[i] << (15-FILTER_BITS); + fv[i].v = vec_splat(fv[i].v, 0); + } + + zero = vec_splat_u8(0); + zeros = vec_splat_s16(0); + + + /* + When we're resampling, we'd ideally like both our input buffers, + and output buffers to be 16-byte aligned, so we can do both aligned + reads and writes. Sadly we can't always have this at the moment, so + we opt for aligned writes, as unaligned writes have a huge overhead. + To do this, do enough scalar resamples to get dst 16-byte aligned. + */ + i = (16-((int)dst) & 0xf) & 0xf; + while(i>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + i--; + } + + /* Do our altivec resampling on 16 pixels at once. */ + while(dst_width>=16) { + /* + Read 16 (potentially unaligned) bytes from each of + 4 lines into 4 vectors, and split them into shorts. + Interleave the multipy/accumulate for the resample + filter with the loads to hide the 3 cycle latency + the vec_madds have. + */ + tv = (vector unsigned char *) &s[0 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); + srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[0].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); + sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); + + tv = (vector unsigned char *) &s[1 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); + srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[1].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); + sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); + + tv = (vector unsigned char *) &s[2 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); + srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[2].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); + sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); + + tv = (vector unsigned char *) &s[3 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); + srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[3].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); + sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); + + /* + Pack the results into our destination vector, + and do an aligned write of that back to memory. + */ + dstv = vec_packsu(sumhv, sumlv) ; + vec_st(dstv, 0, (vector unsigned char *) dst); + + dst+=16; + s+=16; + dst_width-=16; + } + + /* + If there are any leftover pixels, resample them + with the slow scalar method. + */ + while(dst_width>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + } +} +#endif + /* slow version to handle limit cases. Does not need optimisation */ static void h_resample_slow(UINT8 *dst, int dst_width, UINT8 *src, int src_width, int src_start, int src_incr, INT16 *filters) @@ -384,6 +511,13 @@ &s->v_filters[phase_y][0]); else #endif +#ifdef HAVE_ALTIVEC + if ((mm_flags & MM_ALTIVEC) && NB_TAPS == 4 && FILTER_BITS == 8) + v_resample16_altivec(output, owidth, + s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, + &s->v_filters[phase_y][0]); + else +#endif v_resample(output, owidth, s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, &s->v_filters[phase_y][0]);
--- a/ppc/dsputil_ppc.c Tue Nov 26 08:58:24 2002 +0000 +++ b/ppc/dsputil_ppc.c Tue Nov 26 09:21:01 2002 +0000 @@ -23,6 +23,8 @@ #include "dsputil_altivec.h" #endif +int mm_flags = 0; + void dsputil_init_ppc(DSPContext* c, unsigned mask) { // Common optimisations whether Altivec or not @@ -31,6 +33,8 @@ #if HAVE_ALTIVEC if (has_altivec()) { + mm_flags |= MM_ALTIVEC; + // Altivec specific optimisations c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec; c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;