# HG changeset patch # User lu_zero # Date 1145994842 0 # Node ID b2a0cb3561b323503abbe8929b1ccc6f4dd31a5a # Parent e5349b5289d41d8c178ea567dbc5c70b98d68ad7 13% faster inner_add_yblock diff -r e5349b5289d4 -r b2a0cb3561b3 ppc/dsputil_snow_altivec.c --- a/ppc/dsputil_snow_altivec.c Tue Apr 25 17:29:31 2006 +0000 +++ b/ppc/dsputil_snow_altivec.c Tue Apr 25 19:54:02 2006 +0000 @@ -413,6 +413,96 @@ } } +#define LOAD_BLOCKS \ + tmp1 = vec_ld(0, &block[3][y*src_stride]);\ + align = vec_lvsl(0, &block[3][y*src_stride]);\ + tmp2 = vec_ld(15, &block[3][y*src_stride]);\ +\ + b3 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, &block[2][y*src_stride]);\ + align = vec_lvsl(0, &block[2][y*src_stride]);\ + tmp2 = vec_ld(15, &block[2][y*src_stride]);\ +\ + b2 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, &block[1][y*src_stride]);\ + align = vec_lvsl(0, &block[1][y*src_stride]);\ + tmp2 = vec_ld(15, &block[1][y*src_stride]);\ +\ + b1 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, &block[0][y*src_stride]);\ + align = vec_lvsl(0, &block[0][y*src_stride]);\ + tmp2 = vec_ld(15, &block[0][y*src_stride]);\ +\ + b0 = vec_perm(tmp1,tmp2,align); + +#define LOAD_OBMCS \ + tmp1 = vec_ld(0, obmc1);\ + align = vec_lvsl(0, obmc1);\ + tmp2 = vec_ld(15, obmc1);\ +\ + ob1 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, obmc2);\ + align = vec_lvsl(0, obmc2);\ + tmp2 = vec_ld(15, obmc2);\ +\ + ob2 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, obmc3);\ + align = vec_lvsl(0, obmc3);\ + tmp2 = vec_ld(15, obmc3);\ +\ + ob3 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, obmc4);\ + align = vec_lvsl(0, obmc4);\ + tmp2 = vec_ld(15, obmc4);\ +\ + ob4 = vec_perm(tmp1,tmp2,align); + +/* interleave logic + * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ] + * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ] + * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ] + */ + +#define STEPS_0_1\ + h1 = (vector unsigned short)\ + vec_mergeh(ob1, ob2);\ +\ + h2 = (vector unsigned short)\ + vec_mergeh(ob3, ob4);\ +\ + ih = (vector unsigned char)\ + vec_mergeh(h1,h2);\ +\ + l1 = (vector unsigned short) vec_mergeh(b3, b2);\ +\ + ih1 = (vector unsigned char) vec_mergel(h1, h2);\ +\ + l2 = (vector unsigned short) vec_mergeh(b1, b0);\ +\ + il = (vector unsigned char) vec_mergeh(l1, l2);\ +\ + v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ +\ + il1 = (vector unsigned char) vec_mergel(l1, l2);\ +\ + v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); + +#define FINAL_STEP_SCALAR\ + for(x=0; x> FRAC_BITS;\ + if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\ + dst8[x + y*src_stride] = vbuf[x];\ + }else{\ + dst[x + src_x] -= vbuf[x];\ + } static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, const int obmc_stride, @@ -423,11 +513,13 @@ { int y, x; DWTELEM * dst; -// vector bool int mask; -// vector signed int vs; vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, tmp1, tmp2, align; + vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; vector unsigned char b0,b1,b2,b3; + vector unsigned char ob1,ob2,ob3,ob4; + + DECLARE_ALIGNED_16(int, vbuf[16]); + vector signed int *v = (vector signed int *)vbuf, *d; for(y=0; y>1); uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); uint8_t *obmc4= obmc3+ (obmc_stride>>1); -#if 1 - vector unsigned char ob1; - vector unsigned char ob2; - vector unsigned char ob3; - vector unsigned char ob4; - -#endif - DECLARE_ALIGNED_16(int, vbuf[16]); - vector signed int *v = (vector signed int *)vbuf, *d; dst = slice_buffer_get_line(sb, src_y + y); d = (vector signed int *)(dst + src_x); -#if 0 - for(x=0; x> FRAC_BITS; - if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31); - dst8[x + y*src_stride] = vbuf[x]; - }else{ - dst[x + src_x] -= vbuf[x]; - } -#else - if(add) - { - for(x=0; x>1); + uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); + uint8_t *obmc4= obmc3+ (obmc_stride>>1); + + dst = slice_buffer_get_line(sb, src_y + y); + d = (vector signed int *)(dst + src_x); + + // load blocks + LOAD_BLOCKS + + // load obmcs + LOAD_OBMCS + + // steps 0 1 2 3 + STEPS_0_1 + + STEPS_2_3 + + FINAL_STEP_SCALAR + + } +} + +#define FINAL_STEP_VEC \ +\ + if(add)\ + {\ + for(x=0; x>1); uint8_t *obmc4= obmc3+ (obmc_stride>>1); - vector unsigned char ob1; - vector unsigned char ob2; - vector unsigned char ob3; - vector unsigned char ob4; + dst = slice_buffer_get_line(sb, src_y + y); + d = (vector signed int *)(dst + src_x); + +//FIXME i could avoid some loads! + + // load blocks + LOAD_BLOCKS + + // load obmcs + LOAD_OBMCS + + // steps 0 1 + STEPS_0_1 + + FINAL_STEP_VEC + + } + +} - DECLARE_ALIGNED_16(int, vbuf[b_w]); - vector signed int *v = (vector signed int *)vbuf, *d; +static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, + const int obmc_stride, + uint8_t * * block, int b_w, + int b_h, int src_x, int src_y, + int src_stride, slice_buffer * sb, + int add, uint8_t * dst8) +{ + int y, x; + DWTELEM * dst; + vector bool int mask; + vector signed int vs; + vector unsigned short h1, h2, l1, l2; + vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; + vector unsigned char b0,b1,b2,b3; + vector unsigned char ob1,ob2,ob3,ob4; + DECLARE_ALIGNED_16(int, vbuf[b_w]); + vector signed int *v = (vector signed int *)vbuf, *d; + + for(y=0; y>1); + uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); + uint8_t *obmc4= obmc3+ (obmc_stride>>1); dst = slice_buffer_get_line(sb, src_y + y); d = (vector signed int *)(dst + src_x); // load blocks - - tmp1 = vec_ld(0, &block[3][y*src_stride]); - align = vec_lvsl(0, &block[3][y*src_stride]); - tmp2 = vec_ld(15, &block[3][y*src_stride]); - - b3 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, &block[2][y*src_stride]); - align = vec_lvsl(0, &block[2][y*src_stride]); - tmp2 = vec_ld(15, &block[2][y*src_stride]); - - b2 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, &block[1][y*src_stride]); - align = vec_lvsl(0, &block[1][y*src_stride]); - tmp2 = vec_ld(15, &block[1][y*src_stride]); - - b1 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, &block[0][y*src_stride]); - align = vec_lvsl(0, &block[0][y*src_stride]); - tmp2 = vec_ld(15, &block[0][y*src_stride]); - - b0 = vec_perm(tmp1,tmp2,align); - - // load obmcs - - tmp1 = vec_ld(0, obmc1); - align = vec_lvsl(0, obmc1); - tmp2 = vec_ld(15, obmc1); - - ob1 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, obmc2); - align = vec_lvsl(0, obmc2); - tmp2 = vec_ld(15, obmc2); - - ob2 = vec_perm(tmp1,tmp2,align); + LOAD_BLOCKS - tmp1 = vec_ld(0, obmc3); - align = vec_lvsl(0, obmc3); - tmp2 = vec_ld(15, obmc3); - - ob3 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, obmc4); - align = vec_lvsl(0, obmc4); - tmp2 = vec_ld(15, obmc4); - - ob4 = vec_perm(tmp1,tmp2,align); - -//step0 - h1 = (vector unsigned short) - vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b, - a,b,a,b, - a,b,a,b, - a,b,a,b ] */ - h2 = (vector unsigned short) - vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d, - c,d,c,d, - c,d,c,d, - c,d,c,d ] */ - - ih = (vector unsigned char) - vec_mergeh(h1,h2); /*ih <- [ a,b,c,d, - a,b,c,d, - a,b,c,d, - a,b,c,d ]*/ - - l1 = (vector unsigned short) vec_mergeh(b3, b2); - - l2 = (vector unsigned short) vec_mergeh(b1, b0); - - il = (vector unsigned char) vec_mergeh(l1,l2); - - v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); -//step1 - - h1 = (vector unsigned short) vec_mergeh(ob1, ob2); - - h2 = (vector unsigned short) vec_mergeh(ob3, ob4); + // load obmcs + LOAD_OBMCS - ih = (vector unsigned char) vec_mergel(h1,h2); - - l1 = (vector unsigned short) vec_mergeh(b3, b2); - - l2 = (vector unsigned short) vec_mergeh(b1, b0); - - il = (vector unsigned char) vec_mergel(l1,l2); - - v[1] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); - -//step2 - h1 = (vector unsigned short) vec_mergel(ob1, ob2); - - h2 = (vector unsigned short) vec_mergel(ob3, ob4); - - ih = (vector unsigned char) vec_mergeh(h1,h2); - - l1 = (vector unsigned short) vec_mergel(b3, b2); - - l2 = (vector unsigned short) vec_mergel(b1, b0); - - il = (vector unsigned char) vec_mergeh(l1,l2); - - v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); - -//step3 - h1 = (vector unsigned short) vec_mergel(ob1, ob2); - - h2 = (vector unsigned short) vec_mergel(ob3, ob4); - - ih = (vector unsigned char) vec_mergel(h1,h2); - - l1 = (vector unsigned short) vec_mergel(b3, b2); - - l2 = (vector unsigned short) vec_mergel(b1, b0); - - il = (vector unsigned char) vec_mergel(l1,l2); + // steps 0 1 2 3 + STEPS_0_1 - v[3] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); -#if 1 - for(x=0; x> FRAC_BITS; - if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31); - dst8[x + y*src_stride] = vbuf[x]; - }else{ - dst[x + src_x] -= vbuf[x]; - } -#else - if(add) - { - for(x=0; x