Mercurial > libavcodec.hg
diff alpha/dsputil_alpha.c @ 509:cab79946302f libavcodec
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
allows better scheduling of the memory accesses, and is portable among
all compilers.
author | mellum |
---|---|
date | Mon, 01 Jul 2002 04:26:07 +0000 |
parents | 7a976bf93394 |
children | fa4425cf6b31 |
line wrap: on
line diff
--- a/alpha/dsputil_alpha.c Sat Jun 29 15:01:23 2002 +0000 +++ b/alpha/dsputil_alpha.c Mon Jul 01 04:26:07 2002 +0000 @@ -22,64 +22,86 @@ void simple_idct_axp(DCTELEM *block); -static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, - int line_size) +void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, + int line_size); +void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, + int line_size); + +#if 0 +/* These functions were the base for the optimized assembler routines, + and remain here for documentation purposes. */ +static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, + int line_size) { int i = 8; + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ ASM_ACCEPT_MVI; do { - UINT64 shorts; + uint64_t shorts0, shorts1; - shorts = ldq(block); - shorts = maxsw4(shorts, 0); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels); + shorts0 = ldq(block); + shorts0 = maxsw4(shorts0, 0); + shorts0 = minsw4(shorts0, clampmask); + stl(pkwb(shorts0), pixels); - shorts = ldq(block + 4); - shorts = maxsw4(shorts, 0); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels + 4); + shorts1 = ldq(block + 4); + shorts1 = maxsw4(shorts1, 0); + shorts1 = minsw4(shorts1, clampmask); + stl(pkwb(shorts1), pixels + 4); - pixels += line_size; - block += 8; + pixels += line_size; + block += 8; } while (--i); } -static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, - int line_size) +void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, + int line_size) { - int i = 8; + int h = 8; + /* Keep this function a leaf function by generating the constants + manually (mainly for the hack value ;-). */ + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ + uint64_t signmask = zap(-1, 0x33); + signmask ^= signmask >> 1; /* 0x8000800080008000 */ ASM_ACCEPT_MVI; do { - UINT64 shorts; + uint64_t shorts0, pix0, signs0; + uint64_t shorts1, pix1, signs1; + + shorts0 = ldq(block); + shorts1 = ldq(block + 4); - shorts = ldq(block); - shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ - shorts += unpkbw(ldl(pixels)); - shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ - shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ - shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ - shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ - stl(pkwb(shorts), pixels); + pix0 = unpkbw(ldl(pixels)); + /* Signed subword add (MMX paddw). */ + signs0 = shorts0 & signmask; + shorts0 &= ~signmask; + shorts0 += pix0; + shorts0 ^= signs0; + /* Clamp. */ + shorts0 = maxsw4(shorts0, 0); + shorts0 = minsw4(shorts0, clampmask); - /* next 4 */ - shorts = ldq(block + 4); - shorts &= ~WORD_VEC(0x8000); - shorts += unpkbw(ldl(pixels + 4)); - shorts &= ~WORD_VEC(0x8000); - shorts = minuw4(shorts, WORD_VEC(0x4000)); - shorts &= ~WORD_VEC(0x4000); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels + 4); + /* Next 4. */ + pix1 = unpkbw(ldl(pixels + 4)); + signs1 = shorts1 & signmask; + shorts1 &= ~signmask; + shorts1 += pix1; + shorts1 ^= signs1; + shorts1 = maxsw4(shorts1, 0); + shorts1 = minsw4(shorts1, clampmask); - pixels += line_size; - block += 8; - } while (--i); + stl(pkwb(shorts0), pixels); + stl(pkwb(shorts1), pixels + 4); + + pixels += line_size; + block += 8; + } while (--h); } +#endif /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 Since the immediate result could be greater than 255, we do the @@ -222,7 +244,7 @@ /* amask clears all bits that correspond to present features. */ if (amask(AMASK_MVI) == 0) { - put_pixels_clamped = put_pixels_clamped_axp; - add_pixels_clamped = add_pixels_clamped_axp; + put_pixels_clamped = put_pixels_clamped_mvi_asm; + add_pixels_clamped = add_pixels_clamped_mvi_asm; } }