# HG changeset patch # User nickols_k # Date 1011538082 0 # Node ID 73df666cacc7f7d21f3860d8e0d6425510ff075f # Parent e80ad397d30e0f9aee25c33386ec64251a2776ee Alpha optimizations by Falk Hueffner diff -r e80ad397d30e -r 73df666cacc7 Makefile --- a/Makefile Sun Jan 20 14:30:34 2002 +0000 +++ b/Makefile Sun Jan 20 14:48:02 2002 +0000 @@ -37,6 +37,12 @@ CFLAGS += $(MLIB_INC) endif +# alpha specific stuff +ifeq ($(TARGET_ARCH_ALPHA),yes) +OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o +CFLAGS += -Wa,-mpca56 +endif + SRCS = $(OBJS:.o=.c) $(ASM_OBJS:.o=.s) LIB= libavcodec.a @@ -74,6 +80,7 @@ rm -f *.o *~ $(LIB) $(SLIB) *.so i386/*.o i386/*~ \ armv4l/*.o armv4l/*~ \ mlib/*.o mlib/*~ \ + alpha/*.o alpha/*~ \ libac3/*.o libac3/*~ \ apiexample $(TESTS) diff -r e80ad397d30e -r 73df666cacc7 alpha/asm.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alpha/asm.h Sun Jan 20 14:48:02 2002 +0000 @@ -0,0 +1,141 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef LIBAVCODEC_ALPHA_ASM_H +#define LIBAVCODEC_ALPHA_ASM_H + +#include + +#define AMASK_BWX (1 << 0) +#define AMASK_FIX (1 << 1) +#define AMASK_MVI (1 << 8) + +static inline uint64_t BYTE_VEC(uint64_t x) +{ + x |= x << 8; + x |= x << 16; + x |= x << 32; + return x; +} +static inline uint64_t WORD_VEC(uint64_t x) +{ + x |= x << 16; + x |= x << 32; + return x; +} + +static inline int32_t ldl(const void* p) +{ + return *(const int32_t*) p; +} +static inline uint64_t ldq(const void* p) +{ + return *(const uint64_t*) p; +} +/* FIXME ccc doesn't seem to get it? Use inline asm? */ +static inline uint64_t ldq_u(const void* p) +{ + return *(const uint64_t*) ((uintptr_t) p & ~7ul); +} +static inline void stl(uint32_t l, void* p) +{ + *(uint32_t*) p = l; +} +static inline void stq(uint64_t l, void* p) +{ + *(uint64_t*) p = l; +} + +#ifdef __GNUC__ +#define OPCODE1(name) \ +static inline uint64_t name(uint64_t l) \ +{ \ + uint64_t r; \ + asm (#name " %1, %0" : "=r" (r) : "r" (l)); \ + return r; \ +} + +#define OPCODE2(name) \ +static inline uint64_t name(uint64_t l1, uint64_t l2) \ +{ \ + uint64_t r; \ + asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2)); \ + return r; \ +} + +/* We don't want gcc to move this around or combine it with another + rpcc, so mark it volatile. */ +static inline uint64_t rpcc(void) +{ + uint64_t r; + asm volatile ("rpcc %0" : "=r" (r)); + return r; +} + +static inline uint64_t uldq(const void* v) +{ + struct foo { + unsigned long l; + } __attribute__((packed)); + + return ((const struct foo*) v)->l; +} + +#elif defined(__DECC) /* Compaq "ccc" compiler */ + +#include +#define OPCODE1(name) \ +static inline uint64_t name(uint64_t l) \ +{ \ + return asm (#name " %a0, %v0", l); \ +} + +#define OPCODE2(name) \ +static inline uint64_t name(uint64_t l1, uint64_t l2) \ +{ \ + return asm (#name " %a0, %a1, %v0", l1, l2); \ +} + +static inline uint64_t rpcc(void) +{ + return asm ("rpcc %v0"); +} + +static inline uint64_t uldq(const void* v) +{ + return *(const __unaligned uint64_t *) v; +} + +#endif + +OPCODE1(amask); +OPCODE1(unpkbw); +OPCODE1(pkwb); +OPCODE2(extql); +OPCODE2(extqh); +OPCODE2(zap); +OPCODE2(cmpbge); +OPCODE2(minsw4); +OPCODE2(minuw4); +OPCODE2(minub8); +OPCODE2(maxsw4); +OPCODE2(maxuw4); +OPCODE2(perr); + +#endif /* LIBAVCODEC_ALPHA_ASM_H */ diff -r e80ad397d30e -r 73df666cacc7 alpha/dsputil_alpha.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alpha/dsputil_alpha.c Sun Jan 20 14:48:02 2002 +0000 @@ -0,0 +1,223 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "asm.h" +#include "../dsputil.h" + +void simple_idct_axp(DCTELEM *block); + +static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, + int line_size) +{ + int i = 8; + do { + UINT64 shorts; + + shorts = ldq(block); + shorts = maxsw4(shorts, 0); + shorts = minsw4(shorts, WORD_VEC(0x00ff)); + stl(pkwb(shorts), pixels); + + shorts = ldq(block + 4); + shorts = maxsw4(shorts, 0); + shorts = minsw4(shorts, WORD_VEC(0x00ff)); + stl(pkwb(shorts), pixels + 4); + + pixels += line_size; + block += 8; + } while (--i); +} + +static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, + int line_size) +{ + int i = 8; + do { + UINT64 shorts; + + shorts = ldq(block); + shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ + shorts += unpkbw(ldl(pixels)); + shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ + shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ + shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ + shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ + stl(pkwb(shorts), pixels); + + /* next 4 */ + shorts = ldq(block + 4); + shorts &= ~WORD_VEC(0x8000); + shorts += unpkbw(ldl(pixels + 4)); + shorts &= ~WORD_VEC(0x8000); + shorts = minuw4(shorts, WORD_VEC(0x4000)); + shorts &= ~WORD_VEC(0x4000); + shorts = minsw4(shorts, WORD_VEC(0x00ff)); + stl(pkwb(shorts), pixels + 4); + + pixels += line_size; + block += 8; + } while (--i); +} + +/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 + Since the immediate result could be greater than 255, we do the + shift first. The result is too low by one if the bytes were both + odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ +static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2) +{ + UINT64 correction = (l1 & l2) & BYTE_VEC(0x01); + l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; + l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; + return l1 + l2 + correction; +} + +/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1 + The '1' only has an effect when one byte is even and the other odd, + i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01). + Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */ +static inline UINT64 avg2(UINT64 l1, UINT64 l2) +{ + UINT64 correction = (l1 | l2) & BYTE_VEC(0x01); + l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; + l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; + return l1 + l2 + correction; +} + +static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) +{ + UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) + + ((l2 & ~BYTE_VEC(0x03)) >> 2) + + ((l3 & ~BYTE_VEC(0x03)) >> 2) + + ((l4 & ~BYTE_VEC(0x03)) >> 2); + UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) + + (l2 & BYTE_VEC(0x03)) + + (l3 & BYTE_VEC(0x03)) + + (l4 & BYTE_VEC(0x03)) + + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); + return r1 + r2; +} + +static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) +{ + UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) + + ((l2 & ~BYTE_VEC(0x03)) >> 2) + + ((l3 & ~BYTE_VEC(0x03)) >> 2) + + ((l4 & ~BYTE_VEC(0x03)) >> 2); + UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) + + (l2 & BYTE_VEC(0x03)) + + (l3 & BYTE_VEC(0x03)) + + (l4 & BYTE_VEC(0x03)) + + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); + return r1 + r2; +} + +#define PIXOPNAME(suffix) put ## suffix +#define BTYPE UINT8 +#define AVG2 avg2 +#define AVG4 avg4 +#define STORE(l, b) stq(l, b) +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#define PIXOPNAME(suffix) put_no_rnd ## suffix +#define BTYPE UINT8 +#define AVG2 avg2_no_rnd +#define AVG4 avg4_no_rnd +#define STORE(l, b) stq(l, b) +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +/* The following functions are untested. */ +#if 0 + +#define PIXOPNAME(suffix) avg ## suffix +#define BTYPE UINT8 +#define AVG2 avg2 +#define AVG4 avg4 +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#define PIXOPNAME(suffix) avg_no_rnd ## suffix +#define BTYPE UINT8 +#define AVG2 avg2_no_rnd +#define AVG4 avg4_no_rnd +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#define PIXOPNAME(suffix) sub ## suffix +#define BTYPE DCTELEM +#define AVG2 avg2 +#define AVG4 avg4 +#define STORE(l, block) do { \ + UINT64 xxx = l; \ + (block)[0] -= (xxx >> 0) & 0xff; \ + (block)[1] -= (xxx >> 8) & 0xff; \ + (block)[2] -= (xxx >> 16) & 0xff; \ + (block)[3] -= (xxx >> 24) & 0xff; \ + (block)[4] -= (xxx >> 32) & 0xff; \ + (block)[5] -= (xxx >> 40) & 0xff; \ + (block)[6] -= (xxx >> 48) & 0xff; \ + (block)[7] -= (xxx >> 56) & 0xff; \ +} while (0) +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#endif + +void dsputil_init_alpha(void) +{ + put_pixels_tab[0] = put_pixels_axp; + put_pixels_tab[1] = put_pixels_x2_axp; + put_pixels_tab[2] = put_pixels_y2_axp; + put_pixels_tab[3] = put_pixels_xy2_axp; + + put_no_rnd_pixels_tab[0] = put_pixels_axp; + put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; + put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; + put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; + + /* amask clears all bits that correspond to present features. */ + if (amask(AMASK_MVI) == 0) { + fprintf(stderr, "MVI extension detected\n"); + put_pixels_clamped = put_pixels_clamped_axp; + add_pixels_clamped = add_pixels_clamped_axp; + } +} diff -r e80ad397d30e -r 73df666cacc7 alpha/mpegvideo_alpha.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alpha/mpegvideo_alpha.c Sun Jan 20 14:48:02 2002 +0000 @@ -0,0 +1,88 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "asm.h" +#include "../dsputil.h" +#include "../mpegvideo.h" + +extern UINT8 zigzag_end[64]; + +static void dct_unquantize_h263_axp(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int i, level; + UINT64 qmul, qadd; + if (s->mb_intra) { + if (n < 4) + block[0] = block[0] * s->y_dc_scale; + else + block[0] = block[0] * s->c_dc_scale; + /* Catch up to aligned point. */ + qmul = s->qscale << 1; + qadd = (s->qscale - 1) | 1; + for (i = 1; i < 4; ++i) { + level = block[i]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[i] = level; + } + } + block += 4; + i = 60 / 4; + } else { + i = zigzag_end[s->block_last_index[n]] / 4; + } + qmul = s->qscale << 1; + qadd = WORD_VEC((qscale - 1) | 1); + do { + UINT64 levels, negmask, zeromask, corr; + levels = ldq(block); + if (levels == 0) + continue; + zeromask = cmpbge(0, levels); + zeromask &= zeromask >> 1; + /* Negate all negative words. */ + negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */ + negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ + corr = negmask & WORD_VEC(0x0001); /* twos-complement correction */ + levels ^= negmask; + levels += corr; + + levels = levels * qmul; + levels += zap(qadd, zeromask); + + /* Re-negate negative words. */ + levels -= corr; + levels ^= negmask; + + stq(levels, block); + } while (block += 4, --i); +} + +void MPV_common_init_axp(MpegEncContext *s) +{ + if (amask(AMASK_MVI) == 0) { + if (s->out_format == FMT_H263) + s->dct_unquantize = dct_unquantize_h263_axp; + } +} diff -r e80ad397d30e -r 73df666cacc7 alpha/pixops.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alpha/pixops.h Sun Jan 20 14:48:02 2002 +0000 @@ -0,0 +1,135 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* This file is intended to be #included with proper definitions of + * PIXOPNAME, BTYPE, AVG2, AVG4 and STORE. */ + +static void PIXOPNAME(_pixels_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + do { + STORE(uldq(pixels), block); + pixels += line_size; + block += line_size; + } while (--h); + } else { + do { + STORE(ldq(pixels), block); + pixels += line_size; + block += line_size; + } while (--h); + } +} + +static void PIXOPNAME(_pixels_x2_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + do { + UINT64 pix1, pix2; + + pix1 = uldq(pixels); + pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + STORE(AVG2(pix1, pix2), block); + pixels += line_size; + block += line_size; + } while (--h); + } else { + do { + UINT64 pix1, pix2; + + pix1 = ldq(pixels); + pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + STORE(AVG2(pix1, pix2), block); + pixels += line_size; + block += line_size; + } while (--h); + } +} + +static void PIXOPNAME(_pixels_y2_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + UINT64 pix = uldq(pixels); + do { + UINT64 next_pix; + + pixels += line_size; + next_pix = uldq(pixels); + STORE(AVG2(pix, next_pix), block); + block += line_size; + pix = next_pix; + } while (--h); + } else { + UINT64 pix = ldq(pixels); + do { + UINT64 next_pix; + + pixels += line_size; + next_pix = ldq(pixels); + STORE(AVG2(pix, next_pix), block); + block += line_size; + pix = next_pix; + } while (--h); + } +} + +/* This could be further sped up by recycling AVG4 intermediate + results from the previous loop pass. */ +static void PIXOPNAME(_pixels_xy2_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + UINT64 pix1 = uldq(pixels); + UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + + do { + UINT64 next_pix1, next_pix2; + + pixels += line_size; + next_pix1 = uldq(pixels); + next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56); + + STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); + + block += line_size; + pix1 = next_pix1; + pix2 = next_pix2; + } while (--h); + } else { + UINT64 pix1 = ldq(pixels); + UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + + do { + UINT64 next_pix1, next_pix2; + + pixels += line_size; + next_pix1 = ldq(pixels); + next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56); + + STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); + + block += line_size; + pix1 = next_pix1; + pix2 = next_pix2; + } while (--h); + } +} diff -r e80ad397d30e -r 73df666cacc7 dsputil.c --- a/dsputil.c Sun Jan 20 14:30:34 2002 +0000 +++ b/dsputil.c Sun Jan 20 14:48:02 2002 +0000 @@ -497,6 +497,10 @@ dsputil_init_mlib(); use_permuted_idct = 0; #endif +#ifdef ARCH_ALPHA + dsputil_init_alpha(); + use_permuted_idct = 0; +#endif #ifdef SIMPLE_IDCT if(ff_idct == simple_idct) use_permuted_idct=0; diff -r e80ad397d30e -r 73df666cacc7 dsputil.h --- a/dsputil.h Sun Jan 20 14:30:34 2002 +0000 +++ b/dsputil.h Sun Jan 20 14:48:02 2002 +0000 @@ -123,6 +123,13 @@ void dsputil_init_mlib(void); +#elif defined(ARCH_ALPHA) + +#define emms_c() +#define __align8 __attribute__ ((aligned (8))) + +void dsputil_init_alpha(void); + #else #define emms_c() diff -r e80ad397d30e -r 73df666cacc7 msmpeg4.c --- a/msmpeg4.c Sun Jan 20 14:30:34 2002 +0000 +++ b/msmpeg4.c Sun Jan 20 14:48:02 2002 +0000 @@ -460,7 +460,19 @@ : "r" (scale) : "%eax", "%edx" ); -#else +#elif defined (ARCH_ALPHA) + /* Divisions are extremely costly on Alpha; optimize the most + common case. */ + if (scale == 8) { + a = (a + (8 >> 1)) / 8; + b = (b + (8 >> 1)) / 8; + c = (c + (8 >> 1)) / 8; + } else { + a = (a + (scale >> 1)) / scale; + b = (b + (scale >> 1)) / scale; + c = (c + (scale >> 1)) / scale; + } +#else a = (a + (scale >> 1)) / scale; b = (b + (scale >> 1)) / scale; c = (c + (scale >> 1)) / scale; diff -r e80ad397d30e -r 73df666cacc7 simple_idct.c --- a/simple_idct.c Sun Jan 20 14:30:34 2002 +0000 +++ b/simple_idct.c Sun Jan 20 14:48:02 2002 +0000 @@ -23,6 +23,7 @@ #include #include "simple_idct.h" +#include "../config.h" #if 0 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ @@ -102,6 +103,107 @@ return 1; } +#ifdef ARCH_ALPHA +static int inline idctRowCondDC(int16_t *row) +{ + int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; + uint64_t *lrow = (uint64_t *) row; + + if (lrow[1] == 0) { + if (lrow[0] == 0) + return 0; + if ((lrow[0] & ~0xffffULL) == 0) { + uint64_t v; + + a0 = W4 * row[0]; + a0 += 1 << (ROW_SHIFT - 1); + a0 >>= ROW_SHIFT; + v = (uint16_t) a0; + v += v << 16; + v += v << 32; + lrow[0] = v; + lrow[1] = v; + + return 1; + } + } + + a0 = W4 * row[0]; + a1 = W4 * row[0]; + a2 = W4 * row[0]; + a3 = W4 * row[0]; + + if (row[2]) { + a0 += W2 * row[2]; + a1 += W6 * row[2]; + a2 -= W6 * row[2]; + a3 -= W2 * row[2]; + } + + if (row[4]) { + a0 += W4 * row[4]; + a1 -= W4 * row[4]; + a2 -= W4 * row[4]; + a3 += W4 * row[4]; + } + + if (row[6]) { + a0 += W6 * row[6]; + a1 -= W2 * row[6]; + a2 += W2 * row[6]; + a3 -= W6 * row[6]; + } + + a0 += 1 << (ROW_SHIFT - 1); + a1 += 1 << (ROW_SHIFT - 1); + a2 += 1 << (ROW_SHIFT - 1); + a3 += 1 << (ROW_SHIFT - 1); + + if (row[1]) { + b0 = W1 * row[1]; + b1 = W3 * row[1]; + b2 = W5 * row[1]; + b3 = W7 * row[1]; + } else { + b0 = 0; + b1 = 0; + b2 = 0; + b3 = 0; + } + + if (row[3]) { + b0 += W3 * row[3]; + b1 -= W7 * row[3]; + b2 -= W1 * row[3]; + b3 -= W5 * row[3]; + } + + if (row[5]) { + b0 += W5 * row[5]; + b1 -= W1 * row[5]; + b2 += W7 * row[5]; + b3 += W3 * row[5]; + } + + if (row[7]) { + b0 += W7 * row[7]; + b1 -= W5 * row[7]; + b2 += W3 * row[7]; + b3 -= W1 * row[7]; + } + + row[0] = (a0 + b0) >> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; + + return 1; +} +#else /* not ARCH_ALPHA */ static int inline idctRowCondDC (int16_t * row) { int a0, a1, a2, a3, b0, b1, b2, b3; @@ -147,6 +249,7 @@ return 1; } +#endif /* not ARCH_ALPHA */ static void inline idctCol (int16_t * col) { @@ -243,6 +346,7 @@ b3 += - W1*col[8*7]; } +#ifndef ARCH_ALPHA if(!(b0|b1|b2|b3)){ col[8*0] = (a0) >> COL_SHIFT; col[8*7] = (a0) >> COL_SHIFT; @@ -253,6 +357,7 @@ col[8*3] = (a3) >> COL_SHIFT; col[8*4] = (a3) >> COL_SHIFT; }else{ +#endif col[8*0] = (a0 + b0) >> COL_SHIFT; col[8*7] = (a0 - b0) >> COL_SHIFT; col[8*1] = (a1 + b1) >> COL_SHIFT; @@ -261,7 +366,9 @@ col[8*5] = (a2 - b2) >> COL_SHIFT; col[8*3] = (a3 + b3) >> COL_SHIFT; col[8*4] = (a3 - b3) >> COL_SHIFT; +#ifndef ARCH_ALPHA } +#endif } static void inline idctSparse2Col (int16_t * col) @@ -337,6 +444,34 @@ col[8*4] = (a3 - b3) >> COL_SHIFT; } +#ifdef ARCH_ALPHA +/* If all rows but the first one are zero after row transformation, + all rows will be identical after column transformation. */ +static inline void idctCol2(int16_t *col) +{ + int i; + uint64_t l, r; + uint64_t *lcol = (uint64_t *) col; + + for (i = 0; i < 8; ++i) { + int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4; + + a0 *= W4; + col[0] = a0 >> COL_SHIFT; + ++col; + } + + l = lcol[0]; + r = lcol[1]; + lcol[ 2] = l; lcol[ 3] = r; + lcol[ 4] = l; lcol[ 5] = r; + lcol[ 6] = l; lcol[ 7] = r; + lcol[ 8] = l; lcol[ 9] = r; + lcol[10] = l; lcol[11] = r; + lcol[12] = l; lcol[13] = r; + lcol[14] = l; lcol[15] = r; +} +#endif void simple_idct (short *block) { @@ -411,7 +546,22 @@ for(i=0; i<8; i++) idctSparse2Col(block + i); } -#else +#elif defined(ARCH_ALPHA) + int shortcut = 1; + + for (i = 0; i < 8; i++) { + int anynonzero = idctRowCondDC(block + 8 * i); + if (i > 0 && anynonzero) + shortcut = 0; + } + + if (shortcut) { + idctCol2(block); + } else { + for (i = 0; i < 8; i++) + idctSparseCol(block + i); + } +#else for(i=0; i<8; i++) idctRowCondDC(block + i*8);