Mercurial > libavcodec.hg
view i386/idct_mmx.c @ 3990:746a60ba3177 libavcodec
enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ...
author | michael |
---|---|
date | Wed, 11 Oct 2006 12:23:40 +0000 |
parents | ea9fe1c9d126 |
children | 65fd98452a4e |
line wrap: on
line source
/* * idct_mmx.c * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> * * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. * * mpeg2dec is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * mpeg2dec is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "common.h" #include "../dsputil.h" #include "mmx.h" #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) #define ROW_SHIFT 11 #define COL_SHIFT 6 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) #define rounder(bias) {round (bias), round (bias)} #if 0 /* C row IDCT - its just here to document the MMXEXT and MMX versions */ static inline void idct_row (int16_t * row, int offset, int16_t * table, int32_t * rounder) { int C1, C2, C3, C4, C5, C6, C7; int a0, a1, a2, a3, b0, b1, b2, b3; row += offset; C1 = table[1]; C2 = table[2]; C3 = table[3]; C4 = table[4]; C5 = table[5]; C6 = table[6]; C7 = table[7]; a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; row[0] = (a0 + b0) >> ROW_SHIFT; row[1] = (a1 + b1) >> ROW_SHIFT; row[2] = (a2 + b2) >> ROW_SHIFT; row[3] = (a3 + b3) >> ROW_SHIFT; row[4] = (a3 - b3) >> ROW_SHIFT; row[5] = (a2 - b2) >> ROW_SHIFT; row[6] = (a1 - b1) >> ROW_SHIFT; row[7] = (a0 - b0) >> ROW_SHIFT; } #endif /* MMXEXT row IDCT */ #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ c4, c6, c4, c6, \ c1, c3, -c1, -c5, \ c5, c7, c3, -c7, \ c4, -c6, c4, -c6, \ -c4, c2, c4, -c2, \ c5, -c1, c3, -c1, \ c7, c3, c7, -c5 } static inline void mmxext_row_head (int16_t * row, int offset, const int16_t * table) { movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 } static inline void mmxext_row (const int16_t * table, const int32_t * rounder) { movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 paddd_m2r (*rounder, mm3); // mm3 += rounder pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 paddd_r2r (mm7, mm1); // mm1 = b1 b0 paddd_m2r (*rounder, mm0); // mm0 += rounder psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 paddd_r2r (mm6, mm5); // mm5 = b3 b2 movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder } static inline void mmxext_row_tail (int16_t * row, int store) { psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 /* slot */ movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 } static inline void mmxext_row_mid (int16_t * row, int store, int offset, const int16_t * table) { movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 } /* MMX row IDCT */ #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ c4, c6, -c4, -c2, \ c1, c3, c3, -c7, \ c5, c7, -c1, -c5, \ c4, -c6, c4, -c2, \ -c4, c2, c4, -c6, \ c5, -c1, c7, -c5, \ c7, c3, c3, -c1 } static inline void mmx_row_head (int16_t * row, int offset, const int16_t * table) { movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 } static inline void mmx_row (const int16_t * table, const int32_t * rounder) { pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 paddd_m2r (*rounder, mm3); // mm3 += rounder pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 paddd_r2r (mm7, mm1); // mm1 = b1 b0 paddd_m2r (*rounder, mm0); // mm0 += rounder psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 paddd_r2r (mm6, mm5); // mm5 = b3 b2 movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder } static inline void mmx_row_tail (int16_t * row, int store) { psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 /* slot */ movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 } static inline void mmx_row_mid (int16_t * row, int store, int offset, const int16_t * table) { movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 } #if 0 // C column IDCT - its just here to document the MMXEXT and MMX versions static inline void idct_col (int16_t * col, int offset) { /* multiplication - as implemented on mmx */ #define F(c,x) (((c) * (x)) >> 16) /* saturation - it helps us handle torture test cases */ #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) int16_t x0, x1, x2, x3, x4, x5, x6, x7; int16_t y0, y1, y2, y3, y4, y5, y6, y7; int16_t a0, a1, a2, a3, b0, b1, b2, b3; int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; col += offset; x0 = col[0*8]; x1 = col[1*8]; x2 = col[2*8]; x3 = col[3*8]; x4 = col[4*8]; x5 = col[5*8]; x6 = col[6*8]; x7 = col[7*8]; u04 = S (x0 + x4); v04 = S (x0 - x4); u26 = S (F (T2, x6) + x2); v26 = S (F (T2, x2) - x6); a0 = S (u04 + u26); a1 = S (v04 + v26); a2 = S (v04 - v26); a3 = S (u04 - u26); u17 = S (F (T1, x7) + x1); v17 = S (F (T1, x1) - x7); u35 = S (F (T3, x5) + x3); v35 = S (F (T3, x3) - x5); b0 = S (u17 + u35); b3 = S (v17 - v35); u12 = S (u17 - u35); v12 = S (v17 + v35); u12 = S (2 * F (C4, u12)); v12 = S (2 * F (C4, v12)); b1 = S (u12 + v12); b2 = S (u12 - v12); y0 = S (a0 + b0) >> COL_SHIFT; y1 = S (a1 + b1) >> COL_SHIFT; y2 = S (a2 + b2) >> COL_SHIFT; y3 = S (a3 + b3) >> COL_SHIFT; y4 = S (a3 - b3) >> COL_SHIFT; y5 = S (a2 - b2) >> COL_SHIFT; y6 = S (a1 - b1) >> COL_SHIFT; y7 = S (a0 - b0) >> COL_SHIFT; col[0*8] = y0; col[1*8] = y1; col[2*8] = y2; col[3*8] = y3; col[4*8] = y4; col[5*8] = y5; col[6*8] = y6; col[7*8] = y7; } #endif // MMX column IDCT static inline void idct_col (int16_t * col, int offset) { #define T1 13036 #define T2 27146 #define T3 43790 #define C4 23170 static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; /* column code adapted from peter gubanov */ /* http://www.elecard.com/peter/idct.shtml */ movq_m2r (*_T1, mm0); // mm0 = T1 movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 movq_r2r (mm0, mm2); // mm2 = T1 movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 movq_m2r (*_T3, mm5); // mm5 = T3 pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 movq_r2r (mm5, mm7); // mm7 = T3-1 movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 psubsw_r2r (mm4, mm0); // mm0 = v17 movq_m2r (*_T2, mm4); // mm4 = T2 pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 paddsw_r2r (mm2, mm1); // mm1 = u17 pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 /* slot */ movq_r2r (mm4, mm2); // mm2 = T2 paddsw_r2r (mm3, mm5); // mm5 = T3*x3 pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 paddsw_r2r (mm6, mm7); // mm7 = T3*x5 psubsw_r2r (mm6, mm5); // mm5 = v35 paddsw_r2r (mm3, mm7); // mm7 = u35 movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 movq_r2r (mm0, mm6); // mm6 = v17 pmulhw_r2r (mm3, mm2); // mm2 = T2*x6 psubsw_r2r (mm5, mm0); // mm0 = b3 psubsw_r2r (mm3, mm4); // mm4 = v26 paddsw_r2r (mm6, mm5); // mm5 = v12 movq_r2m (mm0, *(col+offset+3*8)); // save b3 in scratch0 movq_r2r (mm1, mm6); // mm6 = u17 paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 paddsw_r2r (mm7, mm6); // mm6 = b0 psubsw_r2r (mm7, mm1); // mm1 = u12 movq_r2r (mm1, mm7); // mm7 = u12 movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 paddsw_r2r (mm5, mm1); // mm1 = u12+v12 movq_m2r (*_C4, mm0); // mm0 = C4/2 psubsw_r2r (mm5, mm7); // mm7 = u12-v12 movq_r2m (mm6, *(col+offset+5*8)); // save b0 in scratch1 pmulhw_r2r (mm0, mm1); // mm1 = b1/2 movq_r2r (mm4, mm6); // mm6 = v26 pmulhw_r2r (mm0, mm7); // mm7 = b2/2 movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 movq_r2r (mm3, mm0); // mm0 = x0 psubsw_r2r (mm5, mm3); // mm3 = v04 paddsw_r2r (mm5, mm0); // mm0 = u04 paddsw_r2r (mm3, mm4); // mm4 = a1 movq_r2r (mm0, mm5); // mm5 = u04 psubsw_r2r (mm6, mm3); // mm3 = a2 paddsw_r2r (mm2, mm5); // mm5 = a0 paddsw_r2r (mm1, mm1); // mm1 = b1 psubsw_r2r (mm2, mm0); // mm0 = a3 paddsw_r2r (mm7, mm7); // mm7 = b2 movq_r2r (mm3, mm2); // mm2 = a2 movq_r2r (mm4, mm6); // mm6 = a1 paddsw_r2r (mm7, mm3); // mm3 = a2+b2 psraw_i2r (COL_SHIFT, mm3); // mm3 = y2 paddsw_r2r (mm1, mm4); // mm4 = a1+b1 psraw_i2r (COL_SHIFT, mm4); // mm4 = y1 psubsw_r2r (mm1, mm6); // mm6 = a1-b1 movq_m2r (*(col+offset+5*8), mm1); // mm1 = b0 psubsw_r2r (mm7, mm2); // mm2 = a2-b2 psraw_i2r (COL_SHIFT, mm6); // mm6 = y6 movq_r2r (mm5, mm7); // mm7 = a0 movq_r2m (mm4, *(col+offset+1*8)); // save y1 psraw_i2r (COL_SHIFT, mm2); // mm2 = y5 movq_r2m (mm3, *(col+offset+2*8)); // save y2 paddsw_r2r (mm1, mm5); // mm5 = a0+b0 movq_m2r (*(col+offset+3*8), mm4); // mm4 = b3 psubsw_r2r (mm1, mm7); // mm7 = a0-b0 psraw_i2r (COL_SHIFT, mm5); // mm5 = y0 movq_r2r (mm0, mm3); // mm3 = a3 movq_r2m (mm2, *(col+offset+5*8)); // save y5 psubsw_r2r (mm4, mm3); // mm3 = a3-b3 psraw_i2r (COL_SHIFT, mm7); // mm7 = y7 paddsw_r2r (mm0, mm4); // mm4 = a3+b3 movq_r2m (mm5, *(col+offset+0*8)); // save y0 psraw_i2r (COL_SHIFT, mm3); // mm3 = y4 movq_r2m (mm6, *(col+offset+6*8)); // save y6 psraw_i2r (COL_SHIFT, mm4); // mm4 = y3 movq_r2m (mm7, *(col+offset+7*8)); // save y7 movq_r2m (mm3, *(col+offset+4*8)); // save y4 movq_r2m (mm4, *(col+offset+3*8)); // save y3 #undef T1 #undef T2 #undef T3 #undef C4 } static const int32_t rounder0[] ATTR_ALIGN(8) = rounder ((1 << (COL_SHIFT - 1)) - 0.5); static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); static const int32_t rounder1[] ATTR_ALIGN(8) = rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ static const int32_t rounder7[] ATTR_ALIGN(8) = rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ static const int32_t rounder2[] ATTR_ALIGN(8) = rounder (0.60355339059); /* C2 * (C6+C2)/2 */ static const int32_t rounder6[] ATTR_ALIGN(8) = rounder (-0.25); /* C2 * (C6-C2)/2 */ static const int32_t rounder3[] ATTR_ALIGN(8) = rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ static const int32_t rounder5[] ATTR_ALIGN(8) = rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ #undef COL_SHIFT #undef ROW_SHIFT #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ void idct (int16_t * block) \ { \ static const int16_t table04[] ATTR_ALIGN(16) = \ table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ static const int16_t table17[] ATTR_ALIGN(16) = \ table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ static const int16_t table26[] ATTR_ALIGN(16) = \ table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ static const int16_t table35[] ATTR_ALIGN(16) = \ table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ \ idct_row_head (block, 0*8, table04); \ idct_row (table04, rounder0); \ idct_row_mid (block, 0*8, 4*8, table04); \ idct_row (table04, rounder4); \ idct_row_mid (block, 4*8, 1*8, table17); \ idct_row (table17, rounder1); \ idct_row_mid (block, 1*8, 7*8, table17); \ idct_row (table17, rounder7); \ idct_row_mid (block, 7*8, 2*8, table26); \ idct_row (table26, rounder2); \ idct_row_mid (block, 2*8, 6*8, table26); \ idct_row (table26, rounder6); \ idct_row_mid (block, 6*8, 3*8, table35); \ idct_row (table35, rounder3); \ idct_row_mid (block, 3*8, 5*8, table35); \ idct_row (table35, rounder5); \ idct_row_tail (block, 5*8); \ \ idct_col (block, 0); \ idct_col (block, 4); \ } void ff_mmx_idct(DCTELEM *block); void ff_mmxext_idct(DCTELEM *block); declare_idct (ff_mmxext_idct, mmxext_table, mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) declare_idct (ff_mmx_idct, mmx_table, mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)