# HG changeset patch # User diego # Date 1273416329 0 # Node ID 1aece15222b5c4722a239c13085d7d40fd2ba9d5 # Parent 4592275518191e495cbf4d8a37fa48d84ccacf14 Remove internal liba52 copy. Nowadays FFmpeg is faster than liba52 and external liba52 is well supported. diff -r 459227551819 -r 1aece15222b5 Copyright --- a/Copyright Sun May 09 12:28:15 2010 +0000 +++ b/Copyright Sun May 09 14:45:29 2010 +0000 @@ -37,18 +37,6 @@ License: permissive, see libmpcodecs/native/xa_gsm.c -Name: liba52 -Version: 0.7.4 + patches -URL: http://liba52.sourceforge.net/ -Directory: liba52 -Copyright: 1999-2000 Aaron Holtzman - 2000-2001 Michel Lespinasse - 2000 Yuqing Deng - 2002 Nick Kurshev - 2004 Romain Dolbeau -License: GNU General Public License - - Name: libdvdcss Version: 1.2.10 URL: http://developers.videolan.org/libdvdcss/ diff -r 459227551819 -r 1aece15222b5 DOCS/tech/MAINTAINERS --- a/DOCS/tech/MAINTAINERS Sun May 09 12:28:15 2010 +0000 +++ b/DOCS/tech/MAINTAINERS Sun May 09 14:45:29 2010 +0000 @@ -93,7 +93,6 @@ * VIDIX core: Benjamin Zores * mp3lib: None * loader: None - * liba52: None * libmpeg2: None * libdvdcss: Diego Biurrun * libdvdread: Diego Biurrun diff -r 459227551819 -r 1aece15222b5 DOCS/tech/binary-packaging.txt --- a/DOCS/tech/binary-packaging.txt Sun May 09 12:28:15 2010 +0000 +++ b/DOCS/tech/binary-packaging.txt Sun May 09 14:45:29 2010 +0000 @@ -45,7 +45,7 @@ * codecs - FAAD(internal) - libavcodec(internal) - - native codecs (libmpeg2/liba52/mp3lib) + - native codecs (libmpeg2/mp3lib) - Vorbis Tremor codec(internal) - RealPlayer codecs support (*) - Win32/VfW/DShow/QT codecs support (*) diff -r 459227551819 -r 1aece15222b5 DOCS/tech/general.txt --- a/DOCS/tech/general.txt Sun May 09 12:28:15 2010 +0000 +++ b/DOCS/tech/general.txt Sun May 09 14:45:29 2010 +0000 @@ -196,7 +196,7 @@ Only used if none of the above works. 4. Codecs. Consists of libmpcodecs/* and separate files or libs, - for example liba52, libmpeg2, loader, mp3lib. + for example libmpeg2, loader, mp3lib. mplayer.c doesn't call them directly, but through the dec_audio.c and dec_video.c files, so the mplayer.c doesn't have to know anything about diff -r 459227551819 -r 1aece15222b5 Makefile --- a/Makefile Sun May 09 12:28:15 2010 +0000 +++ b/Makefile Sun May 09 14:45:29 2010 +0000 @@ -108,16 +108,6 @@ SRCS_COMMON-$(JPEG) += libmpcodecs/vd_ijpg.c SRCS_COMMON-$(LADSPA) += libaf/af_ladspa.c SRCS_COMMON-$(LIBA52) += libmpcodecs/ad_liba52.c -SRCS_LIBA52_INTERNAL += liba52/crc.c \ - liba52/resample.c \ - liba52/bit_allocate.c \ - liba52/bitstream.c \ - liba52/downmix.c \ - liba52/imdct.c \ - liba52/parse.c \ - -SRCS_COMMON-$(LIBA52_INTERNAL) += $(SRCS_LIBA52_INTERNAL) - SRCS_COMMON-$(LIBASS) += libmpcodecs/vf_ass.c \ libass/ass_mp.c \ @@ -758,7 +748,6 @@ gui/wm \ gui/win32 \ input \ - liba52 \ libaf \ libao2 \ libass \ @@ -1020,8 +1009,6 @@ codecs2html$(EXESUF): codec-cfg.c help_mp.h $(TEST_OBJS) $(CC) -I. -DCODECS2HTML -o $@ $^ -liba52/test$(EXESUF): cpudetect.o $(SRCS_LIBA52_INTERNAL:.c=.o) -lm - libvo/aspecttest$(EXESUF): libvo/aspect.o libvo/geometry.o $(TEST_OBJS) LOADER_TEST_OBJS = $(SRCS_WIN32_EMULATION:.c=.o) $(SRCS_QTX_EMULATION:.S=.o) libavutil/libavutil.a osdep/mmap_anon.o cpudetect.o $(TEST_OBJS) @@ -1031,8 +1018,7 @@ mp3lib/test$(EXESUF) mp3lib/test2$(EXESUF): $(SRCS_MP3LIB:.c=.o) libvo/aclib.o cpudetect.o $(TEST_OBJS) -TESTS = codecs2html codec-cfg-test liba52/test libvo/aspecttest \ - mp3lib/test mp3lib/test2 +TESTS = codecs2html codec-cfg-test libvo/aspecttest mp3lib/test mp3lib/test2 ifdef ARCH_X86 TESTS += loader/qtx/list loader/qtx/qtxload diff -r 459227551819 -r 1aece15222b5 configure --- a/configure Sun May 09 12:28:15 2010 +0000 +++ b/configure Sun May 09 14:45:29 2010 +0000 @@ -335,7 +335,6 @@ --enable-libdca enable libdca support [autodetect] --disable-mp3lib disable builtin mp3lib [autodetect] --disable-liba52 disable liba52 [autodetect] - --enable-liba52-internal enable builtin liba52 [disabled] --disable-libmpeg2 disable builtin libmpeg2 [autodetect] --disable-musepack disable musepack support [autodetect] --disable-libopencore_amrnb disable libopencore_amr narrowband [autodetect] @@ -631,7 +630,6 @@ _theora=auto _mp3lib=auto _liba52=auto -_liba52_internal=no _libdca=auto _libmpeg2=auto _faad=auto @@ -1028,8 +1026,6 @@ --disable-theora) _theora=no ;; --enable-mp3lib) _mp3lib=yes ;; --disable-mp3lib) _mp3lib=no ;; - --enable-liba52-internal) _liba52_internal=yes ;; - --disable-liba52-internal) _liba52_internal=no ;; --enable-liba52) _liba52=yes ;; --disable-liba52) _liba52=no ;; --enable-libdca) _libdca=yes ;; @@ -6796,27 +6792,19 @@ echores "$_mp3lib" echocheck "liba52 support" -if test "$_liba52_internal" = auto ; then - test "$cc_vendor" = intel && test "$_cc_major" -le 10 -o "$_cc_major" -eq 11 -a "$_cc_minor" -eq 0 && _liba52_internal=no || _liba52_internal=yes -fi def_liba52='#undef CONFIG_LIBA52' -def_liba52_internal="#undef CONFIG_LIBA52_INTERNAL" -if test "$_liba52_internal" = yes ; then - _liba52=yes - def_liba52_internal="#define CONFIG_LIBA52_INTERNAL 1" - res_comment="internal" -elif test "$_liba52_internal" = no && test "$_liba52" = auto ; then +if test "$_liba52" = auto ; then _liba52=no cat > $TMPC << EOF #include #include int main(void) { a52_state_t *testHand; testHand=a52_init(0); return 0; } EOF - cc_check -la52 && _liba52=yes && res_comment="external" && extra_ldflags="$extra_ldflags -la52" + cc_check -la52 && _liba52=yes && extra_ldflags="$extra_ldflags -la52" fi if test "$_liba52" = yes ; then def_liba52='#define CONFIG_LIBA52 1' - codecmodules="liba52($res_comment) $codecmodules" + codecmodules="liba52 $codecmodules" else nocodecmodules="liba52 $nocodecmodules" fi @@ -8582,7 +8570,6 @@ KVA = $_kva LADSPA = $_ladspa LIBA52 = $_liba52 -LIBA52_INTERNAL = $_liba52_internal LIBASS = $_ass LIBASS_INTERNAL = $ass_internal LIBBS2B = $_libbs2b @@ -8965,7 +8952,6 @@ $def_faad $def_faad_internal $def_liba52 -$def_liba52_internal $def_libdca $def_libdv $def_liblzo diff -r 459227551819 -r 1aece15222b5 liba52/a52.h --- a/liba52/a52.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -/* - * a52.h - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef A52_H -#define A52_H - -#include -#include "mm_accel.h" - -#ifndef LIBA52_DOUBLE -typedef float sample_t; -#else -typedef double sample_t; -#endif - -typedef struct a52_state_s a52_state_t; - -#define A52_CHANNEL 0 -#define A52_MONO 1 -#define A52_STEREO 2 -#define A52_3F 3 -#define A52_2F1R 4 -#define A52_3F1R 5 -#define A52_2F2R 6 -#define A52_3F2R 7 -#define A52_CHANNEL1 8 -#define A52_CHANNEL2 9 -#define A52_DOLBY 10 -#define A52_CHANNEL_MASK 15 - -#define A52_LFE 16 -#define A52_ADJUST_LEVEL 32 - -a52_state_t * a52_init (uint32_t mm_accel); -sample_t * a52_samples (a52_state_t * state); -int a52_syncinfo (uint8_t * buf, int * flags, - int * sample_rate, int * bit_rate); -int a52_frame (a52_state_t * state, uint8_t * buf, int * flags, - sample_t * level, sample_t bias); -void a52_dynrng (a52_state_t * state, - sample_t (* call) (sample_t, void *), void * data); -int a52_block (a52_state_t * state); -void a52_free (a52_state_t * state); - -void* a52_resample_init(uint32_t mm_accel,int flags,int chans); -extern int (* a52_resample) (float * _f, int16_t * s16); - -uint16_t crc16_block(uint8_t *data,uint32_t num_bytes); - -#endif /* A52_H */ diff -r 459227551819 -r 1aece15222b5 liba52/a52_internal.h --- a/liba52/a52_internal.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,140 +0,0 @@ -/* - * a52_internal.h - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -typedef struct { - uint8_t bai; /* fine SNR offset, fast gain */ - uint8_t deltbae; /* delta bit allocation exists */ - int8_t deltba[50]; /* per-band delta bit allocation */ -} ba_t; - -typedef struct { - uint8_t exp[256]; /* decoded channel exponents */ - int8_t bap[256]; /* derived channel bit allocation */ -} expbap_t; - -struct a52_state_s { - uint8_t fscod; /* sample rate */ - uint8_t halfrate; /* halfrate factor */ - uint8_t acmod; /* coded channels */ - uint8_t lfeon; /* coded lfe channel */ - sample_t clev; /* centre channel mix level */ - sample_t slev; /* surround channels mix level */ - - int output; /* type of output */ - sample_t level; /* output level */ - sample_t bias; /* output bias */ - - int dynrnge; /* apply dynamic range */ - sample_t dynrng; /* dynamic range */ - void * dynrngdata; /* dynamic range callback funtion and data */ - sample_t (* dynrngcall) (sample_t range, void * dynrngdata); - - uint8_t chincpl; /* channel coupled */ - uint8_t phsflginu; /* phase flags in use (stereo only) */ - uint8_t cplstrtmant; /* coupling channel start mantissa */ - uint8_t cplendmant; /* coupling channel end mantissa */ - uint32_t cplbndstrc; /* coupling band structure */ - sample_t cplco[5][18]; /* coupling coordinates */ - - /* derived information */ - uint8_t cplstrtbnd; /* coupling start band (for bit allocation) */ - uint8_t ncplbnd; /* number of coupling bands */ - - uint8_t rematflg; /* stereo rematrixing */ - - uint8_t endmant[5]; /* channel end mantissa */ - - uint16_t bai; /* bit allocation information */ - - uint32_t * buffer_start; - uint16_t lfsr_state; /* dither state */ - uint32_t bits_left; - uint32_t current_word; - - uint8_t csnroffst; /* coarse SNR offset */ - ba_t cplba; /* coupling bit allocation parameters */ - ba_t ba[5]; /* channel bit allocation parameters */ - ba_t lfeba; /* lfe bit allocation parameters */ - - uint8_t cplfleak; /* coupling fast leak init */ - uint8_t cplsleak; /* coupling slow leak init */ - - expbap_t cpl_expbap; - expbap_t fbw_expbap[5]; - expbap_t lfe_expbap; - - sample_t * samples; - int downmixed; -}; - -#define LEVEL_PLUS6DB 2.0 -#define LEVEL_PLUS3DB 1.4142135623730951 -#define LEVEL_3DB 0.7071067811865476 -#define LEVEL_45DB 0.5946035575013605 -#define LEVEL_6DB 0.5 - -#define EXP_REUSE (0) -#define EXP_D15 (1) -#define EXP_D25 (2) -#define EXP_D45 (3) - -#define DELTA_BIT_REUSE (0) -#define DELTA_BIT_NEW (1) -#define DELTA_BIT_NONE (2) -#define DELTA_BIT_RESERVED (3) - -#if ARCH_X86_64 -# define REG_a "rax" -# define REG_d "rdx" -# define REG_S "rsi" -# define REG_D "rdi" -# define REG_BP "rbp" -#else -# define REG_a "eax" -# define REG_d "edx" -# define REG_S "esi" -# define REG_D "edi" -# define REG_BP "ebp" -#endif - -void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart, - int start, int end, int fastleak, int slowleak, - expbap_t * expbap); - -int a52_downmix_init (int input, int flags, sample_t * level, - sample_t clev, sample_t slev); -void downmix_accel_init(uint32_t mm_accel); -int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, - sample_t clev, sample_t slev); -extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev); -extern void (*a52_upmix) (sample_t * samples, int acmod, int output); - -void a52_imdct_init (uint32_t mm_accel); -void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias); -extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias); -void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias); diff -r 459227551819 -r 1aece15222b5 liba52/bit_allocate.c --- a/liba52/bit_allocate.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,265 +0,0 @@ -/* - * bit_allocate.c - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include "config.h" - -#include - -#include "a52.h" -#include "a52_internal.h" - -static int hthtab[3][50] = { - {0x730, 0x730, 0x7c0, 0x800, 0x820, 0x840, 0x850, 0x850, 0x860, 0x860, - 0x860, 0x860, 0x860, 0x870, 0x870, 0x870, 0x880, 0x880, 0x890, 0x890, - 0x8a0, 0x8a0, 0x8b0, 0x8b0, 0x8c0, 0x8c0, 0x8d0, 0x8e0, 0x8f0, 0x900, - 0x910, 0x910, 0x910, 0x910, 0x900, 0x8f0, 0x8c0, 0x870, 0x820, 0x7e0, - 0x7a0, 0x770, 0x760, 0x7a0, 0x7c0, 0x7c0, 0x6e0, 0x400, 0x3c0, 0x3c0}, - {0x710, 0x710, 0x7a0, 0x7f0, 0x820, 0x830, 0x840, 0x850, 0x850, 0x860, - 0x860, 0x860, 0x860, 0x860, 0x870, 0x870, 0x870, 0x880, 0x880, 0x880, - 0x890, 0x890, 0x8a0, 0x8a0, 0x8b0, 0x8b0, 0x8c0, 0x8c0, 0x8e0, 0x8f0, - 0x900, 0x910, 0x910, 0x910, 0x910, 0x900, 0x8e0, 0x8b0, 0x870, 0x820, - 0x7e0, 0x7b0, 0x760, 0x770, 0x7a0, 0x7c0, 0x780, 0x5d0, 0x3c0, 0x3c0}, - {0x680, 0x680, 0x750, 0x7b0, 0x7e0, 0x810, 0x820, 0x830, 0x840, 0x850, - 0x850, 0x850, 0x860, 0x860, 0x860, 0x860, 0x860, 0x860, 0x860, 0x860, - 0x870, 0x870, 0x870, 0x870, 0x880, 0x880, 0x880, 0x890, 0x8a0, 0x8b0, - 0x8c0, 0x8d0, 0x8e0, 0x8f0, 0x900, 0x910, 0x910, 0x910, 0x900, 0x8f0, - 0x8d0, 0x8b0, 0x840, 0x7f0, 0x790, 0x760, 0x7a0, 0x7c0, 0x7b0, 0x720} -}; - -static int8_t baptab[305] = { - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, /* 93 padding elems */ - - 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 14, 14, 14, 14, 14, 14, - 14, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, - 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, - 5, 4, 4, -3, -3, 3, 3, 3, -2, -2, -1, -1, -1, -1, -1, 0, - - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 /* 148 padding elems */ -}; - -static int bndtab[30] = {21, 22, 23, 24, 25, 26, 27, 28, 31, 34, - 37, 40, 43, 46, 49, 55, 61, 67, 73, 79, - 85, 97, 109, 121, 133, 157, 181, 205, 229, 253}; - -static int8_t latab[256] = { - -64, -63, -62, -61, -60, -59, -58, -57, -56, -55, -54, -53, - -52, -52, -51, -50, -49, -48, -47, -47, -46, -45, -44, -44, - -43, -42, -41, -41, -40, -39, -38, -38, -37, -36, -36, -35, - -35, -34, -33, -33, -32, -32, -31, -30, -30, -29, -29, -28, - -28, -27, -27, -26, -26, -25, -25, -24, -24, -23, -23, -22, - -22, -21, -21, -21, -20, -20, -19, -19, -19, -18, -18, -18, - -17, -17, -17, -16, -16, -16, -15, -15, -15, -14, -14, -14, - -13, -13, -13, -13, -12, -12, -12, -12, -11, -11, -11, -11, - -10, -10, -10, -10, -10, -9, -9, -9, -9, -9, -8, -8, - -8, -8, -8, -8, -7, -7, -7, -7, -7, -7, -6, -6, - -6, -6, -6, -6, -6, -6, -5, -5, -5, -5, -5, -5, - -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, - -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, - -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 -}; - -#define UPDATE_LEAK() \ -do { \ - fastleak += fdecay; \ - if (fastleak > psd + fgain) \ - fastleak = psd + fgain; \ - slowleak += sdecay; \ - if (slowleak > psd + sgain) \ - slowleak = psd + sgain; \ -} while (0) - -#define COMPUTE_MASK() \ -do { \ - if (psd > dbknee) \ - mask -= (psd - dbknee) >> 2; \ - if (mask > hth [i >> halfrate]) \ - mask = hth [i >> halfrate]; \ - mask -= snroffset + 128 * deltba[i]; \ - mask = (mask > 0) ? 0 : ((-mask) >> 5); \ - mask -= floor; \ -} while (0) - -void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart, - int start, int end, int fastleak, int slowleak, - expbap_t * expbap) -{ - static int slowgain[4] = {0x540, 0x4d8, 0x478, 0x410}; - static int dbpbtab[4] = {0xc00, 0x500, 0x300, 0x100}; - static int floortab[8] = {0x910, 0x950, 0x990, 0x9d0, - 0xa10, 0xa90, 0xb10, 0x1400}; - - int i, j; - uint8_t * exp; - int8_t * bap; - int fdecay, fgain, sdecay, sgain, dbknee, floor, snroffset; - int psd, mask; - int8_t * deltba; - int * hth; - int halfrate; - - halfrate = state->halfrate; - fdecay = (63 + 20 * ((state->bai >> 7) & 3)) >> halfrate; /* fdcycod */ - fgain = 128 + 128 * (ba->bai & 7); /* fgaincod */ - sdecay = (15 + 2 * (state->bai >> 9)) >> halfrate; /* sdcycod */ - sgain = slowgain[(state->bai >> 5) & 3]; /* sgaincod */ - dbknee = dbpbtab[(state->bai >> 3) & 3]; /* dbpbcod */ - hth = hthtab[state->fscod]; - /* - * if there is no delta bit allocation, make deltba point to an area - * known to contain zeroes. baptab+156 here. - */ - deltba = (ba->deltbae == DELTA_BIT_NONE) ? baptab + 156 : ba->deltba; - floor = floortab[state->bai & 7]; /* floorcod */ - snroffset = 960 - 64 * state->csnroffst - 4 * (ba->bai >> 3) + floor; - floor >>= 5; - - exp = expbap->exp; - bap = expbap->bap; - - i = bndstart; - j = start; - if (start == 0) { /* not the coupling channel */ - int lowcomp; - - lowcomp = 0; - j = end - 1; - do { - if (i < j) { - if (exp[i+1] == exp[i] - 2) - lowcomp = 384; - else if (lowcomp && (exp[i+1] > exp[i])) - lowcomp -= 64; - } - psd = 128 * exp[i]; - mask = psd + fgain + lowcomp; - COMPUTE_MASK (); - bap[i] = (baptab+156)[mask + 4 * exp[i]]; - i++; - } while ((i < 3) || ((i < 7) && (exp[i] > exp[i-1]))); - fastleak = psd + fgain; - slowleak = psd + sgain; - - while (i < 7) { - if (i < j) { - if (exp[i+1] == exp[i] - 2) - lowcomp = 384; - else if (lowcomp && (exp[i+1] > exp[i])) - lowcomp -= 64; - } - psd = 128 * exp[i]; - UPDATE_LEAK (); - mask = ((fastleak + lowcomp < slowleak) ? - fastleak + lowcomp : slowleak); - COMPUTE_MASK (); - bap[i] = (baptab+156)[mask + 4 * exp[i]]; - i++; - } - - if (end == 7) /* lfe channel */ - return; - - do { - if (exp[i+1] == exp[i] - 2) - lowcomp = 320; - else if (lowcomp && (exp[i+1] > exp[i])) - lowcomp -= 64; - psd = 128 * exp[i]; - UPDATE_LEAK (); - mask = ((fastleak + lowcomp < slowleak) ? - fastleak + lowcomp : slowleak); - COMPUTE_MASK (); - bap[i] = (baptab+156)[mask + 4 * exp[i]]; - i++; - } while (i < 20); - - while (lowcomp > 128) { /* two iterations maximum */ - lowcomp -= 128; - psd = 128 * exp[i]; - UPDATE_LEAK (); - mask = ((fastleak + lowcomp < slowleak) ? - fastleak + lowcomp : slowleak); - COMPUTE_MASK (); - bap[i] = (baptab+156)[mask + 4 * exp[i]]; - i++; - } - j = i; - } - - do { - int startband, endband; - - startband = j; - endband = ((bndtab-20)[i] < end) ? (bndtab-20)[i] : end; - psd = 128 * exp[j++]; - while (j < endband) { - int next, delta; - - next = 128 * exp[j++]; - delta = next - psd; - switch (delta >> 9) { - case -6: case -5: case -4: case -3: case -2: - psd = next; - break; - case -1: - psd = next + latab[(-delta) >> 1]; - break; - case 0: - psd += latab[delta >> 1]; - break; - } - } - /* minpsd = -289 */ - UPDATE_LEAK (); - mask = (fastleak < slowleak) ? fastleak : slowleak; - COMPUTE_MASK (); - i++; - j = startband; - do { - /* max(mask+4*exp)=147=-(minpsd+fgain-deltba-snroffset)>>5+4*exp */ - /* min(mask+4*exp)=-156=-(sgain-deltba-snroffset)>>5 */ - bap[j] = (baptab+156)[mask + 4 * exp[j]]; - } while (++j < endband); - } while (j < end); -} diff -r 459227551819 -r 1aece15222b5 liba52/bitstream.c --- a/liba52/bitstream.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -/* - * bitstream.c - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include "config.h" - -#include - -#include "a52.h" -#include "a52_internal.h" -#include "bitstream.h" - -#define BUFFER_SIZE 4096 - -#ifdef ALT_BITSTREAM_READER -int indx=0; -#endif - -void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf) -{ - int align; - - align = (long)buf & 3; - state->buffer_start = (uint32_t *) (buf - align); - state->bits_left = 0; -#ifdef ALT_BITSTREAM_READER - indx=0; -#endif - bitstream_get (state, align * 8); -} - -static inline void bitstream_fill_current (a52_state_t * state) -{ - uint32_t tmp; - - tmp = *(state->buffer_start++); - state->current_word = swab32 (tmp); -} - -/* - * The fast paths for _get is in the - * bitstream.h header file so it can be inlined. - * - * The "bottom half" of this routine is suffixed _bh - * - * -ah - */ - -uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits) -{ - uint32_t result; - - num_bits -= state->bits_left; - result = ((state->current_word << (32 - state->bits_left)) >> - (32 - state->bits_left)); - - bitstream_fill_current (state); - - if (num_bits != 0) - result = (result << num_bits) | (state->current_word >> (32 - num_bits)); - - state->bits_left = 32 - num_bits; - - return result; -} - -int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits) -{ - int32_t result; - - num_bits -= state->bits_left; - result = ((((int32_t)state->current_word) << (32 - state->bits_left)) >> - (32 - state->bits_left)); - - bitstream_fill_current(state); - - if (num_bits != 0) - result = (result << num_bits) | (state->current_word >> (32 - num_bits)); - - state->bits_left = 32 - num_bits; - - return result; -} diff -r 459227551819 -r 1aece15222b5 liba52/bitstream.h --- a/liba52/bitstream.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,152 +0,0 @@ -/* - * bitstream.h - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -/* code from ffmpeg/libavcodec */ -#if defined(__sparc__) || defined(hpux) -/* - * the alt bitstream reader performs unaligned memory accesses; that doesn't work - * on sparc/hpux. For now, disable ALT_BITSTREAM_READER. - */ -#undef ALT_BITSTREAM_READER -#else -// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input) -#define ALT_BITSTREAM_READER - -/* used to avoid misaligned exceptions on some archs (alpha, ...) */ -#if ARCH_X86 || HAVE_ARMV6 -# define unaligned32(a) (*(uint32_t*)(a)) -#else -# ifdef __GNUC__ -static inline uint32_t unaligned32(const void *v) { - struct Unaligned { - uint32_t i; - } __attribute__((packed)); - - return ((const struct Unaligned *) v)->i; -} -# elif defined(__DECC) -static inline uint32_t unaligned32(const void *v) { - return *(const __unaligned uint32_t *) v; -} -# else -static inline uint32_t unaligned32(const void *v) { - return *(const uint32_t *) v; -} -# endif -#endif //!ARCH_X86 - -#endif - -/* (stolen from the kernel) */ -#if HAVE_BIGENDIAN - -# define swab32(x) (x) - -#else - -# if defined (__i386__) - -# define swab32(x) __i386_swab32(x) - static inline const uint32_t __i386_swab32(uint32_t x) - { - __asm__("bswap %0" : "=r" (x) : "0" (x)); - return x; - } - -# else - -# define swab32(x) __generic_swab32(x) - static inline const uint32_t __generic_swab32(uint32_t x) - { - return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | - (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])); - } -# endif -#endif - -#ifdef ALT_BITSTREAM_READER -extern int indx; -#endif - -void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf); -uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits); -int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits); - -static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits) -{ -#ifdef ALT_BITSTREAM_READER - uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); - - result<<= (indx&0x07); - result>>= 32 - num_bits; - indx+= num_bits; - - return result; -#else - uint32_t result; - - if (num_bits < state->bits_left) { - result = (state->current_word << (32 - state->bits_left)) >> (32 - num_bits); - state->bits_left -= num_bits; - return result; - } - - return a52_bitstream_get_bh (state, num_bits); -#endif -} - -static inline void bitstream_skip(a52_state_t * state, int num_bits) -{ -#ifdef ALT_BITSTREAM_READER - indx+= num_bits; -#else - bitstream_get(state, num_bits); -#endif -} - -static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits) -{ -#ifdef ALT_BITSTREAM_READER - int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); - - result<<= (indx&0x07); - result>>= 32 - num_bits; - indx+= num_bits; - - return result; -#else - int32_t result; - - if (num_bits < state->bits_left) { - result = (((int32_t)state->current_word) << (32 - state->bits_left)) >> (32 - num_bits); - state->bits_left -= num_bits; - return result; - } - - return a52_bitstream_get_bh_2 (state, num_bits); -#endif -} diff -r 459227551819 -r 1aece15222b5 liba52/crc.c --- a/liba52/crc.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -/* - * crc.c - * - * Copyright (C) Aaron Holtzman - May 1999 - * - * This file is part of ac3dec, a free Dolby AC-3 stream decoder. - * - * ac3dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * ac3dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include -#include -#include - -static const uint16_t crc_lut[256] = -{ - 0x0000,0x8005,0x800f,0x000a,0x801b,0x001e,0x0014,0x8011, - 0x8033,0x0036,0x003c,0x8039,0x0028,0x802d,0x8027,0x0022, - 0x8063,0x0066,0x006c,0x8069,0x0078,0x807d,0x8077,0x0072, - 0x0050,0x8055,0x805f,0x005a,0x804b,0x004e,0x0044,0x8041, - 0x80c3,0x00c6,0x00cc,0x80c9,0x00d8,0x80dd,0x80d7,0x00d2, - 0x00f0,0x80f5,0x80ff,0x00fa,0x80eb,0x00ee,0x00e4,0x80e1, - 0x00a0,0x80a5,0x80af,0x00aa,0x80bb,0x00be,0x00b4,0x80b1, - 0x8093,0x0096,0x009c,0x8099,0x0088,0x808d,0x8087,0x0082, - 0x8183,0x0186,0x018c,0x8189,0x0198,0x819d,0x8197,0x0192, - 0x01b0,0x81b5,0x81bf,0x01ba,0x81ab,0x01ae,0x01a4,0x81a1, - 0x01e0,0x81e5,0x81ef,0x01ea,0x81fb,0x01fe,0x01f4,0x81f1, - 0x81d3,0x01d6,0x01dc,0x81d9,0x01c8,0x81cd,0x81c7,0x01c2, - 0x0140,0x8145,0x814f,0x014a,0x815b,0x015e,0x0154,0x8151, - 0x8173,0x0176,0x017c,0x8179,0x0168,0x816d,0x8167,0x0162, - 0x8123,0x0126,0x012c,0x8129,0x0138,0x813d,0x8137,0x0132, - 0x0110,0x8115,0x811f,0x011a,0x810b,0x010e,0x0104,0x8101, - 0x8303,0x0306,0x030c,0x8309,0x0318,0x831d,0x8317,0x0312, - 0x0330,0x8335,0x833f,0x033a,0x832b,0x032e,0x0324,0x8321, - 0x0360,0x8365,0x836f,0x036a,0x837b,0x037e,0x0374,0x8371, - 0x8353,0x0356,0x035c,0x8359,0x0348,0x834d,0x8347,0x0342, - 0x03c0,0x83c5,0x83cf,0x03ca,0x83db,0x03de,0x03d4,0x83d1, - 0x83f3,0x03f6,0x03fc,0x83f9,0x03e8,0x83ed,0x83e7,0x03e2, - 0x83a3,0x03a6,0x03ac,0x83a9,0x03b8,0x83bd,0x83b7,0x03b2, - 0x0390,0x8395,0x839f,0x039a,0x838b,0x038e,0x0384,0x8381, - 0x0280,0x8285,0x828f,0x028a,0x829b,0x029e,0x0294,0x8291, - 0x82b3,0x02b6,0x02bc,0x82b9,0x02a8,0x82ad,0x82a7,0x02a2, - 0x82e3,0x02e6,0x02ec,0x82e9,0x02f8,0x82fd,0x82f7,0x02f2, - 0x02d0,0x82d5,0x82df,0x02da,0x82cb,0x02ce,0x02c4,0x82c1, - 0x8243,0x0246,0x024c,0x8249,0x0258,0x825d,0x8257,0x0252, - 0x0270,0x8275,0x827f,0x027a,0x826b,0x026e,0x0264,0x8261, - 0x0220,0x8225,0x822f,0x022a,0x823b,0x023e,0x0234,0x8231, - 0x8213,0x0216,0x021c,0x8219,0x0208,0x820d,0x8207,0x0202 -}; - -uint16_t crc16_block(uint8_t *data,uint32_t num_bytes) -{ - uint32_t i; - uint16_t state=0; - - for(i=0;i>8)] ^ (state<<8); - - return state; -} diff -r 459227551819 -r 1aece15222b5 liba52/downmix.c --- a/liba52/downmix.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1788 +0,0 @@ -/* - * downmix.c - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) - */ - -#include "config.h" - -#include -#include - -#include "a52.h" -#include "a52_internal.h" -#include "mm_accel.h" - -#define CONVERT(acmod,output) (((output) << 3) + (acmod)) - - -void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev)= NULL; -void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL; - -static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev); -static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev); -static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev); -static void upmix_MMX (sample_t * samples, int acmod, int output); -static void upmix_C (sample_t * samples, int acmod, int output); - -void downmix_accel_init(uint32_t mm_accel) -{ - a52_upmix= upmix_C; - a52_downmix= downmix_C; -#if ARCH_X86 || ARCH_X86_64 - if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX; - if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE; - if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow; -#endif -} - -int a52_downmix_init (int input, int flags, sample_t * level, - sample_t clev, sample_t slev) -{ - static uint8_t table[11][8] = { - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, - A52_STEREO, A52_STEREO, A52_STEREO, A52_STEREO}, - {A52_MONO, A52_MONO, A52_MONO, A52_MONO, - A52_MONO, A52_MONO, A52_MONO, A52_MONO}, - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, - A52_STEREO, A52_STEREO, A52_STEREO, A52_STEREO}, - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_3F, - A52_STEREO, A52_3F, A52_STEREO, A52_3F}, - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, - A52_2F1R, A52_2F1R, A52_2F1R, A52_2F1R}, - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, - A52_2F1R, A52_3F1R, A52_2F1R, A52_3F1R}, - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_3F, - A52_2F2R, A52_2F2R, A52_2F2R, A52_2F2R}, - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_3F, - A52_2F2R, A52_3F2R, A52_2F2R, A52_3F2R}, - {A52_CHANNEL1, A52_MONO, A52_MONO, A52_MONO, - A52_MONO, A52_MONO, A52_MONO, A52_MONO}, - {A52_CHANNEL2, A52_MONO, A52_MONO, A52_MONO, - A52_MONO, A52_MONO, A52_MONO, A52_MONO}, - {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_DOLBY, - A52_DOLBY, A52_DOLBY, A52_DOLBY, A52_DOLBY} - }; - int output; - - output = flags & A52_CHANNEL_MASK; - if (output > A52_DOLBY) - return -1; - - output = table[output][input & 7]; - - if ((output == A52_STEREO) && - ((input == A52_DOLBY) || ((input == A52_3F) && (clev == LEVEL_3DB)))) - output = A52_DOLBY; - - if (flags & A52_ADJUST_LEVEL) - switch (CONVERT (input & 7, output)) { - - case CONVERT (A52_3F, A52_MONO): - *level *= LEVEL_3DB / (1 + clev); - break; - - case CONVERT (A52_STEREO, A52_MONO): - case CONVERT (A52_2F2R, A52_2F1R): - case CONVERT (A52_3F2R, A52_3F1R): - level_3db: - *level *= LEVEL_3DB; - break; - - case CONVERT (A52_3F2R, A52_2F1R): - if (clev < LEVEL_PLUS3DB - 1) - goto level_3db; - /* break thru */ - case CONVERT (A52_3F, A52_STEREO): - case CONVERT (A52_3F1R, A52_2F1R): - case CONVERT (A52_3F1R, A52_2F2R): - case CONVERT (A52_3F2R, A52_2F2R): - *level /= 1 + clev; - break; - - case CONVERT (A52_2F1R, A52_MONO): - *level *= LEVEL_PLUS3DB / (2 + slev); - break; - - case CONVERT (A52_2F1R, A52_STEREO): - case CONVERT (A52_3F1R, A52_3F): - *level /= 1 + slev * LEVEL_3DB; - break; - - case CONVERT (A52_3F1R, A52_MONO): - *level *= LEVEL_3DB / (1 + clev + 0.5 * slev); - break; - - case CONVERT (A52_3F1R, A52_STEREO): - *level /= 1 + clev + slev * LEVEL_3DB; - break; - - case CONVERT (A52_2F2R, A52_MONO): - *level *= LEVEL_3DB / (1 + slev); - break; - - case CONVERT (A52_2F2R, A52_STEREO): - case CONVERT (A52_3F2R, A52_3F): - *level /= 1 + slev; - break; - - case CONVERT (A52_3F2R, A52_MONO): - *level *= LEVEL_3DB / (1 + clev + slev); - break; - - case CONVERT (A52_3F2R, A52_STEREO): - *level /= 1 + clev + slev; - break; - - case CONVERT (A52_MONO, A52_DOLBY): - *level *= LEVEL_PLUS3DB; - break; - - case CONVERT (A52_3F, A52_DOLBY): - case CONVERT (A52_2F1R, A52_DOLBY): - *level *= 1 / (1 + LEVEL_3DB); - break; - - case CONVERT (A52_3F1R, A52_DOLBY): - case CONVERT (A52_2F2R, A52_DOLBY): - *level *= 1 / (1 + 2 * LEVEL_3DB); - break; - - case CONVERT (A52_3F2R, A52_DOLBY): - *level *= 1 / (1 + 3 * LEVEL_3DB); - break; - } - - return output; -} - -int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, - sample_t clev, sample_t slev) -{ - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { - - case CONVERT (A52_CHANNEL, A52_CHANNEL): - case CONVERT (A52_MONO, A52_MONO): - case CONVERT (A52_STEREO, A52_STEREO): - case CONVERT (A52_3F, A52_3F): - case CONVERT (A52_2F1R, A52_2F1R): - case CONVERT (A52_3F1R, A52_3F1R): - case CONVERT (A52_2F2R, A52_2F2R): - case CONVERT (A52_3F2R, A52_3F2R): - case CONVERT (A52_STEREO, A52_DOLBY): - coeff[0] = coeff[1] = coeff[2] = coeff[3] = coeff[4] = level; - return 0; - - case CONVERT (A52_CHANNEL, A52_MONO): - coeff[0] = coeff[1] = level * LEVEL_6DB; - return 3; - - case CONVERT (A52_STEREO, A52_MONO): - coeff[0] = coeff[1] = level * LEVEL_3DB; - return 3; - - case CONVERT (A52_3F, A52_MONO): - coeff[0] = coeff[2] = level * LEVEL_3DB; - coeff[1] = level * clev * LEVEL_PLUS3DB; - return 7; - - case CONVERT (A52_2F1R, A52_MONO): - coeff[0] = coeff[1] = level * LEVEL_3DB; - coeff[2] = level * slev * LEVEL_3DB; - return 7; - - case CONVERT (A52_2F2R, A52_MONO): - coeff[0] = coeff[1] = level * LEVEL_3DB; - coeff[2] = coeff[3] = level * slev * LEVEL_3DB; - return 15; - - case CONVERT (A52_3F1R, A52_MONO): - coeff[0] = coeff[2] = level * LEVEL_3DB; - coeff[1] = level * clev * LEVEL_PLUS3DB; - coeff[3] = level * slev * LEVEL_3DB; - return 15; - - case CONVERT (A52_3F2R, A52_MONO): - coeff[0] = coeff[2] = level * LEVEL_3DB; - coeff[1] = level * clev * LEVEL_PLUS3DB; - coeff[3] = coeff[4] = level * slev * LEVEL_3DB; - return 31; - - case CONVERT (A52_MONO, A52_DOLBY): - coeff[0] = level * LEVEL_3DB; - return 0; - - case CONVERT (A52_3F, A52_DOLBY): - clev = LEVEL_3DB; - case CONVERT (A52_3F, A52_STEREO): - case CONVERT (A52_3F1R, A52_2F1R): - case CONVERT (A52_3F2R, A52_2F2R): - coeff[0] = coeff[2] = coeff[3] = coeff[4] = level; - coeff[1] = level * clev; - return 7; - - case CONVERT (A52_2F1R, A52_DOLBY): - slev = 1; - case CONVERT (A52_2F1R, A52_STEREO): - coeff[0] = coeff[1] = level; - coeff[2] = level * slev * LEVEL_3DB; - return 7; - - case CONVERT (A52_3F1R, A52_DOLBY): - clev = LEVEL_3DB; - slev = 1; - case CONVERT (A52_3F1R, A52_STEREO): - coeff[0] = coeff[2] = level; - coeff[1] = level * clev; - coeff[3] = level * slev * LEVEL_3DB; - return 15; - - case CONVERT (A52_2F2R, A52_DOLBY): - slev = LEVEL_3DB; - case CONVERT (A52_2F2R, A52_STEREO): - coeff[0] = coeff[1] = level; - coeff[2] = coeff[3] = level * slev; - return 15; - - case CONVERT (A52_3F2R, A52_DOLBY): - clev = LEVEL_3DB; - case CONVERT (A52_3F2R, A52_2F1R): - slev = LEVEL_3DB; - case CONVERT (A52_3F2R, A52_STEREO): - coeff[0] = coeff[2] = level; - coeff[1] = level * clev; - coeff[3] = coeff[4] = level * slev; - return 31; - - case CONVERT (A52_3F1R, A52_3F): - coeff[0] = coeff[1] = coeff[2] = level; - coeff[3] = level * slev * LEVEL_3DB; - return 13; - - case CONVERT (A52_3F2R, A52_3F): - coeff[0] = coeff[1] = coeff[2] = level; - coeff[3] = coeff[4] = level * slev; - return 29; - - case CONVERT (A52_2F2R, A52_2F1R): - coeff[0] = coeff[1] = level; - coeff[2] = coeff[3] = level * LEVEL_3DB; - return 12; - - case CONVERT (A52_3F2R, A52_3F1R): - coeff[0] = coeff[1] = coeff[2] = level; - coeff[3] = coeff[4] = level * LEVEL_3DB; - return 24; - - case CONVERT (A52_2F1R, A52_2F2R): - coeff[0] = coeff[1] = level; - coeff[2] = level * LEVEL_3DB; - return 0; - - case CONVERT (A52_3F1R, A52_2F2R): - coeff[0] = coeff[2] = level; - coeff[1] = level * clev; - coeff[3] = level * LEVEL_3DB; - return 7; - - case CONVERT (A52_3F1R, A52_3F2R): - coeff[0] = coeff[1] = coeff[2] = level; - coeff[3] = level * LEVEL_3DB; - return 0; - - case CONVERT (A52_CHANNEL, A52_CHANNEL1): - coeff[0] = level; - coeff[1] = 0; - return 0; - - case CONVERT (A52_CHANNEL, A52_CHANNEL2): - coeff[0] = 0; - coeff[1] = level; - return 0; - } - - return -1; /* NOTREACHED */ -} - -static void mix2to1 (sample_t * dest, sample_t * src, sample_t bias) -{ - int i; - - for (i = 0; i < 256; i++) - dest[i] += src[i] + bias; -} - -static void mix3to1 (sample_t * samples, sample_t bias) -{ - int i; - - for (i = 0; i < 256; i++) - samples[i] += samples[i + 256] + samples[i + 512] + bias; -} - -static void mix4to1 (sample_t * samples, sample_t bias) -{ - int i; - - for (i = 0; i < 256; i++) - samples[i] += (samples[i + 256] + samples[i + 512] + - samples[i + 768] + bias); -} - -static void mix5to1 (sample_t * samples, sample_t bias) -{ - int i; - - for (i = 0; i < 256; i++) - samples[i] += (samples[i + 256] + samples[i + 512] + - samples[i + 768] + samples[i + 1024] + bias); -} - -static void mix3to2 (sample_t * samples, sample_t bias) -{ - int i; - sample_t common; - - for (i = 0; i < 256; i++) { - common = samples[i + 256] + bias; - samples[i] += common; - samples[i + 256] = samples[i + 512] + common; - } -} - -static void mix21to2 (sample_t * left, sample_t * right, sample_t bias) -{ - int i; - sample_t common; - - for (i = 0; i < 256; i++) { - common = right[i + 256] + bias; - left[i] += common; - right[i] += common; - } -} - -static void mix21toS (sample_t * samples, sample_t bias) -{ - int i; - sample_t surround; - - for (i = 0; i < 256; i++) { - surround = samples[i + 512]; - samples[i] += bias - surround; - samples[i + 256] += bias + surround; - } -} - -static void mix31to2 (sample_t * samples, sample_t bias) -{ - int i; - sample_t common; - - for (i = 0; i < 256; i++) { - common = samples[i + 256] + samples[i + 768] + bias; - samples[i] += common; - samples[i + 256] = samples[i + 512] + common; - } -} - -static void mix31toS (sample_t * samples, sample_t bias) -{ - int i; - sample_t common, surround; - - for (i = 0; i < 256; i++) { - common = samples[i + 256] + bias; - surround = samples[i + 768]; - samples[i] += common - surround; - samples[i + 256] = samples[i + 512] + common + surround; - } -} - -static void mix22toS (sample_t * samples, sample_t bias) -{ - int i; - sample_t surround; - - for (i = 0; i < 256; i++) { - surround = samples[i + 512] + samples[i + 768]; - samples[i] += bias - surround; - samples[i + 256] += bias + surround; - } -} - -static void mix32to2 (sample_t * samples, sample_t bias) -{ - int i; - sample_t common; - - for (i = 0; i < 256; i++) { - common = samples[i + 256] + bias; - samples[i] += common + samples[i + 768]; - samples[i + 256] = common + samples[i + 512] + samples[i + 1024]; - } -} - -static void mix32toS (sample_t * samples, sample_t bias) -{ - int i; - sample_t common, surround; - - for (i = 0; i < 256; i++) { - common = samples[i + 256] + bias; - surround = samples[i + 768] + samples[i + 1024]; - samples[i] += common - surround; - samples[i + 256] = samples[i + 512] + common + surround; - } -} - -static void move2to1 (sample_t * src, sample_t * dest, sample_t bias) -{ - int i; - - for (i = 0; i < 256; i++) - dest[i] = src[i] + src[i + 256] + bias; -} - -static void zero (sample_t * samples) -{ - int i; - - for (i = 0; i < 256; i++) - samples[i] = 0; -} - -void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev) -{ - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { - - case CONVERT (A52_CHANNEL, A52_CHANNEL2): - memcpy (samples, samples + 256, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_CHANNEL, A52_MONO): - case CONVERT (A52_STEREO, A52_MONO): - mix_2to1: - mix2to1 (samples, samples + 256, bias); - break; - - case CONVERT (A52_2F1R, A52_MONO): - if (slev == 0) - goto mix_2to1; - case CONVERT (A52_3F, A52_MONO): - mix_3to1: - mix3to1 (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_MONO): - if (slev == 0) - goto mix_3to1; - case CONVERT (A52_2F2R, A52_MONO): - if (slev == 0) - goto mix_2to1; - mix4to1 (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_MONO): - if (slev == 0) - goto mix_3to1; - mix5to1 (samples, bias); - break; - - case CONVERT (A52_MONO, A52_DOLBY): - memcpy (samples + 256, samples, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F, A52_STEREO): - case CONVERT (A52_3F, A52_DOLBY): - mix_3to2: - mix3to2 (samples, bias); - break; - - case CONVERT (A52_2F1R, A52_STEREO): - if (slev == 0) - break; - mix21to2 (samples, samples + 256, bias); - break; - - case CONVERT (A52_2F1R, A52_DOLBY): - mix21toS (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_STEREO): - if (slev == 0) - goto mix_3to2; - mix31to2 (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_DOLBY): - mix31toS (samples, bias); - break; - - case CONVERT (A52_2F2R, A52_STEREO): - if (slev == 0) - break; - mix2to1 (samples, samples + 512, bias); - mix2to1 (samples + 256, samples + 768, bias); - break; - - case CONVERT (A52_2F2R, A52_DOLBY): - mix22toS (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_STEREO): - if (slev == 0) - goto mix_3to2; - mix32to2 (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_DOLBY): - mix32toS (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_3F): - if (slev == 0) - break; - mix21to2 (samples, samples + 512, bias); - break; - - case CONVERT (A52_3F2R, A52_3F): - if (slev == 0) - break; - mix2to1 (samples, samples + 768, bias); - mix2to1 (samples + 512, samples + 1024, bias); - break; - - case CONVERT (A52_3F1R, A52_2F1R): - mix3to2 (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_2F2R, A52_2F1R): - mix2to1 (samples + 512, samples + 768, bias); - break; - - case CONVERT (A52_3F2R, A52_2F1R): - mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) - move2to1 (samples + 768, samples + 512, bias); - break; - - case CONVERT (A52_3F2R, A52_3F1R): - mix2to1 (samples + 768, samples + 1024, bias); - break; - - case CONVERT (A52_2F1R, A52_2F2R): - memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F1R, A52_2F2R): - mix3to2 (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F2R, A52_2F2R): - mix3to2 (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F1R, A52_3F2R): - memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); - break; - } -} - -void upmix_C (sample_t * samples, int acmod, int output) -{ - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { - - case CONVERT (A52_CHANNEL, A52_CHANNEL2): - memcpy (samples + 256, samples, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F2R, A52_MONO): - zero (samples + 1024); - case CONVERT (A52_3F1R, A52_MONO): - case CONVERT (A52_2F2R, A52_MONO): - zero (samples + 768); - case CONVERT (A52_3F, A52_MONO): - case CONVERT (A52_2F1R, A52_MONO): - zero (samples + 512); - case CONVERT (A52_CHANNEL, A52_MONO): - case CONVERT (A52_STEREO, A52_MONO): - zero (samples + 256); - break; - - case CONVERT (A52_3F2R, A52_STEREO): - case CONVERT (A52_3F2R, A52_DOLBY): - zero (samples + 1024); - case CONVERT (A52_3F1R, A52_STEREO): - case CONVERT (A52_3F1R, A52_DOLBY): - zero (samples + 768); - case CONVERT (A52_3F, A52_STEREO): - case CONVERT (A52_3F, A52_DOLBY): - mix_3to2: - memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); - zero (samples + 256); - break; - - case CONVERT (A52_2F2R, A52_STEREO): - case CONVERT (A52_2F2R, A52_DOLBY): - zero (samples + 768); - case CONVERT (A52_2F1R, A52_STEREO): - case CONVERT (A52_2F1R, A52_DOLBY): - zero (samples + 512); - break; - - case CONVERT (A52_3F2R, A52_3F): - zero (samples + 1024); - case CONVERT (A52_3F1R, A52_3F): - case CONVERT (A52_2F2R, A52_2F1R): - zero (samples + 768); - break; - - case CONVERT (A52_3F2R, A52_3F1R): - zero (samples + 1024); - break; - - case CONVERT (A52_3F2R, A52_2F1R): - zero (samples + 1024); - case CONVERT (A52_3F1R, A52_2F1R): - mix_31to21: - memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); - goto mix_3to2; - - case CONVERT (A52_3F2R, A52_2F2R): - memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); - goto mix_31to21; - } -} - -#if ARCH_X86 || ARCH_X86_64 -static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) -{ - __asm__ volatile( - "movlps %2, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps (%0, %%"REG_S"), %%xmm0 \n\t" - "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" - "addps (%1, %%"REG_S"), %%xmm0 \n\t" - "addps 16(%1, %%"REG_S"), %%xmm1\n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps %%xmm7, %%xmm1 \n\t" - "movaps %%xmm0, (%1, %%"REG_S") \n\t" - "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" - "add $32, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (src+256), "r" (dest+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix3to1_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps (%0, %%"REG_S"), %%xmm0 \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" - "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" - "addps %%xmm7, %%xmm1 \n\t" - "addps %%xmm1, %%xmm0 \n\t" - "movaps %%xmm0, (%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix4to1_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps (%0, %%"REG_S"), %%xmm0 \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" - "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" - "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps %%xmm1, %%xmm0 \n\t" - "movaps %%xmm0, (%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix5to1_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps (%0, %%"REG_S"), %%xmm0 \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" - "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" - "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" - "addps %%xmm1, %%xmm0 \n\t" - "movaps %%xmm0, (%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix3to2_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" - "addps %%xmm7, %%xmm0 \n\t" //common - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%"REG_S") \n\t" - "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) -{ - __asm__ volatile( - "movlps %2, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" - "addps %%xmm7, %%xmm0 \n\t" //common - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "movaps (%1, %%"REG_S"), %%xmm2 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%"REG_S") \n\t" - "movaps %%xmm2, (%1, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (left+256), "r" (right+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix21toS_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" - "addps %%xmm7, %%xmm1 \n\t" - "addps %%xmm7, %%xmm2 \n\t" - "subps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%"REG_S") \n\t" - "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix31to2_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" - "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" - "addps %%xmm7, %%xmm0 \n\t" // common - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%"REG_S") \n\t" - "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix31toS_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" - "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround - "addps %%xmm7, %%xmm0 \n\t" // common - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "subps %%xmm3, %%xmm1 \n\t" - "addps %%xmm3, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%"REG_S") \n\t" - "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix22toS_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" - "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" - "addps %%xmm7, %%xmm1 \n\t" - "addps %%xmm7, %%xmm2 \n\t" - "subps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%"REG_S") \n\t" - "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix32to2_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" - "addps %%xmm7, %%xmm0 \n\t" // common - "movaps %%xmm0, %%xmm1 \n\t" // common - "addps (%0, %%"REG_S"), %%xmm0 \n\t" - "addps 2048(%0, %%"REG_S"), %%xmm1\n\t" - "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" - "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" - "movaps %%xmm0, (%0, %%"REG_S") \n\t" - "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix32toS_SSE (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" - "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" - "addps %%xmm7, %%xmm0 \n\t" // common - "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t" - "subps %%xmm2, %%xmm1 \n\t" - "addps %%xmm2, %%xmm3 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm3 \n\t" - "movaps %%xmm1, (%0, %%"REG_S") \n\t" - "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) -{ - __asm__ volatile( - "movlps %2, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps (%0, %%"REG_S"), %%xmm0 \n\t" - "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" - "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" - "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps %%xmm7, %%xmm1 \n\t" - "movaps %%xmm0, (%1, %%"REG_S") \n\t" - "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" - "add $32, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (src+256), "r" (dest+256), "m" (bias) - : "%"REG_S - ); -} - -static void zero_MMX(sample_t * samples) -{ - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "pxor %%mm0, %%mm0 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq %%mm0, (%0, %%"REG_S") \n\t" - "movq %%mm0, 8(%0, %%"REG_S") \n\t" - "movq %%mm0, 16(%0, %%"REG_S") \n\t" - "movq %%mm0, 24(%0, %%"REG_S") \n\t" - "add $32, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms" - :: "r" (samples+256) - : "%"REG_S - ); -} - -static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev) -{ - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { - - case CONVERT (A52_CHANNEL, A52_CHANNEL2): - memcpy (samples, samples + 256, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_CHANNEL, A52_MONO): - case CONVERT (A52_STEREO, A52_MONO): - mix_2to1_SSE: - mix2to1_SSE (samples, samples + 256, bias); - break; - - case CONVERT (A52_2F1R, A52_MONO): - if (slev == 0) - goto mix_2to1_SSE; - case CONVERT (A52_3F, A52_MONO): - mix_3to1_SSE: - mix3to1_SSE (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_MONO): - if (slev == 0) - goto mix_3to1_SSE; - case CONVERT (A52_2F2R, A52_MONO): - if (slev == 0) - goto mix_2to1_SSE; - mix4to1_SSE (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_MONO): - if (slev == 0) - goto mix_3to1_SSE; - mix5to1_SSE (samples, bias); - break; - - case CONVERT (A52_MONO, A52_DOLBY): - memcpy (samples + 256, samples, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F, A52_STEREO): - case CONVERT (A52_3F, A52_DOLBY): - mix_3to2_SSE: - mix3to2_SSE (samples, bias); - break; - - case CONVERT (A52_2F1R, A52_STEREO): - if (slev == 0) - break; - mix21to2_SSE (samples, samples + 256, bias); - break; - - case CONVERT (A52_2F1R, A52_DOLBY): - mix21toS_SSE (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_STEREO): - if (slev == 0) - goto mix_3to2_SSE; - mix31to2_SSE (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_DOLBY): - mix31toS_SSE (samples, bias); - break; - - case CONVERT (A52_2F2R, A52_STEREO): - if (slev == 0) - break; - mix2to1_SSE (samples, samples + 512, bias); - mix2to1_SSE (samples + 256, samples + 768, bias); - break; - - case CONVERT (A52_2F2R, A52_DOLBY): - mix22toS_SSE (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_STEREO): - if (slev == 0) - goto mix_3to2_SSE; - mix32to2_SSE (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_DOLBY): - mix32toS_SSE (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_3F): - if (slev == 0) - break; - mix21to2_SSE (samples, samples + 512, bias); - break; - - case CONVERT (A52_3F2R, A52_3F): - if (slev == 0) - break; - mix2to1_SSE (samples, samples + 768, bias); - mix2to1_SSE (samples + 512, samples + 1024, bias); - break; - - case CONVERT (A52_3F1R, A52_2F1R): - mix3to2_SSE (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_2F2R, A52_2F1R): - mix2to1_SSE (samples + 512, samples + 768, bias); - break; - - case CONVERT (A52_3F2R, A52_2F1R): - mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) - move2to1_SSE (samples + 768, samples + 512, bias); - break; - - case CONVERT (A52_3F2R, A52_3F1R): - mix2to1_SSE (samples + 768, samples + 1024, bias); - break; - - case CONVERT (A52_2F1R, A52_2F2R): - memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F1R, A52_2F2R): - mix3to2_SSE (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F2R, A52_2F2R): - mix3to2_SSE (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F1R, A52_3F2R): - memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); - break; - } -} - -static void upmix_MMX (sample_t * samples, int acmod, int output) -{ - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { - - case CONVERT (A52_CHANNEL, A52_CHANNEL2): - memcpy (samples + 256, samples, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F2R, A52_MONO): - zero_MMX (samples + 1024); - case CONVERT (A52_3F1R, A52_MONO): - case CONVERT (A52_2F2R, A52_MONO): - zero_MMX (samples + 768); - case CONVERT (A52_3F, A52_MONO): - case CONVERT (A52_2F1R, A52_MONO): - zero_MMX (samples + 512); - case CONVERT (A52_CHANNEL, A52_MONO): - case CONVERT (A52_STEREO, A52_MONO): - zero_MMX (samples + 256); - break; - - case CONVERT (A52_3F2R, A52_STEREO): - case CONVERT (A52_3F2R, A52_DOLBY): - zero_MMX (samples + 1024); - case CONVERT (A52_3F1R, A52_STEREO): - case CONVERT (A52_3F1R, A52_DOLBY): - zero_MMX (samples + 768); - case CONVERT (A52_3F, A52_STEREO): - case CONVERT (A52_3F, A52_DOLBY): - mix_3to2_MMX: - memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); - zero_MMX (samples + 256); - break; - - case CONVERT (A52_2F2R, A52_STEREO): - case CONVERT (A52_2F2R, A52_DOLBY): - zero_MMX (samples + 768); - case CONVERT (A52_2F1R, A52_STEREO): - case CONVERT (A52_2F1R, A52_DOLBY): - zero_MMX (samples + 512); - break; - - case CONVERT (A52_3F2R, A52_3F): - zero_MMX (samples + 1024); - case CONVERT (A52_3F1R, A52_3F): - case CONVERT (A52_2F2R, A52_2F1R): - zero_MMX (samples + 768); - break; - - case CONVERT (A52_3F2R, A52_3F1R): - zero_MMX (samples + 1024); - break; - - case CONVERT (A52_3F2R, A52_2F1R): - zero_MMX (samples + 1024); - case CONVERT (A52_3F1R, A52_2F1R): - mix_31to21_MMX: - memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); - goto mix_3to2_MMX; - - case CONVERT (A52_3F2R, A52_2F2R): - memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); - goto mix_31to21_MMX; - } -} - -static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias) -{ - __asm__ volatile( - "movd %2, %%mm7 \n\t" - "punpckldq %2, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_S"), %%mm0 \n\t" - "movq 8(%0, %%"REG_S"), %%mm1 \n\t" - "movq 16(%0, %%"REG_S"), %%mm2 \n\t" - "movq 24(%0, %%"REG_S"), %%mm3 \n\t" - "pfadd (%1, %%"REG_S"), %%mm0 \n\t" - "pfadd 8(%1, %%"REG_S"), %%mm1 \n\t" - "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t" - "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t" - "pfadd %%mm7, %%mm0 \n\t" - "pfadd %%mm7, %%mm1 \n\t" - "pfadd %%mm7, %%mm2 \n\t" - "pfadd %%mm7, %%mm3 \n\t" - "movq %%mm0, (%1, %%"REG_S") \n\t" - "movq %%mm1, 8(%1, %%"REG_S") \n\t" - "movq %%mm2, 16(%1, %%"REG_S") \n\t" - "movq %%mm3, 24(%1, %%"REG_S") \n\t" - "add $32, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (src+256), "r" (dest+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix3to1_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_S"), %%mm0 \n\t" - "movq 8(%0, %%"REG_S"), %%mm1 \n\t" - "movq 1024(%0, %%"REG_S"), %%mm2\n\t" - "movq 1032(%0, %%"REG_S"), %%mm3\n\t" - "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" - "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" - "pfadd %%mm7, %%mm0 \n\t" - "pfadd %%mm7, %%mm1 \n\t" - "pfadd %%mm2, %%mm0 \n\t" - "pfadd %%mm3, %%mm1 \n\t" - "movq %%mm0, (%0, %%"REG_S") \n\t" - "movq %%mm1, 8(%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix4to1_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_S"), %%mm0 \n\t" - "movq 8(%0, %%"REG_S"), %%mm1 \n\t" - "movq 1024(%0, %%"REG_S"), %%mm2\n\t" - "movq 1032(%0, %%"REG_S"), %%mm3\n\t" - "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" - "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" - "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" - "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" - "pfadd %%mm7, %%mm0 \n\t" - "pfadd %%mm7, %%mm1 \n\t" - "pfadd %%mm2, %%mm0 \n\t" - "pfadd %%mm3, %%mm1 \n\t" - "movq %%mm0, (%0, %%"REG_S") \n\t" - "movq %%mm1, 8(%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix5to1_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_S"), %%mm0 \n\t" - "movq 8(%0, %%"REG_S"), %%mm1 \n\t" - "movq 1024(%0, %%"REG_S"), %%mm2\n\t" - "movq 1032(%0, %%"REG_S"), %%mm3\n\t" - "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" - "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" - "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" - "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" - "pfadd %%mm7, %%mm0 \n\t" - "pfadd %%mm7, %%mm1 \n\t" - "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" - "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" - "pfadd %%mm2, %%mm0 \n\t" - "pfadd %%mm3, %%mm1 \n\t" - "movq %%mm0, (%0, %%"REG_S") \n\t" - "movq %%mm1, 8(%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix3to2_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq 1024(%0, %%"REG_S"), %%mm0\n\t" - "movq 1032(%0, %%"REG_S"), %%mm1\n\t" - "pfadd %%mm7, %%mm0 \n\t" //common - "pfadd %%mm7, %%mm1 \n\t" //common - "movq (%0, %%"REG_S"), %%mm2 \n\t" - "movq 8(%0, %%"REG_S"), %%mm3 \n\t" - "movq 2048(%0, %%"REG_S"), %%mm4\n\t" - "movq 2056(%0, %%"REG_S"), %%mm5\n\t" - "pfadd %%mm0, %%mm2 \n\t" - "pfadd %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm4 \n\t" - "pfadd %%mm1, %%mm5 \n\t" - "movq %%mm2, (%0, %%"REG_S") \n\t" - "movq %%mm3, 8(%0, %%"REG_S") \n\t" - "movq %%mm4, 1024(%0, %%"REG_S")\n\t" - "movq %%mm5, 1032(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias) -{ - __asm__ volatile( - "movd %2, %%mm7 \n\t" - "punpckldq %2, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq 1024(%1, %%"REG_S"), %%mm0\n\t" - "movq 1032(%1, %%"REG_S"), %%mm1\n\t" - "pfadd %%mm7, %%mm0 \n\t" //common - "pfadd %%mm7, %%mm1 \n\t" //common - "movq (%0, %%"REG_S"), %%mm2 \n\t" - "movq 8(%0, %%"REG_S"), %%mm3 \n\t" - "movq (%1, %%"REG_S"), %%mm4 \n\t" - "movq 8(%1, %%"REG_S"), %%mm5 \n\t" - "pfadd %%mm0, %%mm2 \n\t" - "pfadd %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm4 \n\t" - "pfadd %%mm1, %%mm5 \n\t" - "movq %%mm2, (%0, %%"REG_S") \n\t" - "movq %%mm3, 8(%0, %%"REG_S") \n\t" - "movq %%mm4, (%1, %%"REG_S") \n\t" - "movq %%mm5, 8(%1, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (left+256), "r" (right+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix21toS_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround - "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround - "movq (%0, %%"REG_S"), %%mm2 \n\t" - "movq 8(%0, %%"REG_S"), %%mm3 \n\t" - "movq 1024(%0, %%"REG_S"), %%mm4\n\t" - "movq 1032(%0, %%"REG_S"), %%mm5\n\t" - "pfadd %%mm7, %%mm2 \n\t" - "pfadd %%mm7, %%mm3 \n\t" - "pfadd %%mm7, %%mm4 \n\t" - "pfadd %%mm7, %%mm5 \n\t" - "pfsub %%mm0, %%mm2 \n\t" - "pfsub %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm4 \n\t" - "pfadd %%mm1, %%mm5 \n\t" - "movq %%mm2, (%0, %%"REG_S") \n\t" - "movq %%mm3, 8(%0, %%"REG_S") \n\t" - "movq %%mm4, 1024(%0, %%"REG_S")\n\t" - "movq %%mm5, 1032(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix31to2_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq 1024(%0, %%"REG_S"), %%mm0\n\t" - "movq 1032(%0, %%"REG_S"), %%mm1\n\t" - "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" - "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" - "pfadd %%mm7, %%mm0 \n\t" // common - "pfadd %%mm7, %%mm1 \n\t" // common - "movq (%0, %%"REG_S"), %%mm2 \n\t" - "movq 8(%0, %%"REG_S"), %%mm3 \n\t" - "movq 2048(%0, %%"REG_S"), %%mm4\n\t" - "movq 2056(%0, %%"REG_S"), %%mm5\n\t" - "pfadd %%mm0, %%mm2 \n\t" - "pfadd %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm4 \n\t" - "pfadd %%mm1, %%mm5 \n\t" - "movq %%mm2, (%0, %%"REG_S") \n\t" - "movq %%mm3, 8(%0, %%"REG_S") \n\t" - "movq %%mm4, 1024(%0, %%"REG_S")\n\t" - "movq %%mm5, 1032(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix31toS_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq 1024(%0, %%"REG_S"), %%mm0\n\t" - "movq 1032(%0, %%"REG_S"), %%mm1\n\t" - "pfadd %%mm7, %%mm0 \n\t" // common - "pfadd %%mm7, %%mm1 \n\t" // common - "movq (%0, %%"REG_S"), %%mm2 \n\t" - "movq 8(%0, %%"REG_S"), %%mm3 \n\t" - "movq 2048(%0, %%"REG_S"), %%mm4\n\t" - "movq 2056(%0, %%"REG_S"), %%mm5\n\t" - "pfadd %%mm0, %%mm2 \n\t" - "pfadd %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm4 \n\t" - "pfadd %%mm1, %%mm5 \n\t" - "movq 3072(%0, %%"REG_S"), %%mm0\n\t" // surround - "movq 3080(%0, %%"REG_S"), %%mm1\n\t" // surround - "pfsub %%mm0, %%mm2 \n\t" - "pfsub %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm4 \n\t" - "pfadd %%mm1, %%mm5 \n\t" - "movq %%mm2, (%0, %%"REG_S") \n\t" - "movq %%mm3, 8(%0, %%"REG_S") \n\t" - "movq %%mm4, 1024(%0, %%"REG_S")\n\t" - "movq %%mm5, 1032(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix22toS_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq 2048(%0, %%"REG_S"), %%mm0\n\t" - "movq 2056(%0, %%"REG_S"), %%mm1\n\t" - "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround - "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround - "movq (%0, %%"REG_S"), %%mm2 \n\t" - "movq 8(%0, %%"REG_S"), %%mm3 \n\t" - "movq 1024(%0, %%"REG_S"), %%mm4\n\t" - "movq 1032(%0, %%"REG_S"), %%mm5\n\t" - "pfadd %%mm7, %%mm2 \n\t" - "pfadd %%mm7, %%mm3 \n\t" - "pfadd %%mm7, %%mm4 \n\t" - "pfadd %%mm7, %%mm5 \n\t" - "pfsub %%mm0, %%mm2 \n\t" - "pfsub %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm4 \n\t" - "pfadd %%mm1, %%mm5 \n\t" - "movq %%mm2, (%0, %%"REG_S") \n\t" - "movq %%mm3, 8(%0, %%"REG_S") \n\t" - "movq %%mm4, 1024(%0, %%"REG_S")\n\t" - "movq %%mm5, 1032(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void mix32to2_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq 1024(%0, %%"REG_S"), %%mm0\n\t" - "movq 1032(%0, %%"REG_S"), %%mm1\n\t" - "pfadd %%mm7, %%mm0 \n\t" // common - "pfadd %%mm7, %%mm1 \n\t" // common - "movq %%mm0, %%mm2 \n\t" // common - "movq %%mm1, %%mm3 \n\t" // common - "pfadd (%0, %%"REG_S"), %%mm0 \n\t" - "pfadd 8(%0, %%"REG_S"), %%mm1 \n\t" - "pfadd 2048(%0, %%"REG_S"), %%mm2\n\t" - "pfadd 2056(%0, %%"REG_S"), %%mm3\n\t" - "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" - "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" - "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" - "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" - "movq %%mm0, (%0, %%"REG_S") \n\t" - "movq %%mm1, 8(%0, %%"REG_S") \n\t" - "movq %%mm2, 1024(%0, %%"REG_S")\n\t" - "movq %%mm3, 1032(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -/* todo: should be optimized better */ -static void mix32toS_3dnow (sample_t * samples, sample_t bias) -{ - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movd %1, %%mm7 \n\t" - "punpckldq %1, %%mm7 \n\t" - "movq 1024(%0, %%"REG_S"), %%mm0\n\t" - "movq 1032(%0, %%"REG_S"), %%mm1\n\t" - "movq 3072(%0, %%"REG_S"), %%mm4\n\t" - "movq 3080(%0, %%"REG_S"), %%mm5\n\t" - "pfadd %%mm7, %%mm0 \n\t" // common - "pfadd %%mm7, %%mm1 \n\t" // common - "pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround - "pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround - "movq (%0, %%"REG_S"), %%mm2 \n\t" - "movq 8(%0, %%"REG_S"), %%mm3 \n\t" - "movq 2048(%0, %%"REG_S"), %%mm6\n\t" - "movq 2056(%0, %%"REG_S"), %%mm7\n\t" - "pfsub %%mm4, %%mm2 \n\t" - "pfsub %%mm5, %%mm3 \n\t" - "pfadd %%mm4, %%mm6 \n\t" - "pfadd %%mm5, %%mm7 \n\t" - "pfadd %%mm0, %%mm2 \n\t" - "pfadd %%mm1, %%mm3 \n\t" - "pfadd %%mm0, %%mm6 \n\t" - "pfadd %%mm1, %%mm7 \n\t" - "movq %%mm2, (%0, %%"REG_S") \n\t" - "movq %%mm3, 8(%0, %%"REG_S") \n\t" - "movq %%mm6, 1024(%0, %%"REG_S")\n\t" - "movq %%mm7, 1032(%0, %%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%"REG_S - ); -} - -static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias) -{ - __asm__ volatile( - "movd %2, %%mm7 \n\t" - "punpckldq %2, %%mm7 \n\t" - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_S"), %%mm0 \n\t" - "movq 8(%0, %%"REG_S"), %%mm1 \n\t" - "movq 16(%0, %%"REG_S"), %%mm2 \n\t" - "movq 24(%0, %%"REG_S"), %%mm3 \n\t" - "pfadd 1024(%0, %%"REG_S"), %%mm0\n\t" - "pfadd 1032(%0, %%"REG_S"), %%mm1\n\t" - "pfadd 1040(%0, %%"REG_S"), %%mm2\n\t" - "pfadd 1048(%0, %%"REG_S"), %%mm3\n\t" - "pfadd %%mm7, %%mm0 \n\t" - "pfadd %%mm7, %%mm1 \n\t" - "pfadd %%mm7, %%mm2 \n\t" - "pfadd %%mm7, %%mm3 \n\t" - "movq %%mm0, (%1, %%"REG_S") \n\t" - "movq %%mm1, 8(%1, %%"REG_S") \n\t" - "movq %%mm2, 16(%1, %%"REG_S") \n\t" - "movq %%mm3, 24(%1, %%"REG_S") \n\t" - "add $32, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (src+256), "r" (dest+256), "m" (bias) - : "%"REG_S - ); -} - -static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev) -{ - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { - - case CONVERT (A52_CHANNEL, A52_CHANNEL2): - memcpy (samples, samples + 256, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_CHANNEL, A52_MONO): - case CONVERT (A52_STEREO, A52_MONO): - mix_2to1_3dnow: - mix2to1_3dnow (samples, samples + 256, bias); - break; - - case CONVERT (A52_2F1R, A52_MONO): - if (slev == 0) - goto mix_2to1_3dnow; - case CONVERT (A52_3F, A52_MONO): - mix_3to1_3dnow: - mix3to1_3dnow (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_MONO): - if (slev == 0) - goto mix_3to1_3dnow; - case CONVERT (A52_2F2R, A52_MONO): - if (slev == 0) - goto mix_2to1_3dnow; - mix4to1_3dnow (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_MONO): - if (slev == 0) - goto mix_3to1_3dnow; - mix5to1_3dnow (samples, bias); - break; - - case CONVERT (A52_MONO, A52_DOLBY): - memcpy (samples + 256, samples, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F, A52_STEREO): - case CONVERT (A52_3F, A52_DOLBY): - mix_3to2_3dnow: - mix3to2_3dnow (samples, bias); - break; - - case CONVERT (A52_2F1R, A52_STEREO): - if (slev == 0) - break; - mix21to2_3dnow (samples, samples + 256, bias); - break; - - case CONVERT (A52_2F1R, A52_DOLBY): - mix21toS_3dnow (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_STEREO): - if (slev == 0) - goto mix_3to2_3dnow; - mix31to2_3dnow (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_DOLBY): - mix31toS_3dnow (samples, bias); - break; - - case CONVERT (A52_2F2R, A52_STEREO): - if (slev == 0) - break; - mix2to1_3dnow (samples, samples + 512, bias); - mix2to1_3dnow (samples + 256, samples + 768, bias); - break; - - case CONVERT (A52_2F2R, A52_DOLBY): - mix22toS_3dnow (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_STEREO): - if (slev == 0) - goto mix_3to2_3dnow; - mix32to2_3dnow (samples, bias); - break; - - case CONVERT (A52_3F2R, A52_DOLBY): - mix32toS_3dnow (samples, bias); - break; - - case CONVERT (A52_3F1R, A52_3F): - if (slev == 0) - break; - mix21to2_3dnow (samples, samples + 512, bias); - break; - - case CONVERT (A52_3F2R, A52_3F): - if (slev == 0) - break; - mix2to1_3dnow (samples, samples + 768, bias); - mix2to1_3dnow (samples + 512, samples + 1024, bias); - break; - - case CONVERT (A52_3F1R, A52_2F1R): - mix3to2_3dnow (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_2F2R, A52_2F1R): - mix2to1_3dnow (samples + 512, samples + 768, bias); - break; - - case CONVERT (A52_3F2R, A52_2F1R): - mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used) - move2to1_3dnow (samples + 768, samples + 512, bias); - break; - - case CONVERT (A52_3F2R, A52_3F1R): - mix2to1_3dnow (samples + 768, samples + 1024, bias); - break; - - case CONVERT (A52_2F1R, A52_2F2R): - memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F1R, A52_2F2R): - mix3to2_3dnow (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F2R, A52_2F2R): - mix3to2_3dnow (samples, bias); - memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); - memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); - break; - - case CONVERT (A52_3F1R, A52_3F2R): - memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); - break; - } - __asm__ volatile("femms":::"memory"); -} - -#endif // ARCH_X86 || ARCH_X86_64 diff -r 459227551819 -r 1aece15222b5 liba52/imdct.c --- a/liba52/imdct.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1304 +0,0 @@ -/* - * imdct.c - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * The ifft algorithms in this file have been largely inspired by Dan - * Bernstein's work, djbfft, available at http://cr.yp.to/djbfft.html - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) - * 3DNOW optimizations from Nick Kurshev - * michael did port them from libac3 (untested, perhaps totally broken) - * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) - */ - -#include "config.h" - -#include -#include -#ifdef LIBA52_DJBFFT -#include -#endif -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795029 -#endif -#include - -#include "a52.h" -#include "a52_internal.h" -#include "mm_accel.h" -#include "mangle.h" - -void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias); - -#if CONFIG_RUNTIME_CPUDETECT -#undef HAVE_AMD3DNOWEXT -#define HAVE_AMD3DNOWEXT 0 -#endif - -typedef struct complex_s { - sample_t real; - sample_t imag; -} complex_t; - -static const int pm128[128] attribute_used __attribute__((aligned(16))) = -{ - 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, - 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, - 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, - 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, - 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, - 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, - 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, - 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 -}; - -static uint8_t attribute_used bit_reverse_512[] = { - 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, - 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, - 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, - 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c, - 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72, - 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a, - 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76, - 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e, - 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71, - 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79, - 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75, - 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d, - 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73, - 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b, - 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77, - 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; - -static uint8_t fftorder[] = { - 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176, - 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88, - 4,132, 68,196, 36,164,228,100, 20,148, 84,212,244,116, 52,180, - 252,124, 60,188, 28,156,220, 92, 12,140, 76,204,236,108, 44,172, - 2,130, 66,194, 34,162,226, 98, 18,146, 82,210,242,114, 50,178, - 10,138, 74,202, 42,170,234,106,250,122, 58,186, 26,154,218, 90, - 254,126, 62,190, 30,158,222, 94, 14,142, 78,206,238,110, 46,174, - 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86 -}; - -static complex_t __attribute__((aligned(16))) buf[128]; - -/* Twiddle factor LUT */ -static complex_t __attribute__((aligned(16))) w_1[1]; -static complex_t __attribute__((aligned(16))) w_2[2]; -static complex_t __attribute__((aligned(16))) w_4[4]; -static complex_t __attribute__((aligned(16))) w_8[8]; -static complex_t __attribute__((aligned(16))) w_16[16]; -static complex_t __attribute__((aligned(16))) w_32[32]; -static complex_t __attribute__((aligned(16))) w_64[64]; -static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; - -/* Twiddle factors for IMDCT */ -static sample_t __attribute__((aligned(16))) xcos1[128]; -static sample_t __attribute__((aligned(16))) xsin1[128]; - -#if ARCH_X86 || ARCH_X86_64 -// NOTE: SSE needs 16byte alignment or it will segfault -// -static float __attribute__((aligned(16))) sseSinCos1c[256]; -static float __attribute__((aligned(16))) sseSinCos1d[256]; -static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; -//static float __attribute__((aligned(16))) sseW0[4]; -static float __attribute__((aligned(16))) sseW1[8]; -static float __attribute__((aligned(16))) sseW2[16]; -static float __attribute__((aligned(16))) sseW3[32]; -static float __attribute__((aligned(16))) sseW4[64]; -static float __attribute__((aligned(16))) sseW5[128]; -static float __attribute__((aligned(16))) sseW6[256]; -static float __attribute__((aligned(16))) *sseW[7]= - {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; -static float __attribute__((aligned(16))) sseWindow[512]; -#endif - -/* Root values for IFFT */ -static sample_t roots16[3]; -static sample_t roots32[7]; -static sample_t roots64[15]; -static sample_t roots128[31]; - -/* Twiddle factors for IMDCT */ -static complex_t pre1[128]; -static complex_t post1[64]; -static complex_t pre2[64]; -static complex_t post2[32]; - -static sample_t a52_imdct_window[256]; - -static void (* ifft128) (complex_t * buf); -static void (* ifft64) (complex_t * buf); - -static inline void ifft2 (complex_t * buf) -{ - double r, i; - - r = buf[0].real; - i = buf[0].imag; - buf[0].real += buf[1].real; - buf[0].imag += buf[1].imag; - buf[1].real = r - buf[1].real; - buf[1].imag = i - buf[1].imag; -} - -static inline void ifft4 (complex_t * buf) -{ - double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - - tmp1 = buf[0].real + buf[1].real; - tmp2 = buf[3].real + buf[2].real; - tmp3 = buf[0].imag + buf[1].imag; - tmp4 = buf[2].imag + buf[3].imag; - tmp5 = buf[0].real - buf[1].real; - tmp6 = buf[0].imag - buf[1].imag; - tmp7 = buf[2].imag - buf[3].imag; - tmp8 = buf[3].real - buf[2].real; - - buf[0].real = tmp1 + tmp2; - buf[0].imag = tmp3 + tmp4; - buf[2].real = tmp1 - tmp2; - buf[2].imag = tmp3 - tmp4; - buf[1].real = tmp5 + tmp7; - buf[1].imag = tmp6 + tmp8; - buf[3].real = tmp5 - tmp7; - buf[3].imag = tmp6 - tmp8; -} - -/* the basic split-radix ifft butterfly */ - -#define BUTTERFLY(a0,a1,a2,a3,wr,wi) do { \ - tmp5 = a2.real * wr + a2.imag * wi; \ - tmp6 = a2.imag * wr - a2.real * wi; \ - tmp7 = a3.real * wr - a3.imag * wi; \ - tmp8 = a3.imag * wr + a3.real * wi; \ - tmp1 = tmp5 + tmp7; \ - tmp2 = tmp6 + tmp8; \ - tmp3 = tmp6 - tmp8; \ - tmp4 = tmp7 - tmp5; \ - a2.real = a0.real - tmp1; \ - a2.imag = a0.imag - tmp2; \ - a3.real = a1.real - tmp3; \ - a3.imag = a1.imag - tmp4; \ - a0.real += tmp1; \ - a0.imag += tmp2; \ - a1.real += tmp3; \ - a1.imag += tmp4; \ -} while (0) - -/* split-radix ifft butterfly, specialized for wr=1 wi=0 */ - -#define BUTTERFLY_ZERO(a0,a1,a2,a3) do { \ - tmp1 = a2.real + a3.real; \ - tmp2 = a2.imag + a3.imag; \ - tmp3 = a2.imag - a3.imag; \ - tmp4 = a3.real - a2.real; \ - a2.real = a0.real - tmp1; \ - a2.imag = a0.imag - tmp2; \ - a3.real = a1.real - tmp3; \ - a3.imag = a1.imag - tmp4; \ - a0.real += tmp1; \ - a0.imag += tmp2; \ - a1.real += tmp3; \ - a1.imag += tmp4; \ -} while (0) - -/* split-radix ifft butterfly, specialized for wr=wi */ - -#define BUTTERFLY_HALF(a0,a1,a2,a3,w) do { \ - tmp5 = (a2.real + a2.imag) * w; \ - tmp6 = (a2.imag - a2.real) * w; \ - tmp7 = (a3.real - a3.imag) * w; \ - tmp8 = (a3.imag + a3.real) * w; \ - tmp1 = tmp5 + tmp7; \ - tmp2 = tmp6 + tmp8; \ - tmp3 = tmp6 - tmp8; \ - tmp4 = tmp7 - tmp5; \ - a2.real = a0.real - tmp1; \ - a2.imag = a0.imag - tmp2; \ - a3.real = a1.real - tmp3; \ - a3.imag = a1.imag - tmp4; \ - a0.real += tmp1; \ - a0.imag += tmp2; \ - a1.real += tmp3; \ - a1.imag += tmp4; \ -} while (0) - -static inline void ifft8 (complex_t * buf) -{ - double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - - ifft4 (buf); - ifft2 (buf + 4); - ifft2 (buf + 6); - BUTTERFLY_ZERO (buf[0], buf[2], buf[4], buf[6]); - BUTTERFLY_HALF (buf[1], buf[3], buf[5], buf[7], roots16[1]); -} - -static void ifft_pass (complex_t * buf, sample_t * weight, int n) -{ - complex_t * buf1; - complex_t * buf2; - complex_t * buf3; - double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - int i; - - buf++; - buf1 = buf + n; - buf2 = buf + 2 * n; - buf3 = buf + 3 * n; - - BUTTERFLY_ZERO (buf[-1], buf1[-1], buf2[-1], buf3[-1]); - - i = n - 1; - - do { - BUTTERFLY (buf[0], buf1[0], buf2[0], buf3[0], weight[n], weight[2*i]); - buf++; - buf1++; - buf2++; - buf3++; - weight++; - } while (--i); -} - -static void ifft16 (complex_t * buf) -{ - ifft8 (buf); - ifft4 (buf + 8); - ifft4 (buf + 12); - ifft_pass (buf, roots16 - 4, 4); -} - -static void ifft32 (complex_t * buf) -{ - ifft16 (buf); - ifft8 (buf + 16); - ifft8 (buf + 24); - ifft_pass (buf, roots32 - 8, 8); -} - -static void ifft64_c (complex_t * buf) -{ - ifft32 (buf); - ifft16 (buf + 32); - ifft16 (buf + 48); - ifft_pass (buf, roots64 - 16, 16); -} - -static void ifft128_c (complex_t * buf) -{ - ifft32 (buf); - ifft16 (buf + 32); - ifft16 (buf + 48); - ifft_pass (buf, roots64 - 16, 16); - - ifft32 (buf + 64); - ifft32 (buf + 96); - ifft_pass (buf, roots128 - 32, 32); -} - -void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias) -{ - int i, k; - sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2; - const sample_t * window = a52_imdct_window; - complex_t buf[128]; - - for (i = 0; i < 128; i++) { - k = fftorder[i]; - t_r = pre1[i].real; - t_i = pre1[i].imag; - - buf[i].real = t_i * data[255-k] + t_r * data[k]; - buf[i].imag = t_r * data[255-k] - t_i * data[k]; - } - - ifft128 (buf); - - /* Post IFFT complex multiply plus IFFT complex conjugate*/ - /* Window and convert to real valued signal */ - for (i = 0; i < 64; i++) { - /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ - t_r = post1[i].real; - t_i = post1[i].imag; - - a_r = t_r * buf[i].real + t_i * buf[i].imag; - a_i = t_i * buf[i].real - t_r * buf[i].imag; - b_r = t_i * buf[127-i].real + t_r * buf[127-i].imag; - b_i = t_r * buf[127-i].real - t_i * buf[127-i].imag; - - w_1 = window[2*i]; - w_2 = window[255-2*i]; - data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias; - data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias; - delay[2*i] = a_i; - - w_1 = window[2*i+1]; - w_2 = window[254-2*i]; - data[2*i+1] = delay[2*i+1] * w_2 + b_r * w_1 + bias; - data[254-2*i] = delay[2*i+1] * w_1 - b_r * w_2 + bias; - delay[2*i+1] = b_i; - } -} - -#if HAVE_ALTIVEC - -#ifdef HAVE_ALTIVEC_H -#include -#endif - -// used to build registers permutation vectors (vcprm) -// the 's' are for words in the _s_econd vector -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} -#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} - -#define FOUROF(a) {a,a,a,a} - -// vcprmle is used to keep the same index as in the SSE version. -// it's the same as vcprm, with the index inversed -// ('le' is Little Endian) -#define vcprmle(a,b,c,d) vcprm(d,c,b,a) - -// used to build inverse/identity vectors (vcii) -// n is _n_egative, p is _p_ositive -#define FLOAT_n -1. -#define FLOAT_p 1. - - -void -imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias) -{ - int i; - int k; - int p,q; - int m; - long two_m; - long two_m_plus_one; - - sample_t tmp_b_i; - sample_t tmp_b_r; - sample_t tmp_a_i; - sample_t tmp_a_r; - - sample_t *data_ptr; - sample_t *delay_ptr; - sample_t *window_ptr; - - /* 512 IMDCT with source and dest data in 'data' */ - - /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ - for( i=0; i < 128; i++) { - /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ - int j= bit_reverse_512[i]; - buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); - buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); - } - - /* 1. iteration */ - for(i = 0; i < 128; i += 2) { -#if 0 - tmp_a_r = buf[i].real; - tmp_a_i = buf[i].imag; - tmp_b_r = buf[i+1].real; - tmp_b_i = buf[i+1].imag; - buf[i].real = tmp_a_r + tmp_b_r; - buf[i].imag = tmp_a_i + tmp_b_i; - buf[i+1].real = tmp_a_r - tmp_b_r; - buf[i+1].imag = tmp_a_i - tmp_b_i; -#else - vector float temp, bufv; - - bufv = vec_ld(i << 3, (float*)buf); - temp = vec_perm(bufv, bufv, vcprm(2,3,0,1)); - bufv = vec_madd(bufv, vcii(p,p,n,n), temp); - vec_st(bufv, i << 3, (float*)buf); -#endif - } - - /* 2. iteration */ - // Note w[1]={{1,0}, {0,-1}} - for(i = 0; i < 128; i += 4) { -#if 0 - tmp_a_r = buf[i].real; - tmp_a_i = buf[i].imag; - tmp_b_r = buf[i+2].real; - tmp_b_i = buf[i+2].imag; - buf[i].real = tmp_a_r + tmp_b_r; - buf[i].imag = tmp_a_i + tmp_b_i; - buf[i+2].real = tmp_a_r - tmp_b_r; - buf[i+2].imag = tmp_a_i - tmp_b_i; - tmp_a_r = buf[i+1].real; - tmp_a_i = buf[i+1].imag; - /* WARNING: im <-> re here ! */ - tmp_b_r = buf[i+3].imag; - tmp_b_i = buf[i+3].real; - buf[i+1].real = tmp_a_r + tmp_b_r; - buf[i+1].imag = tmp_a_i - tmp_b_i; - buf[i+3].real = tmp_a_r - tmp_b_r; - buf[i+3].imag = tmp_a_i + tmp_b_i; -#else - vector float buf01, buf23, temp1, temp2; - - buf01 = vec_ld((i + 0) << 3, (float*)buf); - buf23 = vec_ld((i + 2) << 3, (float*)buf); - buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2)); - - temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01); - temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01); - - vec_st(temp1, (i + 0) << 3, (float*)buf); - vec_st(temp2, (i + 2) << 3, (float*)buf); -#endif - } - - /* 3. iteration */ - for(i = 0; i < 128; i += 8) { -#if 0 - tmp_a_r = buf[i].real; - tmp_a_i = buf[i].imag; - tmp_b_r = buf[i+4].real; - tmp_b_i = buf[i+4].imag; - buf[i].real = tmp_a_r + tmp_b_r; - buf[i].imag = tmp_a_i + tmp_b_i; - buf[i+4].real = tmp_a_r - tmp_b_r; - buf[i+4].imag = tmp_a_i - tmp_b_i; - tmp_a_r = buf[1+i].real; - tmp_a_i = buf[1+i].imag; - tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; - tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; - buf[1+i].real = tmp_a_r + tmp_b_r; - buf[1+i].imag = tmp_a_i + tmp_b_i; - buf[i+5].real = tmp_a_r - tmp_b_r; - buf[i+5].imag = tmp_a_i - tmp_b_i; - tmp_a_r = buf[i+2].real; - tmp_a_i = buf[i+2].imag; - /* WARNING re <-> im & sign */ - tmp_b_r = buf[i+6].imag; - tmp_b_i = - buf[i+6].real; - buf[i+2].real = tmp_a_r + tmp_b_r; - buf[i+2].imag = tmp_a_i + tmp_b_i; - buf[i+6].real = tmp_a_r - tmp_b_r; - buf[i+6].imag = tmp_a_i - tmp_b_i; - tmp_a_r = buf[i+3].real; - tmp_a_i = buf[i+3].imag; - tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; - tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; - buf[i+3].real = tmp_a_r + tmp_b_r; - buf[i+3].imag = tmp_a_i + tmp_b_i; - buf[i+7].real = tmp_a_r - tmp_b_r; - buf[i+7].imag = tmp_a_i - tmp_b_i; -#else - vector float buf01, buf23, buf45, buf67; - - buf01 = vec_ld((i + 0) << 3, (float*)buf); - buf23 = vec_ld((i + 2) << 3, (float*)buf); - - tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; - tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; - buf[i+5].real = tmp_b_r; - buf[i+5].imag = tmp_b_i; - tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; - tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; - buf[i+7].real = tmp_b_r; - buf[i+7].imag = tmp_b_i; - - buf23 = vec_ld((i + 2) << 3, (float*)buf); - buf45 = vec_ld((i + 4) << 3, (float*)buf); - buf67 = vec_ld((i + 6) << 3, (float*)buf); - buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3)); - - vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf); - vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf); - vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf); - vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf); -#endif - } - - /* 4-7. iterations */ - for (m=3; m < 7; m++) { - two_m = (1 << m); - - two_m_plus_one = two_m<<1; - - for(i = 0; i < 128; i += two_m_plus_one) { - for(k = 0; k < two_m; k+=2) { -#if 0 - int p = k + i; - int q = p + two_m; - tmp_a_r = buf[p].real; - tmp_a_i = buf[p].imag; - tmp_b_r = - buf[q].real * w[m][k].real - - buf[q].imag * w[m][k].imag; - tmp_b_i = - buf[q].imag * w[m][k].real + - buf[q].real * w[m][k].imag; - buf[p].real = tmp_a_r + tmp_b_r; - buf[p].imag = tmp_a_i + tmp_b_i; - buf[q].real = tmp_a_r - tmp_b_r; - buf[q].imag = tmp_a_i - tmp_b_i; - - tmp_a_r = buf[(p + 1)].real; - tmp_a_i = buf[(p + 1)].imag; - tmp_b_r = - buf[(q + 1)].real * w[m][(k + 1)].real - - buf[(q + 1)].imag * w[m][(k + 1)].imag; - tmp_b_i = - buf[(q + 1)].imag * w[m][(k + 1)].real + - buf[(q + 1)].real * w[m][(k + 1)].imag; - buf[(p + 1)].real = tmp_a_r + tmp_b_r; - buf[(p + 1)].imag = tmp_a_i + tmp_b_i; - buf[(q + 1)].real = tmp_a_r - tmp_b_r; - buf[(q + 1)].imag = tmp_a_i - tmp_b_i; -#else - int p = k + i; - int q = p + two_m; - vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4; - const vector float vczero = (const vector float)FOUROF(0.); - // first compute buf[q] and buf[q+1] - vecq = vec_ld(q << 3, (float*)buf); - vecw = vec_ld(0, (float*)&(w[m][k])); - temp1 = vec_madd(vecq, vecw, vczero); - temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2)); - temp2 = vec_madd(temp2, vecw, vczero); - temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2)); - temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3)); - vecq = vec_madd(temp4, vcii(n,p,n,p), temp3); - // then butterfly with buf[p] and buf[p+1] - vecp = vec_ld(p << 3, (float*)buf); - - temp1 = vec_add(vecp, vecq); - temp2 = vec_sub(vecp, vecq); - - vec_st(temp1, p << 3, (float*)buf); - vec_st(temp2, q << 3, (float*)buf); -#endif - } - } - } - - /* Post IFFT complex multiply plus IFFT complex conjugate*/ - for( i=0; i < 128; i+=4) { - /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ -#if 0 - tmp_a_r = buf[(i + 0)].real; - tmp_a_i = -1.0 * buf[(i + 0)].imag; - buf[(i + 0)].real = - (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]); - buf[(i + 0)].imag = - (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]); - - tmp_a_r = buf[(i + 1)].real; - tmp_a_i = -1.0 * buf[(i + 1)].imag; - buf[(i + 1)].real = - (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]); - buf[(i + 1)].imag = - (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]); - - tmp_a_r = buf[(i + 2)].real; - tmp_a_i = -1.0 * buf[(i + 2)].imag; - buf[(i + 2)].real = - (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]); - buf[(i + 2)].imag = - (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]); - - tmp_a_r = buf[(i + 3)].real; - tmp_a_i = -1.0 * buf[(i + 3)].imag; - buf[(i + 3)].real = - (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]); - buf[(i + 3)].imag = - (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]); -#else - vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2; - vector float temp0022, temp1133, tempCS01; - const vector float vczero = (const vector float)FOUROF(0.); - - bufv_0 = vec_ld((i + 0) << 3, (float*)buf); - bufv_2 = vec_ld((i + 2) << 3, (float*)buf); - - cosv = vec_ld(i << 2, xcos1); - sinv = vec_ld(i << 2, xsin1); - - temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2)); - temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3)); - tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1)); - temp1 = vec_madd(temp0022, tempCS01, vczero); - tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1)); - temp2 = vec_madd(temp1133, tempCS01, vczero); - bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1); - - vec_st(bufv_0, (i + 0) << 3, (float*)buf); - - /* idem with bufv_2 and high-order cosv/sinv */ - - temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2)); - temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3)); - tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3)); - temp1 = vec_madd(temp0022, tempCS01, vczero); - tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3)); - temp2 = vec_madd(temp1133, tempCS01, vczero); - bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1); - - vec_st(bufv_2, (i + 2) << 3, (float*)buf); - -#endif - } - - data_ptr = data; - delay_ptr = delay; - window_ptr = a52_imdct_window; - - /* Window and convert to real valued signal */ - for(i=0; i< 64; i++) { - *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; - *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; - } - - for(i=0; i< 64; i++) { - *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; - *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; - } - - /* The trailing edge of the window goes into the delay line */ - delay_ptr = delay; - - for(i=0; i< 64; i++) { - *delay_ptr++ = -buf[64+i].real * *--window_ptr; - *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; - } - - for(i=0; i<64; i++) { - *delay_ptr++ = buf[i].imag * *--window_ptr; - *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; - } -} -#endif - - -// Stuff below this line is borrowed from libac3 -#include "srfftp.h" -#if ARCH_X86 || ARCH_X86_64 -#undef HAVE_AMD3DNOW -#define HAVE_AMD3DNOW 1 -#include "srfftp_3dnow.h" - -const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }}; -const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }}; -const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; - -#undef HAVE_AMD3DNOWEXT -#define HAVE_AMD3DNOWEXT 0 -#include "imdct_3dnow.h" -#undef HAVE_AMD3DNOWEXT -#define HAVE_AMD3DNOWEXT 1 -#include "imdct_3dnow.h" - -#if !ARCH_X86_64 || !defined(PIC) -void -imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) -{ -/* int i,k; - int p,q;*/ - int m; - long two_m; - long two_m_plus_one; - long two_m_plus_one_shl3; - complex_t *buf_offset; - -/* sample_t tmp_a_i; - sample_t tmp_a_r; - sample_t tmp_b_i; - sample_t tmp_b_r;*/ - - sample_t *data_ptr; - sample_t *delay_ptr; - sample_t *window_ptr; - - /* 512 IMDCT with source and dest data in 'data' */ - /* see the c version (dct_do_512()), its allmost identical, just in C */ - - /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ - /* Bit reversed shuffling */ - __asm__ volatile( - "xor %%"REG_S", %%"REG_S" \n\t" - "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t" - "mov $1008, %%"REG_D" \n\t" - "push %%"REG_BP" \n\t" //use ebp without telling gcc - ASMALIGN(4) - "1: \n\t" - "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI - "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI - "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi - "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi - "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR - "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI - "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" - "subps %%xmm0, %%xmm2 \n\t" - "movzb (%%"REG_a"), %%"REG_d" \n\t" - "movzb 1(%%"REG_a"), %%"REG_BP" \n\t" - "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t" - "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t" - "add $16, %%"REG_S" \n\t" - "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap - "sub $16, %%"REG_D" \n\t" - "jnc 1b \n\t" - "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g* - :: "b" (data), "c" (buf) - : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d - ); - - - /* FFT Merge */ -/* unoptimized variant - for (m=1; m < 7; m++) { - if(m) - two_m = (1 << m); - else - two_m = 1; - - two_m_plus_one = (1 << (m+1)); - - for(i = 0; i < 128; i += two_m_plus_one) { - for(k = 0; k < two_m; k++) { - p = k + i; - q = p + two_m; - tmp_a_r = buf[p].real; - tmp_a_i = buf[p].imag; - tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; - tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; - buf[p].real = tmp_a_r + tmp_b_r; - buf[p].imag = tmp_a_i + tmp_b_i; - buf[q].real = tmp_a_r - tmp_b_r; - buf[q].imag = tmp_a_i - tmp_b_i; - } - } - } -*/ - - /* 1. iteration */ - // Note w[0][0]={1,0} - __asm__ volatile( - "xorps %%xmm1, %%xmm1 \n\t" - "xorps %%xmm2, %%xmm2 \n\t" - "mov %0, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p] - "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q] - "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p] - "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q] - "addps %%xmm1, %%xmm0 \n\t" - "subps %%xmm2, %%xmm0 \n\t" - "movaps %%xmm0, (%%"REG_S")\n\t" - "add $16, %%"REG_S" \n\t" - "cmp %1, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "g" (buf), "r" (buf + 128) - : "%"REG_S - ); - - /* 2. iteration */ - // Note w[1]={{1,0}, {0,-1}} - __asm__ volatile( - "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 - "mov %0, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3 - "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 - "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 - "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 - "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1 - "addps %%xmm2, %%xmm0 \n\t" - "subps %%xmm2, %%xmm1 \n\t" - "movaps %%xmm0, (%%"REG_S") \n\t" - "movaps %%xmm1, 16(%%"REG_S") \n\t" - "add $32, %%"REG_S" \n\t" - "cmp %1, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "g" (buf), "r" (buf + 128) - : "%"REG_S - ); - - /* 3. iteration */ -/* - Note sseW2+0={1,1,sqrt(2),sqrt(2)) - Note sseW2+16={0,0,sqrt(2),-sqrt(2)) - Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) - Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) -*/ - __asm__ volatile( - "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" - "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" - "xorps %%xmm5, %%xmm5 \n\t" - "xorps %%xmm2, %%xmm2 \n\t" - "mov %0, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5 - "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7 - "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 - "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 - "mulps %%xmm2, %%xmm4 \n\t" - "mulps %%xmm3, %%xmm5 \n\t" - "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 - "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 - "mulps %%xmm6, %%xmm3 \n\t" - "mulps %%xmm7, %%xmm2 \n\t" - "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 - "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3 - "addps %%xmm4, %%xmm2 \n\t" - "addps %%xmm5, %%xmm3 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "movaps %%xmm3, %%xmm5 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "addps %%xmm1, %%xmm3 \n\t" - "subps %%xmm4, %%xmm0 \n\t" - "subps %%xmm5, %%xmm1 \n\t" - "movaps %%xmm2, (%%"REG_S") \n\t" - "movaps %%xmm3, 16(%%"REG_S") \n\t" - "movaps %%xmm0, 32(%%"REG_S") \n\t" - "movaps %%xmm1, 48(%%"REG_S") \n\t" - "add $64, %%"REG_S" \n\t" - "cmp %1, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "g" (buf), "r" (buf + 128) - : "%"REG_S - ); - - /* 4-7. iterations */ - for (m=3; m < 7; m++) { - two_m = (1 << m); - two_m_plus_one = two_m<<1; - two_m_plus_one_shl3 = (two_m_plus_one<<3); - buf_offset = buf+128; - __asm__ volatile( - "mov %0, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "xor %%"REG_D", %%"REG_D" \n\t" // k - "lea (%%"REG_S", %3), %%"REG_d" \n\t" - "2: \n\t" - "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t" - "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t" - "mulps %%xmm1, %%xmm2 \n\t" - "shufps $0xB1, %%xmm1, %%xmm1 \n\t" - "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t" - "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t" - "addps %%xmm2, %%xmm1 \n\t" - "movaps %%xmm1, %%xmm2 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "subps %%xmm2, %%xmm0 \n\t" - "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t" - "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t" - "add $16, %%"REG_D" \n\t" - "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0 - "jb 2b \n\t" - "add %2, %%"REG_S" \n\t" - "cmp %1, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3), - "r" (sseW[m]) - : "%"REG_S, "%"REG_D, "%"REG_d - ); - } - - /* Post IFFT complex multiply plus IFFT complex conjugate*/ - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - ASMALIGN(4) - "1: \n\t" - "movaps (%0, %%"REG_S"), %%xmm0 \n\t" - "movaps (%0, %%"REG_S"), %%xmm1 \n\t" - "shufps $0xB1, %%xmm0, %%xmm0 \n\t" - "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t" - "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" - "addps %%xmm1, %%xmm0 \n\t" - "movaps %%xmm0, (%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - :: "r" (buf+128) - : "%"REG_S - ); - - - data_ptr = data; - delay_ptr = delay; - window_ptr = a52_imdct_window; - - /* Window and convert to real valued signal */ - __asm__ volatile( - "xor %%"REG_D", %%"REG_D" \n\t" // 0 - "xor %%"REG_S", %%"REG_S" \n\t" // 0 - "movss %3, %%xmm2 \n\t" // bias - "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... - ASMALIGN(4) - "1: \n\t" - "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? - "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? - "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? - "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? - "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A - "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" - "addps (%2, %%"REG_S"), %%xmm0 \n\t" - "addps %%xmm2, %%xmm0 \n\t" - "movaps %%xmm0, (%1, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - "sub $16, %%"REG_D" \n\t" - "cmp $512, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) - : "%"REG_S, "%"REG_D - ); - data_ptr+=128; - delay_ptr+=128; -// window_ptr+=128; - - __asm__ volatile( - "mov $1024, %%"REG_D" \n\t" // 512 - "xor %%"REG_S", %%"REG_S" \n\t" // 0 - "movss %3, %%xmm2 \n\t" // bias - "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... - ASMALIGN(4) - "1: \n\t" - "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A - "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C - "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C - "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A - "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A - "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" - "addps (%2, %%"REG_S"), %%xmm0 \n\t" - "addps %%xmm2, %%xmm0 \n\t" - "movaps %%xmm0, (%1, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - "sub $16, %%"REG_D" \n\t" - "cmp $512, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) - : "%"REG_S, "%"REG_D - ); - data_ptr+=128; -// window_ptr+=128; - - /* The trailing edge of the window goes into the delay line */ - delay_ptr = delay; - - __asm__ volatile( - "xor %%"REG_D", %%"REG_D" \n\t" // 0 - "xor %%"REG_S", %%"REG_S" \n\t" // 0 - ASMALIGN(4) - "1: \n\t" - "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A - "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C - "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C - "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A - "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A - "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" - "movaps %%xmm0, (%1, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - "sub $16, %%"REG_D" \n\t" - "cmp $512, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "r" (buf+64), "r" (delay_ptr) - : "%"REG_S, "%"REG_D - ); - delay_ptr+=128; -// window_ptr-=128; - - __asm__ volatile( - "mov $1024, %%"REG_D" \n\t" // 1024 - "xor %%"REG_S", %%"REG_S" \n\t" // 0 - ASMALIGN(4) - "1: \n\t" - "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? - "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? - "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? - "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? - "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A - "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" - "movaps %%xmm0, (%1, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - "sub $16, %%"REG_D" \n\t" - "cmp $512, %%"REG_S" \n\t" - " jb 1b \n\t" - :: "r" (buf), "r" (delay_ptr) - : "%"REG_S, "%"REG_D - ); -} -#endif -#endif // ARCH_X86 || ARCH_X86_64 - -void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias) -{ - int i, k; - sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2; - const sample_t * window = a52_imdct_window; - complex_t buf1[64], buf2[64]; - - /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ - for (i = 0; i < 64; i++) { - k = fftorder[i]; - t_r = pre2[i].real; - t_i = pre2[i].imag; - - buf1[i].real = t_i * data[254-k] + t_r * data[k]; - buf1[i].imag = t_r * data[254-k] - t_i * data[k]; - - buf2[i].real = t_i * data[255-k] + t_r * data[k+1]; - buf2[i].imag = t_r * data[255-k] - t_i * data[k+1]; - } - - ifft64 (buf1); - ifft64 (buf2); - - /* Post IFFT complex multiply */ - /* Window and convert to real valued signal */ - for (i = 0; i < 32; i++) { - /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */ - t_r = post2[i].real; - t_i = post2[i].imag; - - a_r = t_r * buf1[i].real + t_i * buf1[i].imag; - a_i = t_i * buf1[i].real - t_r * buf1[i].imag; - b_r = t_i * buf1[63-i].real + t_r * buf1[63-i].imag; - b_i = t_r * buf1[63-i].real - t_i * buf1[63-i].imag; - - c_r = t_r * buf2[i].real + t_i * buf2[i].imag; - c_i = t_i * buf2[i].real - t_r * buf2[i].imag; - d_r = t_i * buf2[63-i].real + t_r * buf2[63-i].imag; - d_i = t_r * buf2[63-i].real - t_i * buf2[63-i].imag; - - w_1 = window[2*i]; - w_2 = window[255-2*i]; - data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias; - data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias; - delay[2*i] = c_i; - - w_1 = window[128+2*i]; - w_2 = window[127-2*i]; - data[128+2*i] = delay[127-2*i] * w_2 + a_i * w_1 + bias; - data[127-2*i] = delay[127-2*i] * w_1 - a_i * w_2 + bias; - delay[127-2*i] = c_r; - - w_1 = window[2*i+1]; - w_2 = window[254-2*i]; - data[2*i+1] = delay[2*i+1] * w_2 - b_i * w_1 + bias; - data[254-2*i] = delay[2*i+1] * w_1 + b_i * w_2 + bias; - delay[2*i+1] = d_r; - - w_1 = window[129+2*i]; - w_2 = window[126-2*i]; - data[129+2*i] = delay[126-2*i] * w_2 + b_r * w_1 + bias; - data[126-2*i] = delay[126-2*i] * w_1 - b_r * w_2 + bias; - delay[126-2*i] = d_i; - } -} - -static double besselI0 (double x) -{ - double bessel = 1; - int i = 100; - - do - bessel = bessel * x / (i * i) + 1; - while (--i); - return bessel; -} - -void a52_imdct_init (uint32_t mm_accel) -{ - int i, j, k; - double sum; - - /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */ - sum = 0; - for (i = 0; i < 256; i++) { - sum += besselI0 (i * (256 - i) * (5 * M_PI / 256) * (5 * M_PI / 256)); - a52_imdct_window[i] = sum; - } - sum++; - for (i = 0; i < 256; i++) - a52_imdct_window[i] = sqrt (a52_imdct_window[i] / sum); - - for (i = 0; i < 3; i++) - roots16[i] = cos ((M_PI / 8) * (i + 1)); - - for (i = 0; i < 7; i++) - roots32[i] = cos ((M_PI / 16) * (i + 1)); - - for (i = 0; i < 15; i++) - roots64[i] = cos ((M_PI / 32) * (i + 1)); - - for (i = 0; i < 31; i++) - roots128[i] = cos ((M_PI / 64) * (i + 1)); - - for (i = 0; i < 64; i++) { - k = fftorder[i] / 2 + 64; - pre1[i].real = cos ((M_PI / 256) * (k - 0.25)); - pre1[i].imag = sin ((M_PI / 256) * (k - 0.25)); - } - - for (i = 64; i < 128; i++) { - k = fftorder[i] / 2 + 64; - pre1[i].real = -cos ((M_PI / 256) * (k - 0.25)); - pre1[i].imag = -sin ((M_PI / 256) * (k - 0.25)); - } - - for (i = 0; i < 64; i++) { - post1[i].real = cos ((M_PI / 256) * (i + 0.5)); - post1[i].imag = sin ((M_PI / 256) * (i + 0.5)); - } - - for (i = 0; i < 64; i++) { - k = fftorder[i] / 4; - pre2[i].real = cos ((M_PI / 128) * (k - 0.25)); - pre2[i].imag = sin ((M_PI / 128) * (k - 0.25)); - } - - for (i = 0; i < 32; i++) { - post2[i].real = cos ((M_PI / 128) * (i + 0.5)); - post2[i].imag = sin ((M_PI / 128) * (i + 0.5)); - } - for (i = 0; i < 128; i++) { - xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); - xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); - } - for (i = 0; i < 7; i++) { - j = 1 << i; - for (k = 0; k < j; k++) { - w[i][k].real = cos (-M_PI * k / j); - w[i][k].imag = sin (-M_PI * k / j); - } - } -#if ARCH_X86 || ARCH_X86_64 - for (i = 0; i < 128; i++) { - sseSinCos1c[2*i+0]= xcos1[i]; - sseSinCos1c[2*i+1]= -xcos1[i]; - sseSinCos1d[2*i+0]= xsin1[i]; - sseSinCos1d[2*i+1]= xsin1[i]; - } - for (i = 1; i < 7; i++) { - j = 1 << i; - for (k = 0; k < j; k+=2) { - - sseW[i][4*k + 0] = w[i][k+0].real; - sseW[i][4*k + 1] = w[i][k+0].real; - sseW[i][4*k + 2] = w[i][k+1].real; - sseW[i][4*k + 3] = w[i][k+1].real; - - sseW[i][4*k + 4] = -w[i][k+0].imag; - sseW[i][4*k + 5] = w[i][k+0].imag; - sseW[i][4*k + 6] = -w[i][k+1].imag; - sseW[i][4*k + 7] = w[i][k+1].imag; - - //we multiply more or less uninitalized numbers so we need to use exactly 0.0 - if(k==0) - { -// sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0; - sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0; - } - - if(2*k == j) - { - sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0; -// sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0); - } - } - } - - for(i=0; i<128; i++) - { - sseWindow[2*i+0]= -a52_imdct_window[2*i+0]; - sseWindow[2*i+1]= a52_imdct_window[2*i+1]; - } - - for(i=0; i<64; i++) - { - sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1]; - sseWindow[256 + 2*i+1]= a52_imdct_window[254 - 2*i+0]; - sseWindow[384 + 2*i+0]= a52_imdct_window[126 - 2*i+1]; - sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0]; - } -#endif - a52_imdct_512 = imdct_do_512; - ifft128 = ifft128_c; - ifft64 = ifft64_c; - -#if ARCH_X86 || ARCH_X86_64 -#if !ARCH_X86_64 || !defined(PIC) - if(mm_accel & MM_ACCEL_X86_SSE) - { - fprintf (stderr, "Using SSE optimized IMDCT transform\n"); - a52_imdct_512 = imdct_do_512_sse; - } - else -#endif - if(mm_accel & MM_ACCEL_X86_3DNOWEXT) - { - fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n"); - a52_imdct_512 = imdct_do_512_3dnowex; - } - else - if(mm_accel & MM_ACCEL_X86_3DNOW) - { - fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); - a52_imdct_512 = imdct_do_512_3dnow; - } - else -#endif // ARCH_X86 || ARCH_X86_64 -#if HAVE_ALTIVEC - if (mm_accel & MM_ACCEL_PPC_ALTIVEC) - { - fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); - a52_imdct_512 = imdct_do_512_altivec; - } - else -#endif - -#ifdef LIBA52_DJBFFT - if (mm_accel & MM_ACCEL_DJBFFT) { - fprintf (stderr, "Using djbfft for IMDCT transform\n"); - ifft128 = (void (*) (complex_t *)) fftc4_un128; - ifft64 = (void (*) (complex_t *)) fftc4_un64; - } else -#endif - { - fprintf (stderr, "No accelerated IMDCT transform found\n"); - } -} diff -r 459227551819 -r 1aece15222b5 liba52/imdct_3dnow.h --- a/liba52/imdct_3dnow.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,581 +0,0 @@ -/* - * 3DNOW and 3DNOWEX optimized IMDCT - * Copyright (C) 2002 Nick Kurshev - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#undef FFT_4_3DNOW -#undef FFT_8_3DNOW -#undef FFT_ASMB_3DNOW -#undef FFT_ASMB16_3DNOW -#undef FFT_128P_3DNOW - -#if HAVE_AMD3DNOWEXT -#define FFT_4_3DNOW fft_4_3dnowex -#define FFT_8_3DNOW fft_8_3dnowex -#define FFT_ASMB_3DNOW fft_asmb_3dnowex -#define FFT_ASMB16_3DNOW fft_asmb16_3dnowex -#define FFT_128P_3DNOW fft_128p_3dnowex -#else -#define FFT_4_3DNOW fft_4_3dnow -#define FFT_8_3DNOW fft_8_3dnow -#define FFT_ASMB_3DNOW fft_asmb_3dnow -#define FFT_ASMB16_3DNOW fft_asmb16_3dnow -#define FFT_128P_3DNOW fft_128p_3dnow -#endif - -static void FFT_4_3DNOW(complex_t *x) -{ - /* delta_p = 1 here */ - /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} - */ - __asm__ volatile( - "movq 24(%1), %%mm3\n\t" - "movq 8(%1), %%mm1\n\t" - "pxor %2, %%mm3\n\t" /* mm3.re | -mm3.im */ - "pxor %3, %%mm1\n\t" /* -mm1.re | mm1.im */ - "pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */ - "movq %%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */ -#if HAVE_AMD3DNOWEXT - "pswapd %%mm4, %%mm4\n\t" -#else - "punpckldq %%mm4, %%mm5\n\t" - "punpckhdq %%mm5, %%mm4\n\t" -#endif - "movq (%1), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */ - "movq (%1), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */ - "movq 24(%1), %%mm7\n\t" /* u.re = x[3].re + x[1].re; */ - "pfsub 16(%1), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */ - "pfadd 16(%1), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */ - "pfadd 8(%1), %%mm7\n\t" /* u.im = x[3].im + x[1].im; mm7 = u */ - - "movq %%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */ - "movq %%mm5, %%mm1\n\t" /* x[1].re = yb.re + vi.re; */ - "pfadd %%mm7, %%mm0\n\t" /*x[0].im = yt.im + u.im; */ - "pfadd %%mm4, %%mm1\n\t" /* x[1].im = yb.im + vi.im; */ - "movq %%mm0, (%0)\n\t" - "movq %%mm1, 8(%0)\n\t" - - "pfsub %%mm7, %%mm6\n\t" /* x[2].re = yt.re - u.re; */ - "pfsub %%mm4, %%mm5\n\t" /* x[3].re = yb.re - vi.re; */ - "movq %%mm6, 16(%0)\n\t" /* x[2].im = yt.im - u.im; */ - "movq %%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */ - :"=r"(x) - :"0"(x), - "m"(x_plus_minus_3dnow), - "m"(x_minus_plus_3dnow) - :"memory"); -} - -static void FFT_8_3DNOW(complex_t *x) -{ - /* delta_p = diag{1, sqrt(i)} here */ - /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} - */ - complex_t wT1, wB1, wB2; - - __asm__ volatile( - "movq 8(%2), %%mm0\n\t" - "movq 24(%2), %%mm1\n\t" - "movq %%mm0, %0\n\t" /* wT1 = x[1]; */ - "movq %%mm1, %1\n\t" /* wB1 = x[3]; */ - :"=m"(wT1), "=m"(wB1) - :"r"(x) - :"memory"); - - __asm__ volatile( - "movq 16(%0), %%mm2\n\t" - "movq 32(%0), %%mm3\n\t" - "movq %%mm2, 8(%0)\n\t" /* x[1] = x[2]; */ - "movq 48(%0), %%mm4\n\t" - "movq %%mm3, 16(%0)\n\t" /* x[2] = x[4]; */ - "movq %%mm4, 24(%0)\n\t" /* x[3] = x[6]; */ - :"=r"(x) - :"0"(x) - :"memory"); - - fft_4_3dnow(&x[0]); - - /* x[0] x[4] x[2] x[6] */ - - __asm__ volatile( - "movq 40(%1), %%mm0\n\t" - "movq %%mm0, %%mm3\n\t" - "movq 56(%1), %%mm1\n\t" - "pfadd %%mm1, %%mm0\n\t" - "pfsub %%mm1, %%mm3\n\t" - "movq (%2), %%mm2\n\t" - "pfadd %%mm2, %%mm0\n\t" - "pfadd %%mm2, %%mm3\n\t" - "movq (%3), %%mm1\n\t" - "pfadd %%mm1, %%mm0\n\t" - "pfsub %%mm1, %%mm3\n\t" - "movq (%1), %%mm1\n\t" - "movq 16(%1), %%mm4\n\t" - "movq %%mm1, %%mm2\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd %%mm3, %%mm3\n\t" -#else - "punpckldq %%mm3, %%mm6\n\t" - "punpckhdq %%mm6, %%mm3\n\t" -#endif - "pfadd %%mm0, %%mm1\n\t" - "movq %%mm4, %%mm5\n\t" - "pfsub %%mm0, %%mm2\n\t" - "pfadd %%mm3, %%mm4\n\t" - "movq %%mm1, (%0)\n\t" - "pfsub %%mm3, %%mm5\n\t" - "movq %%mm2, 32(%0)\n\t" - "movd %%mm4, 16(%0)\n\t" - "movd %%mm5, 48(%0)\n\t" - "psrlq $32, %%mm4\n\t" - "psrlq $32, %%mm5\n\t" - "movd %%mm4, 52(%0)\n\t" - "movd %%mm5, 20(%0)" - :"=r"(x) - :"0"(x), "r"(&wT1), "r"(&wB1) - :"memory"); - - /* x[1] x[5] */ - __asm__ volatile ( - "movq %6, %%mm6\n\t" - "movq %5, %%mm7\n\t" - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 56(%3), %%mm3\n\t" - "pfsub 40(%3), %%mm0\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd %%mm1, %%mm1\n\t" -#else - "punpckldq %%mm1, %%mm2\n\t" - "punpckhdq %%mm2, %%mm1\n\t" -#endif - "pxor %%mm7, %%mm1\n\t" - "pfadd %%mm1, %%mm0\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd %%mm3, %%mm3\n\t" -#else - "punpckldq %%mm3, %%mm2\n\t" - "punpckhdq %%mm2, %%mm3\n\t" -#endif - "pxor %%mm6, %%mm3\n\t" - "pfadd %%mm3, %%mm0\n\t" - "movq %%mm0, %%mm1\n\t" - "pxor %%mm6, %%mm1\n\t" - "pfacc %%mm1, %%mm0\n\t" - "pfmul %4, %%mm0\n\t" - - "movq 40(%3), %%mm5\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd %%mm5, %%mm5\n\t" -#else - "punpckldq %%mm5, %%mm1\n\t" - "punpckhdq %%mm1, %%mm5\n\t" -#endif - "movq %%mm5, %0\n\t" - - "movq 8(%3), %%mm1\n\t" - "movq %%mm1, %%mm2\n\t" - "pfsub %%mm0, %%mm1\n\t" - "pfadd %%mm0, %%mm2\n\t" - "movq %%mm1, 40(%3)\n\t" - "movq %%mm2, 8(%3)\n\t" - :"=m"(wB2) - :"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW), - "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow) - :"memory"); - - - /* x[3] x[7] */ - __asm__ volatile( - "movq %1, %%mm0\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd %3, %%mm1\n\t" -#else - "movq %3, %%mm1\n\t" - "punpckldq %%mm1, %%mm2\n\t" - "punpckhdq %%mm2, %%mm1\n\t" -#endif - "pxor %%mm6, %%mm1\n\t" - "pfadd %%mm1, %%mm0\n\t" - "movq %2, %%mm2\n\t" - "movq 56(%4), %%mm3\n\t" - "pxor %%mm7, %%mm3\n\t" - "pfadd %%mm3, %%mm2\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd %%mm2, %%mm2\n\t" -#else - "punpckldq %%mm2, %%mm5\n\t" - "punpckhdq %%mm5, %%mm2\n\t" -#endif - "movq 24(%4), %%mm3\n\t" - "pfsub %%mm2, %%mm0\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm0, %%mm1\n\t" - "pxor %%mm6, %%mm0\n\t" - "pfacc %%mm1, %%mm0\n\t" - "pfmul %5, %%mm0\n\t" - "movq %%mm0, %%mm1\n\t" - "pxor %%mm6, %%mm1\n\t" - "pxor %%mm7, %%mm0\n\t" - "pfadd %%mm1, %%mm3\n\t" - "pfadd %%mm0, %%mm4\n\t" - "movq %%mm4, 24(%0)\n\t" - "movq %%mm3, 56(%0)\n\t" - :"=r"(x) - :"m"(wT1), "m"(wB2), "m"(wB1), "0"(x), "m"(HSQRT2_3DNOW) - :"memory"); -} - -static void FFT_ASMB_3DNOW(int k, complex_t *x, complex_t *wTB, - const complex_t *d, const complex_t *d_3) -{ - register complex_t *x2k, *x3k, *x4k, *wB; - - TRANS_FILL_MM6_MM7_3DNOW(); - x2k = x + 2 * k; - x3k = x2k + 2 * k; - x4k = x3k + 2 * k; - wB = wTB + 2 * k; - - TRANSZERO_3DNOW(x[0],x2k[0],x3k[0],x4k[0]); - TRANS_3DNOW(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); - - --k; - for(;;) { - TRANS_3DNOW(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); - TRANS_3DNOW(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); - if (!--k) break; - x += 2; - x2k += 2; - x3k += 2; - x4k += 2; - d += 2; - d_3 += 2; - wTB += 2; - wB += 2; - } - -} - -void FFT_ASMB16_3DNOW(complex_t *x, complex_t *wTB) -{ - int k = 2; - - TRANS_FILL_MM6_MM7_3DNOW(); - /* transform x[0], x[8], x[4], x[12] */ - TRANSZERO_3DNOW(x[0],x[4],x[8],x[12]); - - /* transform x[1], x[9], x[5], x[13] */ - TRANS_3DNOW(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); - - /* transform x[2], x[10], x[6], x[14] */ - TRANSHALF_16_3DNOW(x[2],x[6],x[10],x[14]); - - /* transform x[3], x[11], x[7], x[15] */ - TRANS_3DNOW(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); - -} - -static void FFT_128P_3DNOW(complex_t *a) -{ - FFT_8_3DNOW(&a[0]); FFT_4_3DNOW(&a[8]); FFT_4_3DNOW(&a[12]); - FFT_ASMB16_3DNOW(&a[0], &a[8]); - - FFT_8_3DNOW(&a[16]), FFT_8_3DNOW(&a[24]); - FFT_ASMB_3DNOW(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); - - FFT_8_3DNOW(&a[32]); FFT_4_3DNOW(&a[40]); FFT_4_3DNOW(&a[44]); - FFT_ASMB16_3DNOW(&a[32], &a[40]); - - FFT_8_3DNOW(&a[48]); FFT_4_3DNOW(&a[56]); FFT_4_3DNOW(&a[60]); - FFT_ASMB16_3DNOW(&a[48], &a[56]); - - FFT_ASMB_3DNOW(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); - - FFT_8_3DNOW(&a[64]); FFT_4_3DNOW(&a[72]); FFT_4_3DNOW(&a[76]); - /* FFT_16(&a[64]); */ - FFT_ASMB16_3DNOW(&a[64], &a[72]); - - FFT_8_3DNOW(&a[80]); FFT_8_3DNOW(&a[88]); - - /* FFT_32(&a[64]); */ - FFT_ASMB_3DNOW(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); - - FFT_8_3DNOW(&a[96]); FFT_4_3DNOW(&a[104]), FFT_4_3DNOW(&a[108]); - /* FFT_16(&a[96]); */ - FFT_ASMB16_3DNOW(&a[96], &a[104]); - - FFT_8_3DNOW(&a[112]), FFT_8_3DNOW(&a[120]); - /* FFT_32(&a[96]); */ - FFT_ASMB_3DNOW(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); - - /* FFT_128(&a[0]); */ - FFT_ASMB_3DNOW(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); -} - -static void -#if HAVE_AMD3DNOWEXT -imdct_do_512_3dnowex -#else -imdct_do_512_3dnow -#endif -(sample_t data[],sample_t delay[], sample_t bias) -{ - int i; -/* int k; - int p,q; - int m; - int two_m; - int two_m_plus_one; - - sample_t tmp_a_i; - sample_t tmp_a_r; - sample_t tmp_b_i; - sample_t tmp_b_r;*/ - - sample_t *data_ptr; - sample_t *delay_ptr; - sample_t *window_ptr; - - /* 512 IMDCT with source and dest data in 'data' */ - - /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ -#if 1 - __asm__ volatile ( - "movq %0, %%mm7\n\t" - ::"m"(x_plus_minus_3dnow) - :"memory"); - for( i=0; i < 128; i++) { - int j = pm128[i]; - __asm__ volatile ( - "movd %1, %%mm0\n\t" - "movd %3, %%mm1\n\t" - "punpckldq %2, %%mm0\n\t" /* mm0 = data[256-2*j-1] | data[2*j]*/ - "punpckldq %4, %%mm1\n\t" /* mm1 = xcos[j] | xsin[j] */ - "movq %%mm0, %%mm2\n\t" - "pfmul %%mm1, %%mm0\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd %%mm1, %%mm1\n\t" -#else - "punpckldq %%mm1, %%mm5\n\t" - "punpckhdq %%mm5, %%mm1\n\t" -#endif - "pfmul %%mm1, %%mm2\n\t" -#if HAVE_AMD3DNOWEXT - "pfpnacc %%mm2, %%mm0\n\t" -#else - "pxor %%mm7, %%mm0\n\t" - "pfacc %%mm2, %%mm0\n\t" -#endif - "pxor %%mm7, %%mm0\n\t" - "movq %%mm0, %0" - :"=m"(buf[i]) - :"m"(data[256-2*j-1]), "m"(data[2*j]), "m"(xcos1[j]), "m"(xsin1[j]) - :"memory" - ); -/* buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]); - buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);*/ - } -#else - __asm__ volatile ("femms":::"memory"); - for( i=0; i < 128; i++) { - /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ - int j= pm128[i]; - buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); - buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); - } -#endif - - /* FFT Merge */ -/* unoptimized variant - for (m=1; m < 7; m++) { - if(m) - two_m = (1 << m); - else - two_m = 1; - - two_m_plus_one = (1 << (m+1)); - - for(i = 0; i < 128; i += two_m_plus_one) { - for(k = 0; k < two_m; k++) { - p = k + i; - q = p + two_m; - tmp_a_r = buf[p].real; - tmp_a_i = buf[p].imag; - tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; - tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; - buf[p].real = tmp_a_r + tmp_b_r; - buf[p].imag = tmp_a_i + tmp_b_i; - buf[q].real = tmp_a_r - tmp_b_r; - buf[q].imag = tmp_a_i - tmp_b_i; - } - } - } -*/ - - FFT_128P_3DNOW (&buf[0]); -// __asm__ volatile ("femms \n\t":::"memory"); - - /* Post IFFT complex multiply plus IFFT complex conjugate*/ -#if 1 - __asm__ volatile ( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(x_plus_minus_3dnow), - "m"(x_minus_plus_3dnow) - :"eax","memory"); - for (i=0; i < 128; i++) { - __asm__ volatile ( - "movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ - "movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ -#if !HAVE_AMD3DNOWEXT - "punpckldq %%mm1, %%mm2\n\t" - "punpckhdq %%mm2, %%mm1\n\t" -#else - "pswapd %%mm1, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ -#endif - "movd %3, %%mm3\n\t" /* ac3_xsin[i] */ - "punpckldq %2, %%mm3\n\t" /* ac3_xsin[i] | ac3_xcos[i] */ - "pfmul %%mm3, %%mm0\n\t" - "pfmul %%mm3, %%mm1\n\t" -#if !HAVE_AMD3DNOWEXT - "pxor %%mm7, %%mm0\n\t" - "pfacc %%mm1, %%mm0\n\t" - "punpckldq %%mm0, %%mm1\n\t" - "punpckhdq %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" -#else - "pfpnacc %%mm1, %%mm0\n\t" /* mm0 = mm0[0] - mm0[1] | mm1[0] + mm1[1] */ - "pswapd %%mm0, %%mm0\n\t" - "movq %%mm0, %0" -#endif - :"=m"(buf[i]) - :"m"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i]) - :"memory"); -/* ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i]) + (tmp_a_i * ac3_xsin1[i]); - ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i]) - (tmp_a_i * ac3_xcos1[i]);*/ - } -#else - __asm__ volatile ("femms":::"memory"); - for( i=0; i < 128; i++) { - /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ - tmp_a_r = buf[i].real; - tmp_a_i = -1.0 * buf[i].imag; - buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]); - buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]); - } -#endif - - data_ptr = data; - delay_ptr = delay; - window_ptr = a52_imdct_window; - - /* Window and convert to real valued signal */ -#if 1 - __asm__ volatile ( - "movd (%0), %%mm3 \n\t" - "punpckldq %%mm3, %%mm3 \n\t" - :: "r" (&bias) - ); - for (i=0; i< 64; i++) { -/* merge two loops in one to enable working of 2 decoders */ - __asm__ volatile ( - "movd 516(%1), %%mm0\n\t" - "movd (%1), %%mm1\n\t" /**data_ptr++=-buf[64+i].im**window_ptr+++*delay_ptr++;*/ - "punpckldq (%2), %%mm0\n\t"/*data_ptr[128]=-buf[i].re*window_ptr[128]+delay_ptr[128];*/ - "punpckldq 516(%2), %%mm1\n\t" - "pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/ - "pfmul 512(%3), %%mm1\n\t" - "pxor %%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/ - "pxor %%mm6, %%mm1\n\t" - "pfadd (%4), %%mm0\n\t" - "pfadd 512(%4), %%mm1\n\t" - "pfadd %%mm3, %%mm0\n\t" - "pfadd %%mm3, %%mm1\n\t" - "movq %%mm0, (%0)\n\t" - "movq %%mm1, 512(%0)" - :"=r"(data_ptr) - :"r"(&buf[i].real), "r"(&buf[64-i-1].real), "r"(window_ptr), "r"(delay_ptr), "0"(data_ptr) - :"memory"); - data_ptr += 2; - window_ptr += 2; - delay_ptr += 2; - } - window_ptr += 128; -#else - __asm__ volatile ("femms":::"memory"); - for(i=0; i< 64; i++) { - *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; - *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; - } - - for(i=0; i< 64; i++) { - *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; - *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; - } -#endif - - /* The trailing edge of the window goes into the delay line */ - delay_ptr = delay; -#if 1 - for(i=0; i< 64; i++) { -/* merge two loops in one to enable working of 2 decoders */ - window_ptr -=2; - __asm__ volatile( - "movd 508(%1), %%mm0\n\t" - "movd (%1), %%mm1\n\t" - "punpckldq (%2), %%mm0\n\t" - "punpckldq 508(%2), %%mm1\n\t" -#if HAVE_AMD3DNOWEXT - "pswapd (%3), %%mm3\n\t" - "pswapd -512(%3), %%mm4\n\t" -#else - "movq (%3), %%mm3\n\t" - "punpckldq %%mm3, %%mm2\n\t" - "punpckhdq %%mm2, %%mm3\n\t" - "movq -512(%3), %%mm4\n\t" - "punpckldq %%mm4, %%mm2\n\t" - "punpckhdq %%mm2, %%mm4\n\t" -#endif - "pfmul %%mm3, %%mm0\n\t" - "pfmul %%mm4, %%mm1\n\t" - "pxor %%mm6, %%mm0\n\t" - "pxor %%mm7, %%mm1\n\t" - "movq %%mm0, (%0)\n\t" - "movq %%mm1, 512(%0)" - :"=r"(delay_ptr) - :"r"(&buf[i].imag), "r"(&buf[64-i-1].imag), "r"(window_ptr), "0"(delay_ptr) - :"memory"); - delay_ptr += 2; - } - __asm__ volatile ("femms":::"memory"); -#else - __asm__ volatile ("femms":::"memory"); - for(i=0; i< 64; i++) { - *delay_ptr++ = -buf[64+i].real * *--window_ptr; - *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; - } - - for(i=0; i<64; i++) { - *delay_ptr++ = buf[i].imag * *--window_ptr; - *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; - } -#endif -} diff -r 459227551819 -r 1aece15222b5 liba52/liba52.txt --- a/liba52/liba52.txt Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,208 +0,0 @@ -Using the liba52 API --------------------- - -liba52 provides a low-level interface to decoding audio frames encoded -using ATSC standard A/52 aka AC-3. liba52 provides downmixing and -dynamic range compression for the following output configurations: - -A52_CHANNEL : Dual mono. Two independant mono channels. -A52_CHANNEL1 : First of the two mono channels above. -A52_CHANNEL2 : Second of the two mono channels above. -A52_MONO : Mono. -A52_STEREO : Stereo. -A52_DOLBY : Dolby surround compatible stereo. -A52_3F : 3 front channels (left, center, right) -A52_2F1R : 2 front, 1 rear surround channel (L, R, S) -A52_3F1R : 3 front, 1 rear surround channel (L, C, R, S) -A52_2F2R : 2 front, 2 rear surround channels (L, R, LS, RS) -A52_3F2R : 3 front, 2 rear surround channels (L, C, R, LS, RS) - -A52_LFE : Low frequency effects channel. Normally used to connect a - subwoofer. Can be combined with any of the above channels. - For example: A52_3F2R | A52_LFE -> 3 front, 2 rear, 1 LFE (5.1) - - -Initialization --------------- - -sample_t * a52_init (uint32_t mm_accel); - -Initializes the A/52 library. Takes as a parameter the acceptable -optimizations which may be used, such as MMX. These are found in the -included header file 'mm_accel', along with an autodetection function -(mm_accel()). Currently, the only accelleration implemented is -MM_ACCEL_MLIB, which uses the 'mlib' library if installed. mlib is -only available on some Sun Microsystems platforms. - -The return value is a pointer to a properly-aligned sample buffer used -for output samples. - - -Probing the bitstream ---------------------- - -int a52_syncinfo (uint8_t * buf, int * flags, - int * sample_rate, int * bit_rate); - -The A/52 bitstream is composed of several a52 frames concatenated one -after each other. An a52 frame is the smallest independantly decodable -unit in the stream. - -buf must contain at least 7 bytes from the input stream. If these look -like the start of a valid a52 frame, a52_syncinfo() returns the size -of the coded frame in bytes, and fills flags, sample_rate and bit_rate -with the information encoded in the stream. The returned size is -guaranteed to be an even number between 128 and 3840. sample_rate will -be the sampling frequency in Hz, bit_rate is for the compressed stream -and is in bits per second, and flags is a description of the coded -channels: the A52_LFE bit is set if there is an LFE channel coded in -this stream, and by masking flags with A52_CHANNEL_MASK you will get a -value that describes the full-bandwidth channels, as one of the -A52_CHANNEL...A52_3F2R flags. - -If this can not possibly be a valid frame, then the function returns -0. You should then try to re-synchronize with the a52 stream - one way -to try this would be to advance buf by one byte until its contents -looks like a valid frame, but there might be better -application-specific ways to synchronize. - -It is recommended to call this function for each frame, for several -reasons: this function detects errors that the other functions will -not double-check, consecutive frames might have different lengths, and -it helps you re-sync with the stream if you get de-synchronized. - - -Starting to decode a frame --------------------------- - -int a52_frame (a52_state_t * state, uint8_t * buf, int * flags, - sample_t * level, sample_t bias); - -This starts the work of decoding the A/52 frame (to be completed using -a52_block()). buf should point to the beginning of the complete frame -of the full size returned by a52_syncinfo(). - -You should pass in the flags the speaker configuration that you -support, and liba52 will return the speaker configuration it will use -for its output, based on what is coded in the stream and what you -asked for. For example, if the stream contains 2+2 channels -(a52_syncinfo() returned A52_2F2R in the flags), and you have 3+1 -speakers (you passed A52_3F1R), then liba52 will choose do downmix to -2+1 speakers, since there is no center channel to send to your center -speaker. So in that case the left and right channels will be -essentially unmodified by the downmix, and the two surround channels -will be added together and sent to your surround speaker. liba52 will -return A52_2F1R to indicate this. - -The good news is that when you downmix to stereo you dont have to -worry about this, you will ALWAYS get a stereo output no matter what -was coded in the stream. For more complex output configurations you -will have to handle the case where liba52 couldnt give you what you -wanted because some of the channels were not encoded in the stream -though. - -Level, bias, and A52_ADJUST_LEVEL: - -Before downmixing, samples are floating point values with a range of -[-1,1]. Most types of downmixing will combine channels together, which -will potentially result in a larger range for the output -samples. liba52 provides two methods of controlling the range of the -output, either before or after the downmix stage. - -If you do not set A52_ADJUST_LEVEL, liba52 will multiply the samples -by your level value, so that they fit in the [-level,level] -range. Then it will apply the standardized downmix equations, -potentially making the samples go out of that interval again. The -level parameter is not modified. - -Setting the A52_ADJUST_LEVEL flag will instruct liba52 to treat your -level value as the intended range interval after downmixing. It will -then figure out what level to use before the downmix (what you should -have passed if you hadnt used the A52_ADJUST_LEVEL flag), and -overwrite the level value you gave it with that new level value. - -The bias represents a value which should be added to the result -regardless: - -output_sample = (input_sample * level) + bias; - -For example, a bias of 384 and a level of 1 tells liba52 you want -samples between 383 and 385 instead of -1 and 1. This is what the -sample program a52dec does, as it makes it faster to convert the -samples to integer format, using a trick based on the IEEE -floating-point format. - -This function also initialises the state for that frame, which will be -reused next when decoding blocks. - - -Dynamic range compression -------------------------- - -void a52_dynrng (a52_state_t * state, - sample_t (* call) (sample_t, void *), void * data); - -This function is purely optional. If you dont call it, liba52 will -provide the default behaviour, which is to apply the full dynamic -range compression as specified in the A/52 stream. This basically -makes the loud sounds softer, and the soft sounds louder, so you can -more easily listen to the stream in a noisy environment without -disturbing anyone. - -If you do call this function and set a NULL callback, this will -totally disable the dynamic range compression and provide a playback -more adapted to a movie theater or a listening room. - -If you call this function and specify a callback function, this -callback might be called up to once for each block, with two -arguments: the compression factor 'c' recommended by the bitstream, -and the private data pointer you specified in a52_dynrng(). The -callback will then return the amount of compression to actually use - -typically pow(c,x) where x is somewhere between 0 and 1. More -elaborate compression functions might want to use a different value -for 'x' depending wether c>1 or c<1 - or even something more complex -if this is what you want. - - -Decoding blocks ---------------- - -int a52_block (a52_state_t * state, sample_t * samples); - -Every A/52 frame is composed of 6 blocks, each with an output of 256 -samples for each channel. The a52_block() function decodes the next -block in the frame, and should be called 6 times to decode all of the -audio in the frame. After each call, you should extract the audio data -from the sample buffer. - -The sample pointer given should be the one a52_init() returned. - -After this function returns, the samples buuffer will contain 256 -samples for the first channel, followed by 256 samples for the second -channel, etc... the channel order is LFE, left, center, right, left -surround, right surround. If one of the channels is not present in the -liba52 output, as indicated by the flags returned by a52_frame(), then -this channel is skipped and the following channels are shifted so -liba52 does not leave an empty space between channels. - - -Pseudocode example ------------------- - -sample_t * samples = a52_init (mm_accel()); - -loop on input bytes: - if at least 7 bytes in the buffer: - - bytes_to_get = a52_syncinfo (...) - - if bytes_to_get == 0: - goto loop to keep looking for sync point - else - get rest of bytes - - a52_frame (state, buf, ...) - [a52_dynrng (state, ...); this is only optional] - for i = 1 ... 6: - a52_block (state, samples) - convert samples to integer and queue to soundcard diff -r 459227551819 -r 1aece15222b5 liba52/liba52_changes.diff --- a/liba52/liba52_changes.diff Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2473 +0,0 @@ ---- include/a52.h 2006-06-12 15:04:57.000000000 +0200 -+++ liba52/a52.h 2006-06-05 02:23:02.000000000 +0200 -@@ -59,4 +66,9 @@ - int a52_block (a52_state_t * state); - void a52_free (a52_state_t * state); - -+void* a52_resample_init(uint32_t mm_accel,int flags,int chans); -+extern int (* a52_resample) (float * _f, int16_t * s16); -+ -+uint16_t crc16_block(uint8_t *data,uint32_t num_bytes); -+ - #endif /* A52_H */ ---- liba52/a52_internal.h 2006-06-12 15:05:07.000000000 +0200 -+++ liba52/a52_internal.h 2006-06-05 02:23:02.000000000 +0200 -@@ -103,18 +107,34 @@ - #define DELTA_BIT_NONE (2) - #define DELTA_BIT_RESERVED (3) - -+#if ARCH_X86_64 -+# define REG_a "rax" -+# define REG_d "rdx" -+# define REG_S "rsi" -+# define REG_D "rdi" -+# define REG_BP "rbp" -+#else -+# define REG_a "eax" -+# define REG_d "edx" -+# define REG_S "esi" -+# define REG_D "edi" -+# define REG_BP "ebp" -+#endif -+ - void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart, - int start, int end, int fastleak, int slowleak, - expbap_t * expbap); - - int a52_downmix_init (int input, int flags, sample_t * level, - sample_t clev, sample_t slev); -+void downmix_accel_init(uint32_t mm_accel); - int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, - sample_t clev, sample_t slev); --void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, -+extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev); --void a52_upmix (sample_t * samples, int acmod, int output); -+extern void (*a52_upmix) (sample_t * samples, int acmod, int output); - - void a52_imdct_init (uint32_t mm_accel); - void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias); --void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias); -+extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias); -+void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias); ---- liba52/bitstream.c 2006-06-12 15:05:07.000000000 +0200 -+++ liba52/bitstream.c 2006-06-05 02:23:02.000000000 +0200 -@@ -31,6 +35,10 @@ - - #define BUFFER_SIZE 4096 - -+#ifdef ALT_BITSTREAM_READER -+int indx=0; -+#endif -+ - void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf) - { - int align; -@@ -38,6 +46,9 @@ - align = (long)buf & 3; - state->buffer_start = (uint32_t *) (buf - align); - state->bits_left = 0; -+#ifdef ALT_BITSTREAM_READER -+ indx=0; -+#endif - bitstream_get (state, align * 8); - } - ---- liba52/bitstream.h 2006-06-12 15:05:07.000000000 +0200 -+++ liba52/bitstream.h 2006-06-05 02:23:02.000000000 +0200 -@@ -21,6 +25,42 @@ - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -+/* code from ffmpeg/libavcodec */ -+#if defined(__sparc__) || defined(hpux) -+/* -+ * the alt bitstream reader performs unaligned memory accesses; that doesn't work -+ * on sparc/hpux. For now, disable ALT_BITSTREAM_READER. -+ */ -+#undef ALT_BITSTREAM_READER -+#else -+// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input) -+#define ALT_BITSTREAM_READER -+ -+/* used to avoid misaligned exceptions on some archs (alpha, ...) */ -+#if ARCH_X86 || HAVE_ARMV6 -+# define unaligned32(a) (*(uint32_t*)(a)) -+#else -+# ifdef __GNUC__ -+static inline uint32_t unaligned32(const void *v) { -+ struct Unaligned { -+ uint32_t i; -+ } __attribute__((packed)); -+ -+ return ((const struct Unaligned *) v)->i; -+} -+# elif defined(__DECC) -+static inline uint32_t unaligned32(const void *v) { -+ return *(const __unaligned uint32_t *) v; -+} -+# else -+static inline uint32_t unaligned32(const void *v) { -+ return *(const uint32_t *) v; -+} -+# endif -+#endif //!ARCH_X86 -+ -+#endif -+ - /* (stolen from the kernel) */ - #if HAVE_BIGENDIAN - -@@ -28,7 +68,7 @@ - - #else - --# if 0 && defined (__i386__) -+# if defined (__i386__) - - # define swab32(x) __i386_swab32(x) - static inline const uint32_t __i386_swab32(uint32_t x) -@@ -39,19 +79,34 @@ - - # else - --# define swab32(x)\ --((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | \ -- (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])) -- -+# define swab32(x) __generic_swab32(x) -+ static inline const uint32_t __generic_swab32(uint32_t x) -+ { -+ return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | -+ (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])); -+ } - # endif - #endif - -+#ifdef ALT_BITSTREAM_READER -+extern int indx; -+#endif -+ - void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf); - uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits); - int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits); - - static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits) - { -+#ifdef ALT_BITSTREAM_READER -+ uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); -+ -+ result<<= (indx&0x07); -+ result>>= 32 - num_bits; -+ indx+= num_bits; -+ -+ return result; -+#else - uint32_t result; - - if (num_bits < state->bits_left) { -@@ -61,10 +116,29 @@ - } - - return a52_bitstream_get_bh (state, num_bits); -+#endif -+} -+ -+static inline void bitstream_skip(a52_state_t * state, int num_bits) -+{ -+#ifdef ALT_BITSTREAM_READER -+ indx+= num_bits; -+#else -+ bitstream_get(state, num_bits); -+#endif - } - - static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits) - { -+#ifdef ALT_BITSTREAM_READER -+ int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); -+ -+ result<<= (indx&0x07); -+ result>>= 32 - num_bits; -+ indx+= num_bits; -+ -+ return result; -+#else - int32_t result; - - if (num_bits < state->bits_left) { -@@ -74,4 +148,5 @@ - } - - return a52_bitstream_get_bh_2 (state, num_bits); -+#endif - } ---- liba52/downmix.c 2006-06-12 15:17:53.000000000 +0200 -+++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200 -@@ -19,18 +23,46 @@ - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ * -+ * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) - */ - - #include "config.h" - - #include - #include - - #include "a52.h" - #include "a52_internal.h" -+#include "mm_accel.h" - - #define CONVERT(acmod,output) (((output) << 3) + (acmod)) - -+ -+void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias, -+ sample_t clev, sample_t slev)= NULL; -+void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL; -+ -+static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, -+ sample_t clev, sample_t slev); -+static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, -+ sample_t clev, sample_t slev); -+static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, -+ sample_t clev, sample_t slev); -+static void upmix_MMX (sample_t * samples, int acmod, int output); -+static void upmix_C (sample_t * samples, int acmod, int output); -+ -+void downmix_accel_init(uint32_t mm_accel) -+{ -+ a52_upmix= upmix_C; -+ a52_downmix= downmix_C; -+#if ARCH_X86 || ARCH_X86_64 -+ if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX; -+ if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE; -+ if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow; -+#endif -+} -+ - int a52_downmix_init (int input, int flags, sample_t * level, - sample_t clev, sample_t slev) - { -@@ -447,7 +479,7 @@ - samples[i] = 0; - } - --void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, -+void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, - sample_t clev, sample_t slev) - { - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { -@@ -559,7 +591,7 @@ - break; - - case CONVERT (A52_3F2R, A52_2F1R): -- mix3to2 (samples, bias); -+ mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) - move2to1 (samples + 768, samples + 512, bias); - break; - -@@ -583,12 +615,12 @@ - break; - - case CONVERT (A52_3F1R, A52_3F2R): -- memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); -+ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); - break; - } - } - --void a52_upmix (sample_t * samples, int acmod, int output) -+void upmix_C (sample_t * samples, int acmod, int output) - { - switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { - -@@ -653,3 +685,1104 @@ - goto mix_31to21; - } - } -+ -+#if ARCH_X86 || ARCH_X86_64 -+static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %2, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" -+ "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps (%1, %%"REG_S"), %%xmm0 \n\t" -+ "addps 16(%1, %%"REG_S"), %%xmm1\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" -+ "addps %%xmm7, %%xmm1 \n\t" -+ "movaps %%xmm0, (%1, %%"REG_S") \n\t" -+ "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" -+ "add $32, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (src+256), "r" (dest+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix3to1_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps %%xmm7, %%xmm1 \n\t" -+ "addps %%xmm1, %%xmm0 \n\t" -+ "movaps %%xmm0, (%0, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix4to1_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" -+ "addps %%xmm1, %%xmm0 \n\t" -+ "movaps %%xmm0, (%0, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix5to1_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" -+ "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps %%xmm1, %%xmm0 \n\t" -+ "movaps %%xmm0, (%0, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix3to2_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" //common -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" -+ "addps %%xmm0, %%xmm1 \n\t" -+ "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %2, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" //common -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "movaps (%1, %%"REG_S"), %%xmm2 \n\t" -+ "addps %%xmm0, %%xmm1 \n\t" -+ "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm2, (%1, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (left+256), "r" (right+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix21toS_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" -+ "addps %%xmm7, %%xmm1 \n\t" -+ "addps %%xmm7, %%xmm2 \n\t" -+ "subps %%xmm0, %%xmm1 \n\t" -+ "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix31to2_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" // common -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" -+ "addps %%xmm0, %%xmm1 \n\t" -+ "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix31toS_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" -+ "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround -+ "addps %%xmm7, %%xmm0 \n\t" // common -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" -+ "addps %%xmm0, %%xmm1 \n\t" -+ "addps %%xmm0, %%xmm2 \n\t" -+ "subps %%xmm3, %%xmm1 \n\t" -+ "addps %%xmm3, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix22toS_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" -+ "addps %%xmm7, %%xmm1 \n\t" -+ "addps %%xmm7, %%xmm2 \n\t" -+ "subps %%xmm0, %%xmm1 \n\t" -+ "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix32to2_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" // common -+ "movaps %%xmm0, %%xmm1 \n\t" // common -+ "addps (%0, %%"REG_S"), %%xmm0 \n\t" -+ "addps 2048(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" -+ "movaps %%xmm0, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix32toS_SSE (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %1, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" -+ "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" // common -+ "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t" -+ "subps %%xmm2, %%xmm1 \n\t" -+ "addps %%xmm2, %%xmm3 \n\t" -+ "addps %%xmm0, %%xmm1 \n\t" -+ "addps %%xmm0, %%xmm3 \n\t" -+ "movaps %%xmm1, (%0, %%"REG_S") \n\t" -+ "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) -+{ -+ __asm__ volatile( -+ "movlps %2, %%xmm7 \n\t" -+ "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" -+ "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" -+ "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" -+ "addps %%xmm7, %%xmm0 \n\t" -+ "addps %%xmm7, %%xmm1 \n\t" -+ "movaps %%xmm0, (%1, %%"REG_S") \n\t" -+ "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" -+ "add $32, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (src+256), "r" (dest+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void zero_MMX(sample_t * samples) -+{ -+ __asm__ volatile( -+ "mov $-1024, %%"REG_S" \n\t" -+ "pxor %%mm0, %%mm0 \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq %%mm0, (%0, %%"REG_S") \n\t" -+ "movq %%mm0, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm0, 16(%0, %%"REG_S") \n\t" -+ "movq %%mm0, 24(%0, %%"REG_S") \n\t" -+ "add $32, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ "emms" -+ :: "r" (samples+256) -+ : "%"REG_S -+ ); -+} -+ -+static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, -+ sample_t clev, sample_t slev) -+{ -+ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { -+ -+ case CONVERT (A52_CHANNEL, A52_CHANNEL2): -+ memcpy (samples, samples + 256, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_CHANNEL, A52_MONO): -+ case CONVERT (A52_STEREO, A52_MONO): -+ mix_2to1_SSE: -+ mix2to1_SSE (samples, samples + 256, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_MONO): -+ if (slev == 0) -+ goto mix_2to1_SSE; -+ case CONVERT (A52_3F, A52_MONO): -+ mix_3to1_SSE: -+ mix3to1_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_MONO): -+ if (slev == 0) -+ goto mix_3to1_SSE; -+ case CONVERT (A52_2F2R, A52_MONO): -+ if (slev == 0) -+ goto mix_2to1_SSE; -+ mix4to1_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_MONO): -+ if (slev == 0) -+ goto mix_3to1_SSE; -+ mix5to1_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_MONO, A52_DOLBY): -+ memcpy (samples + 256, samples, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F, A52_STEREO): -+ case CONVERT (A52_3F, A52_DOLBY): -+ mix_3to2_SSE: -+ mix3to2_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_STEREO): -+ if (slev == 0) -+ break; -+ mix21to2_SSE (samples, samples + 256, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_DOLBY): -+ mix21toS_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_STEREO): -+ if (slev == 0) -+ goto mix_3to2_SSE; -+ mix31to2_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_DOLBY): -+ mix31toS_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_2F2R, A52_STEREO): -+ if (slev == 0) -+ break; -+ mix2to1_SSE (samples, samples + 512, bias); -+ mix2to1_SSE (samples + 256, samples + 768, bias); -+ break; -+ -+ case CONVERT (A52_2F2R, A52_DOLBY): -+ mix22toS_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_STEREO): -+ if (slev == 0) -+ goto mix_3to2_SSE; -+ mix32to2_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_DOLBY): -+ mix32toS_SSE (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_3F): -+ if (slev == 0) -+ break; -+ mix21to2_SSE (samples, samples + 512, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_3F): -+ if (slev == 0) -+ break; -+ mix2to1_SSE (samples, samples + 768, bias); -+ mix2to1_SSE (samples + 512, samples + 1024, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_2F1R): -+ mix3to2_SSE (samples, bias); -+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_2F2R, A52_2F1R): -+ mix2to1_SSE (samples + 512, samples + 768, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_2F1R): -+ mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) -+ move2to1_SSE (samples + 768, samples + 512, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_3F1R): -+ mix2to1_SSE (samples + 768, samples + 1024, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_2F2R): -+ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_2F2R): -+ mix3to2_SSE (samples, bias); -+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_2F2R): -+ mix3to2_SSE (samples, bias); -+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); -+ memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_3F2R): -+ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); -+ break; -+ } -+} -+ -+static void upmix_MMX (sample_t * samples, int acmod, int output) -+{ -+ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { -+ -+ case CONVERT (A52_CHANNEL, A52_CHANNEL2): -+ memcpy (samples + 256, samples, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_MONO): -+ zero_MMX (samples + 1024); -+ case CONVERT (A52_3F1R, A52_MONO): -+ case CONVERT (A52_2F2R, A52_MONO): -+ zero_MMX (samples + 768); -+ case CONVERT (A52_3F, A52_MONO): -+ case CONVERT (A52_2F1R, A52_MONO): -+ zero_MMX (samples + 512); -+ case CONVERT (A52_CHANNEL, A52_MONO): -+ case CONVERT (A52_STEREO, A52_MONO): -+ zero_MMX (samples + 256); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_STEREO): -+ case CONVERT (A52_3F2R, A52_DOLBY): -+ zero_MMX (samples + 1024); -+ case CONVERT (A52_3F1R, A52_STEREO): -+ case CONVERT (A52_3F1R, A52_DOLBY): -+ zero_MMX (samples + 768); -+ case CONVERT (A52_3F, A52_STEREO): -+ case CONVERT (A52_3F, A52_DOLBY): -+ mix_3to2_MMX: -+ memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); -+ zero_MMX (samples + 256); -+ break; -+ -+ case CONVERT (A52_2F2R, A52_STEREO): -+ case CONVERT (A52_2F2R, A52_DOLBY): -+ zero_MMX (samples + 768); -+ case CONVERT (A52_2F1R, A52_STEREO): -+ case CONVERT (A52_2F1R, A52_DOLBY): -+ zero_MMX (samples + 512); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_3F): -+ zero_MMX (samples + 1024); -+ case CONVERT (A52_3F1R, A52_3F): -+ case CONVERT (A52_2F2R, A52_2F1R): -+ zero_MMX (samples + 768); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_3F1R): -+ zero_MMX (samples + 1024); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_2F1R): -+ zero_MMX (samples + 1024); -+ case CONVERT (A52_3F1R, A52_2F1R): -+ mix_31to21_MMX: -+ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); -+ goto mix_3to2_MMX; -+ -+ case CONVERT (A52_3F2R, A52_2F2R): -+ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); -+ goto mix_31to21_MMX; -+ } -+} -+ -+static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %2, %%mm7 \n\t" -+ "punpckldq %2, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq (%0, %%"REG_S"), %%mm0 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" -+ "movq 16(%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 24(%0, %%"REG_S"), %%mm3 \n\t" -+ "pfadd (%1, %%"REG_S"), %%mm0 \n\t" -+ "pfadd 8(%1, %%"REG_S"), %%mm1 \n\t" -+ "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t" -+ "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t" -+ "pfadd %%mm7, %%mm0 \n\t" -+ "pfadd %%mm7, %%mm1 \n\t" -+ "pfadd %%mm7, %%mm2 \n\t" -+ "pfadd %%mm7, %%mm3 \n\t" -+ "movq %%mm0, (%1, %%"REG_S") \n\t" -+ "movq %%mm1, 8(%1, %%"REG_S") \n\t" -+ "movq %%mm2, 16(%1, %%"REG_S") \n\t" -+ "movq %%mm3, 24(%1, %%"REG_S") \n\t" -+ "add $32, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (src+256), "r" (dest+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix3to1_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq (%0, %%"REG_S"), %%mm0 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm2\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" -+ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" -+ "pfadd %%mm7, %%mm1 \n\t" -+ "pfadd %%mm2, %%mm0 \n\t" -+ "pfadd %%mm3, %%mm1 \n\t" -+ "movq %%mm0, (%0, %%"REG_S") \n\t" -+ "movq %%mm1, 8(%0, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix4to1_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq (%0, %%"REG_S"), %%mm0 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm2\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" -+ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" -+ "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" -+ "pfadd %%mm7, %%mm1 \n\t" -+ "pfadd %%mm2, %%mm0 \n\t" -+ "pfadd %%mm3, %%mm1 \n\t" -+ "movq %%mm0, (%0, %%"REG_S") \n\t" -+ "movq %%mm1, 8(%0, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix5to1_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq (%0, %%"REG_S"), %%mm0 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm2\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" -+ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" -+ "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" -+ "pfadd %%mm7, %%mm1 \n\t" -+ "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" -+ "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd %%mm2, %%mm0 \n\t" -+ "pfadd %%mm3, %%mm1 \n\t" -+ "movq %%mm0, (%0, %%"REG_S") \n\t" -+ "movq %%mm1, 8(%0, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix3to2_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm0\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" //common -+ "pfadd %%mm7, %%mm1 \n\t" //common -+ "movq (%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" -+ "movq 2048(%0, %%"REG_S"), %%mm4\n\t" -+ "movq 2056(%0, %%"REG_S"), %%mm5\n\t" -+ "pfadd %%mm0, %%mm2 \n\t" -+ "pfadd %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm4 \n\t" -+ "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%"REG_S") \n\t" -+ "movq %%mm3, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm4, 1024(%0, %%"REG_S")\n\t" -+ "movq %%mm5, 1032(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %2, %%mm7 \n\t" -+ "punpckldq %2, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq 1024(%1, %%"REG_S"), %%mm0\n\t" -+ "movq 1032(%1, %%"REG_S"), %%mm1\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" //common -+ "pfadd %%mm7, %%mm1 \n\t" //common -+ "movq (%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" -+ "movq (%1, %%"REG_S"), %%mm4 \n\t" -+ "movq 8(%1, %%"REG_S"), %%mm5 \n\t" -+ "pfadd %%mm0, %%mm2 \n\t" -+ "pfadd %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm4 \n\t" -+ "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%"REG_S") \n\t" -+ "movq %%mm3, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm4, (%1, %%"REG_S") \n\t" -+ "movq %%mm5, 8(%1, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (left+256), "r" (right+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix21toS_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround -+ "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround -+ "movq (%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm4\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm5\n\t" -+ "pfadd %%mm7, %%mm2 \n\t" -+ "pfadd %%mm7, %%mm3 \n\t" -+ "pfadd %%mm7, %%mm4 \n\t" -+ "pfadd %%mm7, %%mm5 \n\t" -+ "pfsub %%mm0, %%mm2 \n\t" -+ "pfsub %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm4 \n\t" -+ "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%"REG_S") \n\t" -+ "movq %%mm3, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm4, 1024(%0, %%"REG_S")\n\t" -+ "movq %%mm5, 1032(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix31to2_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm0\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" -+ "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" // common -+ "pfadd %%mm7, %%mm1 \n\t" // common -+ "movq (%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" -+ "movq 2048(%0, %%"REG_S"), %%mm4\n\t" -+ "movq 2056(%0, %%"REG_S"), %%mm5\n\t" -+ "pfadd %%mm0, %%mm2 \n\t" -+ "pfadd %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm4 \n\t" -+ "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%"REG_S") \n\t" -+ "movq %%mm3, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm4, 1024(%0, %%"REG_S")\n\t" -+ "movq %%mm5, 1032(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix31toS_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm0\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" // common -+ "pfadd %%mm7, %%mm1 \n\t" // common -+ "movq (%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" -+ "movq 2048(%0, %%"REG_S"), %%mm4\n\t" -+ "movq 2056(%0, %%"REG_S"), %%mm5\n\t" -+ "pfadd %%mm0, %%mm2 \n\t" -+ "pfadd %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm4 \n\t" -+ "pfadd %%mm1, %%mm5 \n\t" -+ "movq 3072(%0, %%"REG_S"), %%mm0\n\t" // surround -+ "movq 3080(%0, %%"REG_S"), %%mm1\n\t" // surround -+ "pfsub %%mm0, %%mm2 \n\t" -+ "pfsub %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm4 \n\t" -+ "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%"REG_S") \n\t" -+ "movq %%mm3, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm4, 1024(%0, %%"REG_S")\n\t" -+ "movq %%mm5, 1032(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix22toS_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq 2048(%0, %%"REG_S"), %%mm0\n\t" -+ "movq 2056(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround -+ "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround -+ "movq (%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm4\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm5\n\t" -+ "pfadd %%mm7, %%mm2 \n\t" -+ "pfadd %%mm7, %%mm3 \n\t" -+ "pfadd %%mm7, %%mm4 \n\t" -+ "pfadd %%mm7, %%mm5 \n\t" -+ "pfsub %%mm0, %%mm2 \n\t" -+ "pfsub %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm4 \n\t" -+ "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%"REG_S") \n\t" -+ "movq %%mm3, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm4, 1024(%0, %%"REG_S")\n\t" -+ "movq %%mm5, 1032(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void mix32to2_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm0\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" // common -+ "pfadd %%mm7, %%mm1 \n\t" // common -+ "movq %%mm0, %%mm2 \n\t" // common -+ "movq %%mm1, %%mm3 \n\t" // common -+ "pfadd (%0, %%"REG_S"), %%mm0 \n\t" -+ "pfadd 8(%0, %%"REG_S"), %%mm1 \n\t" -+ "pfadd 2048(%0, %%"REG_S"), %%mm2\n\t" -+ "pfadd 2056(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" -+ "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" -+ "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" -+ "movq %%mm0, (%0, %%"REG_S") \n\t" -+ "movq %%mm1, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm2, 1024(%0, %%"REG_S")\n\t" -+ "movq %%mm3, 1032(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+/* todo: should be optimized better */ -+static void mix32toS_3dnow (sample_t * samples, sample_t bias) -+{ -+ __asm__ volatile( -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movd %1, %%mm7 \n\t" -+ "punpckldq %1, %%mm7 \n\t" -+ "movq 1024(%0, %%"REG_S"), %%mm0\n\t" -+ "movq 1032(%0, %%"REG_S"), %%mm1\n\t" -+ "movq 3072(%0, %%"REG_S"), %%mm4\n\t" -+ "movq 3080(%0, %%"REG_S"), %%mm5\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" // common -+ "pfadd %%mm7, %%mm1 \n\t" // common -+ "pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround -+ "pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround -+ "movq (%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" -+ "movq 2048(%0, %%"REG_S"), %%mm6\n\t" -+ "movq 2056(%0, %%"REG_S"), %%mm7\n\t" -+ "pfsub %%mm4, %%mm2 \n\t" -+ "pfsub %%mm5, %%mm3 \n\t" -+ "pfadd %%mm4, %%mm6 \n\t" -+ "pfadd %%mm5, %%mm7 \n\t" -+ "pfadd %%mm0, %%mm2 \n\t" -+ "pfadd %%mm1, %%mm3 \n\t" -+ "pfadd %%mm0, %%mm6 \n\t" -+ "pfadd %%mm1, %%mm7 \n\t" -+ "movq %%mm2, (%0, %%"REG_S") \n\t" -+ "movq %%mm3, 8(%0, %%"REG_S") \n\t" -+ "movq %%mm6, 1024(%0, %%"REG_S")\n\t" -+ "movq %%mm7, 1032(%0, %%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (samples+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias) -+{ -+ __asm__ volatile( -+ "movd %2, %%mm7 \n\t" -+ "punpckldq %2, %%mm7 \n\t" -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movq (%0, %%"REG_S"), %%mm0 \n\t" -+ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" -+ "movq 16(%0, %%"REG_S"), %%mm2 \n\t" -+ "movq 24(%0, %%"REG_S"), %%mm3 \n\t" -+ "pfadd 1024(%0, %%"REG_S"), %%mm0\n\t" -+ "pfadd 1032(%0, %%"REG_S"), %%mm1\n\t" -+ "pfadd 1040(%0, %%"REG_S"), %%mm2\n\t" -+ "pfadd 1048(%0, %%"REG_S"), %%mm3\n\t" -+ "pfadd %%mm7, %%mm0 \n\t" -+ "pfadd %%mm7, %%mm1 \n\t" -+ "pfadd %%mm7, %%mm2 \n\t" -+ "pfadd %%mm7, %%mm3 \n\t" -+ "movq %%mm0, (%1, %%"REG_S") \n\t" -+ "movq %%mm1, 8(%1, %%"REG_S") \n\t" -+ "movq %%mm2, 16(%1, %%"REG_S") \n\t" -+ "movq %%mm3, 24(%1, %%"REG_S") \n\t" -+ "add $32, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (src+256), "r" (dest+256), "m" (bias) -+ : "%"REG_S -+ ); -+} -+ -+static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, -+ sample_t clev, sample_t slev) -+{ -+ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { -+ -+ case CONVERT (A52_CHANNEL, A52_CHANNEL2): -+ memcpy (samples, samples + 256, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_CHANNEL, A52_MONO): -+ case CONVERT (A52_STEREO, A52_MONO): -+ mix_2to1_3dnow: -+ mix2to1_3dnow (samples, samples + 256, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_MONO): -+ if (slev == 0) -+ goto mix_2to1_3dnow; -+ case CONVERT (A52_3F, A52_MONO): -+ mix_3to1_3dnow: -+ mix3to1_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_MONO): -+ if (slev == 0) -+ goto mix_3to1_3dnow; -+ case CONVERT (A52_2F2R, A52_MONO): -+ if (slev == 0) -+ goto mix_2to1_3dnow; -+ mix4to1_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_MONO): -+ if (slev == 0) -+ goto mix_3to1_3dnow; -+ mix5to1_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_MONO, A52_DOLBY): -+ memcpy (samples + 256, samples, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F, A52_STEREO): -+ case CONVERT (A52_3F, A52_DOLBY): -+ mix_3to2_3dnow: -+ mix3to2_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_STEREO): -+ if (slev == 0) -+ break; -+ mix21to2_3dnow (samples, samples + 256, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_DOLBY): -+ mix21toS_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_STEREO): -+ if (slev == 0) -+ goto mix_3to2_3dnow; -+ mix31to2_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_DOLBY): -+ mix31toS_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_2F2R, A52_STEREO): -+ if (slev == 0) -+ break; -+ mix2to1_3dnow (samples, samples + 512, bias); -+ mix2to1_3dnow (samples + 256, samples + 768, bias); -+ break; -+ -+ case CONVERT (A52_2F2R, A52_DOLBY): -+ mix22toS_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_STEREO): -+ if (slev == 0) -+ goto mix_3to2_3dnow; -+ mix32to2_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_DOLBY): -+ mix32toS_3dnow (samples, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_3F): -+ if (slev == 0) -+ break; -+ mix21to2_3dnow (samples, samples + 512, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_3F): -+ if (slev == 0) -+ break; -+ mix2to1_3dnow (samples, samples + 768, bias); -+ mix2to1_3dnow (samples + 512, samples + 1024, bias); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_2F1R): -+ mix3to2_3dnow (samples, bias); -+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_2F2R, A52_2F1R): -+ mix2to1_3dnow (samples + 512, samples + 768, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_2F1R): -+ mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used) -+ move2to1_3dnow (samples + 768, samples + 512, bias); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_3F1R): -+ mix2to1_3dnow (samples + 768, samples + 1024, bias); -+ break; -+ -+ case CONVERT (A52_2F1R, A52_2F2R): -+ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_2F2R): -+ mix3to2_3dnow (samples, bias); -+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F2R, A52_2F2R): -+ mix3to2_3dnow (samples, bias); -+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); -+ memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); -+ break; -+ -+ case CONVERT (A52_3F1R, A52_3F2R): -+ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); -+ break; -+ } -+ __asm__ volatile("femms":::"memory"); -+} -+ -+#endif // ARCH_X86 || ARCH_X86_64 ---- liba52/imdct.c 2008-02-19 00:18:33.000000000 +0100 -+++ liba52/imdct.c 2008-02-19 00:16:40.000000000 +0100 -@@ -22,6 +26,11 @@ - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ * -+ * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) -+ * 3DNOW optimizations from Nick Kurshev -+ * michael did port them from libac3 (untested, perhaps totally broken) -+ * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) - */ - - #include "config.h" -@@ -39,12 +48,50 @@ - #include "a52.h" - #include "a52_internal.h" - #include "mm_accel.h" -+#include "mangle.h" -+ -+void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias); -+ -+#if CONFIG_RUNTIME_CPUDETECT -+#undef HAVE_AMD3DNOWEXT -+#define HAVE_AMD3DNOWEXT 0 -+#endif - - typedef struct complex_s { - sample_t real; - sample_t imag; - } complex_t; - -+static const int pm128[128] attribute_used __attribute__((aligned(16))) = -+{ -+ 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, -+ 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, -+ 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, -+ 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, -+ 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, -+ 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, -+ 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, -+ 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 -+}; -+ -+static uint8_t attribute_used bit_reverse_512[] = { -+ 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, -+ 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, -+ 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, -+ 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c, -+ 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72, -+ 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a, -+ 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76, -+ 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e, -+ 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71, -+ 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79, -+ 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75, -+ 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d, -+ 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73, -+ 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b, -+ 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77, -+ 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; -+ - static uint8_t fftorder[] = { - 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176, - 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88, -@@ -56,6 +103,40 @@ - 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86 - }; - -+static complex_t __attribute__((aligned(16))) buf[128]; -+ -+/* Twiddle factor LUT */ -+static complex_t __attribute__((aligned(16))) w_1[1]; -+static complex_t __attribute__((aligned(16))) w_2[2]; -+static complex_t __attribute__((aligned(16))) w_4[4]; -+static complex_t __attribute__((aligned(16))) w_8[8]; -+static complex_t __attribute__((aligned(16))) w_16[16]; -+static complex_t __attribute__((aligned(16))) w_32[32]; -+static complex_t __attribute__((aligned(16))) w_64[64]; -+static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; -+ -+/* Twiddle factors for IMDCT */ -+static sample_t __attribute__((aligned(16))) xcos1[128]; -+static sample_t __attribute__((aligned(16))) xsin1[128]; -+ -+#if ARCH_X86 || ARCH_X86_64 -+// NOTE: SSE needs 16byte alignment or it will segfault -+// -+static float __attribute__((aligned(16))) sseSinCos1c[256]; -+static float __attribute__((aligned(16))) sseSinCos1d[256]; -+static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; -+//static float __attribute__((aligned(16))) sseW0[4]; -+static float __attribute__((aligned(16))) sseW1[8]; -+static float __attribute__((aligned(16))) sseW2[16]; -+static float __attribute__((aligned(16))) sseW3[32]; -+static float __attribute__((aligned(16))) sseW4[64]; -+static float __attribute__((aligned(16))) sseW5[128]; -+static float __attribute__((aligned(16))) sseW6[256]; -+static float __attribute__((aligned(16))) *sseW[7]= -+ {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; -+static float __attribute__((aligned(16))) sseWindow[512]; -+#endif -+ - /* Root values for IFFT */ - static sample_t roots16[3]; - static sample_t roots32[7]; -@@ -241,7 +322,7 @@ - ifft_pass (buf, roots128 - 32, 32); - } - --void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias) -+void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias) - { - int i, k; - sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2; -@@ -285,6 +366,704 @@ - } - } - -+#if HAVE_ALTIVEC -+ -+#ifdef HAVE_ALTIVEC_H -+#include -+#endif -+ -+// used to build registers permutation vectors (vcprm) -+// the 's' are for words in the _s_econd vector -+#define WORD_0 0x00,0x01,0x02,0x03 -+#define WORD_1 0x04,0x05,0x06,0x07 -+#define WORD_2 0x08,0x09,0x0a,0x0b -+#define WORD_3 0x0c,0x0d,0x0e,0x0f -+#define WORD_s0 0x10,0x11,0x12,0x13 -+#define WORD_s1 0x14,0x15,0x16,0x17 -+#define WORD_s2 0x18,0x19,0x1a,0x1b -+#define WORD_s3 0x1c,0x1d,0x1e,0x1f -+ -+#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} -+#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} -+ -+#define FOUROF(a) {a,a,a,a} -+ -+// vcprmle is used to keep the same index as in the SSE version. -+// it's the same as vcprm, with the index inversed -+// ('le' is Little Endian) -+#define vcprmle(a,b,c,d) vcprm(d,c,b,a) -+ -+// used to build inverse/identity vectors (vcii) -+// n is _n_egative, p is _p_ositive -+#define FLOAT_n -1. -+#define FLOAT_p 1. -+ -+ -+void -+imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias) -+{ -+ int i; -+ int k; -+ int p,q; -+ int m; -+ long two_m; -+ long two_m_plus_one; -+ -+ sample_t tmp_b_i; -+ sample_t tmp_b_r; -+ sample_t tmp_a_i; -+ sample_t tmp_a_r; -+ -+ sample_t *data_ptr; -+ sample_t *delay_ptr; -+ sample_t *window_ptr; -+ -+ /* 512 IMDCT with source and dest data in 'data' */ -+ -+ /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ -+ for( i=0; i < 128; i++) { -+ /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ -+ int j= bit_reverse_512[i]; -+ buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); -+ buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); -+ } -+ -+ /* 1. iteration */ -+ for(i = 0; i < 128; i += 2) { -+#if 0 -+ tmp_a_r = buf[i].real; -+ tmp_a_i = buf[i].imag; -+ tmp_b_r = buf[i+1].real; -+ tmp_b_i = buf[i+1].imag; -+ buf[i].real = tmp_a_r + tmp_b_r; -+ buf[i].imag = tmp_a_i + tmp_b_i; -+ buf[i+1].real = tmp_a_r - tmp_b_r; -+ buf[i+1].imag = tmp_a_i - tmp_b_i; -+#else -+ vector float temp, bufv; -+ -+ bufv = vec_ld(i << 3, (float*)buf); -+ temp = vec_perm(bufv, bufv, vcprm(2,3,0,1)); -+ bufv = vec_madd(bufv, vcii(p,p,n,n), temp); -+ vec_st(bufv, i << 3, (float*)buf); -+#endif -+ } -+ -+ /* 2. iteration */ -+ // Note w[1]={{1,0}, {0,-1}} -+ for(i = 0; i < 128; i += 4) { -+#if 0 -+ tmp_a_r = buf[i].real; -+ tmp_a_i = buf[i].imag; -+ tmp_b_r = buf[i+2].real; -+ tmp_b_i = buf[i+2].imag; -+ buf[i].real = tmp_a_r + tmp_b_r; -+ buf[i].imag = tmp_a_i + tmp_b_i; -+ buf[i+2].real = tmp_a_r - tmp_b_r; -+ buf[i+2].imag = tmp_a_i - tmp_b_i; -+ tmp_a_r = buf[i+1].real; -+ tmp_a_i = buf[i+1].imag; -+ /* WARNING: im <-> re here ! */ -+ tmp_b_r = buf[i+3].imag; -+ tmp_b_i = buf[i+3].real; -+ buf[i+1].real = tmp_a_r + tmp_b_r; -+ buf[i+1].imag = tmp_a_i - tmp_b_i; -+ buf[i+3].real = tmp_a_r - tmp_b_r; -+ buf[i+3].imag = tmp_a_i + tmp_b_i; -+#else -+ vector float buf01, buf23, temp1, temp2; -+ -+ buf01 = vec_ld((i + 0) << 3, (float*)buf); -+ buf23 = vec_ld((i + 2) << 3, (float*)buf); -+ buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2)); -+ -+ temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01); -+ temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01); -+ -+ vec_st(temp1, (i + 0) << 3, (float*)buf); -+ vec_st(temp2, (i + 2) << 3, (float*)buf); -+#endif -+ } -+ -+ /* 3. iteration */ -+ for(i = 0; i < 128; i += 8) { -+#if 0 -+ tmp_a_r = buf[i].real; -+ tmp_a_i = buf[i].imag; -+ tmp_b_r = buf[i+4].real; -+ tmp_b_i = buf[i+4].imag; -+ buf[i].real = tmp_a_r + tmp_b_r; -+ buf[i].imag = tmp_a_i + tmp_b_i; -+ buf[i+4].real = tmp_a_r - tmp_b_r; -+ buf[i+4].imag = tmp_a_i - tmp_b_i; -+ tmp_a_r = buf[1+i].real; -+ tmp_a_i = buf[1+i].imag; -+ tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; -+ tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; -+ buf[1+i].real = tmp_a_r + tmp_b_r; -+ buf[1+i].imag = tmp_a_i + tmp_b_i; -+ buf[i+5].real = tmp_a_r - tmp_b_r; -+ buf[i+5].imag = tmp_a_i - tmp_b_i; -+ tmp_a_r = buf[i+2].real; -+ tmp_a_i = buf[i+2].imag; -+ /* WARNING re <-> im & sign */ -+ tmp_b_r = buf[i+6].imag; -+ tmp_b_i = - buf[i+6].real; -+ buf[i+2].real = tmp_a_r + tmp_b_r; -+ buf[i+2].imag = tmp_a_i + tmp_b_i; -+ buf[i+6].real = tmp_a_r - tmp_b_r; -+ buf[i+6].imag = tmp_a_i - tmp_b_i; -+ tmp_a_r = buf[i+3].real; -+ tmp_a_i = buf[i+3].imag; -+ tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; -+ tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; -+ buf[i+3].real = tmp_a_r + tmp_b_r; -+ buf[i+3].imag = tmp_a_i + tmp_b_i; -+ buf[i+7].real = tmp_a_r - tmp_b_r; -+ buf[i+7].imag = tmp_a_i - tmp_b_i; -+#else -+ vector float buf01, buf23, buf45, buf67; -+ -+ buf01 = vec_ld((i + 0) << 3, (float*)buf); -+ buf23 = vec_ld((i + 2) << 3, (float*)buf); -+ -+ tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; -+ tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; -+ buf[i+5].real = tmp_b_r; -+ buf[i+5].imag = tmp_b_i; -+ tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; -+ tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; -+ buf[i+7].real = tmp_b_r; -+ buf[i+7].imag = tmp_b_i; -+ -+ buf23 = vec_ld((i + 2) << 3, (float*)buf); -+ buf45 = vec_ld((i + 4) << 3, (float*)buf); -+ buf67 = vec_ld((i + 6) << 3, (float*)buf); -+ buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3)); -+ -+ vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf); -+ vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf); -+ vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf); -+ vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf); -+#endif -+ } -+ -+ /* 4-7. iterations */ -+ for (m=3; m < 7; m++) { -+ two_m = (1 << m); -+ -+ two_m_plus_one = two_m<<1; -+ -+ for(i = 0; i < 128; i += two_m_plus_one) { -+ for(k = 0; k < two_m; k+=2) { -+#if 0 -+ int p = k + i; -+ int q = p + two_m; -+ tmp_a_r = buf[p].real; -+ tmp_a_i = buf[p].imag; -+ tmp_b_r = -+ buf[q].real * w[m][k].real - -+ buf[q].imag * w[m][k].imag; -+ tmp_b_i = -+ buf[q].imag * w[m][k].real + -+ buf[q].real * w[m][k].imag; -+ buf[p].real = tmp_a_r + tmp_b_r; -+ buf[p].imag = tmp_a_i + tmp_b_i; -+ buf[q].real = tmp_a_r - tmp_b_r; -+ buf[q].imag = tmp_a_i - tmp_b_i; -+ -+ tmp_a_r = buf[(p + 1)].real; -+ tmp_a_i = buf[(p + 1)].imag; -+ tmp_b_r = -+ buf[(q + 1)].real * w[m][(k + 1)].real - -+ buf[(q + 1)].imag * w[m][(k + 1)].imag; -+ tmp_b_i = -+ buf[(q + 1)].imag * w[m][(k + 1)].real + -+ buf[(q + 1)].real * w[m][(k + 1)].imag; -+ buf[(p + 1)].real = tmp_a_r + tmp_b_r; -+ buf[(p + 1)].imag = tmp_a_i + tmp_b_i; -+ buf[(q + 1)].real = tmp_a_r - tmp_b_r; -+ buf[(q + 1)].imag = tmp_a_i - tmp_b_i; -+#else -+ int p = k + i; -+ int q = p + two_m; -+ vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4; -+ const vector float vczero = (const vector float)FOUROF(0.); -+ // first compute buf[q] and buf[q+1] -+ vecq = vec_ld(q << 3, (float*)buf); -+ vecw = vec_ld(0, (float*)&(w[m][k])); -+ temp1 = vec_madd(vecq, vecw, vczero); -+ temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2)); -+ temp2 = vec_madd(temp2, vecw, vczero); -+ temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2)); -+ temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3)); -+ vecq = vec_madd(temp4, vcii(n,p,n,p), temp3); -+ // then butterfly with buf[p] and buf[p+1] -+ vecp = vec_ld(p << 3, (float*)buf); -+ -+ temp1 = vec_add(vecp, vecq); -+ temp2 = vec_sub(vecp, vecq); -+ -+ vec_st(temp1, p << 3, (float*)buf); -+ vec_st(temp2, q << 3, (float*)buf); -+#endif -+ } -+ } -+ } -+ -+ /* Post IFFT complex multiply plus IFFT complex conjugate*/ -+ for( i=0; i < 128; i+=4) { -+ /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ -+#if 0 -+ tmp_a_r = buf[(i + 0)].real; -+ tmp_a_i = -1.0 * buf[(i + 0)].imag; -+ buf[(i + 0)].real = -+ (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]); -+ buf[(i + 0)].imag = -+ (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]); -+ -+ tmp_a_r = buf[(i + 1)].real; -+ tmp_a_i = -1.0 * buf[(i + 1)].imag; -+ buf[(i + 1)].real = -+ (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]); -+ buf[(i + 1)].imag = -+ (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]); -+ -+ tmp_a_r = buf[(i + 2)].real; -+ tmp_a_i = -1.0 * buf[(i + 2)].imag; -+ buf[(i + 2)].real = -+ (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]); -+ buf[(i + 2)].imag = -+ (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]); -+ -+ tmp_a_r = buf[(i + 3)].real; -+ tmp_a_i = -1.0 * buf[(i + 3)].imag; -+ buf[(i + 3)].real = -+ (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]); -+ buf[(i + 3)].imag = -+ (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]); -+#else -+ vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2; -+ vector float temp0022, temp1133, tempCS01; -+ const vector float vczero = (const vector float)FOUROF(0.); -+ -+ bufv_0 = vec_ld((i + 0) << 3, (float*)buf); -+ bufv_2 = vec_ld((i + 2) << 3, (float*)buf); -+ -+ cosv = vec_ld(i << 2, xcos1); -+ sinv = vec_ld(i << 2, xsin1); -+ -+ temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2)); -+ temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3)); -+ tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1)); -+ temp1 = vec_madd(temp0022, tempCS01, vczero); -+ tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1)); -+ temp2 = vec_madd(temp1133, tempCS01, vczero); -+ bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1); -+ -+ vec_st(bufv_0, (i + 0) << 3, (float*)buf); -+ -+ /* idem with bufv_2 and high-order cosv/sinv */ -+ -+ temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2)); -+ temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3)); -+ tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3)); -+ temp1 = vec_madd(temp0022, tempCS01, vczero); -+ tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3)); -+ temp2 = vec_madd(temp1133, tempCS01, vczero); -+ bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1); -+ -+ vec_st(bufv_2, (i + 2) << 3, (float*)buf); -+ -+#endif -+ } -+ -+ data_ptr = data; -+ delay_ptr = delay; -+ window_ptr = a52_imdct_window; -+ -+ /* Window and convert to real valued signal */ -+ for(i=0; i< 64; i++) { -+ *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; -+ *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; -+ } -+ -+ for(i=0; i< 64; i++) { -+ *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; -+ *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; -+ } -+ -+ /* The trailing edge of the window goes into the delay line */ -+ delay_ptr = delay; -+ -+ for(i=0; i< 64; i++) { -+ *delay_ptr++ = -buf[64+i].real * *--window_ptr; -+ *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; -+ } -+ -+ for(i=0; i<64; i++) { -+ *delay_ptr++ = buf[i].imag * *--window_ptr; -+ *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; -+ } -+} -+#endif -+ -+ -+// Stuff below this line is borrowed from libac3 -+#include "srfftp.h" -+#if ARCH_X86 || ARCH_X86_64 -+#undef HAVE_AMD3DNOW -+#define HAVE_AMD3DNOW 1 -+#include "srfftp_3dnow.h" -+ -+const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }}; -+const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }}; -+const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; -+ -+#undef HAVE_AMD3DNOWEXT -+#define HAVE_AMD3DNOWEXT 0 -+#include "imdct_3dnow.h" -+#undef HAVE_AMD3DNOWEXT -+#define HAVE_AMD3DNOWEXT 1 -+#include "imdct_3dnow.h" -+ -+#if !ARCH_X86_64 || !defined(PIC) -+void -+imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) -+{ -+/* int i,k; -+ int p,q;*/ -+ int m; -+ long two_m; -+ long two_m_plus_one; -+ long two_m_plus_one_shl3; -+ complex_t *buf_offset; -+ -+/* sample_t tmp_a_i; -+ sample_t tmp_a_r; -+ sample_t tmp_b_i; -+ sample_t tmp_b_r;*/ -+ -+ sample_t *data_ptr; -+ sample_t *delay_ptr; -+ sample_t *window_ptr; -+ -+ /* 512 IMDCT with source and dest data in 'data' */ -+ /* see the c version (dct_do_512()), its allmost identical, just in C */ -+ -+ /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ -+ /* Bit reversed shuffling */ -+ __asm__ volatile( -+ "xor %%"REG_S", %%"REG_S" \n\t" -+ "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t" -+ "mov $1008, %%"REG_D" \n\t" -+ "push %%"REG_BP" \n\t" //use ebp without telling gcc -+ ASMALIGN(4) -+ "1: \n\t" -+ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI -+ "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI -+ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi -+ "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi -+ "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR -+ "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t" -+ "mulps %%xmm0, %%xmm2 \n\t" -+ "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI -+ "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" -+ "subps %%xmm0, %%xmm2 \n\t" -+ "movzb (%%"REG_a"), %%"REG_d" \n\t" -+ "movzb 1(%%"REG_a"), %%"REG_BP" \n\t" -+ "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t" -+ "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t" -+ "add $16, %%"REG_S" \n\t" -+ "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap -+ "sub $16, %%"REG_D" \n\t" -+ "jnc 1b \n\t" -+ "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g* -+ :: "b" (data), "c" (buf) -+ : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d -+ ); -+ -+ -+ /* FFT Merge */ -+/* unoptimized variant -+ for (m=1; m < 7; m++) { -+ if(m) -+ two_m = (1 << m); -+ else -+ two_m = 1; -+ -+ two_m_plus_one = (1 << (m+1)); -+ -+ for(i = 0; i < 128; i += two_m_plus_one) { -+ for(k = 0; k < two_m; k++) { -+ p = k + i; -+ q = p + two_m; -+ tmp_a_r = buf[p].real; -+ tmp_a_i = buf[p].imag; -+ tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; -+ tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; -+ buf[p].real = tmp_a_r + tmp_b_r; -+ buf[p].imag = tmp_a_i + tmp_b_i; -+ buf[q].real = tmp_a_r - tmp_b_r; -+ buf[q].imag = tmp_a_i - tmp_b_i; -+ } -+ } -+ } -+*/ -+ -+ /* 1. iteration */ -+ // Note w[0][0]={1,0} -+ __asm__ volatile( -+ "xorps %%xmm1, %%xmm1 \n\t" -+ "xorps %%xmm2, %%xmm2 \n\t" -+ "mov %0, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p] -+ "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q] -+ "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p] -+ "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q] -+ "addps %%xmm1, %%xmm0 \n\t" -+ "subps %%xmm2, %%xmm0 \n\t" -+ "movaps %%xmm0, (%%"REG_S")\n\t" -+ "add $16, %%"REG_S" \n\t" -+ "cmp %1, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "g" (buf), "r" (buf + 128) -+ : "%"REG_S -+ ); -+ -+ /* 2. iteration */ -+ // Note w[1]={{1,0}, {0,-1}} -+ __asm__ volatile( -+ "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 -+ "mov %0, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3 -+ "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 -+ "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 -+ "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 -+ "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1 -+ "addps %%xmm2, %%xmm0 \n\t" -+ "subps %%xmm2, %%xmm1 \n\t" -+ "movaps %%xmm0, (%%"REG_S") \n\t" -+ "movaps %%xmm1, 16(%%"REG_S") \n\t" -+ "add $32, %%"REG_S" \n\t" -+ "cmp %1, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "g" (buf), "r" (buf + 128) -+ : "%"REG_S -+ ); -+ -+ /* 3. iteration */ -+/* -+ Note sseW2+0={1,1,sqrt(2),sqrt(2)) -+ Note sseW2+16={0,0,sqrt(2),-sqrt(2)) -+ Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) -+ Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) -+*/ -+ __asm__ volatile( -+ "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" -+ "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" -+ "xorps %%xmm5, %%xmm5 \n\t" -+ "xorps %%xmm2, %%xmm2 \n\t" -+ "mov %0, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5 -+ "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7 -+ "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 -+ "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 -+ "mulps %%xmm2, %%xmm4 \n\t" -+ "mulps %%xmm3, %%xmm5 \n\t" -+ "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 -+ "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 -+ "mulps %%xmm6, %%xmm3 \n\t" -+ "mulps %%xmm7, %%xmm2 \n\t" -+ "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 -+ "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3 -+ "addps %%xmm4, %%xmm2 \n\t" -+ "addps %%xmm5, %%xmm3 \n\t" -+ "movaps %%xmm2, %%xmm4 \n\t" -+ "movaps %%xmm3, %%xmm5 \n\t" -+ "addps %%xmm0, %%xmm2 \n\t" -+ "addps %%xmm1, %%xmm3 \n\t" -+ "subps %%xmm4, %%xmm0 \n\t" -+ "subps %%xmm5, %%xmm1 \n\t" -+ "movaps %%xmm2, (%%"REG_S") \n\t" -+ "movaps %%xmm3, 16(%%"REG_S") \n\t" -+ "movaps %%xmm0, 32(%%"REG_S") \n\t" -+ "movaps %%xmm1, 48(%%"REG_S") \n\t" -+ "add $64, %%"REG_S" \n\t" -+ "cmp %1, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "g" (buf), "r" (buf + 128) -+ : "%"REG_S -+ ); -+ -+ /* 4-7. iterations */ -+ for (m=3; m < 7; m++) { -+ two_m = (1 << m); -+ two_m_plus_one = two_m<<1; -+ two_m_plus_one_shl3 = (two_m_plus_one<<3); -+ buf_offset = buf+128; -+ __asm__ volatile( -+ "mov %0, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "xor %%"REG_D", %%"REG_D" \n\t" // k -+ "lea (%%"REG_S", %3), %%"REG_d" \n\t" -+ "2: \n\t" -+ "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t" -+ "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t" -+ "mulps %%xmm1, %%xmm2 \n\t" -+ "shufps $0xB1, %%xmm1, %%xmm1 \n\t" -+ "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t" -+ "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t" -+ "addps %%xmm2, %%xmm1 \n\t" -+ "movaps %%xmm1, %%xmm2 \n\t" -+ "addps %%xmm0, %%xmm1 \n\t" -+ "subps %%xmm2, %%xmm0 \n\t" -+ "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t" -+ "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t" -+ "add $16, %%"REG_D" \n\t" -+ "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0 -+ "jb 2b \n\t" -+ "add %2, %%"REG_S" \n\t" -+ "cmp %1, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3), -+ "r" (sseW[m]) -+ : "%"REG_S, "%"REG_D, "%"REG_d -+ ); -+ } -+ -+ /* Post IFFT complex multiply plus IFFT complex conjugate*/ -+ __asm__ volatile( -+ "mov $-1024, %%"REG_S" \n\t" -+ ASMALIGN(4) -+ "1: \n\t" -+ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" -+ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" -+ "shufps $0xB1, %%xmm0, %%xmm0 \n\t" -+ "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t" -+ "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" -+ "addps %%xmm1, %%xmm0 \n\t" -+ "movaps %%xmm0, (%0, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ " jnz 1b \n\t" -+ :: "r" (buf+128) -+ : "%"REG_S -+ ); -+ -+ -+ data_ptr = data; -+ delay_ptr = delay; -+ window_ptr = a52_imdct_window; -+ -+ /* Window and convert to real valued signal */ -+ __asm__ volatile( -+ "xor %%"REG_D", %%"REG_D" \n\t" // 0 -+ "xor %%"REG_S", %%"REG_S" \n\t" // 0 -+ "movss %3, %%xmm2 \n\t" // bias -+ "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... -+ ASMALIGN(4) -+ "1: \n\t" -+ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? -+ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? -+ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? -+ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? -+ "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A -+ "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" -+ "addps (%2, %%"REG_S"), %%xmm0 \n\t" -+ "addps %%xmm2, %%xmm0 \n\t" -+ "movaps %%xmm0, (%1, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ "sub $16, %%"REG_D" \n\t" -+ "cmp $512, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) -+ : "%"REG_S, "%"REG_D -+ ); -+ data_ptr+=128; -+ delay_ptr+=128; -+// window_ptr+=128; -+ -+ __asm__ volatile( -+ "mov $1024, %%"REG_D" \n\t" // 512 -+ "xor %%"REG_S", %%"REG_S" \n\t" // 0 -+ "movss %3, %%xmm2 \n\t" // bias -+ "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... -+ ASMALIGN(4) -+ "1: \n\t" -+ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A -+ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C -+ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C -+ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A -+ "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A -+ "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" -+ "addps (%2, %%"REG_S"), %%xmm0 \n\t" -+ "addps %%xmm2, %%xmm0 \n\t" -+ "movaps %%xmm0, (%1, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ "sub $16, %%"REG_D" \n\t" -+ "cmp $512, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) -+ : "%"REG_S, "%"REG_D -+ ); -+ data_ptr+=128; -+// window_ptr+=128; -+ -+ /* The trailing edge of the window goes into the delay line */ -+ delay_ptr = delay; -+ -+ __asm__ volatile( -+ "xor %%"REG_D", %%"REG_D" \n\t" // 0 -+ "xor %%"REG_S", %%"REG_S" \n\t" // 0 -+ ASMALIGN(4) -+ "1: \n\t" -+ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A -+ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C -+ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C -+ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A -+ "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A -+ "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" -+ "movaps %%xmm0, (%1, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ "sub $16, %%"REG_D" \n\t" -+ "cmp $512, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "r" (buf+64), "r" (delay_ptr) -+ : "%"REG_S, "%"REG_D -+ ); -+ delay_ptr+=128; -+// window_ptr-=128; -+ -+ __asm__ volatile( -+ "mov $1024, %%"REG_D" \n\t" // 1024 -+ "xor %%"REG_S", %%"REG_S" \n\t" // 0 -+ ASMALIGN(4) -+ "1: \n\t" -+ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? -+ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? -+ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? -+ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? -+ "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A -+ "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" -+ "movaps %%xmm0, (%1, %%"REG_S") \n\t" -+ "add $16, %%"REG_S" \n\t" -+ "sub $16, %%"REG_D" \n\t" -+ "cmp $512, %%"REG_S" \n\t" -+ " jb 1b \n\t" -+ :: "r" (buf), "r" (delay_ptr) -+ : "%"REG_S, "%"REG_D -+ ); -+} -+#endif -+#endif // ARCH_X86 || ARCH_X86_64 -+ - void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias) - { - int i, k; -@@ -364,7 +1143,7 @@ - - void a52_imdct_init (uint32_t mm_accel) - { -- int i, k; -+ int i, j, k; - double sum; - - /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */ -@@ -416,6 +1195,101 @@ - post2[i].real = cos ((M_PI / 128) * (i + 0.5)); - post2[i].imag = sin ((M_PI / 128) * (i + 0.5)); - } -+ for (i = 0; i < 128; i++) { -+ xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); -+ xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); -+ } -+ for (i = 0; i < 7; i++) { -+ j = 1 << i; -+ for (k = 0; k < j; k++) { -+ w[i][k].real = cos (-M_PI * k / j); -+ w[i][k].imag = sin (-M_PI * k / j); -+ } -+ } -+#if ARCH_X86 || ARCH_X86_64 -+ for (i = 0; i < 128; i++) { -+ sseSinCos1c[2*i+0]= xcos1[i]; -+ sseSinCos1c[2*i+1]= -xcos1[i]; -+ sseSinCos1d[2*i+0]= xsin1[i]; -+ sseSinCos1d[2*i+1]= xsin1[i]; -+ } -+ for (i = 1; i < 7; i++) { -+ j = 1 << i; -+ for (k = 0; k < j; k+=2) { -+ -+ sseW[i][4*k + 0] = w[i][k+0].real; -+ sseW[i][4*k + 1] = w[i][k+0].real; -+ sseW[i][4*k + 2] = w[i][k+1].real; -+ sseW[i][4*k + 3] = w[i][k+1].real; -+ -+ sseW[i][4*k + 4] = -w[i][k+0].imag; -+ sseW[i][4*k + 5] = w[i][k+0].imag; -+ sseW[i][4*k + 6] = -w[i][k+1].imag; -+ sseW[i][4*k + 7] = w[i][k+1].imag; -+ -+ //we multiply more or less uninitalized numbers so we need to use exactly 0.0 -+ if(k==0) -+ { -+// sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0; -+ sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0; -+ } -+ -+ if(2*k == j) -+ { -+ sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0; -+// sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0); -+ } -+ } -+ } -+ -+ for(i=0; i<128; i++) -+ { -+ sseWindow[2*i+0]= -a52_imdct_window[2*i+0]; -+ sseWindow[2*i+1]= a52_imdct_window[2*i+1]; -+ } -+ -+ for(i=0; i<64; i++) -+ { -+ sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1]; -+ sseWindow[256 + 2*i+1]= a52_imdct_window[254 - 2*i+0]; -+ sseWindow[384 + 2*i+0]= a52_imdct_window[126 - 2*i+1]; -+ sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0]; -+ } -+#endif -+ a52_imdct_512 = imdct_do_512; -+ ifft128 = ifft128_c; -+ ifft64 = ifft64_c; -+ -+#if ARCH_X86 || ARCH_X86_64 -+#if !ARCH_X86_64 || !defined(PIC) -+ if(mm_accel & MM_ACCEL_X86_SSE) -+ { -+ fprintf (stderr, "Using SSE optimized IMDCT transform\n"); -+ a52_imdct_512 = imdct_do_512_sse; -+ } -+ else -+#endif -+ if(mm_accel & MM_ACCEL_X86_3DNOWEXT) -+ { -+ fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n"); -+ a52_imdct_512 = imdct_do_512_3dnowex; -+ } -+ else -+ if(mm_accel & MM_ACCEL_X86_3DNOW) -+ { -+ fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); -+ a52_imdct_512 = imdct_do_512_3dnow; -+ } -+ else -+#endif // ARCH_X86 || ARCH_X86_64 -+#if HAVE_ALTIVEC -+ if (mm_accel & MM_ACCEL_PPC_ALTIVEC) -+ { -+ fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); -+ a52_imdct_512 = imdct_do_512_altivec; -+ } -+ else -+#endif - - #ifdef LIBA52_DJBFFT - if (mm_accel & MM_ACCEL_DJBFFT) { -@@ -426,7 +1300,5 @@ - #endif - { - fprintf (stderr, "No accelerated IMDCT transform found\n"); -- ifft128 = ifft128_c; -- ifft64 = ifft64_c; - } - } ---- include/mm_accel.h 2006-06-12 15:05:00.000000000 +0200 -+++ liba52/mm_accel.h 2006-06-05 02:23:04.000000000 +0200 -@@ -30,7 +34,12 @@ - /* x86 accelerations */ - #define MM_ACCEL_X86_MMX 0x80000000 - #define MM_ACCEL_X86_3DNOW 0x40000000 -+#define MM_ACCEL_X86_3DNOWEXT 0x08000000 - #define MM_ACCEL_X86_MMXEXT 0x20000000 -+#define MM_ACCEL_X86_SSE 0x10000000 -+ -+/* PPC accelerations */ -+#define MM_ACCEL_PPC_ALTIVEC 0x00010000 - - uint32_t mm_accel (void); - ---- liba52/parse.c 2006-12-05 08:08:01.000000000 +0100 -+++ liba52/parse.c 2006-12-05 08:08:44.000000000 +0100 -@@ -24,6 +28,7 @@ - #include "config.h" - - #include -+#include - #include - #include - -@@ -31,13 +36,12 @@ - #include "a52_internal.h" - #include "bitstream.h" - #include "tables.h" -+#include "mm_accel.h" -+#include "libavutil/avutil.h" - --#ifdef HAVE_MEMALIGN -+#if HAVE_MEMALIGN - /* some systems have memalign() but no declaration for it */ - void * memalign (size_t align, size_t size); --#else --/* assume malloc alignment is sufficient */ --#define memalign(align,size) malloc (size) - #endif - - typedef struct { -@@ -60,7 +64,16 @@ - if (state == NULL) - return NULL; - -+#if defined(__MINGW32__) && defined(HAVE_SSE) -+ state->samples = av_malloc(256 * 12 * sizeof (sample_t)); -+#else - state->samples = memalign (16, 256 * 12 * sizeof (sample_t)); -+#endif -+ if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){ -+ mm_accel &=~MM_ACCEL_X86_SSE; -+ fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n"); -+ } -+ - if (state->samples == NULL) { - free (state); - return NULL; -@@ -74,6 +87,7 @@ - state->lfsr_state = 1; - - a52_imdct_init (mm_accel); -+ downmix_accel_init(mm_accel); - - return state; - } -@@ -141,7 +155,7 @@ - state->acmod = acmod = buf[6] >> 5; - - a52_bitstream_set_ptr (state, buf + 6); -- bitstream_get (state, 3); /* skip acmod we already parsed */ -+ bitstream_skip (state, 3); /* skip acmod we already parsed */ - - if ((acmod == 2) && (bitstream_get (state, 2) == 2)) /* dsurmod */ - acmod = A52_DOLBY; -@@ -172,28 +186,28 @@ - - chaninfo = !acmod; - do { -- bitstream_get (state, 5); /* dialnorm */ -+ bitstream_skip (state, 5); /* dialnorm */ - if (bitstream_get (state, 1)) /* compre */ -- bitstream_get (state, 8); /* compr */ -+ bitstream_skip (state, 8); /* compr */ - if (bitstream_get (state, 1)) /* langcode */ -- bitstream_get (state, 8); /* langcod */ -+ bitstream_skip (state, 8); /* langcod */ - if (bitstream_get (state, 1)) /* audprodie */ -- bitstream_get (state, 7); /* mixlevel + roomtyp */ -+ bitstream_skip (state, 7); /* mixlevel + roomtyp */ - } while (chaninfo--); - -- bitstream_get (state, 2); /* copyrightb + origbs */ -+ bitstream_skip (state, 2); /* copyrightb + origbs */ - - if (bitstream_get (state, 1)) /* timecod1e */ -- bitstream_get (state, 14); /* timecod1 */ -+ bitstream_skip (state, 14); /* timecod1 */ - if (bitstream_get (state, 1)) /* timecod2e */ -- bitstream_get (state, 14); /* timecod2 */ -+ bitstream_skip (state, 14); /* timecod2 */ - - if (bitstream_get (state, 1)) { /* addbsie */ - int addbsil; - - addbsil = bitstream_get (state, 6); - do { -- bitstream_get (state, 8); /* addbsi */ -+ bitstream_skip (state, 8); /* addbsi */ - } while (addbsil--); - } - -@@ -680,7 +694,7 @@ - state->fbw_expbap[i].exp[0], - state->fbw_expbap[i].exp + 1)) - return 1; -- bitstream_get (state, 2); /* gainrng */ -+ bitstream_skip (state, 2); /* gainrng */ - } - if (lfeexpstr != EXP_REUSE) { - do_bit_alloc |= 32; -@@ -755,7 +769,7 @@ - if (bitstream_get (state, 1)) { /* skiple */ - i = bitstream_get (state, 9); /* skipl */ - while (i--) -- bitstream_get (state, 8); -+ bitstream_skip (state, 8); - } - - samples = state->samples; -@@ -896,6 +910,10 @@ - - void a52_free (a52_state_t * state) - { -- free (state->samples); -+#if defined(__MINGW32__) && defined(HAVE_SSE) -+ av_free (state->samples); -+#else -+ free (state->samples); -+#endif - free (state); - } diff -r 459227551819 -r 1aece15222b5 liba52/mm_accel.h --- a/liba52/mm_accel.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -/* - * mm_accel.h - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef MM_ACCEL_H -#define MM_ACCEL_H - -/* generic accelerations */ -#define MM_ACCEL_DJBFFT 0x00000001 - -/* x86 accelerations */ -#define MM_ACCEL_X86_MMX 0x80000000 -#define MM_ACCEL_X86_3DNOW 0x40000000 -#define MM_ACCEL_X86_3DNOWEXT 0x08000000 -#define MM_ACCEL_X86_MMXEXT 0x20000000 -#define MM_ACCEL_X86_SSE 0x10000000 - -/* PPC accelerations */ -#define MM_ACCEL_PPC_ALTIVEC 0x00010000 - -uint32_t mm_accel (void); - -#endif /* MM_ACCEL_H */ diff -r 459227551819 -r 1aece15222b5 liba52/parse.c --- a/liba52/parse.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,919 +0,0 @@ -/* - * parse.c - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * Modified for use with MPlayer, changes contained in liba52_changes.diff. - * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ - * $Id$ - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include "config.h" - -#include -#include -#include -#include - -#include "a52.h" -#include "a52_internal.h" -#include "bitstream.h" -#include "tables.h" -#include "mm_accel.h" -#include "libavutil/avutil.h" - -#if HAVE_MEMALIGN -/* some systems have memalign() but no declaration for it */ -void * memalign (size_t align, size_t size); -#endif - -typedef struct { - sample_t q1[2]; - sample_t q2[2]; - sample_t q4; - int q1_ptr; - int q2_ptr; - int q4_ptr; -} quantizer_t; - -static uint8_t halfrate[12] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3}; - -a52_state_t * a52_init (uint32_t mm_accel) -{ - a52_state_t * state; - int i; - - state = malloc (sizeof (a52_state_t)); - if (state == NULL) - return NULL; - -#if defined(__MINGW32__) && defined(HAVE_SSE) - state->samples = av_malloc(256 * 12 * sizeof (sample_t)); -#else - state->samples = memalign (16, 256 * 12 * sizeof (sample_t)); -#endif - if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){ - mm_accel &=~MM_ACCEL_X86_SSE; - fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n"); - } - - if (state->samples == NULL) { - free (state); - return NULL; - } - - for (i = 0; i < 256 * 12; i++) - state->samples[i] = 0; - - state->downmixed = 1; - - state->lfsr_state = 1; - - a52_imdct_init (mm_accel); - downmix_accel_init(mm_accel); - - return state; -} - -sample_t * a52_samples (a52_state_t * state) -{ - return state->samples; -} - -int a52_syncinfo (uint8_t * buf, int * flags, - int * sample_rate, int * bit_rate) -{ - static int rate[] = { 32, 40, 48, 56, 64, 80, 96, 112, - 128, 160, 192, 224, 256, 320, 384, 448, - 512, 576, 640}; - static uint8_t lfeon[8] = {0x10, 0x10, 0x04, 0x04, 0x04, 0x01, 0x04, 0x01}; - int frmsizecod; - int bitrate; - int half; - int acmod; - - if ((buf[0] != 0x0b) || (buf[1] != 0x77)) /* syncword */ - return 0; - - if (buf[5] >= 0x60) /* bsid >= 12 */ - return 0; - half = halfrate[buf[5] >> 3]; - - /* acmod, dsurmod and lfeon */ - acmod = buf[6] >> 5; - *flags = ((((buf[6] & 0xf8) == 0x50) ? A52_DOLBY : acmod) | - ((buf[6] & lfeon[acmod]) ? A52_LFE : 0)); - - frmsizecod = buf[4] & 63; - if (frmsizecod >= 38) - return 0; - bitrate = rate [frmsizecod >> 1]; - *bit_rate = (bitrate * 1000) >> half; - - switch (buf[4] & 0xc0) { - case 0: - *sample_rate = 48000 >> half; - return 4 * bitrate; - case 0x40: - *sample_rate = 44100 >> half; - return 2 * (320 * bitrate / 147 + (frmsizecod & 1)); - case 0x80: - *sample_rate = 32000 >> half; - return 6 * bitrate; - default: - return 0; - } -} - -int a52_frame (a52_state_t * state, uint8_t * buf, int * flags, - sample_t * level, sample_t bias) -{ - static sample_t clev[4] = {LEVEL_3DB, LEVEL_45DB, LEVEL_6DB, LEVEL_45DB}; - static sample_t slev[4] = {LEVEL_3DB, LEVEL_6DB, 0, LEVEL_6DB}; - int chaninfo; - int acmod; - - state->fscod = buf[4] >> 6; - state->halfrate = halfrate[buf[5] >> 3]; - state->acmod = acmod = buf[6] >> 5; - - a52_bitstream_set_ptr (state, buf + 6); - bitstream_skip (state, 3); /* skip acmod we already parsed */ - - if ((acmod == 2) && (bitstream_get (state, 2) == 2)) /* dsurmod */ - acmod = A52_DOLBY; - - if ((acmod & 1) && (acmod != 1)) - state->clev = clev[bitstream_get (state, 2)]; /* cmixlev */ - - if (acmod & 4) - state->slev = slev[bitstream_get (state, 2)]; /* surmixlev */ - - state->lfeon = bitstream_get (state, 1); - - state->output = a52_downmix_init (acmod, *flags, level, - state->clev, state->slev); - if (state->output < 0) - return 1; - if (state->lfeon && (*flags & A52_LFE)) - state->output |= A52_LFE; - *flags = state->output; - /* the 2* compensates for differences in imdct */ - state->dynrng = state->level = 2 * *level; - state->bias = bias; - state->dynrnge = 1; - state->dynrngcall = NULL; - state->cplba.deltbae = DELTA_BIT_NONE; - state->ba[0].deltbae = state->ba[1].deltbae = state->ba[2].deltbae = - state->ba[3].deltbae = state->ba[4].deltbae = DELTA_BIT_NONE; - - chaninfo = !acmod; - do { - bitstream_skip (state, 5); /* dialnorm */ - if (bitstream_get (state, 1)) /* compre */ - bitstream_skip (state, 8); /* compr */ - if (bitstream_get (state, 1)) /* langcode */ - bitstream_skip (state, 8); /* langcod */ - if (bitstream_get (state, 1)) /* audprodie */ - bitstream_skip (state, 7); /* mixlevel + roomtyp */ - } while (chaninfo--); - - bitstream_skip (state, 2); /* copyrightb + origbs */ - - if (bitstream_get (state, 1)) /* timecod1e */ - bitstream_skip (state, 14); /* timecod1 */ - if (bitstream_get (state, 1)) /* timecod2e */ - bitstream_skip (state, 14); /* timecod2 */ - - if (bitstream_get (state, 1)) { /* addbsie */ - int addbsil; - - addbsil = bitstream_get (state, 6); - do { - bitstream_skip (state, 8); /* addbsi */ - } while (addbsil--); - } - - return 0; -} - -void a52_dynrng (a52_state_t * state, - sample_t (* call) (sample_t, void *), void * data) -{ - state->dynrnge = 0; - if (call) { - state->dynrnge = 1; - state->dynrngcall = call; - state->dynrngdata = data; - } -} - -static int parse_exponents (a52_state_t * state, int expstr, int ngrps, - uint8_t exponent, uint8_t * dest) -{ - int exps; - - while (ngrps--) { - exps = bitstream_get (state, 7); - - exponent += exp_1[exps]; - if (exponent > 24) - return 1; - - switch (expstr) { - case EXP_D45: - *(dest++) = exponent; - *(dest++) = exponent; - case EXP_D25: - *(dest++) = exponent; - case EXP_D15: - *(dest++) = exponent; - } - - exponent += exp_2[exps]; - if (exponent > 24) - return 1; - - switch (expstr) { - case EXP_D45: - *(dest++) = exponent; - *(dest++) = exponent; - case EXP_D25: - *(dest++) = exponent; - case EXP_D15: - *(dest++) = exponent; - } - - exponent += exp_3[exps]; - if (exponent > 24) - return 1; - - switch (expstr) { - case EXP_D45: - *(dest++) = exponent; - *(dest++) = exponent; - case EXP_D25: - *(dest++) = exponent; - case EXP_D15: - *(dest++) = exponent; - } - } - - return 0; -} - -static int parse_deltba (a52_state_t * state, int8_t * deltba) -{ - int deltnseg, deltlen, delta, j; - - memset (deltba, 0, 50); - - deltnseg = bitstream_get (state, 3); - j = 0; - do { - j += bitstream_get (state, 5); - deltlen = bitstream_get (state, 4); - delta = bitstream_get (state, 3); - delta -= (delta >= 4) ? 3 : 4; - if (!deltlen) - continue; - if (j + deltlen >= 50) - return 1; - while (deltlen--) - deltba[j++] = delta; - } while (deltnseg--); - - return 0; -} - -static inline int zero_snr_offsets (int nfchans, a52_state_t * state) -{ - int i; - - if ((state->csnroffst) || - (state->chincpl && state->cplba.bai >> 3) || /* cplinu, fsnroffst */ - (state->lfeon && state->lfeba.bai >> 3)) /* fsnroffst */ - return 0; - for (i = 0; i < nfchans; i++) - if (state->ba[i].bai >> 3) /* fsnroffst */ - return 0; - return 1; -} - -static inline int16_t dither_gen (a52_state_t * state) -{ - int16_t nstate; - - nstate = dither_lut[state->lfsr_state >> 8] ^ (state->lfsr_state << 8); - - state->lfsr_state = (uint16_t) nstate; - - return nstate; -} - -static void coeff_get (a52_state_t * state, sample_t * coeff, - expbap_t * expbap, quantizer_t * quantizer, - sample_t level, int dither, int end) -{ - int i; - uint8_t * exp; - int8_t * bap; - sample_t factor[25]; - - for (i = 0; i <= 24; i++) - factor[i] = scale_factor[i] * level; - - exp = expbap->exp; - bap = expbap->bap; - - for (i = 0; i < end; i++) { - int bapi; - - bapi = bap[i]; - switch (bapi) { - case 0: - if (dither) { - coeff[i] = dither_gen (state) * LEVEL_3DB * factor[exp[i]]; - continue; - } else { - coeff[i] = 0; - continue; - } - - case -1: - if (quantizer->q1_ptr >= 0) { - coeff[i] = quantizer->q1[quantizer->q1_ptr--] * factor[exp[i]]; - continue; - } else { - int code; - - code = bitstream_get (state, 5); - - quantizer->q1_ptr = 1; - quantizer->q1[0] = q_1_2[code]; - quantizer->q1[1] = q_1_1[code]; - coeff[i] = q_1_0[code] * factor[exp[i]]; - continue; - } - - case -2: - if (quantizer->q2_ptr >= 0) { - coeff[i] = quantizer->q2[quantizer->q2_ptr--] * factor[exp[i]]; - continue; - } else { - int code; - - code = bitstream_get (state, 7); - - quantizer->q2_ptr = 1; - quantizer->q2[0] = q_2_2[code]; - quantizer->q2[1] = q_2_1[code]; - coeff[i] = q_2_0[code] * factor[exp[i]]; - continue; - } - - case 3: - coeff[i] = q_3[bitstream_get (state, 3)] * factor[exp[i]]; - continue; - - case -3: - if (quantizer->q4_ptr == 0) { - quantizer->q4_ptr = -1; - coeff[i] = quantizer->q4 * factor[exp[i]]; - continue; - } else { - int code; - - code = bitstream_get (state, 7); - - quantizer->q4_ptr = 0; - quantizer->q4 = q_4_1[code]; - coeff[i] = q_4_0[code] * factor[exp[i]]; - continue; - } - - case 4: - coeff[i] = q_5[bitstream_get (state, 4)] * factor[exp[i]]; - continue; - - default: - coeff[i] = ((bitstream_get_2 (state, bapi) << (16 - bapi)) * - factor[exp[i]]); - } - } -} - -static void coeff_get_coupling (a52_state_t * state, int nfchans, - sample_t * coeff, sample_t (* samples)[256], - quantizer_t * quantizer, uint8_t dithflag[5]) -{ - int cplbndstrc, bnd, i, i_end, ch; - uint8_t * exp; - int8_t * bap; - sample_t cplco[5]; - - exp = state->cpl_expbap.exp; - bap = state->cpl_expbap.bap; - bnd = 0; - cplbndstrc = state->cplbndstrc; - i = state->cplstrtmant; - while (i < state->cplendmant) { - i_end = i + 12; - while (cplbndstrc & 1) { - cplbndstrc >>= 1; - i_end += 12; - } - cplbndstrc >>= 1; - for (ch = 0; ch < nfchans; ch++) - cplco[ch] = state->cplco[ch][bnd] * coeff[ch]; - bnd++; - - while (i < i_end) { - sample_t cplcoeff; - int bapi; - - bapi = bap[i]; - switch (bapi) { - case 0: - cplcoeff = LEVEL_3DB * scale_factor[exp[i]]; - for (ch = 0; ch < nfchans; ch++) - if ((state->chincpl >> ch) & 1) { - if (dithflag[ch]) - samples[ch][i] = (cplcoeff * cplco[ch] * - dither_gen (state)); - else - samples[ch][i] = 0; - } - i++; - continue; - - case -1: - if (quantizer->q1_ptr >= 0) { - cplcoeff = quantizer->q1[quantizer->q1_ptr--]; - break; - } else { - int code; - - code = bitstream_get (state, 5); - - quantizer->q1_ptr = 1; - quantizer->q1[0] = q_1_2[code]; - quantizer->q1[1] = q_1_1[code]; - cplcoeff = q_1_0[code]; - break; - } - - case -2: - if (quantizer->q2_ptr >= 0) { - cplcoeff = quantizer->q2[quantizer->q2_ptr--]; - break; - } else { - int code; - - code = bitstream_get (state, 7); - - quantizer->q2_ptr = 1; - quantizer->q2[0] = q_2_2[code]; - quantizer->q2[1] = q_2_1[code]; - cplcoeff = q_2_0[code]; - break; - } - - case 3: - cplcoeff = q_3[bitstream_get (state, 3)]; - break; - - case -3: - if (quantizer->q4_ptr == 0) { - quantizer->q4_ptr = -1; - cplcoeff = quantizer->q4; - break; - } else { - int code; - - code = bitstream_get (state, 7); - - quantizer->q4_ptr = 0; - quantizer->q4 = q_4_1[code]; - cplcoeff = q_4_0[code]; - break; - } - - case 4: - cplcoeff = q_5[bitstream_get (state, 4)]; - break; - - default: - cplcoeff = bitstream_get_2 (state, bapi) << (16 - bapi); - } - - cplcoeff *= scale_factor[exp[i]]; - for (ch = 0; ch < nfchans; ch++) - if ((state->chincpl >> ch) & 1) - samples[ch][i] = cplcoeff * cplco[ch]; - i++; - } - } -} - -int a52_block (a52_state_t * state) -{ - static const uint8_t nfchans_tbl[] = {2, 1, 2, 3, 3, 4, 4, 5, 1, 1, 2}; - static int rematrix_band[4] = {25, 37, 61, 253}; - int i, nfchans, chaninfo; - uint8_t cplexpstr, chexpstr[5], lfeexpstr, do_bit_alloc, done_cpl; - uint8_t blksw[5], dithflag[5]; - sample_t coeff[5]; - int chanbias; - quantizer_t quantizer; - sample_t * samples; - - nfchans = nfchans_tbl[state->acmod]; - - for (i = 0; i < nfchans; i++) - blksw[i] = bitstream_get (state, 1); - - for (i = 0; i < nfchans; i++) - dithflag[i] = bitstream_get (state, 1); - - chaninfo = !state->acmod; - do { - if (bitstream_get (state, 1)) { /* dynrnge */ - int dynrng; - - dynrng = bitstream_get_2 (state, 8); - if (state->dynrnge) { - sample_t range; - - range = ((((dynrng & 0x1f) | 0x20) << 13) * - scale_factor[3 - (dynrng >> 5)]); - if (state->dynrngcall) - range = state->dynrngcall (range, state->dynrngdata); - state->dynrng = state->level * range; - } - } - } while (chaninfo--); - - if (bitstream_get (state, 1)) { /* cplstre */ - state->chincpl = 0; - if (bitstream_get (state, 1)) { /* cplinu */ - static uint8_t bndtab[16] = {31, 35, 37, 39, 41, 42, 43, 44, - 45, 45, 46, 46, 47, 47, 48, 48}; - int cplbegf; - int cplendf; - int ncplsubnd; - - for (i = 0; i < nfchans; i++) - state->chincpl |= bitstream_get (state, 1) << i; - switch (state->acmod) { - case 0: case 1: - return 1; - case 2: - state->phsflginu = bitstream_get (state, 1); - } - cplbegf = bitstream_get (state, 4); - cplendf = bitstream_get (state, 4); - - if (cplendf + 3 - cplbegf < 0) - return 1; - state->ncplbnd = ncplsubnd = cplendf + 3 - cplbegf; - state->cplstrtbnd = bndtab[cplbegf]; - state->cplstrtmant = cplbegf * 12 + 37; - state->cplendmant = cplendf * 12 + 73; - - state->cplbndstrc = 0; - for (i = 0; i < ncplsubnd - 1; i++) - if (bitstream_get (state, 1)) { - state->cplbndstrc |= 1 << i; - state->ncplbnd--; - } - } - } - - if (state->chincpl) { /* cplinu */ - int j, cplcoe; - - cplcoe = 0; - for (i = 0; i < nfchans; i++) - if ((state->chincpl) >> i & 1) - if (bitstream_get (state, 1)) { /* cplcoe */ - int mstrcplco, cplcoexp, cplcomant; - - cplcoe = 1; - mstrcplco = 3 * bitstream_get (state, 2); - for (j = 0; j < state->ncplbnd; j++) { - cplcoexp = bitstream_get (state, 4); - cplcomant = bitstream_get (state, 4); - if (cplcoexp == 15) - cplcomant <<= 14; - else - cplcomant = (cplcomant | 0x10) << 13; - state->cplco[i][j] = - cplcomant * scale_factor[cplcoexp + mstrcplco]; - } - } - if ((state->acmod == 2) && state->phsflginu && cplcoe) - for (j = 0; j < state->ncplbnd; j++) - if (bitstream_get (state, 1)) /* phsflg */ - state->cplco[1][j] = -state->cplco[1][j]; - } - - if ((state->acmod == 2) && (bitstream_get (state, 1))) { /* rematstr */ - int end; - - state->rematflg = 0; - end = (state->chincpl) ? state->cplstrtmant : 253; /* cplinu */ - i = 0; - do - state->rematflg |= bitstream_get (state, 1) << i; - while (rematrix_band[i++] < end); - } - - cplexpstr = EXP_REUSE; - lfeexpstr = EXP_REUSE; - if (state->chincpl) /* cplinu */ - cplexpstr = bitstream_get (state, 2); - for (i = 0; i < nfchans; i++) - chexpstr[i] = bitstream_get (state, 2); - if (state->lfeon) - lfeexpstr = bitstream_get (state, 1); - - for (i = 0; i < nfchans; i++) - if (chexpstr[i] != EXP_REUSE) { - if ((state->chincpl >> i) & 1) - state->endmant[i] = state->cplstrtmant; - else { - int chbwcod; - - chbwcod = bitstream_get (state, 6); - if (chbwcod > 60) - return 1; - state->endmant[i] = chbwcod * 3 + 73; - } - } - - do_bit_alloc = 0; - - if (cplexpstr != EXP_REUSE) { - int cplabsexp, ncplgrps; - - do_bit_alloc = 64; - ncplgrps = ((state->cplendmant - state->cplstrtmant) / - (3 << (cplexpstr - 1))); - cplabsexp = bitstream_get (state, 4) << 1; - if (parse_exponents (state, cplexpstr, ncplgrps, cplabsexp, - state->cpl_expbap.exp + state->cplstrtmant)) - return 1; - } - for (i = 0; i < nfchans; i++) - if (chexpstr[i] != EXP_REUSE) { - int grp_size, nchgrps; - - do_bit_alloc |= 1 << i; - grp_size = 3 << (chexpstr[i] - 1); - nchgrps = (state->endmant[i] + grp_size - 4) / grp_size; - state->fbw_expbap[i].exp[0] = bitstream_get (state, 4); - if (parse_exponents (state, chexpstr[i], nchgrps, - state->fbw_expbap[i].exp[0], - state->fbw_expbap[i].exp + 1)) - return 1; - bitstream_skip (state, 2); /* gainrng */ - } - if (lfeexpstr != EXP_REUSE) { - do_bit_alloc |= 32; - state->lfe_expbap.exp[0] = bitstream_get (state, 4); - if (parse_exponents (state, lfeexpstr, 2, state->lfe_expbap.exp[0], - state->lfe_expbap.exp + 1)) - return 1; - } - - if (bitstream_get (state, 1)) { /* baie */ - do_bit_alloc = -1; - state->bai = bitstream_get (state, 11); - } - if (bitstream_get (state, 1)) { /* snroffste */ - do_bit_alloc = -1; - state->csnroffst = bitstream_get (state, 6); - if (state->chincpl) /* cplinu */ - state->cplba.bai = bitstream_get (state, 7); - for (i = 0; i < nfchans; i++) - state->ba[i].bai = bitstream_get (state, 7); - if (state->lfeon) - state->lfeba.bai = bitstream_get (state, 7); - } - if ((state->chincpl) && (bitstream_get (state, 1))) { /* cplleake */ - do_bit_alloc |= 64; - state->cplfleak = 9 - bitstream_get (state, 3); - state->cplsleak = 9 - bitstream_get (state, 3); - } - - if (bitstream_get (state, 1)) { /* deltbaie */ - do_bit_alloc = -1; - if (state->chincpl) /* cplinu */ - state->cplba.deltbae = bitstream_get (state, 2); - for (i = 0; i < nfchans; i++) - state->ba[i].deltbae = bitstream_get (state, 2); - if (state->chincpl && /* cplinu */ - (state->cplba.deltbae == DELTA_BIT_NEW) && - parse_deltba (state, state->cplba.deltba)) - return 1; - for (i = 0; i < nfchans; i++) - if ((state->ba[i].deltbae == DELTA_BIT_NEW) && - parse_deltba (state, state->ba[i].deltba)) - return 1; - } - - if (do_bit_alloc) { - if (zero_snr_offsets (nfchans, state)) { - memset (state->cpl_expbap.bap, 0, sizeof (state->cpl_expbap.bap)); - for (i = 0; i < nfchans; i++) - memset (state->fbw_expbap[i].bap, 0, - sizeof (state->fbw_expbap[i].bap)); - memset (state->lfe_expbap.bap, 0, sizeof (state->lfe_expbap.bap)); - } else { - if (state->chincpl && (do_bit_alloc & 64)) /* cplinu */ - a52_bit_allocate (state, &state->cplba, state->cplstrtbnd, - state->cplstrtmant, state->cplendmant, - state->cplfleak << 8, state->cplsleak << 8, - &state->cpl_expbap); - for (i = 0; i < nfchans; i++) - if (do_bit_alloc & (1 << i)) - a52_bit_allocate (state, state->ba + i, 0, 0, - state->endmant[i], 0, 0, - state->fbw_expbap +i); - if (state->lfeon && (do_bit_alloc & 32)) { - state->lfeba.deltbae = DELTA_BIT_NONE; - a52_bit_allocate (state, &state->lfeba, 0, 0, 7, 0, 0, - &state->lfe_expbap); - } - } - } - - if (bitstream_get (state, 1)) { /* skiple */ - i = bitstream_get (state, 9); /* skipl */ - while (i--) - bitstream_skip (state, 8); - } - - samples = state->samples; - if (state->output & A52_LFE) - samples += 256; /* shift for LFE channel */ - - chanbias = a52_downmix_coeff (coeff, state->acmod, state->output, - state->dynrng, state->clev, state->slev); - - quantizer.q1_ptr = quantizer.q2_ptr = quantizer.q4_ptr = -1; - done_cpl = 0; - - for (i = 0; i < nfchans; i++) { - int j; - - coeff_get (state, samples + 256 * i, state->fbw_expbap +i, &quantizer, - coeff[i], dithflag[i], state->endmant[i]); - - if ((state->chincpl >> i) & 1) { - if (!done_cpl) { - done_cpl = 1; - coeff_get_coupling (state, nfchans, coeff, - (sample_t (*)[256])samples, &quantizer, - dithflag); - } - j = state->cplendmant; - } else - j = state->endmant[i]; - do - (samples + 256 * i)[j] = 0; - while (++j < 256); - } - - if (state->acmod == 2) { - int j, end, band, rematflg; - - end = ((state->endmant[0] < state->endmant[1]) ? - state->endmant[0] : state->endmant[1]); - - i = 0; - j = 13; - rematflg = state->rematflg; - do { - if (! (rematflg & 1)) { - rematflg >>= 1; - j = rematrix_band[i++]; - continue; - } - rematflg >>= 1; - band = rematrix_band[i++]; - if (band > end) - band = end; - do { - sample_t tmp0, tmp1; - - tmp0 = samples[j]; - tmp1 = (samples+256)[j]; - samples[j] = tmp0 + tmp1; - (samples+256)[j] = tmp0 - tmp1; - } while (++j < band); - } while (j < end); - } - - if (state->lfeon) { - if (state->output & A52_LFE) { - coeff_get (state, samples - 256, &state->lfe_expbap, &quantizer, - state->dynrng, 0, 7); - for (i = 7; i < 256; i++) - (samples-256)[i] = 0; - a52_imdct_512 (samples - 256, samples + 1536 - 256, state->bias); - } else { - /* just skip the LFE coefficients */ - coeff_get (state, samples + 1280, &state->lfe_expbap, &quantizer, - 0, 0, 7); - } - } - - i = 0; - if (nfchans_tbl[state->output & A52_CHANNEL_MASK] < nfchans) - for (i = 1; i < nfchans; i++) - if (blksw[i] != blksw[0]) - break; - - if (i < nfchans) { - if (state->downmixed) { - state->downmixed = 0; - a52_upmix (samples + 1536, state->acmod, state->output); - } - - for (i = 0; i < nfchans; i++) { - sample_t bias; - - bias = 0; - if (!(chanbias & (1 << i))) - bias = state->bias; - - if (coeff[i]) { - if (blksw[i]) - a52_imdct_256 (samples + 256 * i, samples + 1536 + 256 * i, - bias); - else - a52_imdct_512 (samples + 256 * i, samples + 1536 + 256 * i, - bias); - } else { - int j; - - for (j = 0; j < 256; j++) - (samples + 256 * i)[j] = bias; - } - } - - a52_downmix (samples, state->acmod, state->output, state->bias, - state->clev, state->slev); - } else { - nfchans = nfchans_tbl[state->output & A52_CHANNEL_MASK]; - - a52_downmix (samples, state->acmod, state->output, 0, - state->clev, state->slev); - - if (!state->downmixed) { - state->downmixed = 1; - a52_downmix (samples + 1536, state->acmod, state->output, 0, - state->clev, state->slev); - } - - if (blksw[0]) - for (i = 0; i < nfchans; i++) - a52_imdct_256 (samples + 256 * i, samples + 1536 + 256 * i, - state->bias); - else - for (i = 0; i < nfchans; i++) - a52_imdct_512 (samples + 256 * i, samples + 1536 + 256 * i, - state->bias); - } - - return 0; -} - -void a52_free (a52_state_t * state) -{ -#if defined(__MINGW32__) && defined(HAVE_SSE) - av_free (state->samples); -#else - free (state->samples); -#endif - free (state); -} diff -r 459227551819 -r 1aece15222b5 liba52/resample.c --- a/liba52/resample.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -/* - * resample.c - * Copyright (C) 2001 Árpád Gereöffy - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * File added for use with MPlayer and not part of original a52dec. - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -// a52_resample_init should find the requested converter (from type flags -> -// given number of channels) and set up some function pointers... - -// a52_resample() should do the conversion. - -#include -#include -#include "a52.h" -#include "mm_accel.h" -#include "config.h" -#include "mangle.h" - -int (* a52_resample) (float * _f, int16_t * s16)=NULL; - -#include "resample_c.c" - -#if ARCH_X86 || ARCH_X86_64 -#include "resample_mmx.c" -#endif - -#if HAVE_ALTIVEC -#include "resample_altivec.c" -#endif - -void* a52_resample_init(uint32_t mm_accel,int flags,int chans){ -void* tmp; - -#if ARCH_X86 || ARCH_X86_64 - if(mm_accel&MM_ACCEL_X86_MMX){ - tmp=a52_resample_MMX(flags,chans); - if(tmp){ - if(a52_resample==NULL) fprintf(stderr, "Using MMX optimized resampler\n"); - a52_resample=tmp; - return tmp; - } - } -#endif -#if HAVE_ALTIVEC - if(mm_accel&MM_ACCEL_PPC_ALTIVEC){ - tmp=a52_resample_altivec(flags,chans); - if(tmp){ - if(a52_resample==NULL) fprintf(stderr, "Using AltiVec optimized resampler\n"); - a52_resample=tmp; - return tmp; - } - } -#endif - - tmp=a52_resample_C(flags,chans); - if(tmp){ - if(a52_resample==NULL) fprintf(stderr, "No accelerated resampler found\n"); - a52_resample=tmp; - return tmp; - } - - fprintf(stderr, "Unimplemented resampler for mode 0x%X -> %d channels conversion - Contact MPlayer developers!\n", flags, chans); - return NULL; -} diff -r 459227551819 -r 1aece15222b5 liba52/resample_altivec.c --- a/liba52/resample_altivec.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -/* - * resample.c - * Copyright (C) 2004 Romain Dolbeau - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * File added for use with MPlayer and not part of original a52dec. - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifdef HAVE_ALTIVEC_H -#include -#endif - -const vector signed int magic = {0x43c00000,0x43c00000,0x43c00000,0x43c00000}; - -static inline vector signed short convert16_altivec(vector signed int v1, vector signed int v2) -{ - register vector signed short result; - v1 = vec_subs(v1, magic); - v2 = vec_subs(v2, magic); - result = vec_packs(v1, v2); - - return result; -} - -static void unaligned_store(vector signed short value, int off, int16_t *dst) -{ - register vector unsigned char align = vec_lvsr(0, dst), - mask = vec_lvsl(0, dst); - register vector signed short t0,t1, edges; - - t0 = vec_ld(0+off, dst); - t1 = vec_ld(15+off, dst); - edges = vec_perm(t1 ,t0, mask); - t1 = vec_perm(value, edges, align); - t0 = vec_perm(edges, value, align); - vec_st(t1, 15+off, dst); - vec_st(t0, 0+off, dst); -} - -static int a52_resample_STEREO_to_2_altivec(float * _f, int16_t * s16){ -#if 0 - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[2*i] = convert (f[i]); - s16[2*i+1] = convert (f[i+256]); - } - return 2*256; -#else - int i = 0; - int32_t * f = (int32_t *) _f; - register vector signed int f0, f4, f256, f260; - register vector signed short reven, rodd, r0, r1; - - for (i = 0; i < 256; i+= 8) { - f0 = vec_ld(0, f); - f4 = vec_ld(16, f); - - f256 = vec_ld(1024, f); - f260 = vec_ld(1040, f); - - reven = convert16_altivec(f0, f4); - rodd = convert16_altivec(f256, f260); - - r0 = vec_mergeh(reven, rodd); - r1 = vec_mergel(reven, rodd); - // FIXME can be merged to spare some I/O - unaligned_store(r0, 0, s16); - unaligned_store(r1, 16, s16); - - f += 8; - s16 += 16; - } - return(2*256); -#endif -} - -static void* a52_resample_altivec(int flags, int ch){ -fprintf(stderr, "Checking for AltiVec resampler : 0x%08x, %d\n", flags, ch); - - switch (flags) { - case A52_CHANNEL: - case A52_STEREO: - case A52_DOLBY: - if(ch==2) return a52_resample_STEREO_to_2_altivec; - break; - - default: - fprintf(stderr, "Unsupported flags: 0x%08x (%d channels)\n", flags, ch); - break; - } - return NULL; -} - diff -r 459227551819 -r 1aece15222b5 liba52/resample_c.c --- a/liba52/resample_c.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,205 +0,0 @@ -/* - * resample_c.c - * Copyright (C) 2001 Árpád Gereöffy - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * File added for use with MPlayer and not part of original a52dec. - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -static inline int16_t convert (int32_t i) -{ - if (i > 0x43c07fff) - return 32767; - else if (i < 0x43bf8000) - return -32768; - else - return i - 0x43c00000; -} - -static int a52_resample_MONO_to_5_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0; - s16[5*i+4] = convert (f[i]); - } - return 5*256; -} - -static int a52_resample_MONO_to_1_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[i] = convert (f[i]); - } - return 1*256; -} - -static int a52_resample_STEREO_to_2_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[2*i] = convert (f[i]); - s16[2*i+1] = convert (f[i+256]); - } - return 2*256; -} - -static int a52_resample_3F_to_5_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[5*i] = convert (f[i]); - s16[5*i+1] = convert (f[i+512]); - s16[5*i+2] = s16[5*i+3] = 0; - s16[5*i+4] = convert (f[i+256]); - } - return 5*256; -} - -static int a52_resample_2F_2R_to_4_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[4*i] = convert (f[i]); - s16[4*i+1] = convert (f[i+256]); - s16[4*i+2] = convert (f[i+512]); - s16[4*i+3] = convert (f[i+768]); - } - return 4*256; -} - -static int a52_resample_3F_2R_to_5_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[5*i] = convert (f[i]); - s16[5*i+1] = convert (f[i+512]); - s16[5*i+2] = convert (f[i+768]); - s16[5*i+3] = convert (f[i+1024]); - s16[5*i+4] = convert (f[i+256]); - } - return 5*256; -} - -static int a52_resample_MONO_LFE_to_6_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0; - s16[6*i+4] = convert (f[i+256]); - s16[6*i+5] = convert (f[i]); - } - return 6*256; -} - -static int a52_resample_STEREO_LFE_to_6_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[6*i] = convert (f[i+256]); - s16[6*i+1] = convert (f[i+512]); - s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0; - s16[6*i+5] = convert (f[i]); - } - return 6*256; -} - -static int a52_resample_3F_LFE_to_6_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[6*i] = convert (f[i+256]); - s16[6*i+1] = convert (f[i+768]); - s16[6*i+2] = s16[6*i+3] = 0; - s16[6*i+4] = convert (f[i+512]); - s16[6*i+5] = convert (f[i]); - } - return 6*256; -} - -static int a52_resample_2F_2R_LFE_to_6_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[6*i] = convert (f[i+256]); - s16[6*i+1] = convert (f[i+512]); - s16[6*i+2] = convert (f[i+768]); - s16[6*i+3] = convert (f[i+1024]); - s16[6*i+4] = 0; - s16[6*i+5] = convert (f[i]); - } - return 6*256; -} - -static int a52_resample_3F_2R_LFE_to_6_C(float * _f, int16_t * s16){ - int i; - int32_t * f = (int32_t *) _f; - for (i = 0; i < 256; i++) { - s16[6*i] = convert (f[i+256]); - s16[6*i+1] = convert (f[i+768]); - s16[6*i+2] = convert (f[i+1024]); - s16[6*i+3] = convert (f[i+1280]); - s16[6*i+4] = convert (f[i+512]); - s16[6*i+5] = convert (f[i]); - } - return 6*256; -} - - -static void* a52_resample_C(int flags, int ch){ - switch (flags) { - case A52_MONO: - if(ch==5) return a52_resample_MONO_to_5_C; - if(ch==1) return a52_resample_MONO_to_1_C; - break; - case A52_CHANNEL: - case A52_STEREO: - case A52_DOLBY: - if(ch==2) return a52_resample_STEREO_to_2_C; - break; - case A52_3F: - if(ch==5) return a52_resample_3F_to_5_C; - break; - case A52_2F2R: - if(ch==4) return a52_resample_2F_2R_to_4_C; - break; - case A52_3F2R: - if(ch==5) return a52_resample_3F_2R_to_5_C; - break; - case A52_MONO | A52_LFE: - if(ch==6) return a52_resample_MONO_LFE_to_6_C; - break; - case A52_CHANNEL | A52_LFE: - case A52_STEREO | A52_LFE: - case A52_DOLBY | A52_LFE: - if(ch==6) return a52_resample_STEREO_LFE_to_6_C; - break; - case A52_3F | A52_LFE: - if(ch==6) return a52_resample_3F_LFE_to_6_C; - break; - case A52_2F2R | A52_LFE: - if(ch==6) return a52_resample_2F_2R_LFE_to_6_C; - break; - case A52_3F2R | A52_LFE: - if(ch==6) return a52_resample_3F_2R_LFE_to_6_C; - break; - } - return NULL; -} diff -r 459227551819 -r 1aece15222b5 liba52/resample_mmx.c --- a/liba52/resample_mmx.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,541 +0,0 @@ -/* - * resample_mmx.c - * Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * File added for use with MPlayer and not part of original a52dec. - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -/* optimization TODO / NOTES - * movntq is slightly faster (0.5% with the current test.c benchmark) - * (but that is just test.c so that needs to be tested in reality) - * and it would mean (C / MMX2 / MMX / 3DNOW) versions. - */ - -#include "a52_internal.h" - - -static uint64_t attribute_used __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; -static uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; -static uint64_t attribute_used __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; -static uint64_t attribute_used __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; - -static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-512, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "movq "MANGLE(wm1100)", %%mm3 \n\t" - "movq "MANGLE(wm0101)", %%mm4 \n\t" - "movq "MANGLE(wm1010)", %%mm5 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq (%1, %%"REG_S", 2), %%mm0 \n\t" - "movq 8(%1, %%"REG_S", 2), %%mm1\n\t" - "lea (%%"REG_S", %%"REG_S", 4), %%"REG_D"\n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "pand %%mm4, %%mm0 \n\t" - "pand %%mm5, %%mm1 \n\t" - "movq %%mm6, (%0, %%"REG_D") \n\t" // 0 0 0 0 - "movd %%mm0, 8(%0, %%"REG_D") \n\t" // A 0 - "pand %%mm3, %%mm0 \n\t" - "movd %%mm6, 12(%0, %%"REG_D") \n\t" // 0 0 - "movd %%mm1, 16(%0, %%"REG_D") \n\t" // 0 B - "pand %%mm3, %%mm1 \n\t" - "movd %%mm6, 20(%0, %%"REG_D") \n\t" // 0 0 - "movq %%mm0, 24(%0, %%"REG_D") \n\t" // 0 0 C 0 - "movq %%mm1, 32(%0, %%"REG_D") \n\t" // 0 0 0 B - "add $8, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1280), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 5*256; -} - -static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; -/* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it -#if HAVE_SSE - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "1: \n\t" - "cvtps2pi (%1, %%"REG_S"), %%mm0\n\t" - "cvtps2pi 1024(%1, %%"REG_S"), %%mm2\n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "movq %%mm0, (%0, %%"REG_S") \n\t" - "movq %%mm1, 8(%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+512), "r" (f+256) - :"%"REG_S, "memory" - );*/ - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "1: \n\t" - "movq (%1, %%"REG_S"), %%mm0 \n\t" - "movq 8(%1, %%"REG_S"), %%mm1 \n\t" - "movq 1024(%1, %%"REG_S"), %%mm2\n\t" - "movq 1032(%1, %%"REG_S"), %%mm3\n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "movq %%mm0, (%0, %%"REG_S") \n\t" - "movq %%mm1, 8(%0, %%"REG_S") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+512), "r" (f+256) - :"%"REG_S, "memory" - ); - return 2*256; -} - -static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "movq %%mm7, %%mm5 \n\t" - "punpckldq %%mm6, %%mm5 \n\t" - "1: \n\t" - "movd (%1, %%"REG_S"), %%mm0 \n\t" - "punpckldq 2048(%1, %%"REG_S"), %%mm0\n\t" - "movd 1024(%1, %%"REG_S"), %%mm1\n\t" - "punpckldq 4(%1, %%"REG_S"), %%mm1\n\t" - "movd 2052(%1, %%"REG_S"), %%mm2\n\t" - "movq %%mm7, %%mm3 \n\t" - "punpckldq 1028(%1, %%"REG_S"), %%mm3\n\t" - "movd 8(%1, %%"REG_S"), %%mm4 \n\t" - "punpckldq 2056(%1, %%"REG_S"), %%mm4\n\t" - "lea (%%"REG_S", %%"REG_S", 4), %%"REG_D"\n\t" - "sar $1, %%"REG_D" \n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm5, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "psubd %%mm7, %%mm4 \n\t" - "packssdw %%mm6, %%mm0 \n\t" - "packssdw %%mm2, %%mm1 \n\t" - "packssdw %%mm4, %%mm3 \n\t" - "movq %%mm0, (%0, %%"REG_D") \n\t" - "movq %%mm1, 8(%0, %%"REG_D") \n\t" - "movq %%mm3, 16(%0, %%"REG_D") \n\t" - "movd 1032(%1, %%"REG_S"), %%mm1\n\t" - "punpckldq 12(%1, %%"REG_S"), %%mm1\n\t" - "movd 2060(%1, %%"REG_S"), %%mm2\n\t" - "movq %%mm7, %%mm3 \n\t" - "punpckldq 1036(%1, %%"REG_S"), %%mm3\n\t" - "pxor %%mm0, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm5, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "movq %%mm0, 24(%0, %%"REG_D") \n\t" - "movq %%mm2, 32(%0, %%"REG_D") \n\t" - - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1280), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 5*256; -} - -static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "1: \n\t" - "movq (%1, %%"REG_S"), %%mm0 \n\t" - "movq 8(%1, %%"REG_S"), %%mm1 \n\t" - "movq 1024(%1, %%"REG_S"), %%mm2\n\t" - "movq 1032(%1, %%"REG_S"), %%mm3\n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "movq 2048(%1, %%"REG_S"), %%mm3\n\t" - "movq 2056(%1, %%"REG_S"), %%mm4\n\t" - "movq 3072(%1, %%"REG_S"), %%mm5\n\t" - "movq 3080(%1, %%"REG_S"), %%mm6\n\t" - "psubd %%mm7, %%mm3 \n\t" - "psubd %%mm7, %%mm4 \n\t" - "psubd %%mm7, %%mm5 \n\t" - "psubd %%mm7, %%mm6 \n\t" - "packssdw %%mm4, %%mm3 \n\t" - "packssdw %%mm6, %%mm5 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm4 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "punpcklwd %%mm5, %%mm3 \n\t" - "punpckhwd %%mm5, %%mm4 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm5 \n\t" - "punpckldq %%mm3, %%mm0 \n\t" - "punpckhdq %%mm3, %%mm2 \n\t" - "punpckldq %%mm4, %%mm1 \n\t" - "punpckhdq %%mm4, %%mm5 \n\t" - "movq %%mm0, (%0, %%"REG_S",2) \n\t" - "movq %%mm2, 8(%0, %%"REG_S",2) \n\t" - "movq %%mm1, 16(%0, %%"REG_S",2)\n\t" - "movq %%mm5, 24(%0, %%"REG_S",2)\n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1024), "r" (f+256) - :"%"REG_S, "memory" - ); - return 4*256; -} - -static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "1: \n\t" - "movd (%1, %%"REG_S"), %%mm0 \n\t" - "punpckldq 2048(%1, %%"REG_S"), %%mm0\n\t" - "movd 3072(%1, %%"REG_S"), %%mm1\n\t" - "punpckldq 4096(%1, %%"REG_S"), %%mm1\n\t" - "movd 1024(%1, %%"REG_S"), %%mm2\n\t" - "punpckldq 4(%1, %%"REG_S"), %%mm2\n\t" - "movd 2052(%1, %%"REG_S"), %%mm3\n\t" - "punpckldq 3076(%1, %%"REG_S"), %%mm3\n\t" - "movd 4100(%1, %%"REG_S"), %%mm4\n\t" - "punpckldq 1028(%1, %%"REG_S"), %%mm4\n\t" - "movd 8(%1, %%"REG_S"), %%mm5 \n\t" - "punpckldq 2056(%1, %%"REG_S"), %%mm5\n\t" - "lea (%%"REG_S", %%"REG_S", 4), %%"REG_D"\n\t" - "sar $1, %%"REG_D" \n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "psubd %%mm7, %%mm4 \n\t" - "psubd %%mm7, %%mm5 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "packssdw %%mm5, %%mm4 \n\t" - "movq %%mm0, (%0, %%"REG_D") \n\t" - "movq %%mm2, 8(%0, %%"REG_D") \n\t" - "movq %%mm4, 16(%0, %%"REG_D") \n\t" - - "movd 3080(%1, %%"REG_S"), %%mm0\n\t" - "punpckldq 4104(%1, %%"REG_S"), %%mm0\n\t" - "movd 1032(%1, %%"REG_S"), %%mm1\n\t" - "punpckldq 12(%1, %%"REG_S"), %%mm1\n\t" - "movd 2060(%1, %%"REG_S"), %%mm2\n\t" - "punpckldq 3084(%1, %%"REG_S"), %%mm2\n\t" - "movd 4108(%1, %%"REG_S"), %%mm3\n\t" - "punpckldq 1036(%1, %%"REG_S"), %%mm3\n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "movq %%mm0, 24(%0, %%"REG_D") \n\t" - "movq %%mm2, 32(%0, %%"REG_D") \n\t" - - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1280), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 5*256; -} - -static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq 1024(%1, %%"REG_S"), %%mm0\n\t" - "movq 1032(%1, %%"REG_S"), %%mm1\n\t" - "movq (%1, %%"REG_S"), %%mm2 \n\t" - "movq 8(%1, %%"REG_S"), %%mm3 \n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t" - "movq %%mm6, (%0, %%"REG_D") \n\t" - "movd %%mm0, 8(%0, %%"REG_D") \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movq %%mm6, 12(%0, %%"REG_D") \n\t" - "movd %%mm0, 20(%0, %%"REG_D") \n\t" - "movq %%mm6, 24(%0, %%"REG_D") \n\t" - "movd %%mm1, 32(%0, %%"REG_D") \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm6, 36(%0, %%"REG_D") \n\t" - "movd %%mm1, 44(%0, %%"REG_D") \n\t" - "add $16, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1536), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 6*256; -} - -static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq 1024(%1, %%"REG_S"), %%mm0\n\t" - "movq 2048(%1, %%"REG_S"), %%mm1\n\t" - "movq (%1, %%"REG_S"), %%mm5 \n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm5 \n\t" - "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t" - - "pxor %%mm4, %%mm4 \n\t" - "packssdw %%mm5, %%mm0 \n\t" // FfAa - "packssdw %%mm4, %%mm1 \n\t" // 00Bb - "punpckhwd %%mm0, %%mm4 \n\t" // F0f0 - "punpcklwd %%mm1, %%mm0 \n\t" // BAba - "movq %%mm0, %%mm1 \n\t" // BAba - "punpckldq %%mm4, %%mm3 \n\t" // f0XX - "punpckldq %%mm6, %%mm0 \n\t" // 00ba - "punpckhdq %%mm1, %%mm3 \n\t" // BAf0 - - "movq %%mm0, (%0, %%"REG_D") \n\t" // 00ba - "punpckhdq %%mm4, %%mm0 \n\t" // F000 - "movq %%mm3, 8(%0, %%"REG_D") \n\t" // BAf0 - "movq %%mm0, 16(%0, %%"REG_D") \n\t" // F000 - "add $8, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1536), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 6*256; -} - -static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq 1024(%1, %%"REG_S"), %%mm0\n\t" - "movq 3072(%1, %%"REG_S"), %%mm1\n\t" - "movq 2048(%1, %%"REG_S"), %%mm4\n\t" - "movq (%1, %%"REG_S"), %%mm5 \n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm4 \n\t" - "psubd %%mm7, %%mm5 \n\t" - "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t" - - "packssdw %%mm4, %%mm0 \n\t" // EeAa - "packssdw %%mm5, %%mm1 \n\t" // FfBb - "movq %%mm0, %%mm2 \n\t" // EeAa - "punpcklwd %%mm1, %%mm0 \n\t" // BAba - "punpckhwd %%mm1, %%mm2 \n\t" // FEfe - "movq %%mm0, %%mm1 \n\t" // BAba - "punpckldq %%mm6, %%mm0 \n\t" // 00ba - "punpckhdq %%mm1, %%mm1 \n\t" // BABA - - "movq %%mm0, (%0, %%"REG_D") \n\t" - "punpckhdq %%mm2, %%mm0 \n\t" // FE00 - "punpckldq %%mm1, %%mm2 \n\t" // BAfe - "movq %%mm2, 8(%0, %%"REG_D") \n\t" - "movq %%mm0, 16(%0, %%"REG_D") \n\t" - "add $8, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1536), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 6*256; -} - -static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" -// "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq 1024(%1, %%"REG_S"), %%mm0\n\t" - "movq 2048(%1, %%"REG_S"), %%mm1\n\t" - "movq 3072(%1, %%"REG_S"), %%mm2\n\t" - "movq 4096(%1, %%"REG_S"), %%mm3\n\t" - "movq (%1, %%"REG_S"), %%mm5 \n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "psubd %%mm7, %%mm5 \n\t" - "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t" - - "packssdw %%mm2, %%mm0 \n\t" // CcAa - "packssdw %%mm3, %%mm1 \n\t" // DdBb - "packssdw %%mm5, %%mm5 \n\t" // FfFf - "movq %%mm0, %%mm2 \n\t" // CcAa - "punpcklwd %%mm1, %%mm0 \n\t" // BAba - "punpckhwd %%mm1, %%mm2 \n\t" // DCdc - "pxor %%mm4, %%mm4 \n\t" // 0000 - "punpcklwd %%mm5, %%mm4 \n\t" // F0f0 - "movq %%mm0, %%mm1 \n\t" // BAba - "movq %%mm4, %%mm3 \n\t" // F0f0 - "punpckldq %%mm2, %%mm0 \n\t" // dcba - "punpckhdq %%mm1, %%mm1 \n\t" // BABA - "punpckldq %%mm1, %%mm4 \n\t" // BAf0 - "punpckhdq %%mm3, %%mm2 \n\t" // F0DC - - "movq %%mm0, (%0, %%"REG_D") \n\t" - "movq %%mm4, 8(%0, %%"REG_D") \n\t" - "movq %%mm2, 16(%0, %%"REG_D") \n\t" - "add $8, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1536), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 6*256; -} - -static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ - int32_t * f = (int32_t *) _f; - __asm__ volatile( - "mov $-1024, %%"REG_S" \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" -// "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq 1024(%1, %%"REG_S"), %%mm0\n\t" - "movq 3072(%1, %%"REG_S"), %%mm1\n\t" - "movq 4096(%1, %%"REG_S"), %%mm2\n\t" - "movq 5120(%1, %%"REG_S"), %%mm3\n\t" - "movq 2048(%1, %%"REG_S"), %%mm4\n\t" - "movq (%1, %%"REG_S"), %%mm5 \n\t" - "psubd %%mm7, %%mm0 \n\t" - "psubd %%mm7, %%mm1 \n\t" - "psubd %%mm7, %%mm2 \n\t" - "psubd %%mm7, %%mm3 \n\t" - "psubd %%mm7, %%mm4 \n\t" - "psubd %%mm7, %%mm5 \n\t" - "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t" - - "packssdw %%mm2, %%mm0 \n\t" // CcAa - "packssdw %%mm3, %%mm1 \n\t" // DdBb - "packssdw %%mm4, %%mm4 \n\t" // EeEe - "packssdw %%mm5, %%mm5 \n\t" // FfFf - "movq %%mm0, %%mm2 \n\t" // CcAa - "punpcklwd %%mm1, %%mm0 \n\t" // BAba - "punpckhwd %%mm1, %%mm2 \n\t" // DCdc - "punpcklwd %%mm5, %%mm4 \n\t" // FEfe - "movq %%mm0, %%mm1 \n\t" // BAba - "movq %%mm4, %%mm3 \n\t" // FEfe - "punpckldq %%mm2, %%mm0 \n\t" // dcba - "punpckhdq %%mm1, %%mm1 \n\t" // BABA - "punpckldq %%mm1, %%mm4 \n\t" // BAfe - "punpckhdq %%mm3, %%mm2 \n\t" // FEDC - - "movq %%mm0, (%0, %%"REG_D") \n\t" - "movq %%mm4, 8(%0, %%"REG_D") \n\t" - "movq %%mm2, 16(%0, %%"REG_D") \n\t" - "add $8, %%"REG_S" \n\t" - " jnz 1b \n\t" - "emms \n\t" - :: "r" (s16+1536), "r" (f+256) - :"%"REG_S, "%"REG_D, "memory" - ); - return 6*256; -} - - -static void* a52_resample_MMX(int flags, int ch){ - switch (flags) { - case A52_MONO: - if(ch==5) return a52_resample_MONO_to_5_MMX; - break; - case A52_CHANNEL: - case A52_STEREO: - case A52_DOLBY: - if(ch==2) return a52_resample_STEREO_to_2_MMX; - break; - case A52_3F: - if(ch==5) return a52_resample_3F_to_5_MMX; - break; - case A52_2F2R: - if(ch==4) return a52_resample_2F_2R_to_4_MMX; - break; - case A52_3F2R: - if(ch==5) return a52_resample_3F_2R_to_5_MMX; - break; - case A52_MONO | A52_LFE: - if(ch==6) return a52_resample_MONO_LFE_to_6_MMX; - break; - case A52_CHANNEL | A52_LFE: - case A52_STEREO | A52_LFE: - case A52_DOLBY | A52_LFE: - if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX; - break; - case A52_3F | A52_LFE: - if(ch==6) return a52_resample_3F_LFE_to_6_MMX; - break; - case A52_2F2R | A52_LFE: - if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX; - break; - case A52_3F2R | A52_LFE: - if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX; - break; - } - return NULL; -} - - diff -r 459227551819 -r 1aece15222b5 liba52/srfftp.h --- a/liba52/srfftp.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,303 +0,0 @@ - -/* - * srfftp.h - * - * Copyright (C) Yuqing Deng - April 2000 - * - * 64 and 128 point split radix fft for ac3dec - * - * The algorithm is desribed in the book: - * "Computational Frameworks of the Fast Fourier Transform". - * - * The ideas and the the organization of code borrowed from djbfft written by - * D. J. Bernstein . djbff can be found at - * http://cr.yp.to/djbfft.html. - * - * srfftp.h is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * srfftp.h is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#ifndef SRFFTP_H__ -#define SRFFTP_H__ - -static complex_t delta16[4] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.92387953251129, -0.38268343236509}, - {0.70710678118655, -0.70710678118655}, - {0.38268343236509, -0.92387953251129}}; - -static complex_t delta16_3[4] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.38268343236509, -0.92387953251129}, - {-0.70710678118655, -0.70710678118655}, - {-0.92387953251129, 0.38268343236509}}; - -static complex_t delta32[8] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.98078528040323, -0.19509032201613}, - {0.92387953251129, -0.38268343236509}, - {0.83146961230255, -0.55557023301960}, - {0.70710678118655, -0.70710678118655}, - {0.55557023301960, -0.83146961230255}, - {0.38268343236509, -0.92387953251129}, - {0.19509032201613, -0.98078528040323}}; - -static complex_t delta32_3[8] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.83146961230255, -0.55557023301960}, - {0.38268343236509, -0.92387953251129}, - {-0.19509032201613, -0.98078528040323}, - {-0.70710678118655, -0.70710678118655}, - {-0.98078528040323, -0.19509032201613}, - {-0.92387953251129, 0.38268343236509}, - {-0.55557023301960, 0.83146961230255}}; - -static complex_t delta64[16] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.99518472667220, -0.09801714032956}, - {0.98078528040323, -0.19509032201613}, - {0.95694033573221, -0.29028467725446}, - {0.92387953251129, -0.38268343236509}, - {0.88192126434836, -0.47139673682600}, - {0.83146961230255, -0.55557023301960}, - {0.77301045336274, -0.63439328416365}, - {0.70710678118655, -0.70710678118655}, - {0.63439328416365, -0.77301045336274}, - {0.55557023301960, -0.83146961230255}, - {0.47139673682600, -0.88192126434835}, - {0.38268343236509, -0.92387953251129}, - {0.29028467725446, -0.95694033573221}, - {0.19509032201613, -0.98078528040323}, - {0.09801714032956, -0.99518472667220}}; - -static complex_t delta64_3[16] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.95694033573221, -0.29028467725446}, - {0.83146961230255, -0.55557023301960}, - {0.63439328416365, -0.77301045336274}, - {0.38268343236509, -0.92387953251129}, - {0.09801714032956, -0.99518472667220}, - {-0.19509032201613, -0.98078528040323}, - {-0.47139673682600, -0.88192126434836}, - {-0.70710678118655, -0.70710678118655}, - {-0.88192126434835, -0.47139673682600}, - {-0.98078528040323, -0.19509032201613}, - {-0.99518472667220, 0.09801714032956}, - {-0.92387953251129, 0.38268343236509}, - {-0.77301045336274, 0.63439328416365}, - {-0.55557023301960, 0.83146961230255}, - {-0.29028467725446, 0.95694033573221}}; - -static complex_t delta128[32] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.99879545620517, -0.04906767432742}, - {0.99518472667220, -0.09801714032956}, - {0.98917650996478, -0.14673047445536}, - {0.98078528040323, -0.19509032201613}, - {0.97003125319454, -0.24298017990326}, - {0.95694033573221, -0.29028467725446}, - {0.94154406518302, -0.33688985339222}, - {0.92387953251129, -0.38268343236509}, - {0.90398929312344, -0.42755509343028}, - {0.88192126434836, -0.47139673682600}, - {0.85772861000027, -0.51410274419322}, - {0.83146961230255, -0.55557023301960}, - {0.80320753148064, -0.59569930449243}, - {0.77301045336274, -0.63439328416365}, - {0.74095112535496, -0.67155895484702}, - {0.70710678118655, -0.70710678118655}, - {0.67155895484702, -0.74095112535496}, - {0.63439328416365, -0.77301045336274}, - {0.59569930449243, -0.80320753148064}, - {0.55557023301960, -0.83146961230255}, - {0.51410274419322, -0.85772861000027}, - {0.47139673682600, -0.88192126434835}, - {0.42755509343028, -0.90398929312344}, - {0.38268343236509, -0.92387953251129}, - {0.33688985339222, -0.94154406518302}, - {0.29028467725446, -0.95694033573221}, - {0.24298017990326, -0.97003125319454}, - {0.19509032201613, -0.98078528040323}, - {0.14673047445536, -0.98917650996478}, - {0.09801714032956, -0.99518472667220}, - {0.04906767432742, -0.99879545620517}}; - -static complex_t delta128_3[32] __attribute__((aligned(16))) = - { {1.00000000000000, 0.00000000000000}, - {0.98917650996478, -0.14673047445536}, - {0.95694033573221, -0.29028467725446}, - {0.90398929312344, -0.42755509343028}, - {0.83146961230255, -0.55557023301960}, - {0.74095112535496, -0.67155895484702}, - {0.63439328416365, -0.77301045336274}, - {0.51410274419322, -0.85772861000027}, - {0.38268343236509, -0.92387953251129}, - {0.24298017990326, -0.97003125319454}, - {0.09801714032956, -0.99518472667220}, - {-0.04906767432742, -0.99879545620517}, - {-0.19509032201613, -0.98078528040323}, - {-0.33688985339222, -0.94154406518302}, - {-0.47139673682600, -0.88192126434836}, - {-0.59569930449243, -0.80320753148065}, - {-0.70710678118655, -0.70710678118655}, - {-0.80320753148065, -0.59569930449243}, - {-0.88192126434835, -0.47139673682600}, - {-0.94154406518302, -0.33688985339222}, - {-0.98078528040323, -0.19509032201613}, - {-0.99879545620517, -0.04906767432742}, - {-0.99518472667220, 0.09801714032956}, - {-0.97003125319454, 0.24298017990326}, - {-0.92387953251129, 0.38268343236509}, - {-0.85772861000027, 0.51410274419322}, - {-0.77301045336274, 0.63439328416365}, - {-0.67155895484702, 0.74095112535496}, - {-0.55557023301960, 0.83146961230255}, - {-0.42755509343028, 0.90398929312344}, - {-0.29028467725446, 0.95694033573221}, - {-0.14673047445536, 0.98917650996478}}; - -#define HSQRT2 0.707106781188; - -#define TRANSZERO(A0,A4,A8,A12) { \ - u_r = wTB[0].real; \ - v_i = u_r - wTB[k*2].real; \ - u_r += wTB[k*2].real; \ - u_i = wTB[0].imag; \ - v_r = wTB[k*2].imag - u_i; \ - u_i += wTB[k*2].imag; \ - a_r = A0.real; \ - a_i = A0.imag; \ - a1_r = a_r; \ - a1_r += u_r; \ - A0.real = a1_r; \ - a_r -= u_r; \ - A8.real = a_r; \ - a1_i = a_i; \ - a1_i += u_i; \ - A0.imag = a1_i; \ - a_i -= u_i; \ - A8.imag = a_i; \ - a1_r = A4.real; \ - a1_i = A4.imag; \ - a_r = a1_r; \ - a_r -= v_r; \ - A4.real = a_r; \ - a1_r += v_r; \ - A12.real = a1_r; \ - a_i = a1_i; \ - a_i -= v_i; \ - A4.imag = a_i; \ - a1_i += v_i; \ - A12.imag = a1_i; \ - } - -#define TRANSHALF_16(A2,A6,A10,A14) {\ - u_r = wTB[2].real; \ - a_r = u_r; \ - u_i = wTB[2].imag; \ - u_r += u_i; \ - u_i -= a_r; \ - a_r = wTB[6].real; \ - a1_r = a_r; \ - a_i = wTB[6].imag; \ - a_r = a_i - a_r; \ - a_i += a1_r; \ - v_i = u_r - a_r; \ - u_r += a_r; \ - v_r = u_i + a_i; \ - u_i -= a_i; \ - v_i *= HSQRT2; \ - v_r *= HSQRT2; \ - u_r *= HSQRT2; \ - u_i *= HSQRT2; \ - a_r = A2.real; \ - a_i = A2.imag; \ - a1_r = a_r; \ - a1_r += u_r; \ - A2.real = a1_r; \ - a_r -= u_r; \ - A10.real = a_r; \ - a1_i = a_i; \ - a1_i += u_i; \ - A2.imag = a1_i; \ - a_i -= u_i; \ - A10.imag = a_i; \ - a1_r = A6.real; \ - a1_i = A6.imag; \ - a_r = a1_r; \ - a1_r += v_r; \ - A6.real = a1_r; \ - a_r -= v_r; \ - A14.real = a_r; \ - a_i = a1_i; \ - a1_i -= v_i; \ - A6.imag = a1_i; \ - a_i += v_i; \ - A14.imag = a_i; \ - } - -#define TRANS(A1,A5,A9,A13,WT,WB,D,D3) { \ - u_r = WT.real; \ - a_r = u_r; \ - a_r *= D.imag; \ - u_r *= D.real; \ - a_i = WT.imag; \ - a1_i = a_i; \ - a1_i *= D.real; \ - a_i *= D.imag; \ - u_r -= a_i; \ - u_i = a_r; \ - u_i += a1_i; \ - a_r = WB.real; \ - a1_r = a_r; \ - a1_r *= D3.real; \ - a_r *= D3.imag; \ - a_i = WB.imag; \ - a1_i = a_i; \ - a_i *= D3.real; \ - a1_i *= D3.imag; \ - a1_r -= a1_i; \ - a_r += a_i; \ - v_i = u_r - a1_r; \ - u_r += a1_r; \ - v_r = a_r - u_i; \ - u_i += a_r; \ - a_r = A1.real; \ - a_i = A1.imag; \ - a1_r = a_r; \ - a1_r += u_r; \ - A1.real = a1_r; \ - a_r -= u_r; \ - A9.real = a_r; \ - a1_i = a_i; \ - a1_i += u_i; \ - A1.imag = a1_i; \ - a_i -= u_i; \ - A9.imag = a_i; \ - a1_r = A5.real; \ - a1_i = A5.imag; \ - a_r = a1_r; \ - a1_r -= v_r; \ - A5.real = a1_r; \ - a_r += v_r; \ - A13.real = a_r; \ - a_i = a1_i; \ - a1_i -= v_i; \ - A5.imag = a1_i; \ - a_i += v_i; \ - A13.imag = a_i; \ - } - -#endif diff -r 459227551819 -r 1aece15222b5 liba52/srfftp_3dnow.h --- a/liba52/srfftp_3dnow.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,187 +0,0 @@ -/* - * srfftp.h - * - * Copyright (C) Yuqing Deng - April 2000 - * - * 64 and 128 point split radix fft for ac3dec - * - * The algorithm is desribed in the book: - * "Computational Frameworks of the Fast Fourier Transform". - * - * The ideas and the the organization of code borrowed from djbfft written by - * D. J. Bernstein . djbff can be found at - * http://cr.yp.to/djbfft.html. - * - * srfftp.h is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * srfftp.h is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations - * by Nick Kurshev - */ - -#ifndef SRFFTP_3DNOW_H__ -#define SRFFTP_3DNOW_H__ - -typedef struct -{ - unsigned long val[2]; -}i_cmplx_t; - -#define TRANS_FILL_MM6_MM7_3DNOW()\ - __asm__ volatile(\ - "movq %1, %%mm7\n\t"\ - "movq %0, %%mm6\n\t"\ - ::"m"(x_plus_minus_3dnow),\ - "m"(x_minus_plus_3dnow)\ - :"memory"); - -#if HAVE_AMD3DNOWEXT -#define PSWAP_MM(mm_base,mm_hlp) "pswapd "mm_base","mm_base"\n\t" -#else -#define PSWAP_MM(mm_base,mm_hlp)\ - "movq "mm_base","mm_hlp"\n\t"\ - "psrlq $32, "mm_base"\n\t"\ - "punpckldq "mm_hlp","mm_base"\n\t" -#endif -#if HAVE_AMD3DNOWEXT -#define PFNACC_MM(mm_base,mm_hlp) "pfnacc "mm_base","mm_base"\n\t" -#else -#define PFNACC_MM(mm_base,mm_hlp)\ - "movq "mm_base","mm_hlp"\n\t"\ - "psrlq $32,"mm_hlp"\n\t"\ - "punpckldq "mm_hlp","mm_hlp"\n\t"\ - "pfsub "mm_hlp","mm_base"\n\t" -#endif - -#define TRANSZERO_3DNOW(A0,A4,A8,A12) \ -{ \ - __asm__ volatile(\ - "movq %4, %%mm0\n\t" /* mm0 = wTB[0]*/\ - "movq %5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \ - "movq %%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\ - "pfadd %%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\ - "pxor %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\ - "pxor %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\ - "pfadd %%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\ - "movq %%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\ - PSWAP_MM("%%mm4","%%mm2")/* mm4 = v*/\ - "movq %6, %%mm0\n\t" /* a1 = A0;*/\ - "movq %7, %%mm2\n\t" /* a1 = A4;*/\ - "movq %%mm0, %%mm1\n\t"\ - "movq %%mm2, %%mm3\n\t"\ - "pfadd %%mm5, %%mm0\n\t" /*A0 = a1 + u;*/\ - "pfadd %%mm4, %%mm2\n\t" /*A12 = a1 + v;*/\ - "movq %%mm0, %0\n\t"\ - "pfsub %%mm5, %%mm1\n\t" /*A1 = a1 - u;*/\ - "movq %%mm2, %3\n\t"\ - "pfsub %%mm4, %%mm3\n\t" /*A4 = a1 - v;*/\ - "movq %%mm1, %1\n\t"\ - "movq %%mm3, %2"\ - :"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\ - :"m"(wTB[0]), "m"(wTB[k*2]), "m"(A0), "m"(A4)\ - :"memory");\ -} - -#define TRANSHALF_16_3DNOW(A2,A6,A10,A14)\ -{\ - __asm__ volatile(\ - "movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\ - "movq %%mm0, %%mm1\n\t"\ - "pxor %%mm7, %%mm1\n\t"\ - "pfacc %%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\ - "movq %5, %%mm1\n\t" /*a.re = wTB[6].im - wTB[6].re; */\ - "movq %%mm1, %%mm2\n\t"\ - "pxor %%mm7, %%mm1\n\t"\ - "pfacc %%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re; mm1 = a*/\ - "movq %%mm1, %%mm2\n\t"\ - "pxor %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\ - "movq %%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\ - "pfadd %%mm2, %%mm3\n\t"\ - PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\ - "pxor %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\ - "pfadd %%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\ - "movq %8, %%mm2\n\t"\ - "pfmul %%mm2, %%mm3\n\t" /* v *= HSQRT2_3DNOW; */\ - "pfmul %%mm2, %%mm0\n\t" /* u *= HSQRT2_3DNOW; */\ - "movq %6, %%mm1\n\t" /* a1 = A2;*/\ - "movq %7, %%mm5\n\t" /* a1 = A6;*/\ - "movq %%mm1, %%mm2\n\t"\ - "movq %%mm3, %%mm4\n\t"\ - "pfadd %%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\ - "pxor %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\ - "pfsub %%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\ - "pxor %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\ - "movq %%mm1, %0\n\t"\ - "movq %%mm2, %1\n\t"\ - "movq %%mm5, %%mm2\n\t"\ - "pfadd %%mm4, %%mm5\n\t"/*A6.im = a1.im - v.im;*/\ - "pfadd %%mm3, %%mm2\n\t"/*A14.im = a1.im + v.im;*/\ - "movq %%mm5, %2\n\t"\ - "movq %%mm2, %3"\ - :"=m"(A2), "=m"(A10), "=m"(A6), "=m"(A14)\ - :"m"(wTB[2]), "m"(wTB[6]), "m"(A2), "m"(A6), "m"(HSQRT2_3DNOW)\ - :"memory");\ -} - -#define TRANS_3DNOW(A1,A5,A9,A13,WT,WB,D,D3)\ -{ \ - __asm__ volatile(\ - "movq %1, %%mm4\n\t"\ - "movq %%mm4, %%mm5\n\t"\ - "punpckldq %%mm4, %%mm4\n\t"/*mm4 = D.re | D.re */\ - "punpckhdq %%mm5, %%mm5\n\t"/*mm5 = D.im | D.im */\ - "movq %0, %%mm0\n\t"\ - "pfmul %%mm0, %%mm4\n\t"/* mm4 =u.re | u.im */\ - "pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\ - PSWAP_MM("%%mm5","%%mm3")\ - "pxor %%mm7, %%mm5\n\t"\ - "pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\ - "movq %3, %%mm1\n\t"\ - "movq %2, %%mm0\n\t"\ - PSWAP_MM("%%mm1","%%mm3")\ - "movq %%mm0, %%mm2\n\t"\ - "pfmul %%mm1, %%mm0\n\t"/* mm0 = a*/\ - "pfmul %3, %%mm2\n\t"/* mm2 = v*/\ - PFNACC_MM("%%mm2","%%mm3")\ - "pfacc %%mm0, %%mm0\n\t"\ - "movq %%mm4, %%mm5\n\t"\ - "punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\ - "pxor %%mm6, %%mm5\n\t"\ - "movq %%mm2, %%mm3\n\t"\ - "pxor %%mm7, %%mm3\n\t"\ - "pfadd %%mm3, %%mm5\n\t"\ - PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\ - "pfadd %%mm2, %%mm4\n\t"\ - :\ - :"m"(WT), "m"(D), "m"(WB), "m"(D3)\ - :"memory");\ - __asm__ volatile(\ - "movq %4, %%mm0\n\t"/* a1 = A1*/\ - "movq %5, %%mm2\n\t"/* a1 = A5*/\ - "movq %%mm0, %%mm1\n\t"\ - "movq %%mm2, %%mm3\n\t"\ - "pfadd %%mm4, %%mm0\n\t"/*A1 = a1 + u*/\ - "pfsub %%mm5, %%mm2\n\t"/*A5 = a1 - v*/\ - "movq %%mm0, %0\n\t"\ - "pfsub %%mm4, %%mm1\n\t"/*A9 = a1 - u*/\ - "movq %%mm2, %2\n\t"\ - "pfadd %%mm5, %%mm3\n\t"/*A9 = a1 + v*/\ - "movq %%mm1, %1\n\t"\ - "movq %%mm3, %3"\ - :"=m"(A1), "=m"(A9), "=m"(A5), "=m"(A13)\ - :"m"(A1), "m"(A5)\ - :"memory");\ -} - -#endif diff -r 459227551819 -r 1aece15222b5 liba52/tables.h --- a/liba52/tables.h Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,246 +0,0 @@ -/* - * tables.h - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * - * This file is part of a52dec, a free ATSC A-52 stream decoder. - * See http://liba52.sourceforge.net/ for updates. - * - * a52dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * a52dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -static const int8_t exp_1[128] = { - -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, - -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 25,25,25 -}; -static const int8_t exp_2[128] = { - -2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - -2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - -2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - -2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - -2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - 25,25,25 -}; -static const int8_t exp_3[128] = { - -2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2, - -2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2, - -2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2, - -2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2, - -2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2, - 25,25,25 -}; - -#define Q0 ((-2 << 15) / 3.0) -#define Q1 (0) -#define Q2 ((2 << 15) / 3.0) - -static const sample_t q_1_0[32] = { - Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0, - Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1, - Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2, - 0,0,0,0,0 -}; - -static const sample_t q_1_1[32] = { - Q0,Q0,Q0,Q1,Q1,Q1,Q2,Q2,Q2, - Q0,Q0,Q0,Q1,Q1,Q1,Q2,Q2,Q2, - Q0,Q0,Q0,Q1,Q1,Q1,Q2,Q2,Q2, - 0,0,0,0,0 -}; - -static const sample_t q_1_2[32] = { - Q0,Q1,Q2,Q0,Q1,Q2,Q0,Q1,Q2, - Q0,Q1,Q2,Q0,Q1,Q2,Q0,Q1,Q2, - Q0,Q1,Q2,Q0,Q1,Q2,Q0,Q1,Q2, - 0,0,0,0,0 -}; - -#undef Q0 -#undef Q1 -#undef Q2 - -#define Q0 ((-4 << 15) / 5.0) -#define Q1 ((-2 << 15) / 5.0) -#define Q2 (0) -#define Q3 ((2 << 15) / 5.0) -#define Q4 ((4 << 15) / 5.0) - -static const sample_t q_2_0[128] = { - Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0, - Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1, - Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2, - Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3, - Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4, - 0,0,0 -}; - -static const sample_t q_2_1[128] = { - Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,Q4,Q4,Q4,Q4,Q4, - Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,Q4,Q4,Q4,Q4,Q4, - Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,Q4,Q4,Q4,Q4,Q4, - Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,Q4,Q4,Q4,Q4,Q4, - Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,Q4,Q4,Q4,Q4,Q4, - 0,0,0 -}; - -static const sample_t q_2_2[128] = { - Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4, - Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4, - Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4, - Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4, - Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4, - 0,0,0 -}; - -#undef Q0 -#undef Q1 -#undef Q2 -#undef Q3 -#undef Q4 - -static const sample_t q_3[8] = { - (-6 << 15)/7.0, (-4 << 15)/7.0, (-2 << 15)/7.0, 0, - ( 2 << 15)/7.0, ( 4 << 15)/7.0, ( 6 << 15)/7.0, 0 -}; - -#define Q0 ((-10 << 15) / 11.0) -#define Q1 ((-8 << 15) / 11.0) -#define Q2 ((-6 << 15) / 11.0) -#define Q3 ((-4 << 15) / 11.0) -#define Q4 ((-2 << 15) / 11.0) -#define Q5 (0) -#define Q6 ((2 << 15) / 11.0) -#define Q7 ((4 << 15) / 11.0) -#define Q8 ((6 << 15) / 11.0) -#define Q9 ((8 << 15) / 11.0) -#define QA ((10 << 15) / 11.0) - -static const sample_t q_4_0[128] = { - Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0, - Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1, - Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2, - Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3, - Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4, - Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5, - Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6, - Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7, - Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8, - Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9, - QA, QA, QA, QA, QA, QA, QA, QA, QA, QA, QA, - 0, 0, 0, 0, 0, 0, 0 -}; - -static const sample_t q_4_1[128] = { - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA, - 0, 0, 0, 0, 0, 0, 0 -}; - -#undef Q0 -#undef Q1 -#undef Q2 -#undef Q3 -#undef Q4 -#undef Q5 -#undef Q6 -#undef Q7 -#undef Q8 -#undef Q9 -#undef QA - -static const sample_t q_5[16] = { - (-14 << 15)/15.0,(-12 << 15)/15.0,(-10 << 15)/15.0, - ( -8 << 15)/15.0,( -6 << 15)/15.0,( -4 << 15)/15.0, - ( -2 << 15)/15.0, 0 ,( 2 << 15)/15.0, - ( 4 << 15)/15.0,( 6 << 15)/15.0,( 8 << 15)/15.0, - ( 10 << 15)/15.0,( 12 << 15)/15.0,( 14 << 15)/15.0, - 0 -}; - -static const sample_t scale_factor[25] = { - 0.000030517578125, - 0.0000152587890625, - 0.00000762939453125, - 0.000003814697265625, - 0.0000019073486328125, - 0.00000095367431640625, - 0.000000476837158203125, - 0.0000002384185791015625, - 0.00000011920928955078125, - 0.000000059604644775390625, - 0.0000000298023223876953125, - 0.00000001490116119384765625, - 0.000000007450580596923828125, - 0.0000000037252902984619140625, - 0.00000000186264514923095703125, - 0.000000000931322574615478515625, - 0.0000000004656612873077392578125, - 0.00000000023283064365386962890625, - 0.000000000116415321826934814453125, - 0.0000000000582076609134674072265625, - 0.00000000002910383045673370361328125, - 0.000000000014551915228366851806640625, - 0.0000000000072759576141834259033203125, - 0.00000000000363797880709171295166015625, - 0.000000000001818989403545856475830078125 -}; - -static const uint16_t dither_lut[256] = { - 0x0000, 0xa011, 0xe033, 0x4022, 0x6077, 0xc066, 0x8044, 0x2055, - 0xc0ee, 0x60ff, 0x20dd, 0x80cc, 0xa099, 0x0088, 0x40aa, 0xe0bb, - 0x21cd, 0x81dc, 0xc1fe, 0x61ef, 0x41ba, 0xe1ab, 0xa189, 0x0198, - 0xe123, 0x4132, 0x0110, 0xa101, 0x8154, 0x2145, 0x6167, 0xc176, - 0x439a, 0xe38b, 0xa3a9, 0x03b8, 0x23ed, 0x83fc, 0xc3de, 0x63cf, - 0x8374, 0x2365, 0x6347, 0xc356, 0xe303, 0x4312, 0x0330, 0xa321, - 0x6257, 0xc246, 0x8264, 0x2275, 0x0220, 0xa231, 0xe213, 0x4202, - 0xa2b9, 0x02a8, 0x428a, 0xe29b, 0xc2ce, 0x62df, 0x22fd, 0x82ec, - 0x8734, 0x2725, 0x6707, 0xc716, 0xe743, 0x4752, 0x0770, 0xa761, - 0x47da, 0xe7cb, 0xa7e9, 0x07f8, 0x27ad, 0x87bc, 0xc79e, 0x678f, - 0xa6f9, 0x06e8, 0x46ca, 0xe6db, 0xc68e, 0x669f, 0x26bd, 0x86ac, - 0x6617, 0xc606, 0x8624, 0x2635, 0x0660, 0xa671, 0xe653, 0x4642, - 0xc4ae, 0x64bf, 0x249d, 0x848c, 0xa4d9, 0x04c8, 0x44ea, 0xe4fb, - 0x0440, 0xa451, 0xe473, 0x4462, 0x6437, 0xc426, 0x8404, 0x2415, - 0xe563, 0x4572, 0x0550, 0xa541, 0x8514, 0x2505, 0x6527, 0xc536, - 0x258d, 0x859c, 0xc5be, 0x65af, 0x45fa, 0xe5eb, 0xa5c9, 0x05d8, - 0xae79, 0x0e68, 0x4e4a, 0xee5b, 0xce0e, 0x6e1f, 0x2e3d, 0x8e2c, - 0x6e97, 0xce86, 0x8ea4, 0x2eb5, 0x0ee0, 0xaef1, 0xeed3, 0x4ec2, - 0x8fb4, 0x2fa5, 0x6f87, 0xcf96, 0xefc3, 0x4fd2, 0x0ff0, 0xafe1, - 0x4f5a, 0xef4b, 0xaf69, 0x0f78, 0x2f2d, 0x8f3c, 0xcf1e, 0x6f0f, - 0xede3, 0x4df2, 0x0dd0, 0xadc1, 0x8d94, 0x2d85, 0x6da7, 0xcdb6, - 0x2d0d, 0x8d1c, 0xcd3e, 0x6d2f, 0x4d7a, 0xed6b, 0xad49, 0x0d58, - 0xcc2e, 0x6c3f, 0x2c1d, 0x8c0c, 0xac59, 0x0c48, 0x4c6a, 0xec7b, - 0x0cc0, 0xacd1, 0xecf3, 0x4ce2, 0x6cb7, 0xcca6, 0x8c84, 0x2c95, - 0x294d, 0x895c, 0xc97e, 0x696f, 0x493a, 0xe92b, 0xa909, 0x0918, - 0xe9a3, 0x49b2, 0x0990, 0xa981, 0x89d4, 0x29c5, 0x69e7, 0xc9f6, - 0x0880, 0xa891, 0xe8b3, 0x48a2, 0x68f7, 0xc8e6, 0x88c4, 0x28d5, - 0xc86e, 0x687f, 0x285d, 0x884c, 0xa819, 0x0808, 0x482a, 0xe83b, - 0x6ad7, 0xcac6, 0x8ae4, 0x2af5, 0x0aa0, 0xaab1, 0xea93, 0x4a82, - 0xaa39, 0x0a28, 0x4a0a, 0xea1b, 0xca4e, 0x6a5f, 0x2a7d, 0x8a6c, - 0x4b1a, 0xeb0b, 0xab29, 0x0b38, 0x2b6d, 0x8b7c, 0xcb5e, 0x6b4f, - 0x8bf4, 0x2be5, 0x6bc7, 0xcbd6, 0xeb83, 0x4b92, 0x0bb0, 0xaba1 -}; diff -r 459227551819 -r 1aece15222b5 liba52/test.c --- a/liba52/test.c Sun May 09 12:28:15 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ -/* - * liba52 sample by A'rpi/ESP-team - * Reads an AC-3 stream from stdin, decodes and downmixes to s16 stereo PCM - * and writes it to stdout. The resulting stream is playable with sox: - * play -c2 -r48000 -sw -fs out.sw - * - * Copyright (C) 2001 Árpád Gereöffy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -//#define TIMING //needs Pentium or newer - -#include -#include -#include -#include - -#include "a52.h" -#include "mm_accel.h" -#include "cpudetect.h" - -static a52_state_t *state; -static uint8_t buf[3840]; -static int buf_size=0; - -static int16_t out_buf[6*256*6]; - -void mp_msg( int x, const char *format, ... ) // stub for cpudetect.c -{ -} - -#ifdef TIMING -static inline long long rdtsc(void) -{ - long long l; - __asm__ volatile("rdtsc\n\t" - : "=A" (l) - ); -// printf("%d\n", int(l/1000)); - return l; -} - -#define STARTTIMING t=rdtsc(); -#define ENDTIMING sum+=rdtsc()-t; t=rdtsc(); -#else -#define STARTTIMING ; -#define ENDTIMING ; -#endif - - -int main(void){ -int accel=0; -int sample_rate=0; -int bit_rate=0; -#ifdef TIMING -long long t, sum=0, min=256*256*256*64; -#endif - - FILE *temp= stdout; - stdout= stderr; //EVIL HACK FIXME - GetCpuCaps(&gCpuCaps); - stdout= temp; -// gCpuCaps.hasMMX=0; -// gCpuCaps.hasSSE=0; - if(gCpuCaps.hasMMX) accel |= MM_ACCEL_X86_MMX; - if(gCpuCaps.hasMMX2) accel |= MM_ACCEL_X86_MMXEXT; - if(gCpuCaps.hasSSE) accel |= MM_ACCEL_X86_SSE; - if(gCpuCaps.has3DNow) accel |= MM_ACCEL_X86_3DNOW; -// if(gCpuCaps.has3DNowExt) accel |= MM_ACCEL_X86_3DNOWEXT; - - state = a52_init (accel); - if (state == NULL) { - fprintf (stderr, "A52 init failed\n"); - return 1; - } - -while(1){ - int length,i; - int16_t *s16; - sample_t level=1, bias=384; - int flags=0; - int channels=0; - - while(buf_size<7){ - int c=getchar(); - if(c<0) goto eof; - buf[buf_size++]=c; - } -STARTTIMING - length = a52_syncinfo (buf, &flags, &sample_rate, &bit_rate); -ENDTIMING - if(!length){ - // bad file => resync - memcpy(buf,buf+1,6); - --buf_size; - continue; - } - fprintf(stderr,"sync. %d bytes 0x%X %d Hz %d kbit\n",length,flags,sample_rate,bit_rate); - while(buf_sizeint + channels interleaving: - s16+=a52_resample(a52_samples(state),s16); -ENDTIMING - } -#ifdef TIMING -if(sum #include int (* a52_resample) (float * _f, int16_t * s16); -#endif static a52_state_t *a52_state; static uint32_t a52_flags=0; @@ -150,11 +145,7 @@ { /* Dolby AC3 audio: */ /* however many channels, 2 bytes in a word, 256 samples in a block, 6 blocks in a frame */ -#ifdef CONFIG_LIBA52_INTERNAL - if (sh->samplesize < 2) sh->samplesize = 2; -#else if (sh->samplesize < 4) sh->samplesize = 4; -#endif sh->audio_out_minsize=audio_output_channels*sh->samplesize*256*6; sh->audio_in_minsize=3840; a52_level = 1.0; @@ -208,9 +199,7 @@ mp_msg(MSGT_DECAUDIO,MSGL_ERR,"A52 init failed\n"); return 0; } -#ifndef CONFIG_LIBA52_INTERNAL sh_audio->sample_format = AF_FORMAT_FLOAT_NE; -#endif if(a52_fillbuff(sh_audio)<0){ mp_msg(MSGT_DECAUDIO,MSGL_ERR,"A52 sync failed\n"); return 0; @@ -283,12 +272,7 @@ break; } } else -#ifdef CONFIG_LIBA52_INTERNAL - if(a52_resample_init(a52_accel,flags,sh_audio->channels)) break; - --sh_audio->channels; /* try to decrease no. of channels*/ -#else break; -#endif } if(sh_audio->channels<=0){ mp_msg(MSGT_DECAUDIO,MSGL_ERR,"a52: no resampler. try different channel setup!\n"); diff -r 459227551819 -r 1aece15222b5 libmpdemux/muxer_mpeg.c --- a/libmpdemux/muxer_mpeg.c Sun May 09 12:28:15 2010 +0000 +++ b/libmpdemux/muxer_mpeg.c Sun May 09 14:45:29 2010 +0000 @@ -41,8 +41,6 @@ #ifdef CONFIG_LIBA52 #include -#else -#include "liba52/a52.h" #endif #define PACK_HEADER_START_CODE 0x01ba