Mercurial > mplayer.hg
changeset 23464:1b1fdac4a68c
Align output pointer so that we can use movaps instead of movups in dct64_sse;
1.5% faster decode.
author | zuxy |
---|---|
date | Wed, 06 Jun 2007 05:13:13 +0000 |
parents | b4214e05bb3f |
children | a1a699833dcf |
files | mp3lib/dct64_sse.c mp3lib/layer1.c mp3lib/layer2.c mp3lib/layer3.c |
diffstat | 4 files changed, 9 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/mp3lib/dct64_sse.c Tue Jun 05 23:30:37 2007 +0000 +++ b/mp3lib/dct64_sse.c Wed Jun 06 05:13:13 2007 +0000 @@ -5,17 +5,7 @@ * and mp3lib/dct64_MMX.c */ -/* NOTE: The following code is suboptimal! It can be improved (at least) by - - 1. Replace all movups by movaps. (Can Parameter c be always aligned on - a 16-byte boundary?) - - 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics - better. However, when __m128 locals are involved, GCC may - produce bad code that uses movaps to access a stack not aligned - on a 16-byte boundary, which leads to run-time crashes.) - -*/ +#include <libavutil/mem.h> typedef float real; @@ -32,8 +22,8 @@ void dct64_sse(short *out0,short *out1,real *c) { - static real __attribute__ ((aligned(16))) b1[0x20]; - static real __attribute__ ((aligned(16))) b2[0x20]; + static DECLARE_ALIGNED(16, real, b1[0x20]); + static DECLARE_ALIGNED(16, real, b2[0x20]); static real const one = 1.f; { @@ -45,9 +35,9 @@ asm( "movaps %2, %%xmm3\n\t" "shufps $27, %%xmm3, %%xmm3\n\t" - "movups %3, %%xmm1\n\t" + "movaps %3, %%xmm1\n\t" "movaps %%xmm1, %%xmm4\n\t" - "movups %4, %%xmm2\n\t" + "movaps %4, %%xmm2\n\t" "shufps $27, %%xmm4, %%xmm4\n\t" "movaps %%xmm2, %%xmm0\n\t" "shufps $27, %%xmm0, %%xmm0\n\t"
--- a/mp3lib/layer1.c Tue Jun 05 23:30:37 2007 +0000 +++ b/mp3lib/layer1.c Wed Jun 06 05:13:13 2007 +0000 @@ -131,7 +131,7 @@ int i,stereo = fr->stereo; unsigned int balloc[2*SBLIMIT]; unsigned int scale_index[2][SBLIMIT]; - real fraction[2][SBLIMIT]; + DECLARE_ALIGNED(16, real, fraction[2][SBLIMIT]); // int single = fr->single; // printf("do_layer1(0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X )\n",
--- a/mp3lib/layer2.c Tue Jun 05 23:30:37 2007 +0000 +++ b/mp3lib/layer2.c Wed Jun 06 05:13:13 2007 +0000 @@ -285,7 +285,7 @@ int clip=0; int i,j; int stereo = fr->stereo; - real fraction[2][4][SBLIMIT]; /* pick_table clears unused subbands */ + DECLARE_ALIGNED(16, real, fraction[2][4][SBLIMIT]); /* pick_table clears unused subbands */ unsigned int bit_alloc[64]; int scale[192]; int single = fr->single;
--- a/mp3lib/layer3.c Tue Jun 05 23:30:37 2007 +0000 +++ b/mp3lib/layer3.c Wed Jun 06 05:13:13 2007 +0000 @@ -1260,8 +1260,8 @@ granules = (fr->lsf) ? 1 : 2; for (gr=0;gr<granules;gr++){ - static real hybridIn[2][SBLIMIT][SSLIMIT]; - static real hybridOut[2][SSLIMIT][SBLIMIT]; + static DECLARE_ALIGNED(16, real, hybridIn[2][SBLIMIT][SSLIMIT]); + static DECLARE_ALIGNED(16, real, hybridOut[2][SSLIMIT][SBLIMIT]); { struct gr_info_s *gr_info = &(sideinfo.ch[0].gr[gr]); int part2bits;