Mercurial > mplayer.hg
comparison mp3lib/dct64_sse.c @ 23464:1b1fdac4a68c
Align output pointer so that we can use movaps instead of movups in dct64_sse;
1.5% faster decode.
author | zuxy |
---|---|
date | Wed, 06 Jun 2007 05:13:13 +0000 |
parents | 1b739c2dc613 |
children | a1a699833dcf |
comparison
equal
deleted
inserted
replaced
23463:b4214e05bb3f | 23464:1b1fdac4a68c |
---|---|
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com> | 3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com> |
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c | 4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c |
5 * and mp3lib/dct64_MMX.c | 5 * and mp3lib/dct64_MMX.c |
6 */ | 6 */ |
7 | 7 |
8 /* NOTE: The following code is suboptimal! It can be improved (at least) by | 8 #include <libavutil/mem.h> |
9 | |
10 1. Replace all movups by movaps. (Can Parameter c be always aligned on | |
11 a 16-byte boundary?) | |
12 | |
13 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics | |
14 better. However, when __m128 locals are involved, GCC may | |
15 produce bad code that uses movaps to access a stack not aligned | |
16 on a 16-byte boundary, which leads to run-time crashes.) | |
17 | |
18 */ | |
19 | 9 |
20 typedef float real; | 10 typedef float real; |
21 | 11 |
22 extern float __attribute__((aligned(16))) costab_mmx[]; | 12 extern float __attribute__((aligned(16))) costab_mmx[]; |
23 | 13 |
30 static const int nnnn[4] __attribute__((aligned(16))) = | 20 static const int nnnn[4] __attribute__((aligned(16))) = |
31 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; | 21 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; |
32 | 22 |
33 void dct64_sse(short *out0,short *out1,real *c) | 23 void dct64_sse(short *out0,short *out1,real *c) |
34 { | 24 { |
35 static real __attribute__ ((aligned(16))) b1[0x20]; | 25 static DECLARE_ALIGNED(16, real, b1[0x20]); |
36 static real __attribute__ ((aligned(16))) b2[0x20]; | 26 static DECLARE_ALIGNED(16, real, b2[0x20]); |
37 static real const one = 1.f; | 27 static real const one = 1.f; |
38 | 28 |
39 { | 29 { |
40 real *costab = costab_mmx; | 30 real *costab = costab_mmx; |
41 int i; | 31 int i; |
43 for (i = 0; i < 0x20 / 2; i += 4) | 33 for (i = 0; i < 0x20 / 2; i += 4) |
44 { | 34 { |
45 asm( | 35 asm( |
46 "movaps %2, %%xmm3\n\t" | 36 "movaps %2, %%xmm3\n\t" |
47 "shufps $27, %%xmm3, %%xmm3\n\t" | 37 "shufps $27, %%xmm3, %%xmm3\n\t" |
48 "movups %3, %%xmm1\n\t" | 38 "movaps %3, %%xmm1\n\t" |
49 "movaps %%xmm1, %%xmm4\n\t" | 39 "movaps %%xmm1, %%xmm4\n\t" |
50 "movups %4, %%xmm2\n\t" | 40 "movaps %4, %%xmm2\n\t" |
51 "shufps $27, %%xmm4, %%xmm4\n\t" | 41 "shufps $27, %%xmm4, %%xmm4\n\t" |
52 "movaps %%xmm2, %%xmm0\n\t" | 42 "movaps %%xmm2, %%xmm0\n\t" |
53 "shufps $27, %%xmm0, %%xmm0\n\t" | 43 "shufps $27, %%xmm0, %%xmm0\n\t" |
54 "addps %%xmm0, %%xmm1\n\t" | 44 "addps %%xmm0, %%xmm1\n\t" |
55 "movaps %%xmm1, %0\n\t" | 45 "movaps %%xmm1, %0\n\t" |