comparison mp3lib/dct64_sse.c @ 23464:1b1fdac4a68c

Align output pointer so that we can use movaps instead of movups in dct64_sse; 1.5% faster decode.
author zuxy
date Wed, 06 Jun 2007 05:13:13 +0000
parents 1b739c2dc613
children a1a699833dcf
comparison
equal deleted inserted replaced
23463:b4214e05bb3f 23464:1b1fdac4a68c
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com> 3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c 4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
5 * and mp3lib/dct64_MMX.c 5 * and mp3lib/dct64_MMX.c
6 */ 6 */
7 7
8 /* NOTE: The following code is suboptimal! It can be improved (at least) by 8 #include <libavutil/mem.h>
9
10 1. Replace all movups by movaps. (Can Parameter c be always aligned on
11 a 16-byte boundary?)
12
13 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics
14 better. However, when __m128 locals are involved, GCC may
15 produce bad code that uses movaps to access a stack not aligned
16 on a 16-byte boundary, which leads to run-time crashes.)
17
18 */
19 9
20 typedef float real; 10 typedef float real;
21 11
22 extern float __attribute__((aligned(16))) costab_mmx[]; 12 extern float __attribute__((aligned(16))) costab_mmx[];
23 13
30 static const int nnnn[4] __attribute__((aligned(16))) = 20 static const int nnnn[4] __attribute__((aligned(16))) =
31 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; 21 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
32 22
33 void dct64_sse(short *out0,short *out1,real *c) 23 void dct64_sse(short *out0,short *out1,real *c)
34 { 24 {
35 static real __attribute__ ((aligned(16))) b1[0x20]; 25 static DECLARE_ALIGNED(16, real, b1[0x20]);
36 static real __attribute__ ((aligned(16))) b2[0x20]; 26 static DECLARE_ALIGNED(16, real, b2[0x20]);
37 static real const one = 1.f; 27 static real const one = 1.f;
38 28
39 { 29 {
40 real *costab = costab_mmx; 30 real *costab = costab_mmx;
41 int i; 31 int i;
43 for (i = 0; i < 0x20 / 2; i += 4) 33 for (i = 0; i < 0x20 / 2; i += 4)
44 { 34 {
45 asm( 35 asm(
46 "movaps %2, %%xmm3\n\t" 36 "movaps %2, %%xmm3\n\t"
47 "shufps $27, %%xmm3, %%xmm3\n\t" 37 "shufps $27, %%xmm3, %%xmm3\n\t"
48 "movups %3, %%xmm1\n\t" 38 "movaps %3, %%xmm1\n\t"
49 "movaps %%xmm1, %%xmm4\n\t" 39 "movaps %%xmm1, %%xmm4\n\t"
50 "movups %4, %%xmm2\n\t" 40 "movaps %4, %%xmm2\n\t"
51 "shufps $27, %%xmm4, %%xmm4\n\t" 41 "shufps $27, %%xmm4, %%xmm4\n\t"
52 "movaps %%xmm2, %%xmm0\n\t" 42 "movaps %%xmm2, %%xmm0\n\t"
53 "shufps $27, %%xmm0, %%xmm0\n\t" 43 "shufps $27, %%xmm0, %%xmm0\n\t"
54 "addps %%xmm0, %%xmm1\n\t" 44 "addps %%xmm0, %%xmm1\n\t"
55 "movaps %%xmm1, %0\n\t" 45 "movaps %%xmm1, %0\n\t"