comparison i386/fft_sse.c @ 3166:ab1273ffe275 libavcodec

use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors. Patch by Zuxy Meng < zuxy POIS meng AH gmail POIS com > OKed by Michael Original thread: Date: Mar 5, 2006 8:15 PM Subject: [Ffmpeg-devel] [PATCH] Little optimization to fft_sse.c
author gpoirier
date Sun, 05 Mar 2006 20:25:18 +0000
parents 0b546eab515d
children a3d97c60ea07
comparison
equal deleted inserted replaced
3165:8b51e108cba6 3166:ab1273ffe275
21 21
22 #ifdef HAVE_BUILTIN_VECTOR 22 #ifdef HAVE_BUILTIN_VECTOR
23 23
24 #include <xmmintrin.h> 24 #include <xmmintrin.h>
25 25
26 static const float p1p1p1m1[4] __attribute__((aligned(16))) = 26 static const int p1p1p1m1[4] __attribute__((aligned(16))) =
27 { 1.0, 1.0, 1.0, -1.0 }; 27 { 0, 0, 0, 1 << 31 };
28 28
29 static const float p1p1m1p1[4] __attribute__((aligned(16))) = 29 static const int p1p1m1p1[4] __attribute__((aligned(16))) =
30 { 1.0, 1.0, -1.0, 1.0 }; 30 { 0, 0, 1 << 31, 0 };
31 31
32 static const float p1p1m1m1[4] __attribute__((aligned(16))) = 32 static const int p1p1m1m1[4] __attribute__((aligned(16))) =
33 { 1.0, 1.0, -1.0, -1.0 }; 33 { 0, 0, 1 << 31, 1 << 31 };
34 34
35 #if 0 35 #if 0
36 static void print_v4sf(const char *str, __m128 a) 36 static void print_v4sf(const char *str, __m128 a)
37 { 37 {
38 float *p = (float *)&a; 38 float *p = (float *)&a;
56 { 56 {
57 __m128 *r, a, b, a1, c1, c2; 57 __m128 *r, a, b, a1, c1, c2;
58 58
59 r = (__m128 *)&z[0]; 59 r = (__m128 *)&z[0];
60 c1 = *(__m128 *)p1p1m1m1; 60 c1 = *(__m128 *)p1p1m1m1;
61 c2 = *(__m128 *)p1p1p1m1;
62 if (s->inverse) 61 if (s->inverse)
63 c2 = *(__m128 *)p1p1m1p1; 62 c2 = *(__m128 *)p1p1m1p1;
64 else 63 else
65 c2 = *(__m128 *)p1p1p1m1; 64 c2 = *(__m128 *)p1p1p1m1;
66 65
67 j = (np >> 2); 66 j = (np >> 2);
68 do { 67 do {
69 a = r[0]; 68 a = r[0];
70 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); 69 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
71 a = _mm_mul_ps(a, c1); 70 a = _mm_xor_ps(a, c1);
72 /* do the pass 0 butterfly */ 71 /* do the pass 0 butterfly */
73 a = _mm_add_ps(a, b); 72 a = _mm_add_ps(a, b);
74 73
75 a1 = r[1]; 74 a1 = r[1];
76 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); 75 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2));
77 a1 = _mm_mul_ps(a1, c1); 76 a1 = _mm_xor_ps(a1, c1);
78 /* do the pass 0 butterfly */ 77 /* do the pass 0 butterfly */
79 b = _mm_add_ps(a1, b); 78 b = _mm_add_ps(a1, b);
80 79
81 /* multiply third by -i */ 80 /* multiply third by -i */
81 /* by toggling the sign bit */
82 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); 82 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0));
83 b = _mm_mul_ps(b, c2); 83 b = _mm_xor_ps(b, c2);
84 84
85 /* do the pass 1 butterfly */ 85 /* do the pass 1 butterfly */
86 r[0] = _mm_add_ps(a, b); 86 r[0] = _mm_add_ps(a, b);
87 r[1] = _mm_sub_ps(a, b); 87 r[1] = _mm_sub_ps(a, b);
88 r += 2; 88 r += 2;