Mercurial > libavcodec.hg
comparison i386/fft_sse.c @ 3166:ab1273ffe275 libavcodec
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
Patch by Zuxy Meng < zuxy POIS meng AH gmail POIS com > OKed by Michael
Original thread:
Date: Mar 5, 2006 8:15 PM
Subject: [Ffmpeg-devel] [PATCH] Little optimization to fft_sse.c
author | gpoirier |
---|---|
date | Sun, 05 Mar 2006 20:25:18 +0000 |
parents | 0b546eab515d |
children | a3d97c60ea07 |
comparison
equal
deleted
inserted
replaced
3165:8b51e108cba6 | 3166:ab1273ffe275 |
---|---|
21 | 21 |
22 #ifdef HAVE_BUILTIN_VECTOR | 22 #ifdef HAVE_BUILTIN_VECTOR |
23 | 23 |
24 #include <xmmintrin.h> | 24 #include <xmmintrin.h> |
25 | 25 |
26 static const float p1p1p1m1[4] __attribute__((aligned(16))) = | 26 static const int p1p1p1m1[4] __attribute__((aligned(16))) = |
27 { 1.0, 1.0, 1.0, -1.0 }; | 27 { 0, 0, 0, 1 << 31 }; |
28 | 28 |
29 static const float p1p1m1p1[4] __attribute__((aligned(16))) = | 29 static const int p1p1m1p1[4] __attribute__((aligned(16))) = |
30 { 1.0, 1.0, -1.0, 1.0 }; | 30 { 0, 0, 1 << 31, 0 }; |
31 | 31 |
32 static const float p1p1m1m1[4] __attribute__((aligned(16))) = | 32 static const int p1p1m1m1[4] __attribute__((aligned(16))) = |
33 { 1.0, 1.0, -1.0, -1.0 }; | 33 { 0, 0, 1 << 31, 1 << 31 }; |
34 | 34 |
35 #if 0 | 35 #if 0 |
36 static void print_v4sf(const char *str, __m128 a) | 36 static void print_v4sf(const char *str, __m128 a) |
37 { | 37 { |
38 float *p = (float *)&a; | 38 float *p = (float *)&a; |
56 { | 56 { |
57 __m128 *r, a, b, a1, c1, c2; | 57 __m128 *r, a, b, a1, c1, c2; |
58 | 58 |
59 r = (__m128 *)&z[0]; | 59 r = (__m128 *)&z[0]; |
60 c1 = *(__m128 *)p1p1m1m1; | 60 c1 = *(__m128 *)p1p1m1m1; |
61 c2 = *(__m128 *)p1p1p1m1; | |
62 if (s->inverse) | 61 if (s->inverse) |
63 c2 = *(__m128 *)p1p1m1p1; | 62 c2 = *(__m128 *)p1p1m1p1; |
64 else | 63 else |
65 c2 = *(__m128 *)p1p1p1m1; | 64 c2 = *(__m128 *)p1p1p1m1; |
66 | 65 |
67 j = (np >> 2); | 66 j = (np >> 2); |
68 do { | 67 do { |
69 a = r[0]; | 68 a = r[0]; |
70 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); | 69 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); |
71 a = _mm_mul_ps(a, c1); | 70 a = _mm_xor_ps(a, c1); |
72 /* do the pass 0 butterfly */ | 71 /* do the pass 0 butterfly */ |
73 a = _mm_add_ps(a, b); | 72 a = _mm_add_ps(a, b); |
74 | 73 |
75 a1 = r[1]; | 74 a1 = r[1]; |
76 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); | 75 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); |
77 a1 = _mm_mul_ps(a1, c1); | 76 a1 = _mm_xor_ps(a1, c1); |
78 /* do the pass 0 butterfly */ | 77 /* do the pass 0 butterfly */ |
79 b = _mm_add_ps(a1, b); | 78 b = _mm_add_ps(a1, b); |
80 | 79 |
81 /* multiply third by -i */ | 80 /* multiply third by -i */ |
81 /* by toggling the sign bit */ | |
82 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); | 82 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); |
83 b = _mm_mul_ps(b, c2); | 83 b = _mm_xor_ps(b, c2); |
84 | 84 |
85 /* do the pass 1 butterfly */ | 85 /* do the pass 1 butterfly */ |
86 r[0] = _mm_add_ps(a, b); | 86 r[0] = _mm_add_ps(a, b); |
87 r[1] = _mm_sub_ps(a, b); | 87 r[1] = _mm_sub_ps(a, b); |
88 r += 2; | 88 r += 2; |