annotate ppc/float_altivec.c @ 11032:01bd040f8607 libavcodec

Unroll main loop so the edge==0 case is seperate. This allows many things to be simplified away. h264 decoder is overall 1% faster with a mbaff sample and 0.1% slower with the cathedral sample, probably because the slow loop filter code must be loaded into the code cache for each first MB of each row but isnt used for the following MBs.
author michael
date Thu, 28 Jan 2010 01:24:25 +0000
parents 34a65026fa06
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
1 /*
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
2 * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
3 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3659
diff changeset
4 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3659
diff changeset
5 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3659
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3659
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
10 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3659
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
14 * Lesser General Public License for more details.
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
15 *
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3659
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
19 */
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
20
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 5010
diff changeset
21 #include "libavcodec/dsputil.h"
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
22
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
23 #include "dsputil_altivec.h"
8542
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
24 #include "util_altivec.h"
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
25
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
26 static void vector_fmul_altivec(float *dst, const float *src, int len)
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
27 {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
28 int i;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
29 vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
30 for(i=0; i<len-7; i+=8) {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
31 d0 = vec_ld(0, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
32 s = vec_ld(0, src+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
33 d1 = vec_ld(16, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
34 d0 = vec_madd(d0, s, zero);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
35 d1 = vec_madd(d1, vec_ld(16,src+i), zero);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
36 vec_st(d0, 0, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
37 vec_st(d1, 16, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
38 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
39 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
40
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
41 static void vector_fmul_reverse_altivec(float *dst, const float *src0,
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
42 const float *src1, int len)
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
43 {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
44 int i;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
45 vector float d, s0, s1, h0, l0,
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
46 s2, s3, zero = (vector float)vec_splat_u32(0);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
47 src1 += len-4;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
48 for(i=0; i<len-7; i+=8) {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
49 s1 = vec_ld(0, src1-i); // [a,b,c,d]
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
50 s0 = vec_ld(0, src0+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
51 l0 = vec_mergel(s1, s1); // [c,c,d,d]
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
52 s3 = vec_ld(-16, src1-i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
53 h0 = vec_mergeh(s1, s1); // [a,a,b,b]
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
54 s2 = vec_ld(16, src0+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
55 s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b]
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
56 vec_mergeh(l0,h0)); // [c,a,c,a]
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
57 // [d,c,b,a]
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
58 l0 = vec_mergel(s3, s3);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
59 d = vec_madd(s0, s1, zero);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
60 h0 = vec_mergeh(s3, s3);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
61 vec_st(d, 0, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
62 s3 = vec_mergeh(vec_mergel(l0,h0),
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
63 vec_mergeh(l0,h0));
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
64 d = vec_madd(s2, s3, zero);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
65 vec_st(d, 16, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
66 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
67 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
68
10300
4d1b9ca628fc Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents: 9421
diff changeset
69 static void vector_fmul_add_altivec(float *dst, const float *src0,
4d1b9ca628fc Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents: 9421
diff changeset
70 const float *src1, const float *src2,
4d1b9ca628fc Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents: 9421
diff changeset
71 int len)
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
72 {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
73 int i;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
74 vector float d, s0, s1, s2, t0, t1, edges;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
75 vector unsigned char align = vec_lvsr(0,dst),
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
76 mask = vec_lvsl(0, dst);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
77
10301
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
78 for (i=0; i<len-3; i+=4) {
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
79 t0 = vec_ld(0, dst+i);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
80 t1 = vec_ld(15, dst+i);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
81 s0 = vec_ld(0, src0+i);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
82 s1 = vec_ld(0, src1+i);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
83 s2 = vec_ld(0, src2+i);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
84 edges = vec_perm(t1 ,t0, mask);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
85 d = vec_madd(s0,s1,s2);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
86 t1 = vec_perm(d, edges, align);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
87 t0 = vec_perm(edges, d, align);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
88 vec_st(t1, 15, dst+i);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
89 vec_st(t0, 0, dst+i);
02798c603744 cosmetics: fix indentation after previous commit
mru
parents: 10300
diff changeset
90 }
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
91 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
92
8542
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
93 static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
94 {
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
95 union {
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
96 vector float v;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
97 float s[4];
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
98 } vadd;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
99 vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
100 const vector unsigned char reverse = vcprm(3,2,1,0);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
101 int i,j;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
102
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
103 dst += len;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
104 win += len;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
105 src0+= len;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
106
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
107 vadd.s[0] = add_bias;
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
108 vadd_bias = vec_splat(vadd.v, 0);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
109 zero = (vector float)vec_splat_u32(0);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
110
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
111 for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
112 s0 = vec_ld(i, src0);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
113 s1 = vec_ld(j, src1);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
114 wi = vec_ld(i, win);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
115 wj = vec_ld(j, win);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
116
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
117 s1 = vec_perm(s1, s1, reverse);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
118 wj = vec_perm(wj, wj, reverse);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
119
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
120 t0 = vec_madd(s0, wj, vadd_bias);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
121 t0 = vec_nmsub(s1, wi, t0);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
122 t1 = vec_madd(s0, wi, vadd_bias);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
123 t1 = vec_madd(s1, wj, t1);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
124 t1 = vec_perm(t1, t1, reverse);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
125
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
126 vec_st(t0, i, dst);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
127 vec_st(t1, j, dst);
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
128 }
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
129 }
7674
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
130
8365
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
131 static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
132 {
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
133 union {
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
134 vector float v;
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
135 float s[4];
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
136 } mul_u;
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
137 int i;
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
138 vector float src1, src2, dst1, dst2, mul_v, zero;
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
139
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
140 zero = (vector float)vec_splat_u32(0);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
141 mul_u.s[0] = mul;
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
142 mul_v = vec_splat(mul_u.v, 0);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
143
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
144 for(i=0; i<len; i+=8) {
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
145 src1 = vec_ctf(vec_ld(0, src+i), 0);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
146 src2 = vec_ctf(vec_ld(16, src+i), 0);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
147 dst1 = vec_madd(src1, mul_v, zero);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
148 dst2 = vec_madd(src2, mul_v, zero);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
149 vec_st(dst1, 0, dst+i);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
150 vec_st(dst2, 16, dst+i);
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
151 }
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
152 }
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
153
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
154
7674
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
155 static vector signed short
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
156 float_to_int16_one_altivec(const float *src)
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
157 {
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
158 vector float s0 = vec_ld(0, src);
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
159 vector float s1 = vec_ld(16, src);
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
160 vector signed int t0 = vec_cts(s0, 0);
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
161 vector signed int t1 = vec_cts(s1, 0);
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
162 return vec_packs(t0,t1);
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
163 }
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
164
8537
93a3020d9636 Fix float_to_int16_altivec prototype to match float_to_int16's in dsputil.h
gpoirier
parents: 8365
diff changeset
165 static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
166 {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
167 int i;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
168 vector signed short d0, d1, d;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
169 vector unsigned char align;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
170 if(((long)dst)&15) //FIXME
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
171 for(i=0; i<len-7; i+=8) {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
172 d0 = vec_ld(0, dst+i);
7674
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
173 d = float_to_int16_one_altivec(src+i);
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
174 d1 = vec_ld(15, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
175 d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
176 align = vec_lvsr(0, dst+i);
3657
ff6720290478 Fix float_to_int16, unaligned case, broken by the previous commit
lu_zero
parents: 3583
diff changeset
177 d0 = vec_perm(d1, d, align);
ff6720290478 Fix float_to_int16, unaligned case, broken by the previous commit
lu_zero
parents: 3583
diff changeset
178 d1 = vec_perm(d, d1, align);
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
179 vec_st(d0, 0, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
180 vec_st(d1,15, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
181 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
182 else
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
183 for(i=0; i<len-7; i+=8) {
7674
e1876d3200ee Introduce float_to_int16_one_altivec
lu_zero
parents: 6763
diff changeset
184 d = float_to_int16_one_altivec(src+i);
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
185 vec_st(d, 0, dst+i);
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
186 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
187 }
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
188
7675
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
189 static void
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
190 float_to_int16_interleave_altivec(int16_t *dst, const float **src,
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
191 long len, int channels)
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
192 {
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
193 int i;
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
194 vector signed short d0, d1, d2, c0, c1, t0, t1;
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
195 vector unsigned char align;
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
196 if(channels == 1)
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
197 float_to_int16_altivec(dst, src[0], len);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
198 else
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
199 if (channels == 2) {
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
200 if(((long)dst)&15)
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
201 for(i=0; i<len-7; i+=8) {
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
202 d0 = vec_ld(0, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
203 t0 = float_to_int16_one_altivec(src[0] + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
204 d1 = vec_ld(31, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
205 t1 = float_to_int16_one_altivec(src[1] + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
206 c0 = vec_mergeh(t0, t1);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
207 c1 = vec_mergel(t0, t1);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
208 d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
209 align = vec_lvsr(0, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
210 d0 = vec_perm(d2, c0, align);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
211 d1 = vec_perm(c0, c1, align);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
212 vec_st(d0, 0, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
213 d0 = vec_perm(c1, d2, align);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
214 vec_st(d1, 15, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
215 vec_st(d0, 31, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
216 dst+=8;
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
217 }
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
218 else
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
219 for(i=0; i<len-7; i+=8) {
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
220 t0 = float_to_int16_one_altivec(src[0] + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
221 t1 = float_to_int16_one_altivec(src[1] + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
222 d0 = vec_mergeh(t0, t1);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
223 d1 = vec_mergel(t0, t1);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
224 vec_st(d0, 0, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
225 vec_st(d1, 16, dst + i);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
226 dst+=8;
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
227 }
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
228 } else {
10961
34a65026fa06 Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents: 10301
diff changeset
229 DECLARE_ALIGNED(16, int16_t, tmp)[len];
7675
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
230 int c, j;
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
231 for (c = 0; c < channels; c++) {
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
232 float_to_int16_altivec(tmp, src[c], len);
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
233 for (i = 0, j = c; i < len; i++, j+=channels) {
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
234 dst[j] = tmp[i];
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
235 }
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
236 }
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
237 }
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
238 }
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
239
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
240 void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
241 {
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
242 c->vector_fmul = vector_fmul_altivec;
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
243 c->vector_fmul_reverse = vector_fmul_reverse_altivec;
10300
4d1b9ca628fc Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents: 9421
diff changeset
244 c->vector_fmul_add = vector_fmul_add_altivec;
8365
d33b47d1f4c1 add AltiVec implementation of int32_to_float_fmul_scalar
gpoirier
parents: 7691
diff changeset
245 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
7675
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
246 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
8542
5a923bd4f5c6 Add Altivec version of vector_fmul_window.
gpoirier
parents: 8537
diff changeset
247 c->vector_fmul_window = vector_fmul_window_altivec;
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
248 c->float_to_int16 = float_to_int16_altivec;
7675
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
249 c->float_to_int16_interleave = float_to_int16_interleave_altivec;
ad4bf45b9b63 Introduce float_to_int16_interleave_altivec, tested with vorbis
lu_zero
parents: 7674
diff changeset
250 }
3581
49082584828a altivec float optimizations
lu_zero
parents:
diff changeset
251 }