annotate ppc/fdct_altivec.c @ 5757:ace63c809071 libavcodec

Remove uses of SIGILL for CPU extension detection, that method is not acceptable in a library. Should not change anything for PPC, the autodetection is currently pointless due to other code being compiled with -maltivec as well (and detection for OSX and AmigaOS remains in place). SPARC binaries built with VIS support can now only run on systems with VIS.
author reimar
date Tue, 02 Oct 2007 18:18:35 +0000
parents d5ba514e3f4a
children 33674fb857b5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
1 /* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
2 * AltiVec optimized library for the FFMPEG Multimedia System
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
3 * Copyright (C) 2003 James Klicman <james@klicman.org>
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
4 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
11 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
15 * Lesser General Public License for more details.
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
16 *
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2819
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
20 */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
21
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
22
2819
alex
parents: 2612
diff changeset
23 #include "common.h"
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 3973
diff changeset
24 #include "dsputil.h"
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
25 #include "dsputil_altivec.h"
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
26 #include "gcc_fixes.h"
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
27
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
28
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
29 #define vs16(v) ((vector signed short)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
30 #define vs32(v) ((vector signed int)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
31 #define vu8(v) ((vector unsigned char)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
32 #define vu16(v) ((vector unsigned short)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
33 #define vu32(v) ((vector unsigned int)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
34
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
35
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
36 #define C1 0.98078525066375732421875000 /* cos(1*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
37 #define C2 0.92387950420379638671875000 /* cos(2*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
38 #define C3 0.83146959543228149414062500 /* cos(3*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
39 #define C4 0.70710676908493041992187500 /* cos(4*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
40 #define C5 0.55557024478912353515625000 /* cos(5*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
41 #define C6 0.38268342614173889160156250 /* cos(6*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
42 #define C7 0.19509032368659973144531250 /* cos(7*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
43 #define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
44
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
45
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
46 #define W0 -(2 * C2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
47 #define W1 (2 * C6)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
48 #define W2 (SQRT_2 * C6)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
49 #define W3 (SQRT_2 * C3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
50 #define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
51 #define W5 (SQRT_2 * ( C1 + C3 - C5 + C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
52 #define W6 (SQRT_2 * ( C1 + C3 + C5 - C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
53 #define W7 (SQRT_2 * ( C1 + C3 - C5 - C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
54 #define W8 (SQRT_2 * ( C7 - C3))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
55 #define W9 (SQRT_2 * (-C1 - C3))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
56 #define WA (SQRT_2 * (-C3 - C5))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
57 #define WB (SQRT_2 * ( C5 - C3))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
58
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
59
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
60 static vector float fdctconsts[3] = {
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
61 (vector float)AVV( W0, W1, W2, W3 ),
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
62 (vector float)AVV( W4, W5, W6, W7 ),
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
63 (vector float)AVV( W8, W9, WA, WB )
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
64 };
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
65
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
66 #define LD_W0 vec_splat(cnsts0, 0)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
67 #define LD_W1 vec_splat(cnsts0, 1)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
68 #define LD_W2 vec_splat(cnsts0, 2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
69 #define LD_W3 vec_splat(cnsts0, 3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
70 #define LD_W4 vec_splat(cnsts1, 0)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
71 #define LD_W5 vec_splat(cnsts1, 1)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
72 #define LD_W6 vec_splat(cnsts1, 2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
73 #define LD_W7 vec_splat(cnsts1, 3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
74 #define LD_W8 vec_splat(cnsts2, 0)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
75 #define LD_W9 vec_splat(cnsts2, 1)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
76 #define LD_WA vec_splat(cnsts2, 2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
77 #define LD_WB vec_splat(cnsts2, 3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
78
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
79
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
80 #define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
81 x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
82 x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
83 x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
84 x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
85 x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
86 x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
87 x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
88 x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
89 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
90 b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
91 b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
92 b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
93 b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
94 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
95 b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
96 b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
97 b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
98 cnst = LD_W2; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
99 b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
100 cnst = LD_W1; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
101 b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
102 cnst = LD_W0; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
103 b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
104 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
105 x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
106 x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
107 x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
108 x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
109 x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
110 cnst = LD_W3; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
111 x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
112 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
113 cnst = LD_W8; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
114 x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
115 cnst = LD_W9; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
116 x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
117 cnst = LD_WA; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
118 x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
119 cnst = LD_WB; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
120 x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
121 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
122 cnst = LD_W4; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
123 b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
124 cnst = LD_W5; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
125 b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
126 cnst = LD_W6; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
127 b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
128 cnst = LD_W7; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
129 b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
130 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
131 b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
132 b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
133 b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
134 b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
135 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
136
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
137 #define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
138 x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
139 x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
140 x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
141 x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
142 x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
143 x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
144 x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
145 x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
146 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
147 b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
148 b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
149 b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
150 b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
151 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
152 b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
153 b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
154 b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
155 cnst = LD_W2; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
156 b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
157 cnst = LD_W1; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
158 b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
159 cnst = LD_W0; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
160 b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
161 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
162 x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
163 x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
164 x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
165 x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
166 x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
167 cnst = LD_W3; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
168 x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
169 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
170 cnst = LD_W8; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
171 x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
172 cnst = LD_W9; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
173 x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
174 cnst = LD_WA; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
175 x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
176 cnst = LD_WB; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
177 x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
178 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
179 cnst = LD_W4; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
180 b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
181 cnst = LD_W5; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
182 b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
183 cnst = LD_W6; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
184 b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
185 cnst = LD_W7; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
186 b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
187 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
188 b7 = vec_add(b7, x2); /* b7 += x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
189 b5 = vec_add(b5, x3); /* b5 += x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
190 b3 = vec_add(b3, x2); /* b3 += x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
191 b1 = vec_add(b1, x3); /* b1 += x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
192 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
193
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
194
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
195
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
196 /* two dimensional discrete cosine transform */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
197
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
198 void fdct_altivec(int16_t *block)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
199 {
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
200 POWERPC_PERF_DECLARE(altivec_fdct, 1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
201 vector signed short *bp;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
202 vector float *cp;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
203 vector float b00, b10, b20, b30, b40, b50, b60, b70;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
204 vector float b01, b11, b21, b31, b41, b51, b61, b71;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
205 vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
206 vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
207
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
208 POWERPC_PERF_START_COUNT(altivec_fdct, 1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
209
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
210
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
211 /* setup constants {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
212 /* mzero = -0.0 */
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
213 mzero = ((vector float)vec_splat_u32(-1));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
214 mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
215 cp = fdctconsts;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
216 cnsts0 = vec_ld(0, cp); cp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
217 cnsts1 = vec_ld(0, cp); cp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
218 cnsts2 = vec_ld(0, cp);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
219 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
220
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
221
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
222 /* 8x8 matrix transpose (vector short[8]) {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
223 #define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
224
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
225 bp = (vector signed short*)block;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
226 b00 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
227 b40 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
228 b01 = ((vector float)MERGE_S16(h, b00, b40));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
229 b11 = ((vector float)MERGE_S16(l, b00, b40));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
230 bp++;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
231 b10 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
232 b50 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
233 b21 = ((vector float)MERGE_S16(h, b10, b50));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
234 b31 = ((vector float)MERGE_S16(l, b10, b50));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
235 bp++;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
236 b20 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
237 b60 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
238 b41 = ((vector float)MERGE_S16(h, b20, b60));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
239 b51 = ((vector float)MERGE_S16(l, b20, b60));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
240 bp++;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
241 b30 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
242 b70 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
243 b61 = ((vector float)MERGE_S16(h, b30, b70));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
244 b71 = ((vector float)MERGE_S16(l, b30, b70));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
245
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
246 x0 = ((vector float)MERGE_S16(h, b01, b41));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
247 x1 = ((vector float)MERGE_S16(l, b01, b41));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
248 x2 = ((vector float)MERGE_S16(h, b11, b51));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
249 x3 = ((vector float)MERGE_S16(l, b11, b51));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
250 x4 = ((vector float)MERGE_S16(h, b21, b61));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
251 x5 = ((vector float)MERGE_S16(l, b21, b61));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
252 x6 = ((vector float)MERGE_S16(h, b31, b71));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
253 x7 = ((vector float)MERGE_S16(l, b31, b71));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
254
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
255 b00 = ((vector float)MERGE_S16(h, x0, x4));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
256 b10 = ((vector float)MERGE_S16(l, x0, x4));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
257 b20 = ((vector float)MERGE_S16(h, x1, x5));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
258 b30 = ((vector float)MERGE_S16(l, x1, x5));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
259 b40 = ((vector float)MERGE_S16(h, x2, x6));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
260 b50 = ((vector float)MERGE_S16(l, x2, x6));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
261 b60 = ((vector float)MERGE_S16(h, x3, x7));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
262 b70 = ((vector float)MERGE_S16(l, x3, x7));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
263
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
264 #undef MERGE_S16
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
265 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
266
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
267
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
268 /* Some of the initial calculations can be done as vector short before
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
269 * conversion to vector float. The following code section takes advantage
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
270 * of this.
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
271 */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
272 #if 1
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
273 /* fdct rows {{{ */
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
274 x0 = ((vector float)vec_add(vs16(b00), vs16(b70)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
275 x7 = ((vector float)vec_sub(vs16(b00), vs16(b70)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
276 x1 = ((vector float)vec_add(vs16(b10), vs16(b60)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
277 x6 = ((vector float)vec_sub(vs16(b10), vs16(b60)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
278 x2 = ((vector float)vec_add(vs16(b20), vs16(b50)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
279 x5 = ((vector float)vec_sub(vs16(b20), vs16(b50)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
280 x3 = ((vector float)vec_add(vs16(b30), vs16(b40)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
281 x4 = ((vector float)vec_sub(vs16(b30), vs16(b40)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
282
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
283 b70 = ((vector float)vec_add(vs16(x0), vs16(x3)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
284 b10 = ((vector float)vec_add(vs16(x1), vs16(x2)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
285
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
286 b00 = ((vector float)vec_add(vs16(b70), vs16(b10)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
287 b40 = ((vector float)vec_sub(vs16(b70), vs16(b10)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
288
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
289 #define CTF0(n) \
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
290 b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
291 b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
292 b##n##1 = vec_ctf(vs32(b##n##1), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
293 b##n##0 = vec_ctf(vs32(b##n##0), 0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
294
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
295 CTF0(0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
296 CTF0(4);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
297
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
298 b20 = ((vector float)vec_sub(vs16(x0), vs16(x3)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
299 b60 = ((vector float)vec_sub(vs16(x1), vs16(x2)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
300
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
301 CTF0(2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
302 CTF0(6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
303
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
304 #undef CTF0
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
305
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
306 x0 = vec_add(b60, b20);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
307 x1 = vec_add(b61, b21);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
308
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
309 cnst = LD_W2;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
310 x0 = vec_madd(cnst, x0, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
311 x1 = vec_madd(cnst, x1, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
312 cnst = LD_W1;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
313 b20 = vec_madd(cnst, b20, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
314 b21 = vec_madd(cnst, b21, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
315 cnst = LD_W0;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
316 b60 = vec_madd(cnst, b60, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
317 b61 = vec_madd(cnst, b61, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
318
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
319 #define CTFX(x,b) \
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
320 b##0 = ((vector float)vec_unpackh(vs16(x))); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
321 b##1 = ((vector float)vec_unpackl(vs16(x))); \
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
322 b##0 = vec_ctf(vs32(b##0), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
323 b##1 = vec_ctf(vs32(b##1), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
324
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
325 CTFX(x4, b7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
326 CTFX(x5, b5);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
327 CTFX(x6, b3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
328 CTFX(x7, b1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
329
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
330 #undef CTFX
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
331
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
332
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
333 x0 = vec_add(b70, b10);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
334 x1 = vec_add(b50, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
335 x2 = vec_add(b70, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
336 x3 = vec_add(b50, b10);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
337 x8 = vec_add(x2, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
338 cnst = LD_W3;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
339 x8 = vec_madd(cnst, x8, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
340
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
341 cnst = LD_W8;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
342 x0 = vec_madd(cnst, x0, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
343 cnst = LD_W9;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
344 x1 = vec_madd(cnst, x1, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
345 cnst = LD_WA;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
346 x2 = vec_madd(cnst, x2, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
347 cnst = LD_WB;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
348 x3 = vec_madd(cnst, x3, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
349
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
350 cnst = LD_W4;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
351 b70 = vec_madd(cnst, b70, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
352 cnst = LD_W5;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
353 b50 = vec_madd(cnst, b50, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
354 cnst = LD_W6;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
355 b30 = vec_madd(cnst, b30, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
356 cnst = LD_W7;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
357 b10 = vec_madd(cnst, b10, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
358
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
359 b70 = vec_add(b70, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
360 b50 = vec_add(b50, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
361 b30 = vec_add(b30, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
362 b10 = vec_add(b10, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
363
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
364
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
365 x0 = vec_add(b71, b11);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
366 x1 = vec_add(b51, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
367 x2 = vec_add(b71, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
368 x3 = vec_add(b51, b11);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
369 x8 = vec_add(x2, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
370 cnst = LD_W3;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
371 x8 = vec_madd(cnst, x8, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
372
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
373 cnst = LD_W8;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
374 x0 = vec_madd(cnst, x0, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
375 cnst = LD_W9;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
376 x1 = vec_madd(cnst, x1, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
377 cnst = LD_WA;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
378 x2 = vec_madd(cnst, x2, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
379 cnst = LD_WB;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
380 x3 = vec_madd(cnst, x3, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
381
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
382 cnst = LD_W4;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
383 b71 = vec_madd(cnst, b71, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
384 cnst = LD_W5;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
385 b51 = vec_madd(cnst, b51, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
386 cnst = LD_W6;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
387 b31 = vec_madd(cnst, b31, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
388 cnst = LD_W7;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
389 b11 = vec_madd(cnst, b11, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
390
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
391 b71 = vec_add(b71, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
392 b51 = vec_add(b51, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
393 b31 = vec_add(b31, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
394 b11 = vec_add(b11, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
395 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
396 #else
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
397 /* convert to float {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
398 #define CTF(n) \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
399 vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
400 vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
401 b##n##1 = vec_ctf(vs32(b##n##1), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
402 b##n##0 = vec_ctf(vs32(b##n##0), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
403
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
404 CTF(0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
405 CTF(1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
406 CTF(2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
407 CTF(3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
408 CTF(4);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
409 CTF(5);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
410 CTF(6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
411 CTF(7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
412
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
413 #undef CTF
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
414 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
415
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
416 FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
417 FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
418 #endif
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
419
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
420
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
421 /* 8x8 matrix transpose (vector float[8][2]) {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
422 x0 = vec_mergel(b00, b20);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
423 x1 = vec_mergeh(b00, b20);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
424 x2 = vec_mergel(b10, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
425 x3 = vec_mergeh(b10, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
426
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
427 b00 = vec_mergeh(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
428 b10 = vec_mergel(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
429 b20 = vec_mergeh(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
430 b30 = vec_mergel(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
431
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
432 x4 = vec_mergel(b41, b61);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
433 x5 = vec_mergeh(b41, b61);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
434 x6 = vec_mergel(b51, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
435 x7 = vec_mergeh(b51, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
436
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
437 b41 = vec_mergeh(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
438 b51 = vec_mergel(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
439 b61 = vec_mergeh(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
440 b71 = vec_mergel(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
441
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
442 x0 = vec_mergel(b01, b21);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
443 x1 = vec_mergeh(b01, b21);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
444 x2 = vec_mergel(b11, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
445 x3 = vec_mergeh(b11, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
446
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
447 x4 = vec_mergel(b40, b60);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
448 x5 = vec_mergeh(b40, b60);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
449 x6 = vec_mergel(b50, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
450 x7 = vec_mergeh(b50, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
451
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
452 b40 = vec_mergeh(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
453 b50 = vec_mergel(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
454 b60 = vec_mergeh(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
455 b70 = vec_mergel(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
456
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
457 b01 = vec_mergeh(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
458 b11 = vec_mergel(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
459 b21 = vec_mergeh(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
460 b31 = vec_mergel(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
461 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
462
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
463
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
464 FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
465 FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
466
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
467
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
468 /* round, convert back to short {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
469 #define CTS(n) \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
470 b##n##0 = vec_round(b##n##0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
471 b##n##1 = vec_round(b##n##1); \
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
472 b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
473 b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
474 b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
475 vec_st(vs16(b##n##0), 0, bp);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
476
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
477 bp = (vector signed short*)block;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
478 CTS(0); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
479 CTS(1); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
480 CTS(2); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
481 CTS(3); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
482 CTS(4); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
483 CTS(5); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
484 CTS(6); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
485 CTS(7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
486
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
487 #undef CTS
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
488 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
489
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
490 POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
491 }
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
492
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
493 /* vim:set foldmethod=marker foldlevel=0: */