annotate ppc/fdct_altivec.c @ 10893:2aafcafbe1f0 libavcodec

Replace cabac checks in inline functions from h264.h with constants. No benchmark because its just replacing variables with litteral constants (so no risk for slowdown outside gcc silliness) and i need sleep.
author michael
date Sat, 16 Jan 2010 05:41:33 +0000
parents dd2b5e52336a
children 50415a8f1451
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9171
bcaba3f602d3 cosmetics: Remove file name from file header.
diego
parents: 7373
diff changeset
1 /*
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
2 * Copyright (C) 2003 James Klicman <james@klicman.org>
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
3 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
4 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
5 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
10 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
14 * Lesser General Public License for more details.
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
15 *
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3036
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2819
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
19 */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
20
9421
dd2b5e52336a Remove gcc_fixes.h. It only contains workarounds for unsupported gcc versions.
diego
parents: 9171
diff changeset
21 #include "config.h"
dd2b5e52336a Remove gcc_fixes.h. It only contains workarounds for unsupported gcc versions.
diego
parents: 9171
diff changeset
22 #if HAVE_ALTIVEC_H
dd2b5e52336a Remove gcc_fixes.h. It only contains workarounds for unsupported gcc versions.
diego
parents: 9171
diff changeset
23 #include <altivec.h>
dd2b5e52336a Remove gcc_fixes.h. It only contains workarounds for unsupported gcc versions.
diego
parents: 9171
diff changeset
24 #endif
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6105
diff changeset
25 #include "libavutil/common.h"
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6105
diff changeset
26 #include "libavcodec/dsputil.h"
6105
33674fb857b5 Change some files to only include the necessary headers.
diego
parents: 5010
diff changeset
27 #include "dsputil_ppc.h"
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
28
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
29
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
30 #define vs16(v) ((vector signed short)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
31 #define vs32(v) ((vector signed int)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
32 #define vu8(v) ((vector unsigned char)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
33 #define vu16(v) ((vector unsigned short)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
34 #define vu32(v) ((vector unsigned int)(v))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
35
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
36
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
37 #define C1 0.98078525066375732421875000 /* cos(1*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
38 #define C2 0.92387950420379638671875000 /* cos(2*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
39 #define C3 0.83146959543228149414062500 /* cos(3*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
40 #define C4 0.70710676908493041992187500 /* cos(4*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
41 #define C5 0.55557024478912353515625000 /* cos(5*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
42 #define C6 0.38268342614173889160156250 /* cos(6*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
43 #define C7 0.19509032368659973144531250 /* cos(7*PI/16) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
44 #define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
45
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
46
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
47 #define W0 -(2 * C2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
48 #define W1 (2 * C6)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
49 #define W2 (SQRT_2 * C6)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
50 #define W3 (SQRT_2 * C3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
51 #define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
52 #define W5 (SQRT_2 * ( C1 + C3 - C5 + C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
53 #define W6 (SQRT_2 * ( C1 + C3 + C5 - C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
54 #define W7 (SQRT_2 * ( C1 + C3 - C5 - C7))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
55 #define W8 (SQRT_2 * ( C7 - C3))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
56 #define W9 (SQRT_2 * (-C1 - C3))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
57 #define WA (SQRT_2 * (-C3 - C5))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
58 #define WB (SQRT_2 * ( C5 - C3))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
59
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
60
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
61 static vector float fdctconsts[3] = {
7373
266d4949aa15 Remove AltiVec vector declaration compiler compatibility macros.
diego
parents: 7223
diff changeset
62 { W0, W1, W2, W3 },
266d4949aa15 Remove AltiVec vector declaration compiler compatibility macros.
diego
parents: 7223
diff changeset
63 { W4, W5, W6, W7 },
266d4949aa15 Remove AltiVec vector declaration compiler compatibility macros.
diego
parents: 7223
diff changeset
64 { W8, W9, WA, WB }
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
65 };
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
66
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
67 #define LD_W0 vec_splat(cnsts0, 0)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
68 #define LD_W1 vec_splat(cnsts0, 1)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
69 #define LD_W2 vec_splat(cnsts0, 2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
70 #define LD_W3 vec_splat(cnsts0, 3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
71 #define LD_W4 vec_splat(cnsts1, 0)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
72 #define LD_W5 vec_splat(cnsts1, 1)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
73 #define LD_W6 vec_splat(cnsts1, 2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
74 #define LD_W7 vec_splat(cnsts1, 3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
75 #define LD_W8 vec_splat(cnsts2, 0)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
76 #define LD_W9 vec_splat(cnsts2, 1)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
77 #define LD_WA vec_splat(cnsts2, 2)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
78 #define LD_WB vec_splat(cnsts2, 3)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
79
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
80
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
81 #define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
82 x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
83 x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
84 x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
85 x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
86 x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
87 x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
88 x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
89 x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
90 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
91 b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
92 b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
93 b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
94 b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
95 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
96 b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
97 b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
98 b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
99 cnst = LD_W2; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
100 b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
101 cnst = LD_W1; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
102 b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
103 cnst = LD_W0; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
104 b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
105 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
106 x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
107 x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
108 x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
109 x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
110 x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
111 cnst = LD_W3; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
112 x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
113 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
114 cnst = LD_W8; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
115 x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
116 cnst = LD_W9; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
117 x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
118 cnst = LD_WA; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
119 x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
120 cnst = LD_WB; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
121 x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
122 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
123 cnst = LD_W4; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
124 b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
125 cnst = LD_W5; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
126 b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
127 cnst = LD_W6; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
128 b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
129 cnst = LD_W7; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
130 b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
131 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
132 b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
133 b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
134 b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
135 b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
136 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
137
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
138 #define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
139 x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
140 x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
141 x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
142 x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
143 x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
144 x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
145 x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
146 x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
147 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
148 b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
149 b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
150 b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
151 b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
152 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
153 b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
154 b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
155 b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
156 cnst = LD_W2; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
157 b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
158 cnst = LD_W1; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
159 b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
160 cnst = LD_W0; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
161 b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
162 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
163 x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
164 x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
165 x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
166 x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
167 x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
168 cnst = LD_W3; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
169 x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
170 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
171 cnst = LD_W8; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
172 x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
173 cnst = LD_W9; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
174 x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
175 cnst = LD_WA; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
176 x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
177 cnst = LD_WB; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
178 x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
179 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
180 cnst = LD_W4; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
181 b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
182 cnst = LD_W5; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
183 b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
184 cnst = LD_W6; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
185 b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
186 cnst = LD_W7; \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
187 b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
188 \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
189 b7 = vec_add(b7, x2); /* b7 += x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
190 b5 = vec_add(b5, x3); /* b5 += x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
191 b3 = vec_add(b3, x2); /* b3 += x2; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
192 b1 = vec_add(b1, x3); /* b1 += x3; */ \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
193 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
194
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
195
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
196
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
197 /* two dimensional discrete cosine transform */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
198
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
199 void fdct_altivec(int16_t *block)
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
200 {
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
201 POWERPC_PERF_DECLARE(altivec_fdct, 1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
202 vector signed short *bp;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
203 vector float *cp;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
204 vector float b00, b10, b20, b30, b40, b50, b60, b70;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
205 vector float b01, b11, b21, b31, b41, b51, b61, b71;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
206 vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
207 vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
208
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
209 POWERPC_PERF_START_COUNT(altivec_fdct, 1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
210
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
211
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
212 /* setup constants {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
213 /* mzero = -0.0 */
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
214 mzero = ((vector float)vec_splat_u32(-1));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
215 mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
216 cp = fdctconsts;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
217 cnsts0 = vec_ld(0, cp); cp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
218 cnsts1 = vec_ld(0, cp); cp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
219 cnsts2 = vec_ld(0, cp);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
220 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
221
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
222
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
223 /* 8x8 matrix transpose (vector short[8]) {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
224 #define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b))
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
225
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
226 bp = (vector signed short*)block;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
227 b00 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
228 b40 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
229 b01 = ((vector float)MERGE_S16(h, b00, b40));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
230 b11 = ((vector float)MERGE_S16(l, b00, b40));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
231 bp++;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
232 b10 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
233 b50 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
234 b21 = ((vector float)MERGE_S16(h, b10, b50));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
235 b31 = ((vector float)MERGE_S16(l, b10, b50));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
236 bp++;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
237 b20 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
238 b60 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
239 b41 = ((vector float)MERGE_S16(h, b20, b60));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
240 b51 = ((vector float)MERGE_S16(l, b20, b60));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
241 bp++;
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
242 b30 = ((vector float)vec_ld(0, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
243 b70 = ((vector float)vec_ld(16*4, bp));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
244 b61 = ((vector float)MERGE_S16(h, b30, b70));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
245 b71 = ((vector float)MERGE_S16(l, b30, b70));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
246
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
247 x0 = ((vector float)MERGE_S16(h, b01, b41));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
248 x1 = ((vector float)MERGE_S16(l, b01, b41));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
249 x2 = ((vector float)MERGE_S16(h, b11, b51));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
250 x3 = ((vector float)MERGE_S16(l, b11, b51));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
251 x4 = ((vector float)MERGE_S16(h, b21, b61));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
252 x5 = ((vector float)MERGE_S16(l, b21, b61));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
253 x6 = ((vector float)MERGE_S16(h, b31, b71));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
254 x7 = ((vector float)MERGE_S16(l, b31, b71));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
255
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
256 b00 = ((vector float)MERGE_S16(h, x0, x4));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
257 b10 = ((vector float)MERGE_S16(l, x0, x4));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
258 b20 = ((vector float)MERGE_S16(h, x1, x5));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
259 b30 = ((vector float)MERGE_S16(l, x1, x5));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
260 b40 = ((vector float)MERGE_S16(h, x2, x6));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
261 b50 = ((vector float)MERGE_S16(l, x2, x6));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
262 b60 = ((vector float)MERGE_S16(h, x3, x7));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
263 b70 = ((vector float)MERGE_S16(l, x3, x7));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
264
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
265 #undef MERGE_S16
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
266 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
267
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
268
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
269 /* Some of the initial calculations can be done as vector short before
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
270 * conversion to vector float. The following code section takes advantage
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
271 * of this.
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
272 */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
273 #if 1
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
274 /* fdct rows {{{ */
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
275 x0 = ((vector float)vec_add(vs16(b00), vs16(b70)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
276 x7 = ((vector float)vec_sub(vs16(b00), vs16(b70)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
277 x1 = ((vector float)vec_add(vs16(b10), vs16(b60)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
278 x6 = ((vector float)vec_sub(vs16(b10), vs16(b60)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
279 x2 = ((vector float)vec_add(vs16(b20), vs16(b50)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
280 x5 = ((vector float)vec_sub(vs16(b20), vs16(b50)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
281 x3 = ((vector float)vec_add(vs16(b30), vs16(b40)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
282 x4 = ((vector float)vec_sub(vs16(b30), vs16(b40)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
283
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
284 b70 = ((vector float)vec_add(vs16(x0), vs16(x3)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
285 b10 = ((vector float)vec_add(vs16(x1), vs16(x2)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
286
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
287 b00 = ((vector float)vec_add(vs16(b70), vs16(b10)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
288 b40 = ((vector float)vec_sub(vs16(b70), vs16(b10)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
289
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
290 #define CTF0(n) \
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
291 b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
292 b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
293 b##n##1 = vec_ctf(vs32(b##n##1), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
294 b##n##0 = vec_ctf(vs32(b##n##0), 0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
295
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
296 CTF0(0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
297 CTF0(4);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
298
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
299 b20 = ((vector float)vec_sub(vs16(x0), vs16(x3)));
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
300 b60 = ((vector float)vec_sub(vs16(x1), vs16(x2)));
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
301
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
302 CTF0(2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
303 CTF0(6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
304
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
305 #undef CTF0
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
306
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
307 x0 = vec_add(b60, b20);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
308 x1 = vec_add(b61, b21);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
309
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
310 cnst = LD_W2;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
311 x0 = vec_madd(cnst, x0, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
312 x1 = vec_madd(cnst, x1, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
313 cnst = LD_W1;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
314 b20 = vec_madd(cnst, b20, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
315 b21 = vec_madd(cnst, b21, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
316 cnst = LD_W0;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
317 b60 = vec_madd(cnst, b60, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
318 b61 = vec_madd(cnst, b61, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
319
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
320 #define CTFX(x,b) \
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
321 b##0 = ((vector float)vec_unpackh(vs16(x))); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
322 b##1 = ((vector float)vec_unpackl(vs16(x))); \
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
323 b##0 = vec_ctf(vs32(b##0), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
324 b##1 = vec_ctf(vs32(b##1), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
325
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
326 CTFX(x4, b7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
327 CTFX(x5, b5);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
328 CTFX(x6, b3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
329 CTFX(x7, b1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
330
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
331 #undef CTFX
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
332
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
333
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
334 x0 = vec_add(b70, b10);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
335 x1 = vec_add(b50, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
336 x2 = vec_add(b70, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
337 x3 = vec_add(b50, b10);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
338 x8 = vec_add(x2, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
339 cnst = LD_W3;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
340 x8 = vec_madd(cnst, x8, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
341
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
342 cnst = LD_W8;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
343 x0 = vec_madd(cnst, x0, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
344 cnst = LD_W9;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
345 x1 = vec_madd(cnst, x1, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
346 cnst = LD_WA;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
347 x2 = vec_madd(cnst, x2, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
348 cnst = LD_WB;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
349 x3 = vec_madd(cnst, x3, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
350
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
351 cnst = LD_W4;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
352 b70 = vec_madd(cnst, b70, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
353 cnst = LD_W5;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
354 b50 = vec_madd(cnst, b50, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
355 cnst = LD_W6;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
356 b30 = vec_madd(cnst, b30, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
357 cnst = LD_W7;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
358 b10 = vec_madd(cnst, b10, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
359
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
360 b70 = vec_add(b70, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
361 b50 = vec_add(b50, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
362 b30 = vec_add(b30, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
363 b10 = vec_add(b10, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
364
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
365
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
366 x0 = vec_add(b71, b11);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
367 x1 = vec_add(b51, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
368 x2 = vec_add(b71, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
369 x3 = vec_add(b51, b11);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
370 x8 = vec_add(x2, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
371 cnst = LD_W3;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
372 x8 = vec_madd(cnst, x8, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
373
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
374 cnst = LD_W8;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
375 x0 = vec_madd(cnst, x0, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
376 cnst = LD_W9;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
377 x1 = vec_madd(cnst, x1, mzero);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
378 cnst = LD_WA;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
379 x2 = vec_madd(cnst, x2, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
380 cnst = LD_WB;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
381 x3 = vec_madd(cnst, x3, x8);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
382
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
383 cnst = LD_W4;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
384 b71 = vec_madd(cnst, b71, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
385 cnst = LD_W5;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
386 b51 = vec_madd(cnst, b51, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
387 cnst = LD_W6;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
388 b31 = vec_madd(cnst, b31, x1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
389 cnst = LD_W7;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
390 b11 = vec_madd(cnst, b11, x0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
391
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
392 b71 = vec_add(b71, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
393 b51 = vec_add(b51, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
394 b31 = vec_add(b31, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
395 b11 = vec_add(b11, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
396 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
397 #else
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
398 /* convert to float {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
399 #define CTF(n) \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
400 vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
401 vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
402 b##n##1 = vec_ctf(vs32(b##n##1), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
403 b##n##0 = vec_ctf(vs32(b##n##0), 0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
404
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
405 CTF(0);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
406 CTF(1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
407 CTF(2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
408 CTF(3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
409 CTF(4);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
410 CTF(5);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
411 CTF(6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
412 CTF(7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
413
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
414 #undef CTF
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
415 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
416
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
417 FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
418 FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
419 #endif
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
420
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
421
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
422 /* 8x8 matrix transpose (vector float[8][2]) {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
423 x0 = vec_mergel(b00, b20);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
424 x1 = vec_mergeh(b00, b20);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
425 x2 = vec_mergel(b10, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
426 x3 = vec_mergeh(b10, b30);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
427
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
428 b00 = vec_mergeh(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
429 b10 = vec_mergel(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
430 b20 = vec_mergeh(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
431 b30 = vec_mergel(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
432
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
433 x4 = vec_mergel(b41, b61);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
434 x5 = vec_mergeh(b41, b61);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
435 x6 = vec_mergel(b51, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
436 x7 = vec_mergeh(b51, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
437
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
438 b41 = vec_mergeh(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
439 b51 = vec_mergel(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
440 b61 = vec_mergeh(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
441 b71 = vec_mergel(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
442
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
443 x0 = vec_mergel(b01, b21);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
444 x1 = vec_mergeh(b01, b21);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
445 x2 = vec_mergel(b11, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
446 x3 = vec_mergeh(b11, b31);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
447
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
448 x4 = vec_mergel(b40, b60);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
449 x5 = vec_mergeh(b40, b60);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
450 x6 = vec_mergel(b50, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
451 x7 = vec_mergeh(b50, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
452
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
453 b40 = vec_mergeh(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
454 b50 = vec_mergel(x1, x3);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
455 b60 = vec_mergeh(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
456 b70 = vec_mergel(x0, x2);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
457
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
458 b01 = vec_mergeh(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
459 b11 = vec_mergel(x5, x7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
460 b21 = vec_mergeh(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
461 b31 = vec_mergel(x4, x6);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
462 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
463
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
464
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
465 FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
466 FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
467
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
468
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
469 /* round, convert back to short {{{ */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
470 #define CTS(n) \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
471 b##n##0 = vec_round(b##n##0); \
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
472 b##n##1 = vec_round(b##n##1); \
2612
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
473 b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
474 b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \
210cca8f5764 gcc 4 compilation fix
diego
parents: 1578
diff changeset
475 b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \
1578
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
476 vec_st(vs16(b##n##0), 0, bp);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
477
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
478 bp = (vector signed short*)block;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
479 CTS(0); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
480 CTS(1); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
481 CTS(2); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
482 CTS(3); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
483 CTS(4); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
484 CTS(5); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
485 CTS(6); bp++;
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
486 CTS(7);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
487
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
488 #undef CTS
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
489 /* }}} */
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
490
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
491 POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
492 }
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
493
6a4cfc5f9f96 AltiVec optimized fdct patch by (James Klicman <james at klicman dot org>)
michael
parents:
diff changeset
494 /* vim:set foldmethod=marker foldlevel=0: */