annotate ppc/vc1dsp_altivec.c @ 12530:63edd10ad4bc libavcodec tip

Try to fix crashes introduced by r25218 r25218 made assumptions about the existence of past reference frames that weren't necessarily true.
author darkshikari
date Tue, 28 Sep 2010 09:06:22 +0000
parents 50415a8f1451
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
1 /*
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
2 * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
3 * Copyright (c) 2006 Konstantin Shishkov
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
4 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
11 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
15 * Lesser General Public License for more details.
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
16 *
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
20 */
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
21
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6028
diff changeset
22 #include "libavcodec/dsputil.h"
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
23
5750
09f99af1db40 Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents: 5215
diff changeset
24 #include "util_altivec.h"
11382
50415a8f1451 PPC: move prototypes to headers and make some functions static
mru
parents: 9364
diff changeset
25 #include "dsputil_altivec.h"
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
26
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
27 // main steps of 8x8 transform
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
28 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
29 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
30 t0 = vec_sl(vec_add(s0, s4), vec_2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
31 t0 = vec_add(vec_sl(t0, vec_1), t0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
32 t0 = vec_add(t0, vec_rnd); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
33 t1 = vec_sl(vec_sub(s0, s4), vec_2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
34 t1 = vec_add(vec_sl(t1, vec_1), t1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
35 t1 = vec_add(t1, vec_rnd); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
36 t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
37 t2 = vec_add(t2, vec_sl(s2, vec_4)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
38 t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
39 t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
40 t4 = vec_add(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
41 t5 = vec_add(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
42 t6 = vec_sub(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
43 t7 = vec_sub(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
44 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
45 t0 = vec_sl(vec_add(s1, s3), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
46 t0 = vec_add(t0, vec_sl(s5, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
47 t0 = vec_add(t0, vec_sl(s7, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
48 t0 = vec_add(t0, vec_sub(s5, s3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
49 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
50 t1 = vec_sl(vec_sub(s1, s5), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
51 t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
52 t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
53 t1 = vec_sub(t1, vec_add(s1, s7)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
54 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
55 t2 = vec_sl(vec_sub(s7, s3), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
56 t2 = vec_add(t2, vec_sl(s1, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
57 t2 = vec_add(t2, vec_sl(s5, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
58 t2 = vec_add(t2, vec_sub(s1, s7)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
59 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
60 t3 = vec_sl(vec_sub(s5, s7), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
61 t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
62 t3 = vec_add(t3, vec_sl(s1, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
63 t3 = vec_sub(t3, vec_add(s3, s5)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
64 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
65 s0 = vec_add(t4, t0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
66 s1 = vec_add(t5, t1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
67 s2 = vec_add(t6, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
68 s3 = vec_add(t7, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
69 s4 = vec_sub(t7, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
70 s5 = vec_sub(t6, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
71 s6 = vec_sub(t5, t1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
72 s7 = vec_sub(t4, t0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
73 }while(0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
74
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
75 #define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
76 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
77 s0 = vec_sra(s0, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
78 s1 = vec_sra(s1, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
79 s2 = vec_sra(s2, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
80 s3 = vec_sra(s3, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
81 s4 = vec_sra(s4, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
82 s5 = vec_sra(s5, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
83 s6 = vec_sra(s6, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
84 s7 = vec_sra(s7, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
85 }while(0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
86
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
87 #define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
88 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
89 s0 = vec_sra(s0, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
90 s1 = vec_sra(s1, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
91 s2 = vec_sra(s2, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
92 s3 = vec_sra(s3, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
93 s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
94 s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
95 s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
96 s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
97 }while(0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
98
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
99 /* main steps of 4x4 transform */
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
100 #define STEP4(s0, s1, s2, s3, vec_rnd) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
101 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
102 t1 = vec_add(vec_sl(s0, vec_4), s0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
103 t1 = vec_add(t1, vec_rnd); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
104 t2 = vec_add(vec_sl(s2, vec_4), s2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
105 t0 = vec_add(t1, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
106 t1 = vec_sub(t1, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
107 t3 = vec_sl(vec_sub(s3, s1), vec_1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
108 t3 = vec_add(t3, vec_sl(t3, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
109 t2 = vec_add(t3, vec_sl(s1, vec_5)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
110 t3 = vec_add(t3, vec_sl(s3, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
111 t3 = vec_add(t3, vec_sl(s3, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
112 s0 = vec_add(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
113 s1 = vec_sub(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
114 s2 = vec_add(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
115 s3 = vec_sub(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
116 }while (0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
117
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
118 #define SHIFT_HOR4(s0, s1, s2, s3) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
119 s0 = vec_sra(s0, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
120 s1 = vec_sra(s1, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
121 s2 = vec_sra(s2, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
122 s3 = vec_sra(s3, vec_3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
123
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
124 #define SHIFT_VERT4(s0, s1, s2, s3) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
125 s0 = vec_sra(s0, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
126 s1 = vec_sra(s1, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
127 s2 = vec_sra(s2, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
128 s3 = vec_sra(s3, vec_7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
129
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
130 /** Do inverse transform on 8x8 block
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
131 */
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
132 static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
133 {
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
134 vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
135 vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
136 vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
137 vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
138 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
139 const vector unsigned int vec_7 = vec_splat_u32(7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
140 const vector unsigned int vec_4 = vec_splat_u32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
141 const vector signed int vec_4s = vec_splat_s32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
142 const vector unsigned int vec_3 = vec_splat_u32(3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
143 const vector unsigned int vec_2 = vec_splat_u32(2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
144 const vector signed int vec_1s = vec_splat_s32(1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
145 const vector unsigned int vec_1 = vec_splat_u32(1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
146
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
147
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
148 src0 = vec_ld( 0, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
149 src1 = vec_ld( 16, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
150 src2 = vec_ld( 32, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
151 src3 = vec_ld( 48, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
152 src4 = vec_ld( 64, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
153 src5 = vec_ld( 80, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
154 src6 = vec_ld( 96, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
155 src7 = vec_ld(112, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
156
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
157 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
158 s0 = vec_unpackl(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
159 s1 = vec_unpackl(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
160 s2 = vec_unpackl(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
161 s3 = vec_unpackl(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
162 s4 = vec_unpackl(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
163 s5 = vec_unpackl(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
164 s6 = vec_unpackl(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
165 s7 = vec_unpackl(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
166 s8 = vec_unpackh(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
167 s9 = vec_unpackh(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
168 sA = vec_unpackh(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
169 sB = vec_unpackh(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
170 sC = vec_unpackh(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
171 sD = vec_unpackh(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
172 sE = vec_unpackh(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
173 sF = vec_unpackh(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
174 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
175 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
176 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
177 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
178 src0 = vec_pack(s8, s0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
179 src1 = vec_pack(s9, s1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
180 src2 = vec_pack(sA, s2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
181 src3 = vec_pack(sB, s3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
182 src4 = vec_pack(sC, s4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
183 src5 = vec_pack(sD, s5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
184 src6 = vec_pack(sE, s6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
185 src7 = vec_pack(sF, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
186 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
187
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
188 s0 = vec_unpackl(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
189 s1 = vec_unpackl(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
190 s2 = vec_unpackl(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
191 s3 = vec_unpackl(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
192 s4 = vec_unpackl(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
193 s5 = vec_unpackl(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
194 s6 = vec_unpackl(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
195 s7 = vec_unpackl(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
196 s8 = vec_unpackh(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
197 s9 = vec_unpackh(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
198 sA = vec_unpackh(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
199 sB = vec_unpackh(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
200 sC = vec_unpackh(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
201 sD = vec_unpackh(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
202 sE = vec_unpackh(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
203 sF = vec_unpackh(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
204 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
205 SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
206 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
207 SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
208 src0 = vec_pack(s8, s0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
209 src1 = vec_pack(s9, s1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
210 src2 = vec_pack(sA, s2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
211 src3 = vec_pack(sB, s3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
212 src4 = vec_pack(sC, s4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
213 src5 = vec_pack(sD, s5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
214 src6 = vec_pack(sE, s6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
215 src7 = vec_pack(sF, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
216
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
217 vec_st(src0, 0, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
218 vec_st(src1, 16, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
219 vec_st(src2, 32, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
220 vec_st(src3, 48, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
221 vec_st(src4, 64, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
222 vec_st(src5, 80, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
223 vec_st(src6, 96, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
224 vec_st(src7,112, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
225 }
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
226
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
227 /** Do inverse transform on 8x4 part of block
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
228 */
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
229 static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
230 {
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
231 vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
232 vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
233 vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
234 vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
235 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
236 const vector unsigned int vec_7 = vec_splat_u32(7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
237 const vector unsigned int vec_5 = vec_splat_u32(5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
238 const vector unsigned int vec_4 = vec_splat_u32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
239 const vector signed int vec_4s = vec_splat_s32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
240 const vector unsigned int vec_3 = vec_splat_u32(3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
241 const vector unsigned int vec_2 = vec_splat_u32(2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
242 const vector unsigned int vec_1 = vec_splat_u32(1);
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
243 vector unsigned char tmp;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
244 vector signed short tmp2, tmp3;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
245 vector unsigned char perm0, perm1, p0, p1, p;
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
246
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
247 src0 = vec_ld( 0, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
248 src1 = vec_ld( 16, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
249 src2 = vec_ld( 32, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
250 src3 = vec_ld( 48, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
251 src4 = vec_ld( 64, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
252 src5 = vec_ld( 80, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
253 src6 = vec_ld( 96, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
254 src7 = vec_ld(112, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
255
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
256 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
257 s0 = vec_unpackl(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
258 s1 = vec_unpackl(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
259 s2 = vec_unpackl(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
260 s3 = vec_unpackl(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
261 s4 = vec_unpackl(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
262 s5 = vec_unpackl(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
263 s6 = vec_unpackl(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
264 s7 = vec_unpackl(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
265 s8 = vec_unpackh(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
266 s9 = vec_unpackh(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
267 sA = vec_unpackh(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
268 sB = vec_unpackh(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
269 sC = vec_unpackh(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
270 sD = vec_unpackh(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
271 sE = vec_unpackh(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
272 sF = vec_unpackh(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
273 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
274 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
275 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
276 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
277 src0 = vec_pack(s8, s0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
278 src1 = vec_pack(s9, s1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
279 src2 = vec_pack(sA, s2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
280 src3 = vec_pack(sB, s3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
281 src4 = vec_pack(sC, s4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
282 src5 = vec_pack(sD, s5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
283 src6 = vec_pack(sE, s6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
284 src7 = vec_pack(sF, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
285 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
286
6000
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
287 s0 = vec_unpackh(src0);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
288 s1 = vec_unpackh(src1);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
289 s2 = vec_unpackh(src2);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
290 s3 = vec_unpackh(src3);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
291 s8 = vec_unpackl(src0);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
292 s9 = vec_unpackl(src1);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
293 sA = vec_unpackl(src2);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
294 sB = vec_unpackl(src3);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
295 STEP4(s0, s1, s2, s3, vec_64);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
296 SHIFT_VERT4(s0, s1, s2, s3);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
297 STEP4(s8, s9, sA, sB, vec_64);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
298 SHIFT_VERT4(s8, s9, sA, sB);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
299 src0 = vec_pack(s0, s8);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
300 src1 = vec_pack(s1, s9);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
301 src2 = vec_pack(s2, sA);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
302 src3 = vec_pack(s3, sB);
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
303
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
304 p0 = vec_lvsl (0, dest);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
305 p1 = vec_lvsl (stride, dest);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
306 p = vec_splat_u8 (-1);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
307 perm0 = vec_mergeh (p, p0);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
308 perm1 = vec_mergeh (p, p1);
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
309
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
310 #define ADD(dest,src,perm) \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
311 /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
312 tmp = vec_ld (0, dest); \
6028
1ba8ee13e5b9 Make strict altivec parsers happy (gcc-4.3 and others)
lu_zero
parents: 6000
diff changeset
313 tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
314 tmp3 = vec_adds (tmp2, src); \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
315 tmp = vec_packsu (tmp3, tmp3); \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
316 vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
317 vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
318
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
319 ADD (dest, src0, perm0) dest += stride;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
320 ADD (dest, src1, perm1) dest += stride;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
321 ADD (dest, src2, perm0) dest += stride;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
322 ADD (dest, src3, perm1)
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
323 }
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
324
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
325
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
326 void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) {
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
327 dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
328 dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
329 }