annotate ppc/vc1dsp_altivec.c @ 11032:01bd040f8607 libavcodec

Unroll main loop so the edge==0 case is seperate. This allows many things to be simplified away. h264 decoder is overall 1% faster with a mbaff sample and 0.1% slower with the cathedral sample, probably because the slow loop filter code must be loaded into the code cache for each first MB of each row but isnt used for the following MBs.
author michael
date Thu, 28 Jan 2010 01:24:25 +0000
parents 7cee7292d5cc
children 50415a8f1451
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
1 /*
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
2 * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
3 * Copyright (c) 2006 Konstantin Shishkov
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
4 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
11 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
15 * Lesser General Public License for more details.
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
16 *
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3537
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
20 */
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
21
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6028
diff changeset
22 #include "libavcodec/dsputil.h"
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
23
5750
09f99af1db40 Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents: 5215
diff changeset
24 #include "util_altivec.h"
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
25
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
26 // main steps of 8x8 transform
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
27 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
28 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
29 t0 = vec_sl(vec_add(s0, s4), vec_2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
30 t0 = vec_add(vec_sl(t0, vec_1), t0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
31 t0 = vec_add(t0, vec_rnd); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
32 t1 = vec_sl(vec_sub(s0, s4), vec_2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
33 t1 = vec_add(vec_sl(t1, vec_1), t1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
34 t1 = vec_add(t1, vec_rnd); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
35 t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
36 t2 = vec_add(t2, vec_sl(s2, vec_4)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
37 t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
38 t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
39 t4 = vec_add(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
40 t5 = vec_add(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
41 t6 = vec_sub(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
42 t7 = vec_sub(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
43 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
44 t0 = vec_sl(vec_add(s1, s3), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
45 t0 = vec_add(t0, vec_sl(s5, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
46 t0 = vec_add(t0, vec_sl(s7, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
47 t0 = vec_add(t0, vec_sub(s5, s3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
48 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
49 t1 = vec_sl(vec_sub(s1, s5), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
50 t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
51 t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
52 t1 = vec_sub(t1, vec_add(s1, s7)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
53 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
54 t2 = vec_sl(vec_sub(s7, s3), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
55 t2 = vec_add(t2, vec_sl(s1, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
56 t2 = vec_add(t2, vec_sl(s5, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
57 t2 = vec_add(t2, vec_sub(s1, s7)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
58 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
59 t3 = vec_sl(vec_sub(s5, s7), vec_4); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
60 t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
61 t3 = vec_add(t3, vec_sl(s1, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
62 t3 = vec_sub(t3, vec_add(s3, s5)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
63 \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
64 s0 = vec_add(t4, t0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
65 s1 = vec_add(t5, t1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
66 s2 = vec_add(t6, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
67 s3 = vec_add(t7, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
68 s4 = vec_sub(t7, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
69 s5 = vec_sub(t6, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
70 s6 = vec_sub(t5, t1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
71 s7 = vec_sub(t4, t0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
72 }while(0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
73
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
74 #define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
75 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
76 s0 = vec_sra(s0, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
77 s1 = vec_sra(s1, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
78 s2 = vec_sra(s2, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
79 s3 = vec_sra(s3, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
80 s4 = vec_sra(s4, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
81 s5 = vec_sra(s5, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
82 s6 = vec_sra(s6, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
83 s7 = vec_sra(s7, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
84 }while(0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
85
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
86 #define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
87 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
88 s0 = vec_sra(s0, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
89 s1 = vec_sra(s1, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
90 s2 = vec_sra(s2, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
91 s3 = vec_sra(s3, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
92 s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
93 s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
94 s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
95 s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
96 }while(0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
97
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
98 /* main steps of 4x4 transform */
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
99 #define STEP4(s0, s1, s2, s3, vec_rnd) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
100 do { \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
101 t1 = vec_add(vec_sl(s0, vec_4), s0); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
102 t1 = vec_add(t1, vec_rnd); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
103 t2 = vec_add(vec_sl(s2, vec_4), s2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
104 t0 = vec_add(t1, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
105 t1 = vec_sub(t1, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
106 t3 = vec_sl(vec_sub(s3, s1), vec_1); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
107 t3 = vec_add(t3, vec_sl(t3, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
108 t2 = vec_add(t3, vec_sl(s1, vec_5)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
109 t3 = vec_add(t3, vec_sl(s3, vec_3)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
110 t3 = vec_add(t3, vec_sl(s3, vec_2)); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
111 s0 = vec_add(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
112 s1 = vec_sub(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
113 s2 = vec_add(t1, t3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
114 s3 = vec_sub(t0, t2); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
115 }while (0)
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
116
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
117 #define SHIFT_HOR4(s0, s1, s2, s3) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
118 s0 = vec_sra(s0, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
119 s1 = vec_sra(s1, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
120 s2 = vec_sra(s2, vec_3); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
121 s3 = vec_sra(s3, vec_3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
122
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
123 #define SHIFT_VERT4(s0, s1, s2, s3) \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
124 s0 = vec_sra(s0, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
125 s1 = vec_sra(s1, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
126 s2 = vec_sra(s2, vec_7); \
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
127 s3 = vec_sra(s3, vec_7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
128
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
129 /** Do inverse transform on 8x8 block
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
130 */
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
131 static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
132 {
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
133 vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
134 vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
135 vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
136 vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
137 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
138 const vector unsigned int vec_7 = vec_splat_u32(7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
139 const vector unsigned int vec_4 = vec_splat_u32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
140 const vector signed int vec_4s = vec_splat_s32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
141 const vector unsigned int vec_3 = vec_splat_u32(3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
142 const vector unsigned int vec_2 = vec_splat_u32(2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
143 const vector signed int vec_1s = vec_splat_s32(1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
144 const vector unsigned int vec_1 = vec_splat_u32(1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
145
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
146
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
147 src0 = vec_ld( 0, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
148 src1 = vec_ld( 16, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
149 src2 = vec_ld( 32, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
150 src3 = vec_ld( 48, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
151 src4 = vec_ld( 64, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
152 src5 = vec_ld( 80, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
153 src6 = vec_ld( 96, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
154 src7 = vec_ld(112, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
155
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
156 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
157 s0 = vec_unpackl(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
158 s1 = vec_unpackl(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
159 s2 = vec_unpackl(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
160 s3 = vec_unpackl(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
161 s4 = vec_unpackl(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
162 s5 = vec_unpackl(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
163 s6 = vec_unpackl(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
164 s7 = vec_unpackl(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
165 s8 = vec_unpackh(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
166 s9 = vec_unpackh(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
167 sA = vec_unpackh(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
168 sB = vec_unpackh(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
169 sC = vec_unpackh(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
170 sD = vec_unpackh(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
171 sE = vec_unpackh(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
172 sF = vec_unpackh(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
173 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
174 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
175 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
176 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
177 src0 = vec_pack(s8, s0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
178 src1 = vec_pack(s9, s1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
179 src2 = vec_pack(sA, s2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
180 src3 = vec_pack(sB, s3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
181 src4 = vec_pack(sC, s4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
182 src5 = vec_pack(sD, s5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
183 src6 = vec_pack(sE, s6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
184 src7 = vec_pack(sF, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
185 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
186
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
187 s0 = vec_unpackl(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
188 s1 = vec_unpackl(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
189 s2 = vec_unpackl(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
190 s3 = vec_unpackl(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
191 s4 = vec_unpackl(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
192 s5 = vec_unpackl(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
193 s6 = vec_unpackl(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
194 s7 = vec_unpackl(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
195 s8 = vec_unpackh(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
196 s9 = vec_unpackh(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
197 sA = vec_unpackh(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
198 sB = vec_unpackh(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
199 sC = vec_unpackh(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
200 sD = vec_unpackh(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
201 sE = vec_unpackh(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
202 sF = vec_unpackh(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
203 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
204 SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
205 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
206 SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
207 src0 = vec_pack(s8, s0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
208 src1 = vec_pack(s9, s1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
209 src2 = vec_pack(sA, s2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
210 src3 = vec_pack(sB, s3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
211 src4 = vec_pack(sC, s4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
212 src5 = vec_pack(sD, s5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
213 src6 = vec_pack(sE, s6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
214 src7 = vec_pack(sF, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
215
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
216 vec_st(src0, 0, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
217 vec_st(src1, 16, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
218 vec_st(src2, 32, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
219 vec_st(src3, 48, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
220 vec_st(src4, 64, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
221 vec_st(src5, 80, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
222 vec_st(src6, 96, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
223 vec_st(src7,112, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
224 }
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
225
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
226 /** Do inverse transform on 8x4 part of block
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
227 */
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
228 static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
229 {
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
230 vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
231 vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
232 vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
233 vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
234 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
235 const vector unsigned int vec_7 = vec_splat_u32(7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
236 const vector unsigned int vec_5 = vec_splat_u32(5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
237 const vector unsigned int vec_4 = vec_splat_u32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
238 const vector signed int vec_4s = vec_splat_s32(4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
239 const vector unsigned int vec_3 = vec_splat_u32(3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
240 const vector unsigned int vec_2 = vec_splat_u32(2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
241 const vector unsigned int vec_1 = vec_splat_u32(1);
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
242 vector unsigned char tmp;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
243 vector signed short tmp2, tmp3;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
244 vector unsigned char perm0, perm1, p0, p1, p;
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
245
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
246 src0 = vec_ld( 0, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
247 src1 = vec_ld( 16, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
248 src2 = vec_ld( 32, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
249 src3 = vec_ld( 48, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
250 src4 = vec_ld( 64, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
251 src5 = vec_ld( 80, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
252 src6 = vec_ld( 96, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
253 src7 = vec_ld(112, block);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
254
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
255 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
256 s0 = vec_unpackl(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
257 s1 = vec_unpackl(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
258 s2 = vec_unpackl(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
259 s3 = vec_unpackl(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
260 s4 = vec_unpackl(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
261 s5 = vec_unpackl(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
262 s6 = vec_unpackl(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
263 s7 = vec_unpackl(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
264 s8 = vec_unpackh(src0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
265 s9 = vec_unpackh(src1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
266 sA = vec_unpackh(src2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
267 sB = vec_unpackh(src3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
268 sC = vec_unpackh(src4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
269 sD = vec_unpackh(src5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
270 sE = vec_unpackh(src6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
271 sF = vec_unpackh(src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
272 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
273 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
274 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
275 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
276 src0 = vec_pack(s8, s0);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
277 src1 = vec_pack(s9, s1);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
278 src2 = vec_pack(sA, s2);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
279 src3 = vec_pack(sB, s3);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
280 src4 = vec_pack(sC, s4);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
281 src5 = vec_pack(sD, s5);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
282 src6 = vec_pack(sE, s6);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
283 src7 = vec_pack(sF, s7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
284 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
285
6000
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
286 s0 = vec_unpackh(src0);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
287 s1 = vec_unpackh(src1);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
288 s2 = vec_unpackh(src2);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
289 s3 = vec_unpackh(src3);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
290 s8 = vec_unpackl(src0);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
291 s9 = vec_unpackl(src1);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
292 sA = vec_unpackl(src2);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
293 sB = vec_unpackl(src3);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
294 STEP4(s0, s1, s2, s3, vec_64);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
295 SHIFT_VERT4(s0, s1, s2, s3);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
296 STEP4(s8, s9, sA, sB, vec_64);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
297 SHIFT_VERT4(s8, s9, sA, sB);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
298 src0 = vec_pack(s0, s8);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
299 src1 = vec_pack(s1, s9);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
300 src2 = vec_pack(s2, sA);
791240825ac4 Reindent after last commit
kostya
parents: 5999
diff changeset
301 src3 = vec_pack(s3, sB);
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
302
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
303 p0 = vec_lvsl (0, dest);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
304 p1 = vec_lvsl (stride, dest);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
305 p = vec_splat_u8 (-1);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
306 perm0 = vec_mergeh (p, p0);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
307 perm1 = vec_mergeh (p, p1);
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
308
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
309 #define ADD(dest,src,perm) \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
310 /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
311 tmp = vec_ld (0, dest); \
6028
1ba8ee13e5b9 Make strict altivec parsers happy (gcc-4.3 and others)
lu_zero
parents: 6000
diff changeset
312 tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
313 tmp3 = vec_adds (tmp2, src); \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
314 tmp = vec_packsu (tmp3, tmp3); \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
315 vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
316 vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest);
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
317
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
318 ADD (dest, src0, perm0) dest += stride;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
319 ADD (dest, src1, perm1) dest += stride;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
320 ADD (dest, src2, perm0) dest += stride;
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
321 ADD (dest, src3, perm1)
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
322 }
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
323
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
324
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
325 void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) {
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
326 dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
5999
f22faee96323 Update Altivec variant of vc1_inv_trans_8x4
kostya
parents: 5997
diff changeset
327 dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
3537
f52e3f60481b Some AltiVec optimizations for VC-1
kostya
parents:
diff changeset
328 }