comparison postprocess_altivec_template.c @ 95:c24dab9bca80 libpostproc

cosmetics: Fix indentation to be 4 spaces and consistently place {}.
author diego
date Sat, 22 Mar 2008 15:46:34 +0000
parents 8181b013dafa
children e565483b1193
comparison
equal deleted inserted replaced
94:094413c45b0f 95:c24dab9bca80
21 */ 21 */
22 22
23 #include "avutil.h" 23 #include "avutil.h"
24 24
25 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ 25 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
26 do { \ 26 do { \
27 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ 27 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \
28 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ 28 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \
29 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ 29 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \
30 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ 30 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \
31 tempA1 = vec_mergeh (src_a, src_e); \ 31 tempA1 = vec_mergeh (src_a, src_e); \
32 tempB1 = vec_mergel (src_a, src_e); \ 32 tempB1 = vec_mergel (src_a, src_e); \
33 tempC1 = vec_mergeh (src_b, src_f); \ 33 tempC1 = vec_mergeh (src_b, src_f); \
34 tempD1 = vec_mergel (src_b, src_f); \ 34 tempD1 = vec_mergel (src_b, src_f); \
35 tempE1 = vec_mergeh (src_c, src_g); \ 35 tempE1 = vec_mergeh (src_c, src_g); \
36 tempF1 = vec_mergel (src_c, src_g); \ 36 tempF1 = vec_mergel (src_c, src_g); \
37 tempG1 = vec_mergeh (src_d, src_h); \ 37 tempG1 = vec_mergeh (src_d, src_h); \
38 tempH1 = vec_mergel (src_d, src_h); \ 38 tempH1 = vec_mergel (src_d, src_h); \
39 tempA2 = vec_mergeh (tempA1, tempE1); \ 39 tempA2 = vec_mergeh (tempA1, tempE1); \
40 tempB2 = vec_mergel (tempA1, tempE1); \ 40 tempB2 = vec_mergel (tempA1, tempE1); \
41 tempC2 = vec_mergeh (tempB1, tempF1); \ 41 tempC2 = vec_mergeh (tempB1, tempF1); \
42 tempD2 = vec_mergel (tempB1, tempF1); \ 42 tempD2 = vec_mergel (tempB1, tempF1); \
43 tempE2 = vec_mergeh (tempC1, tempG1); \ 43 tempE2 = vec_mergeh (tempC1, tempG1); \
44 tempF2 = vec_mergel (tempC1, tempG1); \ 44 tempF2 = vec_mergel (tempC1, tempG1); \
45 tempG2 = vec_mergeh (tempD1, tempH1); \ 45 tempG2 = vec_mergeh (tempD1, tempH1); \
46 tempH2 = vec_mergel (tempD1, tempH1); \ 46 tempH2 = vec_mergel (tempD1, tempH1); \
47 src_a = vec_mergeh (tempA2, tempE2); \ 47 src_a = vec_mergeh (tempA2, tempE2); \
48 src_b = vec_mergel (tempA2, tempE2); \ 48 src_b = vec_mergel (tempA2, tempE2); \
49 src_c = vec_mergeh (tempB2, tempF2); \ 49 src_c = vec_mergeh (tempB2, tempF2); \
50 src_d = vec_mergel (tempB2, tempF2); \ 50 src_d = vec_mergel (tempB2, tempF2); \
51 src_e = vec_mergeh (tempC2, tempG2); \ 51 src_e = vec_mergeh (tempC2, tempG2); \
52 src_f = vec_mergel (tempC2, tempG2); \ 52 src_f = vec_mergel (tempC2, tempG2); \
53 src_g = vec_mergeh (tempD2, tempH2); \ 53 src_g = vec_mergeh (tempD2, tempH2); \
54 src_h = vec_mergel (tempD2, tempH2); \ 54 src_h = vec_mergel (tempD2, tempH2); \
55 } while (0) 55 } while (0)
56 56
57 57
58 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { 58 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
59 /* 59 /*
60 this code makes no assumption on src or stride. 60 this code makes no assumption on src or stride.
61 One could remove the recomputation of the perm 61 One could remove the recomputation of the perm
62 vector by assuming (stride % 16) == 0, unfortunately 62 vector by assuming (stride % 16) == 0, unfortunately
63 this is not always true. 63 this is not always true.
64 */ 64 */
65 DECLARE_ALIGNED(16, short, data[8]); 65 DECLARE_ALIGNED(16, short, data[8]);
66 int numEq; 66 int numEq;
67 uint8_t *src2 = src; 67 uint8_t *src2 = src;
68 vector signed short v_dcOffset; 68 vector signed short v_dcOffset;
69 vector signed short v2QP; 69 vector signed short v2QP;
70 vector unsigned short v4QP; 70 vector unsigned short v4QP;
71 vector unsigned short v_dcThreshold; 71 vector unsigned short v_dcThreshold;
72 const int properStride = (stride % 16); 72 const int properStride = (stride % 16);
73 const int srcAlign = ((unsigned long)src2 % 16); 73 const int srcAlign = ((unsigned long)src2 % 16);
74 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0; 74 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
75 const vector signed int zero = vec_splat_s32(0); 75 const vector signed int zero = vec_splat_s32(0);
76 const vector signed short mask = vec_splat_s16(1); 76 const vector signed short mask = vec_splat_s16(1);
77 vector signed int v_numEq = vec_splat_s32(0); 77 vector signed int v_numEq = vec_splat_s32(0);
78 78
79 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 79 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
80 data[1] = data[0] * 2 + 1; 80 data[1] = data[0] * 2 + 1;
81 data[2] = c->QP * 2; 81 data[2] = c->QP * 2;
82 data[3] = c->QP * 4; 82 data[3] = c->QP * 4;
83 vector signed short v_data = vec_ld(0, data); 83 vector signed short v_data = vec_ld(0, data);
84 v_dcOffset = vec_splat(v_data, 0); 84 v_dcOffset = vec_splat(v_data, 0);
85 v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1); 85 v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
86 v2QP = vec_splat(v_data, 2); 86 v2QP = vec_splat(v_data, 2);
87 v4QP = (vector unsigned short)vec_splat(v_data, 3); 87 v4QP = (vector unsigned short)vec_splat(v_data, 3);
88 88
89 src2 += stride * 4; 89 src2 += stride * 4;
90 90
91 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; 91 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
92 92
93 #define LOAD_LINE(i) \ 93 #define LOAD_LINE(i) \
94 register int j##i = i * stride; \ 94 register int j##i = i * stride; \
95 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ 95 vector unsigned char perm##i = vec_lvsl(j##i, src2); \
96 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ 96 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
97 vector unsigned char v_srcA2##i; \ 97 vector unsigned char v_srcA2##i; \
98 if (two_vectors) \ 98 if (two_vectors) \
99 v_srcA2##i = vec_ld(j##i + 16, src2); \ 99 v_srcA2##i = vec_ld(j##i + 16, src2); \
100 const vector unsigned char v_srcA##i = \ 100 const vector unsigned char v_srcA##i = \
101 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ 101 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
102 v_srcAss##i = \ 102 v_srcAss##i = \
103 (vector signed short)vec_mergeh((vector signed char)zero, \ 103 (vector signed short)vec_mergeh((vector signed char)zero, \
104 (vector signed char)v_srcA##i) 104 (vector signed char)v_srcA##i)
105 105
106 #define LOAD_LINE_ALIGNED(i) \ 106 #define LOAD_LINE_ALIGNED(i) \
107 register int j##i = i * stride; \ 107 register int j##i = i * stride; \
108 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ 108 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \
109 v_srcAss##i = \ 109 v_srcAss##i = \
110 (vector signed short)vec_mergeh((vector signed char)zero, \ 110 (vector signed short)vec_mergeh((vector signed char)zero, \
111 (vector signed char)v_srcA##i) 111 (vector signed char)v_srcA##i)
112 112
113 /* Special-casing the aligned case is worthwhile, as all calls from 113 /* Special-casing the aligned case is worthwhile, as all calls from
114 * the (transposed) horizontable deblocks will be aligned, in addition 114 * the (transposed) horizontable deblocks will be aligned, in addition
115 * to the naturally aligned vertical deblocks. */ 115 * to the naturally aligned vertical deblocks. */
116 if (properStride && srcAlign) { 116 if (properStride && srcAlign) {
117 LOAD_LINE_ALIGNED(0); 117 LOAD_LINE_ALIGNED(0);
118 LOAD_LINE_ALIGNED(1); 118 LOAD_LINE_ALIGNED(1);
119 LOAD_LINE_ALIGNED(2); 119 LOAD_LINE_ALIGNED(2);
120 LOAD_LINE_ALIGNED(3); 120 LOAD_LINE_ALIGNED(3);
121 LOAD_LINE_ALIGNED(4); 121 LOAD_LINE_ALIGNED(4);
122 LOAD_LINE_ALIGNED(5); 122 LOAD_LINE_ALIGNED(5);
123 LOAD_LINE_ALIGNED(6); 123 LOAD_LINE_ALIGNED(6);
124 LOAD_LINE_ALIGNED(7); 124 LOAD_LINE_ALIGNED(7);
125 } else { 125 } else {
126 LOAD_LINE(0); 126 LOAD_LINE(0);
127 LOAD_LINE(1); 127 LOAD_LINE(1);
128 LOAD_LINE(2); 128 LOAD_LINE(2);
129 LOAD_LINE(3); 129 LOAD_LINE(3);
130 LOAD_LINE(4); 130 LOAD_LINE(4);
131 LOAD_LINE(5); 131 LOAD_LINE(5);
132 LOAD_LINE(6); 132 LOAD_LINE(6);
133 LOAD_LINE(7); 133 LOAD_LINE(7);
134 } 134 }
135 #undef LOAD_LINE 135 #undef LOAD_LINE
136 #undef LOAD_LINE_ALIGNED 136 #undef LOAD_LINE_ALIGNED
137 137
138 #define ITER(i, j) \ 138 #define ITER(i, j) \
139 const vector signed short v_diff##i = \ 139 const vector signed short v_diff##i = \
140 vec_sub(v_srcAss##i, v_srcAss##j); \ 140 vec_sub(v_srcAss##i, v_srcAss##j); \
141 const vector signed short v_sum##i = \ 141 const vector signed short v_sum##i = \
142 vec_add(v_diff##i, v_dcOffset); \ 142 vec_add(v_diff##i, v_dcOffset); \
143 const vector signed short v_comp##i = \ 143 const vector signed short v_comp##i = \
144 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ 144 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
145 v_dcThreshold); \ 145 v_dcThreshold); \
146 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ 146 const vector signed short v_part##i = vec_and(mask, v_comp##i); \
147 v_numEq = vec_sum4s(v_part##i, v_numEq); 147 v_numEq = vec_sum4s(v_part##i, v_numEq);
148 148
149 ITER(0, 1); 149 ITER(0, 1);
150 ITER(1, 2); 150 ITER(1, 2);
151 ITER(2, 3); 151 ITER(2, 3);
152 ITER(3, 4); 152 ITER(3, 4);
153 ITER(4, 5); 153 ITER(4, 5);
154 ITER(5, 6); 154 ITER(5, 6);
155 ITER(6, 7); 155 ITER(6, 7);
156 #undef ITER 156 #undef ITER
157 157
158 v_numEq = vec_sums(v_numEq, zero); 158 v_numEq = vec_sums(v_numEq, zero);
159 159
160 v_numEq = vec_splat(v_numEq, 3); 160 v_numEq = vec_splat(v_numEq, 3);
161 vec_ste(v_numEq, 0, &numEq); 161 vec_ste(v_numEq, 0, &numEq);
162 162
163 if (numEq > c->ppMode.flatnessThreshold) 163 if (numEq > c->ppMode.flatnessThreshold){
164 { 164 const vector unsigned char mmoP1 = (const vector unsigned char)
165 const vector unsigned char mmoP1 = (const vector unsigned char) 165 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
166 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 166 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
167 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); 167 const vector unsigned char mmoP2 = (const vector unsigned char)
168 const vector unsigned char mmoP2 = (const vector unsigned char) 168 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
169 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, 169 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
170 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); 170 const vector unsigned char mmoP = (const vector unsigned char)
171 const vector unsigned char mmoP = (const vector unsigned char) 171 vec_lvsl(8, (unsigned char*)0);
172 vec_lvsl(8, (unsigned char*)0); 172
173 173 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
174 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); 174 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
175 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); 175 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
176 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); 176 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
177 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); 177 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
178 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2); 178 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
179 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); 179 vector signed short mmoDiff = vec_sub(mmoL, mmoR);
180 vector signed short mmoDiff = vec_sub(mmoL, mmoR); 180 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
181 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); 181
182 182 if (vec_any_gt(mmoSum, v4QP))
183 if (vec_any_gt(mmoSum, v4QP)) 183 return 0;
184 return 0; 184 else
185 else 185 return 1;
186 return 1;
187 } 186 }
188 else return 2; 187 else return 2;
189 } 188 }
190 189
191 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { 190 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
192 /* 191 /*
193 this code makes no assumption on src or stride. 192 this code makes no assumption on src or stride.
194 One could remove the recomputation of the perm 193 One could remove the recomputation of the perm
195 vector by assuming (stride % 16) == 0, unfortunately 194 vector by assuming (stride % 16) == 0, unfortunately
196 this is not always true. Quite a lot of load/stores 195 this is not always true. Quite a lot of load/stores
197 can be removed by assuming proper alignment of 196 can be removed by assuming proper alignment of
198 src & stride :-( 197 src & stride :-(
199 */ 198 */
200 uint8_t *src2 = src; 199 uint8_t *src2 = src;
201 const vector signed int zero = vec_splat_s32(0); 200 const vector signed int zero = vec_splat_s32(0);
202 const int properStride = (stride % 16); 201 const int properStride = (stride % 16);
203 const int srcAlign = ((unsigned long)src2 % 16); 202 const int srcAlign = ((unsigned long)src2 % 16);
204 DECLARE_ALIGNED(16, short, qp[8]); 203 DECLARE_ALIGNED(16, short, qp[8]);
205 qp[0] = c->QP; 204 qp[0] = c->QP;
206 vector signed short vqp = vec_ld(0, qp); 205 vector signed short vqp = vec_ld(0, qp);
207 vqp = vec_splat(vqp, 0); 206 vqp = vec_splat(vqp, 0);
208 207
209 src2 += stride*3; 208 src2 += stride*3;
210 209
211 vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9; 210 vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
212 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; 211 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
213 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; 212 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
214 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; 213 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
215 214
216 #define LOAD_LINE(i) \ 215 #define LOAD_LINE(i) \
217 const vector unsigned char perml##i = \ 216 const vector unsigned char perml##i = \
218 vec_lvsl(i * stride, src2); \ 217 vec_lvsl(i * stride, src2); \
219 vbA##i = vec_ld(i * stride, src2); \ 218 vbA##i = vec_ld(i * stride, src2); \
220 vbB##i = vec_ld(i * stride + 16, src2); \ 219 vbB##i = vec_ld(i * stride + 16, src2); \
221 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ 220 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \
222 vb##i = \ 221 vb##i = \
223 (vector signed short)vec_mergeh((vector unsigned char)zero, \ 222 (vector signed short)vec_mergeh((vector unsigned char)zero, \
224 (vector unsigned char)vbT##i) 223 (vector unsigned char)vbT##i)
225 224
226 #define LOAD_LINE_ALIGNED(i) \ 225 #define LOAD_LINE_ALIGNED(i) \
227 register int j##i = i * stride; \ 226 register int j##i = i * stride; \
228 vbT##i = vec_ld(j##i, src2); \ 227 vbT##i = vec_ld(j##i, src2); \
229 vb##i = \ 228 vb##i = \
230 (vector signed short)vec_mergeh((vector signed char)zero, \ 229 (vector signed short)vec_mergeh((vector signed char)zero, \
231 (vector signed char)vbT##i) 230 (vector signed char)vbT##i)
231
232 /* Special-casing the aligned case is worthwhile, as all calls from
233 * the (transposed) horizontable deblocks will be aligned, in addition
234 * to the naturally aligned vertical deblocks. */
235 if (properStride && srcAlign) {
236 LOAD_LINE_ALIGNED(0);
237 LOAD_LINE_ALIGNED(1);
238 LOAD_LINE_ALIGNED(2);
239 LOAD_LINE_ALIGNED(3);
240 LOAD_LINE_ALIGNED(4);
241 LOAD_LINE_ALIGNED(5);
242 LOAD_LINE_ALIGNED(6);
243 LOAD_LINE_ALIGNED(7);
244 LOAD_LINE_ALIGNED(8);
245 LOAD_LINE_ALIGNED(9);
246 } else {
247 LOAD_LINE(0);
248 LOAD_LINE(1);
249 LOAD_LINE(2);
250 LOAD_LINE(3);
251 LOAD_LINE(4);
252 LOAD_LINE(5);
253 LOAD_LINE(6);
254 LOAD_LINE(7);
255 LOAD_LINE(8);
256 LOAD_LINE(9);
257 }
258 #undef LOAD_LINE
259 #undef LOAD_LINE_ALIGNED
260
261 const vector unsigned short v_2 = vec_splat_u16(2);
262 const vector unsigned short v_4 = vec_splat_u16(4);
263
264 const vector signed short v_diff01 = vec_sub(vb0, vb1);
265 const vector unsigned short v_cmp01 =
266 (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
267 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
268 const vector signed short v_diff89 = vec_sub(vb8, vb9);
269 const vector unsigned short v_cmp89 =
270 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
271 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
272
273 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
274 const vector signed short temp02 = vec_add(vb2, vb3);
275 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
276 const vector signed short v_sumsB0 = vec_add(temp02, temp03);
277
278 const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
279 const vector signed short v_sumsB1 = vec_add(temp11, vb4);
280
281 const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
282 const vector signed short v_sumsB2 = vec_add(temp21, vb5);
283
284 const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
285 const vector signed short v_sumsB3 = vec_add(temp31, vb6);
286
287 const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
288 const vector signed short v_sumsB4 = vec_add(temp41, vb7);
289
290 const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
291 const vector signed short v_sumsB5 = vec_add(temp51, vb8);
292
293 const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
294 const vector signed short v_sumsB6 = vec_add(temp61, v_last);
295
296 const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
297 const vector signed short v_sumsB7 = vec_add(temp71, v_last);
298
299 const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
300 const vector signed short v_sumsB8 = vec_add(temp81, v_last);
301
302 const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
303 const vector signed short v_sumsB9 = vec_add(temp91, v_last);
304
305 #define COMPUTE_VR(i, j, k) \
306 const vector signed short temps1##i = \
307 vec_add(v_sumsB##i, v_sumsB##k); \
308 const vector signed short temps2##i = \
309 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
310 const vector signed short vr##j = vec_sra(temps2##i, v_4)
311
312 COMPUTE_VR(0, 1, 2);
313 COMPUTE_VR(1, 2, 3);
314 COMPUTE_VR(2, 3, 4);
315 COMPUTE_VR(3, 4, 5);
316 COMPUTE_VR(4, 5, 6);
317 COMPUTE_VR(5, 6, 7);
318 COMPUTE_VR(6, 7, 8);
319 COMPUTE_VR(7, 8, 9);
320
321 const vector signed char neg1 = vec_splat_s8(-1);
322 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
323 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
324
325 #define PACK_AND_STORE(i) \
326 const vector unsigned char perms##i = \
327 vec_lvsr(i * stride, src2); \
328 const vector unsigned char vf##i = \
329 vec_packsu(vr##i, (vector signed short)zero); \
330 const vector unsigned char vg##i = \
331 vec_perm(vf##i, vbT##i, permHH); \
332 const vector unsigned char mask##i = \
333 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
334 const vector unsigned char vg2##i = \
335 vec_perm(vg##i, vg##i, perms##i); \
336 const vector unsigned char svA##i = \
337 vec_sel(vbA##i, vg2##i, mask##i); \
338 const vector unsigned char svB##i = \
339 vec_sel(vg2##i, vbB##i, mask##i); \
340 vec_st(svA##i, i * stride, src2); \
341 vec_st(svB##i, i * stride + 16, src2)
342
343 #define PACK_AND_STORE_ALIGNED(i) \
344 const vector unsigned char vf##i = \
345 vec_packsu(vr##i, (vector signed short)zero); \
346 const vector unsigned char vg##i = \
347 vec_perm(vf##i, vbT##i, permHH); \
348 vec_st(vg##i, i * stride, src2)
232 349
233 /* Special-casing the aligned case is worthwhile, as all calls from 350 /* Special-casing the aligned case is worthwhile, as all calls from
234 * the (transposed) horizontable deblocks will be aligned, in addition 351 * the (transposed) horizontable deblocks will be aligned, in addition
235 * to the naturally aligned vertical deblocks. */ 352 * to the naturally aligned vertical deblocks. */
236 if (properStride && srcAlign) { 353 if (properStride && srcAlign) {
237 LOAD_LINE_ALIGNED(0); 354 PACK_AND_STORE_ALIGNED(1);
238 LOAD_LINE_ALIGNED(1); 355 PACK_AND_STORE_ALIGNED(2);
239 LOAD_LINE_ALIGNED(2); 356 PACK_AND_STORE_ALIGNED(3);
240 LOAD_LINE_ALIGNED(3); 357 PACK_AND_STORE_ALIGNED(4);
241 LOAD_LINE_ALIGNED(4); 358 PACK_AND_STORE_ALIGNED(5);
242 LOAD_LINE_ALIGNED(5); 359 PACK_AND_STORE_ALIGNED(6);
243 LOAD_LINE_ALIGNED(6); 360 PACK_AND_STORE_ALIGNED(7);
244 LOAD_LINE_ALIGNED(7); 361 PACK_AND_STORE_ALIGNED(8);
245 LOAD_LINE_ALIGNED(8);
246 LOAD_LINE_ALIGNED(9);
247 } else { 362 } else {
248 LOAD_LINE(0); 363 PACK_AND_STORE(1);
249 LOAD_LINE(1); 364 PACK_AND_STORE(2);
250 LOAD_LINE(2); 365 PACK_AND_STORE(3);
251 LOAD_LINE(3); 366 PACK_AND_STORE(4);
252 LOAD_LINE(4); 367 PACK_AND_STORE(5);
253 LOAD_LINE(5); 368 PACK_AND_STORE(6);
254 LOAD_LINE(6); 369 PACK_AND_STORE(7);
255 LOAD_LINE(7); 370 PACK_AND_STORE(8);
256 LOAD_LINE(8);
257 LOAD_LINE(9);
258 } 371 }
259 #undef LOAD_LINE
260 #undef LOAD_LINE_ALIGNED
261
262 const vector unsigned short v_2 = vec_splat_u16(2);
263 const vector unsigned short v_4 = vec_splat_u16(4);
264
265 const vector signed short v_diff01 = vec_sub(vb0, vb1);
266 const vector unsigned short v_cmp01 =
267 (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
268 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
269 const vector signed short v_diff89 = vec_sub(vb8, vb9);
270 const vector unsigned short v_cmp89 =
271 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
272 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
273
274 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
275 const vector signed short temp02 = vec_add(vb2, vb3);
276 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
277 const vector signed short v_sumsB0 = vec_add(temp02, temp03);
278
279 const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
280 const vector signed short v_sumsB1 = vec_add(temp11, vb4);
281
282 const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
283 const vector signed short v_sumsB2 = vec_add(temp21, vb5);
284
285 const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
286 const vector signed short v_sumsB3 = vec_add(temp31, vb6);
287
288 const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
289 const vector signed short v_sumsB4 = vec_add(temp41, vb7);
290
291 const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
292 const vector signed short v_sumsB5 = vec_add(temp51, vb8);
293
294 const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
295 const vector signed short v_sumsB6 = vec_add(temp61, v_last);
296
297 const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
298 const vector signed short v_sumsB7 = vec_add(temp71, v_last);
299
300 const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
301 const vector signed short v_sumsB8 = vec_add(temp81, v_last);
302
303 const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
304 const vector signed short v_sumsB9 = vec_add(temp91, v_last);
305
306 #define COMPUTE_VR(i, j, k) \
307 const vector signed short temps1##i = \
308 vec_add(v_sumsB##i, v_sumsB##k); \
309 const vector signed short temps2##i = \
310 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
311 const vector signed short vr##j = vec_sra(temps2##i, v_4)
312
313 COMPUTE_VR(0, 1, 2);
314 COMPUTE_VR(1, 2, 3);
315 COMPUTE_VR(2, 3, 4);
316 COMPUTE_VR(3, 4, 5);
317 COMPUTE_VR(4, 5, 6);
318 COMPUTE_VR(5, 6, 7);
319 COMPUTE_VR(6, 7, 8);
320 COMPUTE_VR(7, 8, 9);
321
322 const vector signed char neg1 = vec_splat_s8(-1);
323 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
324 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
325
326 #define PACK_AND_STORE(i) \
327 const vector unsigned char perms##i = \
328 vec_lvsr(i * stride, src2); \
329 const vector unsigned char vf##i = \
330 vec_packsu(vr##i, (vector signed short)zero); \
331 const vector unsigned char vg##i = \
332 vec_perm(vf##i, vbT##i, permHH); \
333 const vector unsigned char mask##i = \
334 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
335 const vector unsigned char vg2##i = \
336 vec_perm(vg##i, vg##i, perms##i); \
337 const vector unsigned char svA##i = \
338 vec_sel(vbA##i, vg2##i, mask##i); \
339 const vector unsigned char svB##i = \
340 vec_sel(vg2##i, vbB##i, mask##i); \
341 vec_st(svA##i, i * stride, src2); \
342 vec_st(svB##i, i * stride + 16, src2)
343
344 #define PACK_AND_STORE_ALIGNED(i) \
345 const vector unsigned char vf##i = \
346 vec_packsu(vr##i, (vector signed short)zero); \
347 const vector unsigned char vg##i = \
348 vec_perm(vf##i, vbT##i, permHH); \
349 vec_st(vg##i, i * stride, src2)
350
351 /* Special-casing the aligned case is worthwhile, as all calls from
352 * the (transposed) horizontable deblocks will be aligned, in addition
353 * to the naturally aligned vertical deblocks. */
354 if (properStride && srcAlign) {
355 PACK_AND_STORE_ALIGNED(1);
356 PACK_AND_STORE_ALIGNED(2);
357 PACK_AND_STORE_ALIGNED(3);
358 PACK_AND_STORE_ALIGNED(4);
359 PACK_AND_STORE_ALIGNED(5);
360 PACK_AND_STORE_ALIGNED(6);
361 PACK_AND_STORE_ALIGNED(7);
362 PACK_AND_STORE_ALIGNED(8);
363 } else {
364 PACK_AND_STORE(1);
365 PACK_AND_STORE(2);
366 PACK_AND_STORE(3);
367 PACK_AND_STORE(4);
368 PACK_AND_STORE(5);
369 PACK_AND_STORE(6);
370 PACK_AND_STORE(7);
371 PACK_AND_STORE(8);
372 }
373 #undef PACK_AND_STORE 372 #undef PACK_AND_STORE
374 #undef PACK_AND_STORE_ALIGNED 373 #undef PACK_AND_STORE_ALIGNED
375 } 374 }
376 375
377 376
378 377
379 static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) { 378 static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
380 /* 379 /*
381 this code makes no assumption on src or stride. 380 this code makes no assumption on src or stride.
382 One could remove the recomputation of the perm 381 One could remove the recomputation of the perm
383 vector by assuming (stride % 16) == 0, unfortunately 382 vector by assuming (stride % 16) == 0, unfortunately
384 this is not always true. Quite a lot of load/stores 383 this is not always true. Quite a lot of load/stores
385 can be removed by assuming proper alignment of 384 can be removed by assuming proper alignment of
386 src & stride :-( 385 src & stride :-(
387 */ 386 */
388 uint8_t *src2 = src; 387 uint8_t *src2 = src;
389 const vector signed int zero = vec_splat_s32(0); 388 const vector signed int zero = vec_splat_s32(0);
390 DECLARE_ALIGNED(16, short, qp[8]); 389 DECLARE_ALIGNED(16, short, qp[8]);
391 qp[0] = 8*c->QP; 390 qp[0] = 8*c->QP;
392 vector signed short vqp = vec_ld(0, qp); 391 vector signed short vqp = vec_ld(0, qp);
393 vqp = vec_splat(vqp, 0); 392 vqp = vec_splat(vqp, 0);
394 393
395 #define LOAD_LINE(i) \ 394 #define LOAD_LINE(i) \
396 const vector unsigned char perm##i = \ 395 const vector unsigned char perm##i = \
397 vec_lvsl(i * stride, src2); \ 396 vec_lvsl(i * stride, src2); \
398 const vector unsigned char vbA##i = \ 397 const vector unsigned char vbA##i = \
399 vec_ld(i * stride, src2); \ 398 vec_ld(i * stride, src2); \
400 const vector unsigned char vbB##i = \ 399 const vector unsigned char vbB##i = \
401 vec_ld(i * stride + 16, src2); \ 400 vec_ld(i * stride + 16, src2); \
402 const vector unsigned char vbT##i = \ 401 const vector unsigned char vbT##i = \
403 vec_perm(vbA##i, vbB##i, perm##i); \ 402 vec_perm(vbA##i, vbB##i, perm##i); \
404 const vector signed short vb##i = \ 403 const vector signed short vb##i = \
405 (vector signed short)vec_mergeh((vector unsigned char)zero, \ 404 (vector signed short)vec_mergeh((vector unsigned char)zero, \
406 (vector unsigned char)vbT##i) 405 (vector unsigned char)vbT##i)
407 406
408 src2 += stride*3; 407 src2 += stride*3;
409 408
410 LOAD_LINE(1); 409 LOAD_LINE(1);
411 LOAD_LINE(2); 410 LOAD_LINE(2);
412 LOAD_LINE(3); 411 LOAD_LINE(3);
413 LOAD_LINE(4); 412 LOAD_LINE(4);
414 LOAD_LINE(5); 413 LOAD_LINE(5);
415 LOAD_LINE(6); 414 LOAD_LINE(6);
416 LOAD_LINE(7); 415 LOAD_LINE(7);
417 LOAD_LINE(8); 416 LOAD_LINE(8);
418 #undef LOAD_LINE 417 #undef LOAD_LINE
419 418
420 const vector signed short v_1 = vec_splat_s16(1); 419 const vector signed short v_1 = vec_splat_s16(1);
421 const vector signed short v_2 = vec_splat_s16(2); 420 const vector signed short v_2 = vec_splat_s16(2);
422 const vector signed short v_5 = vec_splat_s16(5); 421 const vector signed short v_5 = vec_splat_s16(5);
423 const vector signed short v_32 = vec_sl(v_1, 422 const vector signed short v_32 = vec_sl(v_1,
424 (vector unsigned short)v_5); 423 (vector unsigned short)v_5);
425 /* middle energy */ 424 /* middle energy */
426 const vector signed short l3minusl6 = vec_sub(vb3, vb6); 425 const vector signed short l3minusl6 = vec_sub(vb3, vb6);
427 const vector signed short l5minusl4 = vec_sub(vb5, vb4); 426 const vector signed short l5minusl4 = vec_sub(vb5, vb4);
428 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); 427 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
429 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); 428 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
430 const vector signed short absmE = vec_abs(mE); 429 const vector signed short absmE = vec_abs(mE);
431 /* left & right energy */ 430 /* left & right energy */
432 const vector signed short l1minusl4 = vec_sub(vb1, vb4); 431 const vector signed short l1minusl4 = vec_sub(vb1, vb4);
433 const vector signed short l3minusl2 = vec_sub(vb3, vb2); 432 const vector signed short l3minusl2 = vec_sub(vb3, vb2);
434 const vector signed short l5minusl8 = vec_sub(vb5, vb8); 433 const vector signed short l5minusl8 = vec_sub(vb5, vb8);
435 const vector signed short l7minusl6 = vec_sub(vb7, vb6); 434 const vector signed short l7minusl6 = vec_sub(vb7, vb6);
436 const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero); 435 const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
437 const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero); 436 const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
438 const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4); 437 const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
439 const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8); 438 const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
440 /* d */ 439 /* d */
441 const vector signed short ddiff = vec_sub(absmE, 440 const vector signed short ddiff = vec_sub(absmE,
442 vec_min(vec_abs(lE), 441 vec_min(vec_abs(lE),
443 vec_abs(rE))); 442 vec_abs(rE)));
444 const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero); 443 const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
445 const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32); 444 const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
446 const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6)); 445 const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
447 const vector signed short minusd = vec_sub((vector signed short)zero, d); 446 const vector signed short minusd = vec_sub((vector signed short)zero, d);
448 const vector signed short finald = vec_sel(minusd, 447 const vector signed short finald = vec_sel(minusd,
449 d, 448 d,
450 vec_cmpgt(vec_sub((vector signed short)zero, mE), 449 vec_cmpgt(vec_sub((vector signed short)zero, mE),
451 (vector signed short)zero)); 450 (vector signed short)zero));
452 /* q */ 451 /* q */
453 const vector signed short qtimes2 = vec_sub(vb4, vb5); 452 const vector signed short qtimes2 = vec_sub(vb4, vb5);
454 /* for a shift right to behave like /2, we need to add one 453 /* for a shift right to behave like /2, we need to add one
455 to all negative integer */ 454 to all negative integer */
456 const vector signed short rounddown = vec_sel((vector signed short)zero, 455 const vector signed short rounddown = vec_sel((vector signed short)zero,
457 v_1, 456 v_1,
458 vec_cmplt(qtimes2, (vector signed short)zero)); 457 vec_cmplt(qtimes2, (vector signed short)zero));
459 const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1)); 458 const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
460 /* clamp */ 459 /* clamp */
461 const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald); 460 const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
462 const vector signed short dclamp_P = vec_min(dclamp_P1, q); 461 const vector signed short dclamp_P = vec_min(dclamp_P1, q);
463 const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald); 462 const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
464 const vector signed short dclamp_N = vec_max(dclamp_N1, q); 463 const vector signed short dclamp_N = vec_max(dclamp_N1, q);
465 464
466 const vector signed short dclampedfinal = vec_sel(dclamp_N, 465 const vector signed short dclampedfinal = vec_sel(dclamp_N,
467 dclamp_P, 466 dclamp_P,
468 vec_cmpgt(q, (vector signed short)zero)); 467 vec_cmpgt(q, (vector signed short)zero));
469 const vector signed short dornotd = vec_sel((vector signed short)zero, 468 const vector signed short dornotd = vec_sel((vector signed short)zero,
470 dclampedfinal, 469 dclampedfinal,
471 vec_cmplt(absmE, vqp)); 470 vec_cmplt(absmE, vqp));
472 /* add/subtract to l4 and l5 */ 471 /* add/subtract to l4 and l5 */
473 const vector signed short vb4minusd = vec_sub(vb4, dornotd); 472 const vector signed short vb4minusd = vec_sub(vb4, dornotd);
474 const vector signed short vb5plusd = vec_add(vb5, dornotd); 473 const vector signed short vb5plusd = vec_add(vb5, dornotd);
475 /* finally, stores */ 474 /* finally, stores */
476 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); 475 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
477 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); 476 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
478 477
479 const vector signed char neg1 = vec_splat_s8(-1); 478 const vector signed char neg1 = vec_splat_s8(-1);
480 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 479 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
481 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); 480 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
482 481
483 #define STORE(i) \ 482 #define STORE(i) \
484 const vector unsigned char perms##i = \ 483 const vector unsigned char perms##i = \
485 vec_lvsr(i * stride, src2); \ 484 vec_lvsr(i * stride, src2); \
486 const vector unsigned char vg##i = \ 485 const vector unsigned char vg##i = \
487 vec_perm(st##i, vbT##i, permHH); \ 486 vec_perm(st##i, vbT##i, permHH); \
488 const vector unsigned char mask##i = \ 487 const vector unsigned char mask##i = \
489 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ 488 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
490 const vector unsigned char vg2##i = \ 489 const vector unsigned char vg2##i = \
491 vec_perm(vg##i, vg##i, perms##i); \ 490 vec_perm(vg##i, vg##i, perms##i); \
492 const vector unsigned char svA##i = \ 491 const vector unsigned char svA##i = \
493 vec_sel(vbA##i, vg2##i, mask##i); \ 492 vec_sel(vbA##i, vg2##i, mask##i); \
494 const vector unsigned char svB##i = \ 493 const vector unsigned char svB##i = \
495 vec_sel(vg2##i, vbB##i, mask##i); \ 494 vec_sel(vg2##i, vbB##i, mask##i); \
496 vec_st(svA##i, i * stride, src2); \ 495 vec_st(svA##i, i * stride, src2); \
497 vec_st(svB##i, i * stride + 16, src2) 496 vec_st(svB##i, i * stride + 16, src2)
498 497
499 STORE(4); 498 STORE(4);
500 STORE(5); 499 STORE(5);
501 } 500 }
502 501
503 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { 502 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
504 /* 503 /*
505 this code makes no assumption on src or stride. 504 this code makes no assumption on src or stride.
506 One could remove the recomputation of the perm 505 One could remove the recomputation of the perm
507 vector by assuming (stride % 16) == 0, unfortunately 506 vector by assuming (stride % 16) == 0, unfortunately
508 this is not always true. Quite a lot of load/stores 507 this is not always true. Quite a lot of load/stores
509 can be removed by assuming proper alignment of 508 can be removed by assuming proper alignment of
510 src & stride :-( 509 src & stride :-(
511 */ 510 */
512 uint8_t *srcCopy = src; 511 uint8_t *srcCopy = src;
513 DECLARE_ALIGNED(16, uint8_t, dt[16]); 512 DECLARE_ALIGNED(16, uint8_t, dt[16]);
514 const vector signed int zero = vec_splat_s32(0); 513 const vector signed int zero = vec_splat_s32(0);
515 vector unsigned char v_dt; 514 vector unsigned char v_dt;
516 dt[0] = deringThreshold; 515 dt[0] = deringThreshold;
517 v_dt = vec_splat(vec_ld(0, dt), 0); 516 v_dt = vec_splat(vec_ld(0, dt), 0);
518 517
519 #define LOAD_LINE(i) \ 518 #define LOAD_LINE(i) \
520 const vector unsigned char perm##i = \ 519 const vector unsigned char perm##i = \
521 vec_lvsl(i * stride, srcCopy); \ 520 vec_lvsl(i * stride, srcCopy); \
522 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ 521 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \
523 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ 522 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \
524 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) 523 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
525 524
526 LOAD_LINE(0); 525 LOAD_LINE(0);
527 LOAD_LINE(1); 526 LOAD_LINE(1);
528 LOAD_LINE(2); 527 LOAD_LINE(2);
529 LOAD_LINE(3); 528 LOAD_LINE(3);
530 LOAD_LINE(4); 529 LOAD_LINE(4);
531 LOAD_LINE(5); 530 LOAD_LINE(5);
532 LOAD_LINE(6); 531 LOAD_LINE(6);
533 LOAD_LINE(7); 532 LOAD_LINE(7);
534 LOAD_LINE(8); 533 LOAD_LINE(8);
535 LOAD_LINE(9); 534 LOAD_LINE(9);
536 #undef LOAD_LINE 535 #undef LOAD_LINE
537 536
538 vector unsigned char v_avg; 537 vector unsigned char v_avg;
539 { 538 {
540 const vector unsigned char trunc_perm = (vector unsigned char) 539 const vector unsigned char trunc_perm = (vector unsigned char)
541 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 540 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
542 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); 541 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);
543 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); 542 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
544 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); 543 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
545 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); 544 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
546 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); 545 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
547 546
548 #define EXTRACT(op) do { \ 547 #define EXTRACT(op) do { \
549 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ 548 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \
550 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ 549 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \
551 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ 550 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \
552 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ 551 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \
553 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ 552 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \
554 const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \ 553 const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \
555 const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \ 554 const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \
556 const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \ 555 const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \
557 const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \ 556 const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \
558 const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \ 557 const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \
559 const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \ 558 const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \
560 const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \ 559 const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \
561 const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \ 560 const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \
562 const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \ 561 const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \
563 v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0) 562 v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)
564 563
565 vector unsigned char v_min; 564 vector unsigned char v_min;
566 vector unsigned char v_max; 565 vector unsigned char v_max;
567 EXTRACT(min); 566 EXTRACT(min);
568 EXTRACT(max); 567 EXTRACT(max);
569 #undef EXTRACT 568 #undef EXTRACT
570 569
571 if (vec_all_lt(vec_sub(v_max, v_min), v_dt)) 570 if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
572 return; 571 return;
573 572
574 v_avg = vec_avg(v_min, v_max); 573 v_avg = vec_avg(v_min, v_max);
575 } 574 }
576 575
577 DECLARE_ALIGNED(16, signed int, S[8]); 576 DECLARE_ALIGNED(16, signed int, S[8]);
578 { 577 {
579 const vector unsigned short mask1 = (vector unsigned short) 578 const vector unsigned short mask1 = (vector unsigned short)
580 AVV(0x0001, 0x0002, 0x0004, 0x0008, 579 AVV(0x0001, 0x0002, 0x0004, 0x0008,
581 0x0010, 0x0020, 0x0040, 0x0080); 580 0x0010, 0x0020, 0x0040, 0x0080);
582 const vector unsigned short mask2 = (vector unsigned short) 581 const vector unsigned short mask2 = (vector unsigned short)
583 AVV(0x0100, 0x0200, 0x0000, 0x0000, 582 AVV(0x0100, 0x0200, 0x0000, 0x0000,
584 0x0000, 0x0000, 0x0000, 0x0000); 583 0x0000, 0x0000, 0x0000, 0x0000);
585 584
586 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); 585 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
587 const vector unsigned int vuint32_1 = vec_splat_u32(1); 586 const vector unsigned int vuint32_1 = vec_splat_u32(1);
588 587
589 #define COMPARE(i) \ 588 #define COMPARE(i) \
590 vector signed int sum##i; \ 589 vector signed int sum##i; \
591 do { \ 590 do { \
592 const vector unsigned char cmp##i = \ 591 const vector unsigned char cmp##i = \
593 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ 592 (vector unsigned char)vec_cmpgt(src##i, v_avg); \
594 const vector unsigned short cmpHi##i = \ 593 const vector unsigned short cmpHi##i = \
595 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ 594 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \
596 const vector unsigned short cmpLi##i = \ 595 const vector unsigned short cmpLi##i = \
597 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ 596 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \
598 const vector signed short cmpHf##i = \ 597 const vector signed short cmpHf##i = \
599 (vector signed short)vec_and(cmpHi##i, mask1); \ 598 (vector signed short)vec_and(cmpHi##i, mask1); \
600 const vector signed short cmpLf##i = \ 599 const vector signed short cmpLf##i = \
601 (vector signed short)vec_and(cmpLi##i, mask2); \ 600 (vector signed short)vec_and(cmpLi##i, mask2); \
602 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ 601 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \
603 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ 602 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \
604 sum##i = vec_sums(sumq##i, zero); } while (0) 603 sum##i = vec_sums(sumq##i, zero); } while (0)
605 604
606 COMPARE(0); 605 COMPARE(0);
607 COMPARE(1); 606 COMPARE(1);
608 COMPARE(2); 607 COMPARE(2);
609 COMPARE(3); 608 COMPARE(3);
616 #undef COMPARE 615 #undef COMPARE
617 616
618 vector signed int sumA2; 617 vector signed int sumA2;
619 vector signed int sumB2; 618 vector signed int sumB2;
620 { 619 {
621 const vector signed int sump02 = vec_mergel(sum0, sum2); 620 const vector signed int sump02 = vec_mergel(sum0, sum2);
622 const vector signed int sump13 = vec_mergel(sum1, sum3); 621 const vector signed int sump13 = vec_mergel(sum1, sum3);
623 const vector signed int sumA = vec_mergel(sump02, sump13); 622 const vector signed int sumA = vec_mergel(sump02, sump13);
624 623
625 const vector signed int sump46 = vec_mergel(sum4, sum6); 624 const vector signed int sump46 = vec_mergel(sum4, sum6);
626 const vector signed int sump57 = vec_mergel(sum5, sum7); 625 const vector signed int sump57 = vec_mergel(sum5, sum7);
627 const vector signed int sumB = vec_mergel(sump46, sump57); 626 const vector signed int sumB = vec_mergel(sump46, sump57);
628 627
629 const vector signed int sump8A = vec_mergel(sum8, zero); 628 const vector signed int sump8A = vec_mergel(sum8, zero);
630 const vector signed int sump9B = vec_mergel(sum9, zero); 629 const vector signed int sump9B = vec_mergel(sum9, zero);
631 const vector signed int sumC = vec_mergel(sump8A, sump9B); 630 const vector signed int sumC = vec_mergel(sump8A, sump9B);
632 631
633 const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16); 632 const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
634 const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16); 633 const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
635 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); 634 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
636 const vector signed int t2A = vec_or(sumA, tA); 635 const vector signed int t2A = vec_or(sumA, tA);
637 const vector signed int t2B = vec_or(sumB, tB); 636 const vector signed int t2B = vec_or(sumB, tB);
638 const vector signed int t2C = vec_or(sumC, tC); 637 const vector signed int t2C = vec_or(sumC, tC);
639 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), 638 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
640 vec_sl(t2A, vuint32_1)); 639 vec_sl(t2A, vuint32_1));
641 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), 640 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
642 vec_sl(t2B, vuint32_1)); 641 vec_sl(t2B, vuint32_1));
643 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), 642 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
644 vec_sl(t2C, vuint32_1)); 643 vec_sl(t2C, vuint32_1));
645 const vector signed int yA = vec_and(t2A, t3A); 644 const vector signed int yA = vec_and(t2A, t3A);
646 const vector signed int yB = vec_and(t2B, t3B); 645 const vector signed int yB = vec_and(t2B, t3B);
647 const vector signed int yC = vec_and(t2C, t3C); 646 const vector signed int yC = vec_and(t2C, t3C);
648 647
649 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); 648 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
650 const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0); 649 const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
651 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); 650 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
652 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); 651 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
653 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); 652 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
654 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); 653 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
655 const vector signed int sumAp = vec_and(yA, 654 const vector signed int sumAp = vec_and(yA,
656 vec_and(sumAd4,sumAd8)); 655 vec_and(sumAd4,sumAd8));
657 const vector signed int sumBp = vec_and(yB, 656 const vector signed int sumBp = vec_and(yB,
658 vec_and(sumBd4,sumBd8)); 657 vec_and(sumBd4,sumBd8));
659 sumA2 = vec_or(sumAp, 658 sumA2 = vec_or(sumAp,
660 vec_sra(sumAp, 659 vec_sra(sumAp,
661 vuint32_16)); 660 vuint32_16));
662 sumB2 = vec_or(sumBp, 661 sumB2 = vec_or(sumBp,
663 vec_sra(sumBp, 662 vec_sra(sumBp,
664 vuint32_16)); 663 vuint32_16));
665 } 664 }
666 vec_st(sumA2, 0, S); 665 vec_st(sumA2, 0, S);
667 vec_st(sumB2, 16, S); 666 vec_st(sumB2, 16, S);
668 } 667 }
669 668
670 /* I'm not sure the following is actually faster 669 /* I'm not sure the following is actually faster
671 than straight, unvectorized C code :-( */ 670 than straight, unvectorized C code :-( */
672 671
673 DECLARE_ALIGNED(16, int, tQP2[4]); 672 DECLARE_ALIGNED(16, int, tQP2[4]);
674 tQP2[0]= c->QP/2 + 1; 673 tQP2[0]= c->QP/2 + 1;
675 vector signed int vQP2 = vec_ld(0, tQP2); 674 vector signed int vQP2 = vec_ld(0, tQP2);
676 vQP2 = vec_splat(vQP2, 0); 675 vQP2 = vec_splat(vQP2, 0);
677 const vector signed int vsint32_8 = vec_splat_s32(8); 676 const vector signed int vsint32_8 = vec_splat_s32(8);
678 const vector unsigned int vuint32_4 = vec_splat_u32(4); 677 const vector unsigned int vuint32_4 = vec_splat_u32(4);
679 678
680 const vector unsigned char permA1 = (vector unsigned char) 679 const vector unsigned char permA1 = (vector unsigned char)
681 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, 680 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
682 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); 681 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
683 const vector unsigned char permA2 = (vector unsigned char) 682 const vector unsigned char permA2 = (vector unsigned char)
684 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 683 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
685 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); 684 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
686 const vector unsigned char permA1inc = (vector unsigned char) 685 const vector unsigned char permA1inc = (vector unsigned char)
687 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 686 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
688 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 687 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
689 const vector unsigned char permA2inc = (vector unsigned char) 688 const vector unsigned char permA2inc = (vector unsigned char)
690 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 689 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
691 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 690 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
692 const vector unsigned char magic = (vector unsigned char) 691 const vector unsigned char magic = (vector unsigned char)
693 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, 692 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
694 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 693 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
695 const vector unsigned char extractPerm = (vector unsigned char) 694 const vector unsigned char extractPerm = (vector unsigned char)
696 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, 695 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
697 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); 696 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01);
698 const vector unsigned char extractPermInc = (vector unsigned char) 697 const vector unsigned char extractPermInc = (vector unsigned char)
699 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 698 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
700 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); 699 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01);
701 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); 700 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
702 const vector unsigned char tenRight = (vector unsigned char) 701 const vector unsigned char tenRight = (vector unsigned char)
703 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 702 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
704 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 703 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
705 const vector unsigned char eightLeft = (vector unsigned char) 704 const vector unsigned char eightLeft = (vector unsigned char)
706 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 705 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
707 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); 706 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08);
708 707
709 708
710 #define F_INIT(i) \ 709 #define F_INIT(i) \
711 vector unsigned char tenRightM##i = tenRight; \ 710 vector unsigned char tenRightM##i = tenRight; \
712 vector unsigned char permA1M##i = permA1; \ 711 vector unsigned char permA1M##i = permA1; \
713 vector unsigned char permA2M##i = permA2; \ 712 vector unsigned char permA2M##i = permA2; \
714 vector unsigned char extractPermM##i = extractPerm 713 vector unsigned char extractPermM##i = extractPerm
715 714
716 #define F2(i, j, k, l) \ 715 #define F2(i, j, k, l) \
717 if (S[i] & (1 << (l+1))) { \ 716 if (S[i] & (1 << (l+1))) { \
718 const vector unsigned char a_##j##_A##l = \ 717 const vector unsigned char a_##j##_A##l = \
719 vec_perm(src##i, src##j, permA1M##i); \ 718 vec_perm(src##i, src##j, permA1M##i); \
720 const vector unsigned char a_##j##_B##l = \ 719 const vector unsigned char a_##j##_B##l = \
721 vec_perm(a_##j##_A##l, src##k, permA2M##i); \ 720 vec_perm(a_##j##_A##l, src##k, permA2M##i); \
722 const vector signed int a_##j##_sump##l = \ 721 const vector signed int a_##j##_sump##l = \
723 (vector signed int)vec_msum(a_##j##_B##l, magic, \ 722 (vector signed int)vec_msum(a_##j##_B##l, magic, \
724 (vector unsigned int)zero); \ 723 (vector unsigned int)zero); \
725 vector signed int F_##j##_##l = \ 724 vector signed int F_##j##_##l = \
726 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ 725 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \
727 F_##j##_##l = vec_splat(F_##j##_##l, 3); \ 726 F_##j##_##l = vec_splat(F_##j##_##l, 3); \
728 const vector signed int p_##j##_##l = \ 727 const vector signed int p_##j##_##l = \
729 (vector signed int)vec_perm(src##j, \ 728 (vector signed int)vec_perm(src##j, \
730 (vector unsigned char)zero, \ 729 (vector unsigned char)zero, \
731 extractPermM##i); \ 730 extractPermM##i); \
732 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2);\ 731 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2);\
733 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\ 732 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\
734 vector signed int newpm_##j##_##l; \ 733 vector signed int newpm_##j##_##l; \
735 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ 734 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \
736 newpm_##j##_##l = sum_##j##_##l; \ 735 newpm_##j##_##l = sum_##j##_##l; \
737 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ 736 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \
738 newpm_##j##_##l = diff_##j##_##l; \ 737 newpm_##j##_##l = diff_##j##_##l; \
739 else newpm_##j##_##l = F_##j##_##l; \ 738 else newpm_##j##_##l = F_##j##_##l; \
740 const vector unsigned char newpm2_##j##_##l = \ 739 const vector unsigned char newpm2_##j##_##l = \
741 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ 740 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \
742 const vector unsigned char mask##j##l = vec_add(identity, \ 741 const vector unsigned char mask##j##l = vec_add(identity, \
743 tenRightM##i); \ 742 tenRightM##i); \
744 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ 743 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \
745 } \ 744 } \
746 permA1M##i = vec_add(permA1M##i, permA1inc); \ 745 permA1M##i = vec_add(permA1M##i, permA1inc); \
747 permA2M##i = vec_add(permA2M##i, permA2inc); \ 746 permA2M##i = vec_add(permA2M##i, permA2inc); \
748 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ 747 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \
749 extractPermM##i = vec_add(extractPermM##i, extractPermInc) 748 extractPermM##i = vec_add(extractPermM##i, extractPermInc)
750 749
751 #define ITER(i, j, k) \ 750 #define ITER(i, j, k) \
752 F_INIT(i); \ 751 F_INIT(i); \
753 F2(i, j, k, 0); \ 752 F2(i, j, k, 0); \
754 F2(i, j, k, 1); \ 753 F2(i, j, k, 1); \
755 F2(i, j, k, 2); \ 754 F2(i, j, k, 2); \
756 F2(i, j, k, 3); \ 755 F2(i, j, k, 3); \
757 F2(i, j, k, 4); \ 756 F2(i, j, k, 4); \
758 F2(i, j, k, 5); \ 757 F2(i, j, k, 5); \
759 F2(i, j, k, 6); \ 758 F2(i, j, k, 6); \
760 F2(i, j, k, 7) 759 F2(i, j, k, 7)
761 760
762 ITER(0, 1, 2); 761 ITER(0, 1, 2);
763 ITER(1, 2, 3); 762 ITER(1, 2, 3);
764 ITER(2, 3, 4); 763 ITER(2, 3, 4);
765 ITER(3, 4, 5); 764 ITER(3, 4, 5);
766 ITER(4, 5, 6); 765 ITER(4, 5, 6);
767 ITER(5, 6, 7); 766 ITER(5, 6, 7);
768 ITER(6, 7, 8); 767 ITER(6, 7, 8);
769 ITER(7, 8, 9); 768 ITER(7, 8, 9);
770 769
771 const vector signed char neg1 = vec_splat_s8(-1); 770 const vector signed char neg1 = vec_splat_s8(-1);
772 771
773 #define STORE_LINE(i) \ 772 #define STORE_LINE(i) \
774 const vector unsigned char permST##i = \ 773 const vector unsigned char permST##i = \
775 vec_lvsr(i * stride, srcCopy); \ 774 vec_lvsr(i * stride, srcCopy); \
776 const vector unsigned char maskST##i = \ 775 const vector unsigned char maskST##i = \
777 vec_perm((vector unsigned char)zero, \ 776 vec_perm((vector unsigned char)zero, \
778 (vector unsigned char)neg1, permST##i); \ 777 (vector unsigned char)neg1, permST##i);\
779 src##i = vec_perm(src##i ,src##i, permST##i); \ 778 src##i = vec_perm(src##i ,src##i, permST##i); \
780 sA##i= vec_sel(sA##i, src##i, maskST##i); \ 779 sA##i= vec_sel(sA##i, src##i, maskST##i); \
781 sB##i= vec_sel(src##i, sB##i, maskST##i); \ 780 sB##i= vec_sel(src##i, sB##i, maskST##i); \
782 vec_st(sA##i, i * stride, srcCopy); \ 781 vec_st(sA##i, i * stride, srcCopy); \
783 vec_st(sB##i, i * stride + 16, srcCopy) 782 vec_st(sB##i, i * stride + 16, srcCopy)
784 783
785 STORE_LINE(1); 784 STORE_LINE(1);
786 STORE_LINE(2); 785 STORE_LINE(2);
787 STORE_LINE(3); 786 STORE_LINE(3);
788 STORE_LINE(4); 787 STORE_LINE(4);
789 STORE_LINE(5); 788 STORE_LINE(5);
790 STORE_LINE(6); 789 STORE_LINE(6);
791 STORE_LINE(7); 790 STORE_LINE(7);
792 STORE_LINE(8); 791 STORE_LINE(8);
793 792
794 #undef STORE_LINE 793 #undef STORE_LINE
795 #undef ITER 794 #undef ITER
796 #undef F2 795 #undef F2
797 } 796 }
799 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) 798 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
800 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) 799 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
801 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) 800 #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
802 801
803 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 802 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
804 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) 803 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
805 { 804 {
806 const vector signed int zero = vec_splat_s32(0); 805 const vector signed int zero = vec_splat_s32(0);
807 const vector signed short vsint16_1 = vec_splat_s16(1); 806 const vector signed short vsint16_1 = vec_splat_s16(1);
808 vector signed int v_dp = zero; 807 vector signed int v_dp = zero;
809 vector signed int v_sysdp = zero; 808 vector signed int v_sysdp = zero;
810 int d, sysd, i; 809 int d, sysd, i;
811 810
812 tempBluredPast[127]= maxNoise[0]; 811 tempBluredPast[127]= maxNoise[0];
813 tempBluredPast[128]= maxNoise[1]; 812 tempBluredPast[128]= maxNoise[1];
814 tempBluredPast[129]= maxNoise[2]; 813 tempBluredPast[129]= maxNoise[2];
815 814
816 #define LOAD_LINE(src, i) \ 815 #define LOAD_LINE(src, i) \
817 register int j##src##i = i * stride; \ 816 register int j##src##i = i * stride; \
818 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ 817 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \
819 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ 818 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
820 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ 819 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
821 const vector unsigned char v_##src##A##i = \ 820 const vector unsigned char v_##src##A##i = \
822 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ 821 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
823 vector signed short v_##src##Ass##i = \ 822 vector signed short v_##src##Ass##i = \
824 (vector signed short)vec_mergeh((vector signed char)zero, \ 823 (vector signed short)vec_mergeh((vector signed char)zero, \
825 (vector signed char)v_##src##A##i) 824 (vector signed char)v_##src##A##i)
826 825
827 LOAD_LINE(src, 0); 826 LOAD_LINE(src, 0);
828 LOAD_LINE(src, 1); 827 LOAD_LINE(src, 1);
829 LOAD_LINE(src, 2); 828 LOAD_LINE(src, 2);
830 LOAD_LINE(src, 3); 829 LOAD_LINE(src, 3);
831 LOAD_LINE(src, 4); 830 LOAD_LINE(src, 4);
832 LOAD_LINE(src, 5); 831 LOAD_LINE(src, 5);
833 LOAD_LINE(src, 6); 832 LOAD_LINE(src, 6);
834 LOAD_LINE(src, 7); 833 LOAD_LINE(src, 7);
835 834
836 LOAD_LINE(tempBlured, 0); 835 LOAD_LINE(tempBlured, 0);
837 LOAD_LINE(tempBlured, 1); 836 LOAD_LINE(tempBlured, 1);
838 LOAD_LINE(tempBlured, 2); 837 LOAD_LINE(tempBlured, 2);
839 LOAD_LINE(tempBlured, 3); 838 LOAD_LINE(tempBlured, 3);
840 LOAD_LINE(tempBlured, 4); 839 LOAD_LINE(tempBlured, 4);
841 LOAD_LINE(tempBlured, 5); 840 LOAD_LINE(tempBlured, 5);
842 LOAD_LINE(tempBlured, 6); 841 LOAD_LINE(tempBlured, 6);
843 LOAD_LINE(tempBlured, 7); 842 LOAD_LINE(tempBlured, 7);
844 #undef LOAD_LINE 843 #undef LOAD_LINE
845 844
846 #define ACCUMULATE_DIFFS(i) \ 845 #define ACCUMULATE_DIFFS(i) \
847 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ 846 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \
848 v_srcAss##i); \ 847 v_srcAss##i); \
849 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ 848 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \
850 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) 849 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp)
851 850
852 ACCUMULATE_DIFFS(0); 851 ACCUMULATE_DIFFS(0);
853 ACCUMULATE_DIFFS(1); 852 ACCUMULATE_DIFFS(1);
854 ACCUMULATE_DIFFS(2); 853 ACCUMULATE_DIFFS(2);
855 ACCUMULATE_DIFFS(3); 854 ACCUMULATE_DIFFS(3);
856 ACCUMULATE_DIFFS(4); 855 ACCUMULATE_DIFFS(4);
857 ACCUMULATE_DIFFS(5); 856 ACCUMULATE_DIFFS(5);
858 ACCUMULATE_DIFFS(6); 857 ACCUMULATE_DIFFS(6);
859 ACCUMULATE_DIFFS(7); 858 ACCUMULATE_DIFFS(7);
860 #undef ACCUMULATE_DIFFS 859 #undef ACCUMULATE_DIFFS
861 860
862 v_dp = vec_sums(v_dp, zero); 861 v_dp = vec_sums(v_dp, zero);
863 v_sysdp = vec_sums(v_sysdp, zero); 862 v_sysdp = vec_sums(v_sysdp, zero);
864 863
865 v_dp = vec_splat(v_dp, 3); 864 v_dp = vec_splat(v_dp, 3);
866 v_sysdp = vec_splat(v_sysdp, 3); 865 v_sysdp = vec_splat(v_sysdp, 3);
867 866
868 vec_ste(v_dp, 0, &d); 867 vec_ste(v_dp, 0, &d);
869 vec_ste(v_sysdp, 0, &sysd); 868 vec_ste(v_sysdp, 0, &sysd);
870 869
871 i = d; 870 i = d;
872 d = (4*d 871 d = (4*d
873 +(*(tempBluredPast-256)) 872 +(*(tempBluredPast-256))
874 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) 873 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
875 +(*(tempBluredPast+256)) 874 +(*(tempBluredPast+256))
876 +4)>>3; 875 +4)>>3;
877 876
878 *tempBluredPast=i; 877 *tempBluredPast=i;
879 878
880 if (d > maxNoise[1]) { 879 if (d > maxNoise[1]) {
881 if (d < maxNoise[2]) { 880 if (d < maxNoise[2]) {
882 #define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i); 881 #define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i);
883 882
884 OP(0); 883 OP(0);
885 OP(1); 884 OP(1);
886 OP(2); 885 OP(2);
887 OP(3); 886 OP(3);
888 OP(4); 887 OP(4);
889 OP(5); 888 OP(5);
890 OP(6); 889 OP(6);
891 OP(7); 890 OP(7);
892 #undef OP 891 #undef OP
892 } else {
893 #define OP(i) v_tempBluredAss##i = v_srcAss##i;
894
895 OP(0);
896 OP(1);
897 OP(2);
898 OP(3);
899 OP(4);
900 OP(5);
901 OP(6);
902 OP(7);
903 #undef OP
904 }
893 } else { 905 } else {
894 #define OP(i) v_tempBluredAss##i = v_srcAss##i; 906 if (d < maxNoise[0]) {
895 907 const vector signed short vsint16_7 = vec_splat_s16(7);
896 OP(0); 908 const vector signed short vsint16_4 = vec_splat_s16(4);
897 OP(1); 909 const vector unsigned short vuint16_3 = vec_splat_u16(3);
898 OP(2); 910
899 OP(3); 911 #define OP(i) \
900 OP(4); 912 const vector signed short v_temp##i = \
901 OP(5); 913 vec_mladd(v_tempBluredAss##i, \
902 OP(6); 914 vsint16_7, v_srcAss##i); \
903 OP(7); 915 const vector signed short v_temp2##i = \
916 vec_add(v_temp##i, vsint16_4); \
917 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
918
919 OP(0);
920 OP(1);
921 OP(2);
922 OP(3);
923 OP(4);
924 OP(5);
925 OP(6);
926 OP(7);
904 #undef OP 927 #undef OP
928 } else {
929 const vector signed short vsint16_3 = vec_splat_s16(3);
930 const vector signed short vsint16_2 = vec_splat_s16(2);
931
932 #define OP(i) \
933 const vector signed short v_temp##i = \
934 vec_mladd(v_tempBluredAss##i, \
935 vsint16_3, v_srcAss##i); \
936 const vector signed short v_temp2##i = \
937 vec_add(v_temp##i, vsint16_2); \
938 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
939
940 OP(0);
941 OP(1);
942 OP(2);
943 OP(3);
944 OP(4);
945 OP(5);
946 OP(6);
947 OP(7);
948 #undef OP
949 }
905 } 950 }
906 } else { 951
907 if (d < maxNoise[0]) { 952 const vector signed char neg1 = vec_splat_s8(-1);
908 const vector signed short vsint16_7 = vec_splat_s16(7); 953 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
909 const vector signed short vsint16_4 = vec_splat_s16(4); 954 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
910 const vector unsigned short vuint16_3 = vec_splat_u16(3);
911
912 #define OP(i) \
913 const vector signed short v_temp##i = \
914 vec_mladd(v_tempBluredAss##i, \
915 vsint16_7, v_srcAss##i); \
916 const vector signed short v_temp2##i = \
917 vec_add(v_temp##i, vsint16_4); \
918 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
919
920 OP(0);
921 OP(1);
922 OP(2);
923 OP(3);
924 OP(4);
925 OP(5);
926 OP(6);
927 OP(7);
928 #undef OP
929 } else {
930 const vector signed short vsint16_3 = vec_splat_s16(3);
931 const vector signed short vsint16_2 = vec_splat_s16(2);
932
933 #define OP(i) \
934 const vector signed short v_temp##i = \
935 vec_mladd(v_tempBluredAss##i, \
936 vsint16_3, v_srcAss##i); \
937 const vector signed short v_temp2##i = \
938 vec_add(v_temp##i, vsint16_2); \
939 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
940
941 OP(0);
942 OP(1);
943 OP(2);
944 OP(3);
945 OP(4);
946 OP(5);
947 OP(6);
948 OP(7);
949 #undef OP
950 }
951 }
952
953 const vector signed char neg1 = vec_splat_s8(-1);
954 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
955 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
956 955
957 #define PACK_AND_STORE(src, i) \ 956 #define PACK_AND_STORE(src, i) \
958 const vector unsigned char perms##src##i = \ 957 const vector unsigned char perms##src##i = \
959 vec_lvsr(i * stride, src); \ 958 vec_lvsr(i * stride, src); \
960 const vector unsigned char vf##src##i = \ 959 const vector unsigned char vf##src##i = \
961 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ 960 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \
962 const vector unsigned char vg##src##i = \ 961 const vector unsigned char vg##src##i = \
963 vec_perm(vf##src##i, v_##src##A##i, permHH); \ 962 vec_perm(vf##src##i, v_##src##A##i, permHH); \
964 const vector unsigned char mask##src##i = \ 963 const vector unsigned char mask##src##i = \
965 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ 964 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
966 const vector unsigned char vg2##src##i = \ 965 const vector unsigned char vg2##src##i = \
967 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ 966 vec_perm(vg##src##i, vg##src##i, perms##src##i); \
968 const vector unsigned char svA##src##i = \ 967 const vector unsigned char svA##src##i = \
969 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ 968 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \
970 const vector unsigned char svB##src##i = \ 969 const vector unsigned char svB##src##i = \
971 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ 970 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \
972 vec_st(svA##src##i, i * stride, src); \ 971 vec_st(svA##src##i, i * stride, src); \
973 vec_st(svB##src##i, i * stride + 16, src) 972 vec_st(svB##src##i, i * stride + 16, src)
974 973
975 PACK_AND_STORE(src, 0); 974 PACK_AND_STORE(src, 0);
976 PACK_AND_STORE(src, 1); 975 PACK_AND_STORE(src, 1);
977 PACK_AND_STORE(src, 2); 976 PACK_AND_STORE(src, 2);
978 PACK_AND_STORE(src, 3); 977 PACK_AND_STORE(src, 3);
979 PACK_AND_STORE(src, 4); 978 PACK_AND_STORE(src, 4);
980 PACK_AND_STORE(src, 5); 979 PACK_AND_STORE(src, 5);
981 PACK_AND_STORE(src, 6); 980 PACK_AND_STORE(src, 6);
982 PACK_AND_STORE(src, 7); 981 PACK_AND_STORE(src, 7);
983 PACK_AND_STORE(tempBlured, 0); 982 PACK_AND_STORE(tempBlured, 0);
984 PACK_AND_STORE(tempBlured, 1); 983 PACK_AND_STORE(tempBlured, 1);
985 PACK_AND_STORE(tempBlured, 2); 984 PACK_AND_STORE(tempBlured, 2);
986 PACK_AND_STORE(tempBlured, 3); 985 PACK_AND_STORE(tempBlured, 3);
987 PACK_AND_STORE(tempBlured, 4); 986 PACK_AND_STORE(tempBlured, 4);
988 PACK_AND_STORE(tempBlured, 5); 987 PACK_AND_STORE(tempBlured, 5);
989 PACK_AND_STORE(tempBlured, 6); 988 PACK_AND_STORE(tempBlured, 6);
990 PACK_AND_STORE(tempBlured, 7); 989 PACK_AND_STORE(tempBlured, 7);
991 #undef PACK_AND_STORE 990 #undef PACK_AND_STORE
992 } 991 }
993 992
994 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { 993 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
995 const vector unsigned char zero = vec_splat_u8(0); 994 const vector unsigned char zero = vec_splat_u8(0);
996 995
997 #define LOAD_DOUBLE_LINE(i, j) \ 996 #define LOAD_DOUBLE_LINE(i, j) \
998 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ 997 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \
999 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ 998 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \
1000 vector unsigned char srcA##i = vec_ld(i * stride, src); \ 999 vector unsigned char srcA##i = vec_ld(i * stride, src); \
1001 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ 1000 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \
1002 vector unsigned char srcC##i = vec_ld(j * stride, src); \ 1001 vector unsigned char srcC##i = vec_ld(j * stride, src); \
1003 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ 1002 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \
1004 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ 1003 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
1005 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) 1004 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1006 1005
1007 LOAD_DOUBLE_LINE(0, 1); 1006 LOAD_DOUBLE_LINE(0, 1);
1008 LOAD_DOUBLE_LINE(2, 3); 1007 LOAD_DOUBLE_LINE(2, 3);
1009 LOAD_DOUBLE_LINE(4, 5); 1008 LOAD_DOUBLE_LINE(4, 5);
1010 LOAD_DOUBLE_LINE(6, 7); 1009 LOAD_DOUBLE_LINE(6, 7);
1011 #undef LOAD_DOUBLE_LINE 1010 #undef LOAD_DOUBLE_LINE
1012 1011
1013 vector unsigned char tempA = vec_mergeh(src0, zero); 1012 vector unsigned char tempA = vec_mergeh(src0, zero);
1014 vector unsigned char tempB = vec_mergel(src0, zero); 1013 vector unsigned char tempB = vec_mergel(src0, zero);
1015 vector unsigned char tempC = vec_mergeh(src1, zero); 1014 vector unsigned char tempC = vec_mergeh(src1, zero);
1016 vector unsigned char tempD = vec_mergel(src1, zero); 1015 vector unsigned char tempD = vec_mergel(src1, zero);
1017 vector unsigned char tempE = vec_mergeh(src2, zero); 1016 vector unsigned char tempE = vec_mergeh(src2, zero);
1018 vector unsigned char tempF = vec_mergel(src2, zero); 1017 vector unsigned char tempF = vec_mergel(src2, zero);
1019 vector unsigned char tempG = vec_mergeh(src3, zero); 1018 vector unsigned char tempG = vec_mergeh(src3, zero);
1020 vector unsigned char tempH = vec_mergel(src3, zero); 1019 vector unsigned char tempH = vec_mergel(src3, zero);
1021 vector unsigned char tempI = vec_mergeh(src4, zero); 1020 vector unsigned char tempI = vec_mergeh(src4, zero);
1022 vector unsigned char tempJ = vec_mergel(src4, zero); 1021 vector unsigned char tempJ = vec_mergel(src4, zero);
1023 vector unsigned char tempK = vec_mergeh(src5, zero); 1022 vector unsigned char tempK = vec_mergeh(src5, zero);
1024 vector unsigned char tempL = vec_mergel(src5, zero); 1023 vector unsigned char tempL = vec_mergel(src5, zero);
1025 vector unsigned char tempM = vec_mergeh(src6, zero); 1024 vector unsigned char tempM = vec_mergeh(src6, zero);
1026 vector unsigned char tempN = vec_mergel(src6, zero); 1025 vector unsigned char tempN = vec_mergel(src6, zero);
1027 vector unsigned char tempO = vec_mergeh(src7, zero); 1026 vector unsigned char tempO = vec_mergeh(src7, zero);
1028 vector unsigned char tempP = vec_mergel(src7, zero); 1027 vector unsigned char tempP = vec_mergel(src7, zero);
1029 1028
1030 vector unsigned char temp0 = vec_mergeh(tempA, tempI); 1029 vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1031 vector unsigned char temp1 = vec_mergel(tempA, tempI); 1030 vector unsigned char temp1 = vec_mergel(tempA, tempI);
1032 vector unsigned char temp2 = vec_mergeh(tempB, tempJ); 1031 vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1033 vector unsigned char temp3 = vec_mergel(tempB, tempJ); 1032 vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1034 vector unsigned char temp4 = vec_mergeh(tempC, tempK); 1033 vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1035 vector unsigned char temp5 = vec_mergel(tempC, tempK); 1034 vector unsigned char temp5 = vec_mergel(tempC, tempK);
1036 vector unsigned char temp6 = vec_mergeh(tempD, tempL); 1035 vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1037 vector unsigned char temp7 = vec_mergel(tempD, tempL); 1036 vector unsigned char temp7 = vec_mergel(tempD, tempL);
1038 vector unsigned char temp8 = vec_mergeh(tempE, tempM); 1037 vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1039 vector unsigned char temp9 = vec_mergel(tempE, tempM); 1038 vector unsigned char temp9 = vec_mergel(tempE, tempM);
1040 vector unsigned char temp10 = vec_mergeh(tempF, tempN); 1039 vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1041 vector unsigned char temp11 = vec_mergel(tempF, tempN); 1040 vector unsigned char temp11 = vec_mergel(tempF, tempN);
1042 vector unsigned char temp12 = vec_mergeh(tempG, tempO); 1041 vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1043 vector unsigned char temp13 = vec_mergel(tempG, tempO); 1042 vector unsigned char temp13 = vec_mergel(tempG, tempO);
1044 vector unsigned char temp14 = vec_mergeh(tempH, tempP); 1043 vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1045 vector unsigned char temp15 = vec_mergel(tempH, tempP); 1044 vector unsigned char temp15 = vec_mergel(tempH, tempP);
1046 1045
1047 tempA = vec_mergeh(temp0, temp8); 1046 tempA = vec_mergeh(temp0, temp8);
1048 tempB = vec_mergel(temp0, temp8); 1047 tempB = vec_mergel(temp0, temp8);
1049 tempC = vec_mergeh(temp1, temp9); 1048 tempC = vec_mergeh(temp1, temp9);
1050 tempD = vec_mergel(temp1, temp9); 1049 tempD = vec_mergel(temp1, temp9);
1051 tempE = vec_mergeh(temp2, temp10); 1050 tempE = vec_mergeh(temp2, temp10);
1052 tempF = vec_mergel(temp2, temp10); 1051 tempF = vec_mergel(temp2, temp10);
1053 tempG = vec_mergeh(temp3, temp11); 1052 tempG = vec_mergeh(temp3, temp11);
1054 tempH = vec_mergel(temp3, temp11); 1053 tempH = vec_mergel(temp3, temp11);
1055 tempI = vec_mergeh(temp4, temp12); 1054 tempI = vec_mergeh(temp4, temp12);
1056 tempJ = vec_mergel(temp4, temp12); 1055 tempJ = vec_mergel(temp4, temp12);
1057 tempK = vec_mergeh(temp5, temp13); 1056 tempK = vec_mergeh(temp5, temp13);
1058 tempL = vec_mergel(temp5, temp13); 1057 tempL = vec_mergel(temp5, temp13);
1059 tempM = vec_mergeh(temp6, temp14); 1058 tempM = vec_mergeh(temp6, temp14);
1060 tempN = vec_mergel(temp6, temp14); 1059 tempN = vec_mergel(temp6, temp14);
1061 tempO = vec_mergeh(temp7, temp15); 1060 tempO = vec_mergeh(temp7, temp15);
1062 tempP = vec_mergel(temp7, temp15); 1061 tempP = vec_mergel(temp7, temp15);
1063 1062
1064 temp0 = vec_mergeh(tempA, tempI); 1063 temp0 = vec_mergeh(tempA, tempI);
1065 temp1 = vec_mergel(tempA, tempI); 1064 temp1 = vec_mergel(tempA, tempI);
1066 temp2 = vec_mergeh(tempB, tempJ); 1065 temp2 = vec_mergeh(tempB, tempJ);
1067 temp3 = vec_mergel(tempB, tempJ); 1066 temp3 = vec_mergel(tempB, tempJ);
1068 temp4 = vec_mergeh(tempC, tempK); 1067 temp4 = vec_mergeh(tempC, tempK);
1069 temp5 = vec_mergel(tempC, tempK); 1068 temp5 = vec_mergel(tempC, tempK);
1070 temp6 = vec_mergeh(tempD, tempL); 1069 temp6 = vec_mergeh(tempD, tempL);
1071 temp7 = vec_mergel(tempD, tempL); 1070 temp7 = vec_mergel(tempD, tempL);
1072 temp8 = vec_mergeh(tempE, tempM); 1071 temp8 = vec_mergeh(tempE, tempM);
1073 temp9 = vec_mergel(tempE, tempM); 1072 temp9 = vec_mergel(tempE, tempM);
1074 temp10 = vec_mergeh(tempF, tempN); 1073 temp10 = vec_mergeh(tempF, tempN);
1075 temp11 = vec_mergel(tempF, tempN); 1074 temp11 = vec_mergel(tempF, tempN);
1076 temp12 = vec_mergeh(tempG, tempO); 1075 temp12 = vec_mergeh(tempG, tempO);
1077 temp13 = vec_mergel(tempG, tempO); 1076 temp13 = vec_mergel(tempG, tempO);
1078 temp14 = vec_mergeh(tempH, tempP); 1077 temp14 = vec_mergeh(tempH, tempP);
1079 temp15 = vec_mergel(tempH, tempP); 1078 temp15 = vec_mergel(tempH, tempP);
1080 1079
1081 vec_st(temp0, 0, dst); 1080 vec_st(temp0, 0, dst);
1082 vec_st(temp1, 16, dst); 1081 vec_st(temp1, 16, dst);
1083 vec_st(temp2, 32, dst); 1082 vec_st(temp2, 32, dst);
1084 vec_st(temp3, 48, dst); 1083 vec_st(temp3, 48, dst);
1085 vec_st(temp4, 64, dst); 1084 vec_st(temp4, 64, dst);
1086 vec_st(temp5, 80, dst); 1085 vec_st(temp5, 80, dst);
1087 vec_st(temp6, 96, dst); 1086 vec_st(temp6, 96, dst);
1088 vec_st(temp7, 112, dst); 1087 vec_st(temp7, 112, dst);
1089 vec_st(temp8, 128, dst); 1088 vec_st(temp8, 128, dst);
1090 vec_st(temp9, 144, dst); 1089 vec_st(temp9, 144, dst);
1091 vec_st(temp10, 160, dst); 1090 vec_st(temp10, 160, dst);
1092 vec_st(temp11, 176, dst); 1091 vec_st(temp11, 176, dst);
1093 vec_st(temp12, 192, dst); 1092 vec_st(temp12, 192, dst);
1094 vec_st(temp13, 208, dst); 1093 vec_st(temp13, 208, dst);
1095 vec_st(temp14, 224, dst); 1094 vec_st(temp14, 224, dst);
1096 vec_st(temp15, 240, dst); 1095 vec_st(temp15, 240, dst);
1097 } 1096 }
1098 1097
1099 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { 1098 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1100 const vector unsigned char zero = vec_splat_u8(0); 1099 const vector unsigned char zero = vec_splat_u8(0);
1101 1100
1102 #define LOAD_DOUBLE_LINE(i, j) \ 1101 #define LOAD_DOUBLE_LINE(i, j) \
1103 vector unsigned char src##i = vec_ld(i * 16, src); \ 1102 vector unsigned char src##i = vec_ld(i * 16, src); \
1104 vector unsigned char src##j = vec_ld(j * 16, src) 1103 vector unsigned char src##j = vec_ld(j * 16, src)
1105 1104
1106 LOAD_DOUBLE_LINE(0, 1); 1105 LOAD_DOUBLE_LINE(0, 1);
1107 LOAD_DOUBLE_LINE(2, 3); 1106 LOAD_DOUBLE_LINE(2, 3);
1108 LOAD_DOUBLE_LINE(4, 5); 1107 LOAD_DOUBLE_LINE(4, 5);
1109 LOAD_DOUBLE_LINE(6, 7); 1108 LOAD_DOUBLE_LINE(6, 7);
1110 LOAD_DOUBLE_LINE(8, 9); 1109 LOAD_DOUBLE_LINE(8, 9);
1111 LOAD_DOUBLE_LINE(10, 11); 1110 LOAD_DOUBLE_LINE(10, 11);
1112 LOAD_DOUBLE_LINE(12, 13); 1111 LOAD_DOUBLE_LINE(12, 13);
1113 LOAD_DOUBLE_LINE(14, 15); 1112 LOAD_DOUBLE_LINE(14, 15);
1114 #undef LOAD_DOUBLE_LINE 1113 #undef LOAD_DOUBLE_LINE
1115 1114
1116 vector unsigned char tempA = vec_mergeh(src0, src8); 1115 vector unsigned char tempA = vec_mergeh(src0, src8);
1117 vector unsigned char tempB; 1116 vector unsigned char tempB;
1118 vector unsigned char tempC = vec_mergeh(src1, src9); 1117 vector unsigned char tempC = vec_mergeh(src1, src9);
1119 vector unsigned char tempD; 1118 vector unsigned char tempD;
1120 vector unsigned char tempE = vec_mergeh(src2, src10); 1119 vector unsigned char tempE = vec_mergeh(src2, src10);
1121 vector unsigned char tempG = vec_mergeh(src3, src11); 1120 vector unsigned char tempG = vec_mergeh(src3, src11);
1122 vector unsigned char tempI = vec_mergeh(src4, src12); 1121 vector unsigned char tempI = vec_mergeh(src4, src12);
1123 vector unsigned char tempJ; 1122 vector unsigned char tempJ;
1124 vector unsigned char tempK = vec_mergeh(src5, src13); 1123 vector unsigned char tempK = vec_mergeh(src5, src13);
1125 vector unsigned char tempL; 1124 vector unsigned char tempL;
1126 vector unsigned char tempM = vec_mergeh(src6, src14); 1125 vector unsigned char tempM = vec_mergeh(src6, src14);
1127 vector unsigned char tempO = vec_mergeh(src7, src15); 1126 vector unsigned char tempO = vec_mergeh(src7, src15);
1128 1127
1129 vector unsigned char temp0 = vec_mergeh(tempA, tempI); 1128 vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1130 vector unsigned char temp1 = vec_mergel(tempA, tempI); 1129 vector unsigned char temp1 = vec_mergel(tempA, tempI);
1131 vector unsigned char temp2; 1130 vector unsigned char temp2;
1132 vector unsigned char temp3; 1131 vector unsigned char temp3;
1133 vector unsigned char temp4 = vec_mergeh(tempC, tempK); 1132 vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1134 vector unsigned char temp5 = vec_mergel(tempC, tempK); 1133 vector unsigned char temp5 = vec_mergel(tempC, tempK);
1135 vector unsigned char temp6; 1134 vector unsigned char temp6;
1136 vector unsigned char temp7; 1135 vector unsigned char temp7;
1137 vector unsigned char temp8 = vec_mergeh(tempE, tempM); 1136 vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1138 vector unsigned char temp9 = vec_mergel(tempE, tempM); 1137 vector unsigned char temp9 = vec_mergel(tempE, tempM);
1139 vector unsigned char temp12 = vec_mergeh(tempG, tempO); 1138 vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1140 vector unsigned char temp13 = vec_mergel(tempG, tempO); 1139 vector unsigned char temp13 = vec_mergel(tempG, tempO);
1141 1140
1142 tempA = vec_mergeh(temp0, temp8); 1141 tempA = vec_mergeh(temp0, temp8);
1143 tempB = vec_mergel(temp0, temp8); 1142 tempB = vec_mergel(temp0, temp8);
1144 tempC = vec_mergeh(temp1, temp9); 1143 tempC = vec_mergeh(temp1, temp9);
1145 tempD = vec_mergel(temp1, temp9); 1144 tempD = vec_mergel(temp1, temp9);
1146 tempI = vec_mergeh(temp4, temp12); 1145 tempI = vec_mergeh(temp4, temp12);
1147 tempJ = vec_mergel(temp4, temp12); 1146 tempJ = vec_mergel(temp4, temp12);
1148 tempK = vec_mergeh(temp5, temp13); 1147 tempK = vec_mergeh(temp5, temp13);
1149 tempL = vec_mergel(temp5, temp13); 1148 tempL = vec_mergel(temp5, temp13);
1150 1149
1151 temp0 = vec_mergeh(tempA, tempI); 1150 temp0 = vec_mergeh(tempA, tempI);
1152 temp1 = vec_mergel(tempA, tempI); 1151 temp1 = vec_mergel(tempA, tempI);
1153 temp2 = vec_mergeh(tempB, tempJ); 1152 temp2 = vec_mergeh(tempB, tempJ);
1154 temp3 = vec_mergel(tempB, tempJ); 1153 temp3 = vec_mergel(tempB, tempJ);
1155 temp4 = vec_mergeh(tempC, tempK); 1154 temp4 = vec_mergeh(tempC, tempK);
1156 temp5 = vec_mergel(tempC, tempK); 1155 temp5 = vec_mergel(tempC, tempK);
1157 temp6 = vec_mergeh(tempD, tempL); 1156 temp6 = vec_mergeh(tempD, tempL);
1158 temp7 = vec_mergel(tempD, tempL); 1157 temp7 = vec_mergel(tempD, tempL);
1159 1158
1160 1159
1161 const vector signed char neg1 = vec_splat_s8(-1); 1160 const vector signed char neg1 = vec_splat_s8(-1);
1162 #define STORE_DOUBLE_LINE(i, j) \ 1161 #define STORE_DOUBLE_LINE(i, j) \
1163 vector unsigned char dstA##i = vec_ld(i * stride, dst); \ 1162 vector unsigned char dstA##i = vec_ld(i * stride, dst); \
1164 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \ 1163 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \
1165 vector unsigned char dstA##j = vec_ld(j * stride, dst); \ 1164 vector unsigned char dstA##j = vec_ld(j * stride, dst); \
1166 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \ 1165 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \
1167 vector unsigned char align##i = vec_lvsr(i * stride, dst); \ 1166 vector unsigned char align##i = vec_lvsr(i * stride, dst); \
1168 vector unsigned char align##j = vec_lvsr(j * stride, dst); \ 1167 vector unsigned char align##j = vec_lvsr(j * stride, dst); \
1169 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \ 1168 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \
1170 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \ 1169 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \
1171 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \ 1170 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);\
1172 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \ 1171 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);\
1173 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \ 1172 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \
1174 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \ 1173 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \
1175 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \ 1174 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \
1176 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \ 1175 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \
1177 vec_st(dstAF##i, i * stride, dst); \ 1176 vec_st(dstAF##i, i * stride, dst); \
1178 vec_st(dstBF##i, i * stride + 16, dst); \ 1177 vec_st(dstBF##i, i * stride + 16, dst); \
1179 vec_st(dstAF##j, j * stride, dst); \ 1178 vec_st(dstAF##j, j * stride, dst); \
1180 vec_st(dstBF##j, j * stride + 16, dst) 1179 vec_st(dstBF##j, j * stride + 16, dst)
1181 1180
1182 STORE_DOUBLE_LINE(0,1); 1181 STORE_DOUBLE_LINE(0,1);
1183 STORE_DOUBLE_LINE(2,3); 1182 STORE_DOUBLE_LINE(2,3);
1184 STORE_DOUBLE_LINE(4,5); 1183 STORE_DOUBLE_LINE(4,5);
1185 STORE_DOUBLE_LINE(6,7); 1184 STORE_DOUBLE_LINE(6,7);
1186 } 1185 }