Mercurial > libavcodec.hg
comparison libpostproc/postprocess_altivec_template.c @ 2979:bfabfdf9ce55 libavcodec
COSMETICS: tabs --> spaces, some prettyprinting
author | diego |
---|---|
date | Thu, 22 Dec 2005 01:10:11 +0000 |
parents | ef2149182f1c |
children | 0b546eab515d |
comparison
equal
deleted
inserted
replaced
2978:403183bbb505 | 2979:bfabfdf9ce55 |
---|---|
24 #else | 24 #else |
25 #define AVV(x...) {x} | 25 #define AVV(x...) {x} |
26 #endif | 26 #endif |
27 | 27 |
28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ | 28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ |
29 do { \ | 29 do { \ |
30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ | 30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ |
31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ | 31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ |
32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ | 32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ |
33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ | 33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ |
34 tempA1 = vec_mergeh (src_a, src_e); \ | 34 tempA1 = vec_mergeh (src_a, src_e); \ |
35 tempB1 = vec_mergel (src_a, src_e); \ | 35 tempB1 = vec_mergel (src_a, src_e); \ |
36 tempC1 = vec_mergeh (src_b, src_f); \ | 36 tempC1 = vec_mergeh (src_b, src_f); \ |
37 tempD1 = vec_mergel (src_b, src_f); \ | 37 tempD1 = vec_mergel (src_b, src_f); \ |
38 tempE1 = vec_mergeh (src_c, src_g); \ | 38 tempE1 = vec_mergeh (src_c, src_g); \ |
39 tempF1 = vec_mergel (src_c, src_g); \ | 39 tempF1 = vec_mergel (src_c, src_g); \ |
40 tempG1 = vec_mergeh (src_d, src_h); \ | 40 tempG1 = vec_mergeh (src_d, src_h); \ |
41 tempH1 = vec_mergel (src_d, src_h); \ | 41 tempH1 = vec_mergel (src_d, src_h); \ |
42 tempA2 = vec_mergeh (tempA1, tempE1); \ | 42 tempA2 = vec_mergeh (tempA1, tempE1); \ |
43 tempB2 = vec_mergel (tempA1, tempE1); \ | 43 tempB2 = vec_mergel (tempA1, tempE1); \ |
44 tempC2 = vec_mergeh (tempB1, tempF1); \ | 44 tempC2 = vec_mergeh (tempB1, tempF1); \ |
45 tempD2 = vec_mergel (tempB1, tempF1); \ | 45 tempD2 = vec_mergel (tempB1, tempF1); \ |
46 tempE2 = vec_mergeh (tempC1, tempG1); \ | 46 tempE2 = vec_mergeh (tempC1, tempG1); \ |
47 tempF2 = vec_mergel (tempC1, tempG1); \ | 47 tempF2 = vec_mergel (tempC1, tempG1); \ |
48 tempG2 = vec_mergeh (tempD1, tempH1); \ | 48 tempG2 = vec_mergeh (tempD1, tempH1); \ |
49 tempH2 = vec_mergel (tempD1, tempH1); \ | 49 tempH2 = vec_mergel (tempD1, tempH1); \ |
50 src_a = vec_mergeh (tempA2, tempE2); \ | 50 src_a = vec_mergeh (tempA2, tempE2); \ |
51 src_b = vec_mergel (tempA2, tempE2); \ | 51 src_b = vec_mergel (tempA2, tempE2); \ |
52 src_c = vec_mergeh (tempB2, tempF2); \ | 52 src_c = vec_mergeh (tempB2, tempF2); \ |
53 src_d = vec_mergel (tempB2, tempF2); \ | 53 src_d = vec_mergel (tempB2, tempF2); \ |
54 src_e = vec_mergeh (tempC2, tempG2); \ | 54 src_e = vec_mergeh (tempC2, tempG2); \ |
55 src_f = vec_mergel (tempC2, tempG2); \ | 55 src_f = vec_mergel (tempC2, tempG2); \ |
56 src_g = vec_mergeh (tempD2, tempH2); \ | 56 src_g = vec_mergeh (tempD2, tempH2); \ |
57 src_h = vec_mergel (tempD2, tempH2); \ | 57 src_h = vec_mergel (tempD2, tempH2); \ |
58 } while (0) | 58 } while (0) |
59 | 59 |
60 | 60 |
61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { | 61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { |
62 /* | 62 /* |
92 | 92 |
93 src2 += stride * 4; | 93 src2 += stride * 4; |
94 | 94 |
95 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; | 95 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; |
96 | 96 |
97 #define LOAD_LINE(i) \ | 97 #define LOAD_LINE(i) \ |
98 register int j##i = i * stride; \ | 98 register int j##i = i * stride; \ |
99 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ | 99 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ |
100 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ | 100 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ |
101 vector unsigned char v_srcA2##i; \ | 101 vector unsigned char v_srcA2##i; \ |
102 if (two_vectors) \ | 102 if (two_vectors) \ |
103 v_srcA2##i = vec_ld(j##i + 16, src2); \ | 103 v_srcA2##i = vec_ld(j##i + 16, src2); \ |
104 const vector unsigned char v_srcA##i = \ | 104 const vector unsigned char v_srcA##i = \ |
105 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ | 105 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ |
106 v_srcAss##i = \ | 106 v_srcAss##i = \ |
107 (vector signed short)vec_mergeh((vector signed char)zero, \ | 107 (vector signed short)vec_mergeh((vector signed char)zero, \ |
108 (vector signed char)v_srcA##i) | 108 (vector signed char)v_srcA##i) |
109 | 109 |
110 #define LOAD_LINE_ALIGNED(i) \ | 110 #define LOAD_LINE_ALIGNED(i) \ |
111 register int j##i = i * stride; \ | 111 register int j##i = i * stride; \ |
112 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ | 112 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ |
113 v_srcAss##i = \ | 113 v_srcAss##i = \ |
114 (vector signed short)vec_mergeh((vector signed char)zero, \ | 114 (vector signed short)vec_mergeh((vector signed char)zero, \ |
115 (vector signed char)v_srcA##i) | 115 (vector signed char)v_srcA##i) |
116 | 116 |
117 // special casing the aligned case is worthwhile, as all call from | 117 // special casing the aligned case is worthwhile, as all call from |
118 // the (transposed) horizontable deblocks will be aligned, i naddition | 118 // the (transposed) horizontable deblocks will be aligned, i naddition |
119 // to the naturraly aligned vertical deblocks. | 119 // to the naturraly aligned vertical deblocks. |
120 if (properStride && srcAlign) { | 120 if (properStride && srcAlign) { |
137 LOAD_LINE(7); | 137 LOAD_LINE(7); |
138 } | 138 } |
139 #undef LOAD_LINE | 139 #undef LOAD_LINE |
140 #undef LOAD_LINE_ALIGNED | 140 #undef LOAD_LINE_ALIGNED |
141 | 141 |
142 #define ITER(i, j) \ | 142 #define ITER(i, j) \ |
143 const vector signed short v_diff##i = \ | 143 const vector signed short v_diff##i = \ |
144 vec_sub(v_srcAss##i, v_srcAss##j); \ | 144 vec_sub(v_srcAss##i, v_srcAss##j); \ |
145 const vector signed short v_sum##i = \ | 145 const vector signed short v_sum##i = \ |
146 vec_add(v_diff##i, v_dcOffset); \ | 146 vec_add(v_diff##i, v_dcOffset); \ |
147 const vector signed short v_comp##i = \ | 147 const vector signed short v_comp##i = \ |
148 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ | 148 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ |
149 v_dcThreshold); \ | 149 v_dcThreshold); \ |
150 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ | 150 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ |
151 v_numEq = vec_sum4s(v_part##i, v_numEq); | 151 v_numEq = vec_sum4s(v_part##i, v_numEq); |
152 | 152 |
153 ITER(0, 1); | 153 ITER(0, 1); |
154 ITER(1, 2); | 154 ITER(1, 2); |
155 ITER(2, 3); | 155 ITER(2, 3); |
165 vec_ste(v_numEq, 0, &numEq); | 165 vec_ste(v_numEq, 0, &numEq); |
166 | 166 |
167 if (numEq > c->ppMode.flatnessThreshold) | 167 if (numEq > c->ppMode.flatnessThreshold) |
168 { | 168 { |
169 const vector unsigned char mmoP1 = (const vector unsigned char) | 169 const vector unsigned char mmoP1 = (const vector unsigned char) |
170 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, | 170 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, |
171 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); | 171 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); |
172 const vector unsigned char mmoP2 = (const vector unsigned char) | 172 const vector unsigned char mmoP2 = (const vector unsigned char) |
173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, | 173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, |
174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); | 174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); |
175 const vector unsigned char mmoP = (const vector unsigned char) | 175 const vector unsigned char mmoP = (const vector unsigned char) |
176 vec_lvsl(8, (unsigned char*)0); | 176 vec_lvsl(8, (unsigned char*)0); |
177 | 177 |
178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); | 178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); |
179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); | 179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); |
180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); | 180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); |
181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); | 181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); |
183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); | 183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); |
184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); | 184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); |
185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); | 185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); |
186 | 186 |
187 if (vec_any_gt(mmoSum, v4QP)) | 187 if (vec_any_gt(mmoSum, v4QP)) |
188 return 0; | 188 return 0; |
189 else | 189 else |
190 return 1; | 190 return 1; |
191 } | 191 } |
192 else return 2; | 192 else return 2; |
193 } | 193 } |
194 | 194 |
195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { | 195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { |
216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; | 216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; |
217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; | 217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; |
218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; | 218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; |
219 | 219 |
220 #define LOAD_LINE(i) \ | 220 #define LOAD_LINE(i) \ |
221 const vector unsigned char perml##i = \ | 221 const vector unsigned char perml##i = \ |
222 vec_lvsl(i * stride, src2); \ | 222 vec_lvsl(i * stride, src2); \ |
223 vbA##i = vec_ld(i * stride, src2); \ | 223 vbA##i = vec_ld(i * stride, src2); \ |
224 vbB##i = vec_ld(i * stride + 16, src2); \ | 224 vbB##i = vec_ld(i * stride + 16, src2); \ |
225 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ | 225 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ |
226 vb##i = \ | 226 vb##i = \ |
227 (vector signed short)vec_mergeh((vector unsigned char)zero, \ | 227 (vector signed short)vec_mergeh((vector unsigned char)zero, \ |
228 (vector unsigned char)vbT##i) | 228 (vector unsigned char)vbT##i) |
229 | 229 |
230 #define LOAD_LINE_ALIGNED(i) \ | 230 #define LOAD_LINE_ALIGNED(i) \ |
231 register int j##i = i * stride; \ | 231 register int j##i = i * stride; \ |
232 vbT##i = vec_ld(j##i, src2); \ | 232 vbT##i = vec_ld(j##i, src2); \ |
233 vb##i = \ | 233 vb##i = \ |
234 (vector signed short)vec_mergeh((vector signed char)zero, \ | 234 (vector signed short)vec_mergeh((vector signed char)zero, \ |
235 (vector signed char)vbT##i) | 235 (vector signed char)vbT##i) |
236 | 236 |
237 // special casing the aligned case is worthwhile, as all call from | 237 // special casing the aligned case is worthwhile, as all call from |
238 // the (transposed) horizontable deblocks will be aligned, in addition | 238 // the (transposed) horizontable deblocks will be aligned, in addition |
239 // to the naturraly aligned vertical deblocks. | 239 // to the naturraly aligned vertical deblocks. |
240 if (properStride && srcAlign) { | 240 if (properStride && srcAlign) { |
306 const vector signed short v_sumsB8 = vec_add(temp81, v_last); | 306 const vector signed short v_sumsB8 = vec_add(temp81, v_last); |
307 | 307 |
308 const vector signed short temp91 = vec_sub(v_sumsB8, vb5); | 308 const vector signed short temp91 = vec_sub(v_sumsB8, vb5); |
309 const vector signed short v_sumsB9 = vec_add(temp91, v_last); | 309 const vector signed short v_sumsB9 = vec_add(temp91, v_last); |
310 | 310 |
311 #define COMPUTE_VR(i, j, k) \ | 311 #define COMPUTE_VR(i, j, k) \ |
312 const vector signed short temps1##i = \ | 312 const vector signed short temps1##i = \ |
313 vec_add(v_sumsB##i, v_sumsB##k); \ | 313 vec_add(v_sumsB##i, v_sumsB##k); \ |
314 const vector signed short temps2##i = \ | 314 const vector signed short temps2##i = \ |
315 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ | 315 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ |
316 const vector signed short vr##j = vec_sra(temps2##i, v_4) | 316 const vector signed short vr##j = vec_sra(temps2##i, v_4) |
317 | 317 |
318 COMPUTE_VR(0, 1, 2); | 318 COMPUTE_VR(0, 1, 2); |
319 COMPUTE_VR(1, 2, 3); | 319 COMPUTE_VR(1, 2, 3); |
320 COMPUTE_VR(2, 3, 4); | 320 COMPUTE_VR(2, 3, 4); |
324 COMPUTE_VR(6, 7, 8); | 324 COMPUTE_VR(6, 7, 8); |
325 COMPUTE_VR(7, 8, 9); | 325 COMPUTE_VR(7, 8, 9); |
326 | 326 |
327 const vector signed char neg1 = vec_splat_s8(-1); | 327 const vector signed char neg1 = vec_splat_s8(-1); |
328 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 328 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
329 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 329 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
330 | 330 |
331 #define PACK_AND_STORE(i) \ | 331 #define PACK_AND_STORE(i) \ |
332 const vector unsigned char perms##i = \ | 332 const vector unsigned char perms##i = \ |
333 vec_lvsr(i * stride, src2); \ | 333 vec_lvsr(i * stride, src2); \ |
334 const vector unsigned char vf##i = \ | 334 const vector unsigned char vf##i = \ |
335 vec_packsu(vr##i, (vector signed short)zero); \ | 335 vec_packsu(vr##i, (vector signed short)zero); \ |
336 const vector unsigned char vg##i = \ | 336 const vector unsigned char vg##i = \ |
337 vec_perm(vf##i, vbT##i, permHH); \ | 337 vec_perm(vf##i, vbT##i, permHH); \ |
338 const vector unsigned char mask##i = \ | 338 const vector unsigned char mask##i = \ |
339 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ | 339 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
340 const vector unsigned char vg2##i = \ | 340 const vector unsigned char vg2##i = \ |
341 vec_perm(vg##i, vg##i, perms##i); \ | 341 vec_perm(vg##i, vg##i, perms##i); \ |
342 const vector unsigned char svA##i = \ | 342 const vector unsigned char svA##i = \ |
343 vec_sel(vbA##i, vg2##i, mask##i); \ | 343 vec_sel(vbA##i, vg2##i, mask##i); \ |
344 const vector unsigned char svB##i = \ | 344 const vector unsigned char svB##i = \ |
345 vec_sel(vg2##i, vbB##i, mask##i); \ | 345 vec_sel(vg2##i, vbB##i, mask##i); \ |
346 vec_st(svA##i, i * stride, src2); \ | 346 vec_st(svA##i, i * stride, src2); \ |
347 vec_st(svB##i, i * stride + 16, src2) | 347 vec_st(svB##i, i * stride + 16, src2) |
348 | 348 |
349 #define PACK_AND_STORE_ALIGNED(i) \ | 349 #define PACK_AND_STORE_ALIGNED(i) \ |
350 const vector unsigned char vf##i = \ | 350 const vector unsigned char vf##i = \ |
351 vec_packsu(vr##i, (vector signed short)zero); \ | 351 vec_packsu(vr##i, (vector signed short)zero); \ |
352 const vector unsigned char vg##i = \ | 352 const vector unsigned char vg##i = \ |
353 vec_perm(vf##i, vbT##i, permHH); \ | 353 vec_perm(vf##i, vbT##i, permHH); \ |
354 vec_st(vg##i, i * stride, src2) | 354 vec_st(vg##i, i * stride, src2) |
355 | 355 |
356 // special casing the aligned case is worthwhile, as all call from | 356 // special casing the aligned case is worthwhile, as all call from |
357 // the (transposed) horizontable deblocks will be aligned, in addition | 357 // the (transposed) horizontable deblocks will be aligned, in addition |
358 // to the naturraly aligned vertical deblocks. | 358 // to the naturraly aligned vertical deblocks. |
396 qp[0] = 8*c->QP; | 396 qp[0] = 8*c->QP; |
397 vector signed short vqp = vec_ld(0, qp); | 397 vector signed short vqp = vec_ld(0, qp); |
398 vqp = vec_splat(vqp, 0); | 398 vqp = vec_splat(vqp, 0); |
399 | 399 |
400 #define LOAD_LINE(i) \ | 400 #define LOAD_LINE(i) \ |
401 const vector unsigned char perm##i = \ | 401 const vector unsigned char perm##i = \ |
402 vec_lvsl(i * stride, src2); \ | 402 vec_lvsl(i * stride, src2); \ |
403 const vector unsigned char vbA##i = \ | 403 const vector unsigned char vbA##i = \ |
404 vec_ld(i * stride, src2); \ | 404 vec_ld(i * stride, src2); \ |
405 const vector unsigned char vbB##i = \ | 405 const vector unsigned char vbB##i = \ |
406 vec_ld(i * stride + 16, src2); \ | 406 vec_ld(i * stride + 16, src2); \ |
407 const vector unsigned char vbT##i = \ | 407 const vector unsigned char vbT##i = \ |
408 vec_perm(vbA##i, vbB##i, perm##i); \ | 408 vec_perm(vbA##i, vbB##i, perm##i); \ |
409 const vector signed short vb##i = \ | 409 const vector signed short vb##i = \ |
410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ | 410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ |
411 (vector unsigned char)vbT##i) | 411 (vector unsigned char)vbT##i) |
412 | 412 |
413 src2 += stride*3; | 413 src2 += stride*3; |
414 | 414 |
415 LOAD_LINE(1); | 415 LOAD_LINE(1); |
416 LOAD_LINE(2); | 416 LOAD_LINE(2); |
424 | 424 |
425 const vector signed short v_1 = vec_splat_s16(1); | 425 const vector signed short v_1 = vec_splat_s16(1); |
426 const vector signed short v_2 = vec_splat_s16(2); | 426 const vector signed short v_2 = vec_splat_s16(2); |
427 const vector signed short v_5 = vec_splat_s16(5); | 427 const vector signed short v_5 = vec_splat_s16(5); |
428 const vector signed short v_32 = vec_sl(v_1, | 428 const vector signed short v_32 = vec_sl(v_1, |
429 (vector unsigned short)v_5); | 429 (vector unsigned short)v_5); |
430 /* middle energy */ | 430 /* middle energy */ |
431 const vector signed short l3minusl6 = vec_sub(vb3, vb6); | 431 const vector signed short l3minusl6 = vec_sub(vb3, vb6); |
432 const vector signed short l5minusl4 = vec_sub(vb5, vb4); | 432 const vector signed short l5minusl4 = vec_sub(vb5, vb4); |
433 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); | 433 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); |
434 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); | 434 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); |
481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); | 481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); |
482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); | 482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); |
483 | 483 |
484 const vector signed char neg1 = vec_splat_s8(-1); | 484 const vector signed char neg1 = vec_splat_s8(-1); |
485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
487 | 487 |
488 #define STORE(i) \ | 488 #define STORE(i) \ |
489 const vector unsigned char perms##i = \ | 489 const vector unsigned char perms##i = \ |
490 vec_lvsr(i * stride, src2); \ | 490 vec_lvsr(i * stride, src2); \ |
491 const vector unsigned char vg##i = \ | 491 const vector unsigned char vg##i = \ |
492 vec_perm(st##i, vbT##i, permHH); \ | 492 vec_perm(st##i, vbT##i, permHH); \ |
493 const vector unsigned char mask##i = \ | 493 const vector unsigned char mask##i = \ |
494 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ | 494 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
495 const vector unsigned char vg2##i = \ | 495 const vector unsigned char vg2##i = \ |
496 vec_perm(vg##i, vg##i, perms##i); \ | 496 vec_perm(vg##i, vg##i, perms##i); \ |
497 const vector unsigned char svA##i = \ | 497 const vector unsigned char svA##i = \ |
498 vec_sel(vbA##i, vg2##i, mask##i); \ | 498 vec_sel(vbA##i, vg2##i, mask##i); \ |
499 const vector unsigned char svB##i = \ | 499 const vector unsigned char svB##i = \ |
500 vec_sel(vg2##i, vbB##i, mask##i); \ | 500 vec_sel(vg2##i, vbB##i, mask##i); \ |
501 vec_st(svA##i, i * stride, src2); \ | 501 vec_st(svA##i, i * stride, src2); \ |
502 vec_st(svB##i, i * stride + 16, src2) | 502 vec_st(svB##i, i * stride + 16, src2) |
503 | 503 |
504 STORE(4); | 504 STORE(4); |
505 STORE(5); | 505 STORE(5); |
506 } | 506 } |
520 const vector signed int zero = vec_splat_s32(0); | 520 const vector signed int zero = vec_splat_s32(0); |
521 vector unsigned char v_dt; | 521 vector unsigned char v_dt; |
522 dt[0] = deringThreshold; | 522 dt[0] = deringThreshold; |
523 v_dt = vec_splat(vec_ld(0, dt), 0); | 523 v_dt = vec_splat(vec_ld(0, dt), 0); |
524 | 524 |
525 #define LOAD_LINE(i) \ | 525 #define LOAD_LINE(i) \ |
526 const vector unsigned char perm##i = \ | 526 const vector unsigned char perm##i = \ |
527 vec_lvsl(i * stride, srcCopy); \ | 527 vec_lvsl(i * stride, srcCopy); \ |
528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ | 528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ |
529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ | 529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ |
530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) | 530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) |
531 | 531 |
532 LOAD_LINE(0); | 532 LOAD_LINE(0); |
533 LOAD_LINE(1); | 533 LOAD_LINE(1); |
534 LOAD_LINE(2); | 534 LOAD_LINE(2); |
543 | 543 |
544 vector unsigned char v_avg; | 544 vector unsigned char v_avg; |
545 { | 545 { |
546 const vector unsigned char trunc_perm = (vector unsigned char) | 546 const vector unsigned char trunc_perm = (vector unsigned char) |
547 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, | 547 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, |
548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); | 548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); |
549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); | 549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); |
550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); | 550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); |
551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); | 551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); |
552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); | 552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); |
553 | 553 |
554 #define EXTRACT(op) do { \ | 554 #define EXTRACT(op) do { \ |
555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ | 555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ |
556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ | 556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ |
557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ | 557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ |
558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ | 558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ |
559 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ | 559 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ |
582 | 582 |
583 signed int __attribute__((aligned(16))) S[8]; | 583 signed int __attribute__((aligned(16))) S[8]; |
584 { | 584 { |
585 const vector unsigned short mask1 = (vector unsigned short) | 585 const vector unsigned short mask1 = (vector unsigned short) |
586 AVV(0x0001, 0x0002, 0x0004, 0x0008, | 586 AVV(0x0001, 0x0002, 0x0004, 0x0008, |
587 0x0010, 0x0020, 0x0040, 0x0080); | 587 0x0010, 0x0020, 0x0040, 0x0080); |
588 const vector unsigned short mask2 = (vector unsigned short) | 588 const vector unsigned short mask2 = (vector unsigned short) |
589 AVV(0x0100, 0x0200, 0x0000, 0x0000, | 589 AVV(0x0100, 0x0200, 0x0000, 0x0000, |
590 0x0000, 0x0000, 0x0000, 0x0000); | 590 0x0000, 0x0000, 0x0000, 0x0000); |
591 | 591 |
592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); | 592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); |
593 const vector unsigned int vuint32_1 = vec_splat_u32(1); | 593 const vector unsigned int vuint32_1 = vec_splat_u32(1); |
594 | 594 |
595 #define COMPARE(i) \ | 595 #define COMPARE(i) \ |
596 vector signed int sum##i; \ | 596 vector signed int sum##i; \ |
597 do { \ | 597 do { \ |
598 const vector unsigned char cmp##i = \ | 598 const vector unsigned char cmp##i = \ |
599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ | 599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ |
600 const vector unsigned short cmpHi##i = \ | 600 const vector unsigned short cmpHi##i = \ |
601 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ | 601 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ |
602 const vector unsigned short cmpLi##i = \ | 602 const vector unsigned short cmpLi##i = \ |
603 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ | 603 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ |
604 const vector signed short cmpHf##i = \ | 604 const vector signed short cmpHf##i = \ |
605 (vector signed short)vec_and(cmpHi##i, mask1); \ | 605 (vector signed short)vec_and(cmpHi##i, mask1); \ |
606 const vector signed short cmpLf##i = \ | 606 const vector signed short cmpLf##i = \ |
607 (vector signed short)vec_and(cmpLi##i, mask2); \ | 607 (vector signed short)vec_and(cmpLi##i, mask2); \ |
608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ | 608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ |
609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ | 609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ |
610 sum##i = vec_sums(sumq##i, zero); } while (0) | 610 sum##i = vec_sums(sumq##i, zero); } while (0) |
611 | 611 |
612 COMPARE(0); | 612 COMPARE(0); |
613 COMPARE(1); | 613 COMPARE(1); |
614 COMPARE(2); | 614 COMPARE(2); |
641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); | 641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); |
642 const vector signed int t2A = vec_or(sumA, tA); | 642 const vector signed int t2A = vec_or(sumA, tA); |
643 const vector signed int t2B = vec_or(sumB, tB); | 643 const vector signed int t2B = vec_or(sumB, tB); |
644 const vector signed int t2C = vec_or(sumC, tC); | 644 const vector signed int t2C = vec_or(sumC, tC); |
645 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), | 645 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), |
646 vec_sl(t2A, vuint32_1)); | 646 vec_sl(t2A, vuint32_1)); |
647 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), | 647 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), |
648 vec_sl(t2B, vuint32_1)); | 648 vec_sl(t2B, vuint32_1)); |
649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), | 649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), |
650 vec_sl(t2C, vuint32_1)); | 650 vec_sl(t2C, vuint32_1)); |
651 const vector signed int yA = vec_and(t2A, t3A); | 651 const vector signed int yA = vec_and(t2A, t3A); |
652 const vector signed int yB = vec_and(t2B, t3B); | 652 const vector signed int yB = vec_and(t2B, t3B); |
653 const vector signed int yC = vec_and(t2C, t3C); | 653 const vector signed int yC = vec_and(t2C, t3C); |
654 | 654 |
655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); | 655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); |
657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); | 657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); |
658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); | 658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); |
659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); | 659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); |
660 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); | 660 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); |
661 const vector signed int sumAp = vec_and(yA, | 661 const vector signed int sumAp = vec_and(yA, |
662 vec_and(sumAd4,sumAd8)); | 662 vec_and(sumAd4,sumAd8)); |
663 const vector signed int sumBp = vec_and(yB, | 663 const vector signed int sumBp = vec_and(yB, |
664 vec_and(sumBd4,sumBd8)); | 664 vec_and(sumBd4,sumBd8)); |
665 sumA2 = vec_or(sumAp, | 665 sumA2 = vec_or(sumAp, |
666 vec_sra(sumAp, | 666 vec_sra(sumAp, |
667 vuint32_16)); | 667 vuint32_16)); |
668 sumB2 = vec_or(sumBp, | 668 sumB2 = vec_or(sumBp, |
669 vec_sra(sumBp, | 669 vec_sra(sumBp, |
670 vuint32_16)); | 670 vuint32_16)); |
671 } | 671 } |
672 vec_st(sumA2, 0, S); | 672 vec_st(sumA2, 0, S); |
673 vec_st(sumB2, 16, S); | 673 vec_st(sumB2, 16, S); |
674 } | 674 } |
675 | 675 |
684 const vector signed int vsint32_8 = vec_splat_s32(8); | 684 const vector signed int vsint32_8 = vec_splat_s32(8); |
685 const vector unsigned int vuint32_4 = vec_splat_u32(4); | 685 const vector unsigned int vuint32_4 = vec_splat_u32(4); |
686 | 686 |
687 const vector unsigned char permA1 = (vector unsigned char) | 687 const vector unsigned char permA1 = (vector unsigned char) |
688 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, | 688 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, |
689 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); | 689 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); |
690 const vector unsigned char permA2 = (vector unsigned char) | 690 const vector unsigned char permA2 = (vector unsigned char) |
691 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, | 691 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, |
692 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); | 692 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); |
693 const vector unsigned char permA1inc = (vector unsigned char) | 693 const vector unsigned char permA1inc = (vector unsigned char) |
694 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, | 694 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, |
695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
696 const vector unsigned char permA2inc = (vector unsigned char) | 696 const vector unsigned char permA2inc = (vector unsigned char) |
697 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, | 697 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, |
698 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 698 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
699 const vector unsigned char magic = (vector unsigned char) | 699 const vector unsigned char magic = (vector unsigned char) |
700 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, | 700 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, |
701 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 701 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
702 const vector unsigned char extractPerm = (vector unsigned char) | 702 const vector unsigned char extractPerm = (vector unsigned char) |
703 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, | 703 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, |
704 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); | 704 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); |
705 const vector unsigned char extractPermInc = (vector unsigned char) | 705 const vector unsigned char extractPermInc = (vector unsigned char) |
706 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, | 706 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, |
707 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); | 707 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); |
708 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); | 708 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); |
709 const vector unsigned char tenRight = (vector unsigned char) | 709 const vector unsigned char tenRight = (vector unsigned char) |
710 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | 710 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
712 const vector unsigned char eightLeft = (vector unsigned char) | 712 const vector unsigned char eightLeft = (vector unsigned char) |
713 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | 713 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
714 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); | 714 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); |
715 | 715 |
716 | 716 |
717 #define F_INIT(i) \ | 717 #define F_INIT(i) \ |
718 vector unsigned char tenRightM##i = tenRight; \ | 718 vector unsigned char tenRightM##i = tenRight; \ |
719 vector unsigned char permA1M##i = permA1; \ | 719 vector unsigned char permA1M##i = permA1; \ |
720 vector unsigned char permA2M##i = permA2; \ | 720 vector unsigned char permA2M##i = permA2; \ |
721 vector unsigned char extractPermM##i = extractPerm | 721 vector unsigned char extractPermM##i = extractPerm |
722 | 722 |
723 #define F2(i, j, k, l) \ | 723 #define F2(i, j, k, l) \ |
724 if (S[i] & (1 << (l+1))) { \ | 724 if (S[i] & (1 << (l+1))) { \ |
725 const vector unsigned char a_##j##_A##l = \ | 725 const vector unsigned char a_##j##_A##l = \ |
726 vec_perm(src##i, src##j, permA1M##i); \ | 726 vec_perm(src##i, src##j, permA1M##i); \ |
727 const vector unsigned char a_##j##_B##l = \ | 727 const vector unsigned char a_##j##_B##l = \ |
728 vec_perm(a_##j##_A##l, src##k, permA2M##i); \ | 728 vec_perm(a_##j##_A##l, src##k, permA2M##i); \ |
729 const vector signed int a_##j##_sump##l = \ | 729 const vector signed int a_##j##_sump##l = \ |
730 (vector signed int)vec_msum(a_##j##_B##l, magic, \ | 730 (vector signed int)vec_msum(a_##j##_B##l, magic, \ |
731 (vector unsigned int)zero); \ | 731 (vector unsigned int)zero); \ |
732 vector signed int F_##j##_##l = \ | 732 vector signed int F_##j##_##l = \ |
733 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ | 733 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ |
734 F_##j##_##l = vec_splat(F_##j##_##l, 3); \ | 734 F_##j##_##l = vec_splat(F_##j##_##l, 3); \ |
735 const vector signed int p_##j##_##l = \ | 735 const vector signed int p_##j##_##l = \ |
736 (vector signed int)vec_perm(src##j, \ | 736 (vector signed int)vec_perm(src##j, \ |
737 (vector unsigned char)zero, \ | 737 (vector unsigned char)zero, \ |
738 extractPermM##i); \ | 738 extractPermM##i); \ |
739 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2); \ | 739 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2);\ |
740 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2); \ | 740 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\ |
741 vector signed int newpm_##j##_##l; \ | 741 vector signed int newpm_##j##_##l; \ |
742 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ | 742 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ |
743 newpm_##j##_##l = sum_##j##_##l; \ | 743 newpm_##j##_##l = sum_##j##_##l; \ |
744 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ | 744 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ |
745 newpm_##j##_##l = diff_##j##_##l; \ | 745 newpm_##j##_##l = diff_##j##_##l; \ |
746 else newpm_##j##_##l = F_##j##_##l; \ | 746 else newpm_##j##_##l = F_##j##_##l; \ |
747 const vector unsigned char newpm2_##j##_##l = \ | 747 const vector unsigned char newpm2_##j##_##l = \ |
748 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ | 748 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ |
749 const vector unsigned char mask##j##l = vec_add(identity, \ | 749 const vector unsigned char mask##j##l = vec_add(identity, \ |
750 tenRightM##i); \ | 750 tenRightM##i); \ |
751 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ | 751 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ |
752 } \ | 752 } \ |
753 permA1M##i = vec_add(permA1M##i, permA1inc); \ | 753 permA1M##i = vec_add(permA1M##i, permA1inc); \ |
754 permA2M##i = vec_add(permA2M##i, permA2inc); \ | 754 permA2M##i = vec_add(permA2M##i, permA2inc); \ |
755 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ | 755 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ |
756 extractPermM##i = vec_add(extractPermM##i, extractPermInc) | 756 extractPermM##i = vec_add(extractPermM##i, extractPermInc) |
757 | 757 |
758 #define ITER(i, j, k) \ | 758 #define ITER(i, j, k) \ |
759 F_INIT(i); \ | 759 F_INIT(i); \ |
760 F2(i, j, k, 0); \ | 760 F2(i, j, k, 0); \ |
761 F2(i, j, k, 1); \ | 761 F2(i, j, k, 1); \ |
762 F2(i, j, k, 2); \ | 762 F2(i, j, k, 2); \ |
763 F2(i, j, k, 3); \ | 763 F2(i, j, k, 3); \ |
764 F2(i, j, k, 4); \ | 764 F2(i, j, k, 4); \ |
765 F2(i, j, k, 5); \ | 765 F2(i, j, k, 5); \ |
766 F2(i, j, k, 6); \ | 766 F2(i, j, k, 6); \ |
767 F2(i, j, k, 7) | 767 F2(i, j, k, 7) |
768 | 768 |
769 ITER(0, 1, 2); | 769 ITER(0, 1, 2); |
770 ITER(1, 2, 3); | 770 ITER(1, 2, 3); |
771 ITER(2, 3, 4); | 771 ITER(2, 3, 4); |
775 ITER(6, 7, 8); | 775 ITER(6, 7, 8); |
776 ITER(7, 8, 9); | 776 ITER(7, 8, 9); |
777 | 777 |
778 const vector signed char neg1 = vec_splat_s8(-1); | 778 const vector signed char neg1 = vec_splat_s8(-1); |
779 | 779 |
780 #define STORE_LINE(i) \ | 780 #define STORE_LINE(i) \ |
781 const vector unsigned char permST##i = \ | 781 const vector unsigned char permST##i = \ |
782 vec_lvsr(i * stride, srcCopy); \ | 782 vec_lvsr(i * stride, srcCopy); \ |
783 const vector unsigned char maskST##i = \ | 783 const vector unsigned char maskST##i = \ |
784 vec_perm((vector unsigned char)zero, \ | 784 vec_perm((vector unsigned char)zero, \ |
785 (vector unsigned char)neg1, permST##i); \ | 785 (vector unsigned char)neg1, permST##i); \ |
786 src##i = vec_perm(src##i ,src##i, permST##i); \ | 786 src##i = vec_perm(src##i ,src##i, permST##i); \ |
787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ | 787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ |
788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ | 788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ |
789 vec_st(sA##i, i * stride, srcCopy); \ | 789 vec_st(sA##i, i * stride, srcCopy); \ |
790 vec_st(sB##i, i * stride + 16, srcCopy) | 790 vec_st(sB##i, i * stride + 16, srcCopy) |
791 | 791 |
792 STORE_LINE(1); | 792 STORE_LINE(1); |
793 STORE_LINE(2); | 793 STORE_LINE(2); |
794 STORE_LINE(3); | 794 STORE_LINE(3); |
806 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) | 806 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) |
807 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) | 807 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) |
808 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) | 808 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) |
809 | 809 |
810 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, | 810 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
811 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) | 811 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
812 { | 812 { |
813 const vector signed int zero = vec_splat_s32(0); | 813 const vector signed int zero = vec_splat_s32(0); |
814 const vector signed short vsint16_1 = vec_splat_s16(1); | 814 const vector signed short vsint16_1 = vec_splat_s16(1); |
815 vector signed int v_dp = zero; | 815 vector signed int v_dp = zero; |
816 vector signed int v_sysdp = zero; | 816 vector signed int v_sysdp = zero; |
818 | 818 |
819 tempBluredPast[127]= maxNoise[0]; | 819 tempBluredPast[127]= maxNoise[0]; |
820 tempBluredPast[128]= maxNoise[1]; | 820 tempBluredPast[128]= maxNoise[1]; |
821 tempBluredPast[129]= maxNoise[2]; | 821 tempBluredPast[129]= maxNoise[2]; |
822 | 822 |
823 #define LOAD_LINE(src, i) \ | 823 #define LOAD_LINE(src, i) \ |
824 register int j##src##i = i * stride; \ | 824 register int j##src##i = i * stride; \ |
825 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ | 825 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ |
826 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ | 826 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ |
827 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ | 827 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ |
828 const vector unsigned char v_##src##A##i = \ | 828 const vector unsigned char v_##src##A##i = \ |
829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ | 829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ |
830 vector signed short v_##src##Ass##i = \ | 830 vector signed short v_##src##Ass##i = \ |
831 (vector signed short)vec_mergeh((vector signed char)zero, \ | 831 (vector signed short)vec_mergeh((vector signed char)zero, \ |
832 (vector signed char)v_##src##A##i) | 832 (vector signed char)v_##src##A##i) |
833 | 833 |
834 LOAD_LINE(src, 0); | 834 LOAD_LINE(src, 0); |
835 LOAD_LINE(src, 1); | 835 LOAD_LINE(src, 1); |
836 LOAD_LINE(src, 2); | 836 LOAD_LINE(src, 2); |
837 LOAD_LINE(src, 3); | 837 LOAD_LINE(src, 3); |
848 LOAD_LINE(tempBlured, 5); | 848 LOAD_LINE(tempBlured, 5); |
849 LOAD_LINE(tempBlured, 6); | 849 LOAD_LINE(tempBlured, 6); |
850 LOAD_LINE(tempBlured, 7); | 850 LOAD_LINE(tempBlured, 7); |
851 #undef LOAD_LINE | 851 #undef LOAD_LINE |
852 | 852 |
853 #define ACCUMULATE_DIFFS(i) \ | 853 #define ACCUMULATE_DIFFS(i) \ |
854 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ | 854 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ |
855 v_srcAss##i); \ | 855 v_srcAss##i); \ |
856 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ | 856 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ |
857 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) | 857 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) |
858 | 858 |
859 ACCUMULATE_DIFFS(0); | 859 ACCUMULATE_DIFFS(0); |
860 ACCUMULATE_DIFFS(1); | 860 ACCUMULATE_DIFFS(1); |
861 ACCUMULATE_DIFFS(2); | 861 ACCUMULATE_DIFFS(2); |
914 if (d < maxNoise[0]) { | 914 if (d < maxNoise[0]) { |
915 const vector signed short vsint16_7 = vec_splat_s16(7); | 915 const vector signed short vsint16_7 = vec_splat_s16(7); |
916 const vector signed short vsint16_4 = vec_splat_s16(4); | 916 const vector signed short vsint16_4 = vec_splat_s16(4); |
917 const vector unsigned short vuint16_3 = vec_splat_u16(3); | 917 const vector unsigned short vuint16_3 = vec_splat_u16(3); |
918 | 918 |
919 #define OP(i) \ | 919 #define OP(i) \ |
920 const vector signed short v_temp##i = \ | 920 const vector signed short v_temp##i = \ |
921 vec_mladd(v_tempBluredAss##i, \ | 921 vec_mladd(v_tempBluredAss##i, \ |
922 vsint16_7, v_srcAss##i); \ | 922 vsint16_7, v_srcAss##i); \ |
923 const vector signed short v_temp2##i = \ | 923 const vector signed short v_temp2##i = \ |
924 vec_add(v_temp##i, vsint16_4); \ | 924 vec_add(v_temp##i, vsint16_4); \ |
925 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3) | 925 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3) |
926 | 926 |
927 OP(0); | 927 OP(0); |
928 OP(1); | 928 OP(1); |
929 OP(2); | 929 OP(2); |
935 #undef OP | 935 #undef OP |
936 } else { | 936 } else { |
937 const vector signed short vsint16_3 = vec_splat_s16(3); | 937 const vector signed short vsint16_3 = vec_splat_s16(3); |
938 const vector signed short vsint16_2 = vec_splat_s16(2); | 938 const vector signed short vsint16_2 = vec_splat_s16(2); |
939 | 939 |
940 #define OP(i) \ | 940 #define OP(i) \ |
941 const vector signed short v_temp##i = \ | 941 const vector signed short v_temp##i = \ |
942 vec_mladd(v_tempBluredAss##i, \ | 942 vec_mladd(v_tempBluredAss##i, \ |
943 vsint16_3, v_srcAss##i); \ | 943 vsint16_3, v_srcAss##i); \ |
944 const vector signed short v_temp2##i = \ | 944 const vector signed short v_temp2##i = \ |
945 vec_add(v_temp##i, vsint16_2); \ | 945 vec_add(v_temp##i, vsint16_2); \ |
946 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2) | 946 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2) |
947 | 947 |
948 OP(0); | 948 OP(0); |
949 OP(1); | 949 OP(1); |
950 OP(2); | 950 OP(2); |
957 } | 957 } |
958 } | 958 } |
959 | 959 |
960 const vector signed char neg1 = vec_splat_s8(-1); | 960 const vector signed char neg1 = vec_splat_s8(-1); |
961 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 961 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
962 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 962 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
963 | 963 |
964 #define PACK_AND_STORE(src, i) \ | 964 #define PACK_AND_STORE(src, i) \ |
965 const vector unsigned char perms##src##i = \ | 965 const vector unsigned char perms##src##i = \ |
966 vec_lvsr(i * stride, src); \ | 966 vec_lvsr(i * stride, src); \ |
967 const vector unsigned char vf##src##i = \ | 967 const vector unsigned char vf##src##i = \ |
968 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ | 968 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ |
969 const vector unsigned char vg##src##i = \ | 969 const vector unsigned char vg##src##i = \ |
970 vec_perm(vf##src##i, v_##src##A##i, permHH); \ | 970 vec_perm(vf##src##i, v_##src##A##i, permHH); \ |
971 const vector unsigned char mask##src##i = \ | 971 const vector unsigned char mask##src##i = \ |
972 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ | 972 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ |
973 const vector unsigned char vg2##src##i = \ | 973 const vector unsigned char vg2##src##i = \ |
974 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ | 974 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ |
975 const vector unsigned char svA##src##i = \ | 975 const vector unsigned char svA##src##i = \ |
976 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ | 976 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ |
977 const vector unsigned char svB##src##i = \ | 977 const vector unsigned char svB##src##i = \ |
978 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ | 978 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ |
979 vec_st(svA##src##i, i * stride, src); \ | 979 vec_st(svA##src##i, i * stride, src); \ |
980 vec_st(svB##src##i, i * stride + 16, src) | 980 vec_st(svB##src##i, i * stride + 16, src) |
981 | 981 |
982 PACK_AND_STORE(src, 0); | 982 PACK_AND_STORE(src, 0); |
983 PACK_AND_STORE(src, 1); | 983 PACK_AND_STORE(src, 1); |
984 PACK_AND_STORE(src, 2); | 984 PACK_AND_STORE(src, 2); |
999 } | 999 } |
1000 | 1000 |
1001 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { | 1001 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { |
1002 const vector unsigned char zero = vec_splat_u8(0); | 1002 const vector unsigned char zero = vec_splat_u8(0); |
1003 | 1003 |
1004 #define LOAD_DOUBLE_LINE(i, j) \ | 1004 #define LOAD_DOUBLE_LINE(i, j) \ |
1005 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ | 1005 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ |
1006 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ | 1006 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ |
1007 vector unsigned char srcA##i = vec_ld(i * stride, src); \ | 1007 vector unsigned char srcA##i = vec_ld(i * stride, src); \ |
1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ | 1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ |
1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ | 1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ |
1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ | 1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ |
1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ | 1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ |
1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) | 1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) |
1013 | 1013 |
1014 LOAD_DOUBLE_LINE(0, 1); | 1014 LOAD_DOUBLE_LINE(0, 1); |
1015 LOAD_DOUBLE_LINE(2, 3); | 1015 LOAD_DOUBLE_LINE(2, 3); |
1016 LOAD_DOUBLE_LINE(4, 5); | 1016 LOAD_DOUBLE_LINE(4, 5); |
1105 | 1105 |
1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { | 1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { |
1107 const vector unsigned char zero = vec_splat_u8(0); | 1107 const vector unsigned char zero = vec_splat_u8(0); |
1108 const vector unsigned char magic_perm = (const vector unsigned char) | 1108 const vector unsigned char magic_perm = (const vector unsigned char) |
1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
1111 | 1111 |
1112 #define LOAD_DOUBLE_LINE(i, j) \ | 1112 #define LOAD_DOUBLE_LINE(i, j) \ |
1113 vector unsigned char src##i = vec_ld(i * 16, src); \ | 1113 vector unsigned char src##i = vec_ld(i * 16, src); \ |
1114 vector unsigned char src##j = vec_ld(j * 16, src) | 1114 vector unsigned char src##j = vec_ld(j * 16, src) |
1115 | 1115 |
1116 LOAD_DOUBLE_LINE(0, 1); | 1116 LOAD_DOUBLE_LINE(0, 1); |
1117 LOAD_DOUBLE_LINE(2, 3); | 1117 LOAD_DOUBLE_LINE(2, 3); |
1118 LOAD_DOUBLE_LINE(4, 5); | 1118 LOAD_DOUBLE_LINE(4, 5); |
1167 temp6 = vec_mergeh(tempD, tempL); | 1167 temp6 = vec_mergeh(tempD, tempL); |
1168 temp7 = vec_mergel(tempD, tempL); | 1168 temp7 = vec_mergel(tempD, tempL); |
1169 | 1169 |
1170 | 1170 |
1171 const vector signed char neg1 = vec_splat_s8(-1); | 1171 const vector signed char neg1 = vec_splat_s8(-1); |
1172 #define STORE_DOUBLE_LINE(i, j) \ | 1172 #define STORE_DOUBLE_LINE(i, j) \ |
1173 vector unsigned char dstA##i = vec_ld(i * stride, dst); \ | 1173 vector unsigned char dstA##i = vec_ld(i * stride, dst); \ |
1174 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \ | 1174 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \ |
1175 vector unsigned char dstA##j = vec_ld(j * stride, dst); \ | 1175 vector unsigned char dstA##j = vec_ld(j * stride, dst); \ |
1176 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \ | 1176 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \ |
1177 vector unsigned char align##i = vec_lvsr(i * stride, dst); \ | 1177 vector unsigned char align##i = vec_lvsr(i * stride, dst); \ |
1178 vector unsigned char align##j = vec_lvsr(j * stride, dst); \ | 1178 vector unsigned char align##j = vec_lvsr(j * stride, dst); \ |
1179 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \ | 1179 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \ |
1180 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \ | 1180 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \ |
1181 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \ | 1181 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \ |
1182 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \ | 1182 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \ |
1183 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \ | 1183 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \ |
1184 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \ | 1184 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \ |
1185 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \ | 1185 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \ |
1186 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \ | 1186 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \ |
1187 vec_st(dstAF##i, i * stride, dst); \ | 1187 vec_st(dstAF##i, i * stride, dst); \ |
1188 vec_st(dstBF##i, i * stride + 16, dst); \ | 1188 vec_st(dstBF##i, i * stride + 16, dst); \ |
1189 vec_st(dstAF##j, j * stride, dst); \ | 1189 vec_st(dstAF##j, j * stride, dst); \ |
1190 vec_st(dstBF##j, j * stride + 16, dst) | 1190 vec_st(dstBF##j, j * stride + 16, dst) |
1191 | 1191 |
1192 STORE_DOUBLE_LINE(0,1); | 1192 STORE_DOUBLE_LINE(0,1); |
1193 STORE_DOUBLE_LINE(2,3); | 1193 STORE_DOUBLE_LINE(2,3); |
1194 STORE_DOUBLE_LINE(4,5); | 1194 STORE_DOUBLE_LINE(4,5); |