comparison libpostproc/postprocess_altivec_template.c @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents ef2149182f1c
children 0b546eab515d
comparison
equal deleted inserted replaced
2978:403183bbb505 2979:bfabfdf9ce55
24 #else 24 #else
25 #define AVV(x...) {x} 25 #define AVV(x...) {x}
26 #endif 26 #endif
27 27
28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ 28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
29 do { \ 29 do { \
30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ 30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \
31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ 31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \
32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ 32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \
33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ 33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \
34 tempA1 = vec_mergeh (src_a, src_e); \ 34 tempA1 = vec_mergeh (src_a, src_e); \
35 tempB1 = vec_mergel (src_a, src_e); \ 35 tempB1 = vec_mergel (src_a, src_e); \
36 tempC1 = vec_mergeh (src_b, src_f); \ 36 tempC1 = vec_mergeh (src_b, src_f); \
37 tempD1 = vec_mergel (src_b, src_f); \ 37 tempD1 = vec_mergel (src_b, src_f); \
38 tempE1 = vec_mergeh (src_c, src_g); \ 38 tempE1 = vec_mergeh (src_c, src_g); \
39 tempF1 = vec_mergel (src_c, src_g); \ 39 tempF1 = vec_mergel (src_c, src_g); \
40 tempG1 = vec_mergeh (src_d, src_h); \ 40 tempG1 = vec_mergeh (src_d, src_h); \
41 tempH1 = vec_mergel (src_d, src_h); \ 41 tempH1 = vec_mergel (src_d, src_h); \
42 tempA2 = vec_mergeh (tempA1, tempE1); \ 42 tempA2 = vec_mergeh (tempA1, tempE1); \
43 tempB2 = vec_mergel (tempA1, tempE1); \ 43 tempB2 = vec_mergel (tempA1, tempE1); \
44 tempC2 = vec_mergeh (tempB1, tempF1); \ 44 tempC2 = vec_mergeh (tempB1, tempF1); \
45 tempD2 = vec_mergel (tempB1, tempF1); \ 45 tempD2 = vec_mergel (tempB1, tempF1); \
46 tempE2 = vec_mergeh (tempC1, tempG1); \ 46 tempE2 = vec_mergeh (tempC1, tempG1); \
47 tempF2 = vec_mergel (tempC1, tempG1); \ 47 tempF2 = vec_mergel (tempC1, tempG1); \
48 tempG2 = vec_mergeh (tempD1, tempH1); \ 48 tempG2 = vec_mergeh (tempD1, tempH1); \
49 tempH2 = vec_mergel (tempD1, tempH1); \ 49 tempH2 = vec_mergel (tempD1, tempH1); \
50 src_a = vec_mergeh (tempA2, tempE2); \ 50 src_a = vec_mergeh (tempA2, tempE2); \
51 src_b = vec_mergel (tempA2, tempE2); \ 51 src_b = vec_mergel (tempA2, tempE2); \
52 src_c = vec_mergeh (tempB2, tempF2); \ 52 src_c = vec_mergeh (tempB2, tempF2); \
53 src_d = vec_mergel (tempB2, tempF2); \ 53 src_d = vec_mergel (tempB2, tempF2); \
54 src_e = vec_mergeh (tempC2, tempG2); \ 54 src_e = vec_mergeh (tempC2, tempG2); \
55 src_f = vec_mergel (tempC2, tempG2); \ 55 src_f = vec_mergel (tempC2, tempG2); \
56 src_g = vec_mergeh (tempD2, tempH2); \ 56 src_g = vec_mergeh (tempD2, tempH2); \
57 src_h = vec_mergel (tempD2, tempH2); \ 57 src_h = vec_mergel (tempD2, tempH2); \
58 } while (0) 58 } while (0)
59 59
60 60
61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { 61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
62 /* 62 /*
92 92
93 src2 += stride * 4; 93 src2 += stride * 4;
94 94
95 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; 95 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
96 96
97 #define LOAD_LINE(i) \ 97 #define LOAD_LINE(i) \
98 register int j##i = i * stride; \ 98 register int j##i = i * stride; \
99 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ 99 vector unsigned char perm##i = vec_lvsl(j##i, src2); \
100 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ 100 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
101 vector unsigned char v_srcA2##i; \ 101 vector unsigned char v_srcA2##i; \
102 if (two_vectors) \ 102 if (two_vectors) \
103 v_srcA2##i = vec_ld(j##i + 16, src2); \ 103 v_srcA2##i = vec_ld(j##i + 16, src2); \
104 const vector unsigned char v_srcA##i = \ 104 const vector unsigned char v_srcA##i = \
105 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ 105 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
106 v_srcAss##i = \ 106 v_srcAss##i = \
107 (vector signed short)vec_mergeh((vector signed char)zero, \ 107 (vector signed short)vec_mergeh((vector signed char)zero, \
108 (vector signed char)v_srcA##i) 108 (vector signed char)v_srcA##i)
109 109
110 #define LOAD_LINE_ALIGNED(i) \ 110 #define LOAD_LINE_ALIGNED(i) \
111 register int j##i = i * stride; \ 111 register int j##i = i * stride; \
112 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ 112 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \
113 v_srcAss##i = \ 113 v_srcAss##i = \
114 (vector signed short)vec_mergeh((vector signed char)zero, \ 114 (vector signed short)vec_mergeh((vector signed char)zero, \
115 (vector signed char)v_srcA##i) 115 (vector signed char)v_srcA##i)
116 116
117 // special casing the aligned case is worthwhile, as all call from 117 // special casing the aligned case is worthwhile, as all call from
118 // the (transposed) horizontable deblocks will be aligned, i naddition 118 // the (transposed) horizontable deblocks will be aligned, i naddition
119 // to the naturraly aligned vertical deblocks. 119 // to the naturraly aligned vertical deblocks.
120 if (properStride && srcAlign) { 120 if (properStride && srcAlign) {
137 LOAD_LINE(7); 137 LOAD_LINE(7);
138 } 138 }
139 #undef LOAD_LINE 139 #undef LOAD_LINE
140 #undef LOAD_LINE_ALIGNED 140 #undef LOAD_LINE_ALIGNED
141 141
142 #define ITER(i, j) \ 142 #define ITER(i, j) \
143 const vector signed short v_diff##i = \ 143 const vector signed short v_diff##i = \
144 vec_sub(v_srcAss##i, v_srcAss##j); \ 144 vec_sub(v_srcAss##i, v_srcAss##j); \
145 const vector signed short v_sum##i = \ 145 const vector signed short v_sum##i = \
146 vec_add(v_diff##i, v_dcOffset); \ 146 vec_add(v_diff##i, v_dcOffset); \
147 const vector signed short v_comp##i = \ 147 const vector signed short v_comp##i = \
148 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ 148 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
149 v_dcThreshold); \ 149 v_dcThreshold); \
150 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ 150 const vector signed short v_part##i = vec_and(mask, v_comp##i); \
151 v_numEq = vec_sum4s(v_part##i, v_numEq); 151 v_numEq = vec_sum4s(v_part##i, v_numEq);
152 152
153 ITER(0, 1); 153 ITER(0, 1);
154 ITER(1, 2); 154 ITER(1, 2);
155 ITER(2, 3); 155 ITER(2, 3);
165 vec_ste(v_numEq, 0, &numEq); 165 vec_ste(v_numEq, 0, &numEq);
166 166
167 if (numEq > c->ppMode.flatnessThreshold) 167 if (numEq > c->ppMode.flatnessThreshold)
168 { 168 {
169 const vector unsigned char mmoP1 = (const vector unsigned char) 169 const vector unsigned char mmoP1 = (const vector unsigned char)
170 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 170 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
171 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); 171 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
172 const vector unsigned char mmoP2 = (const vector unsigned char) 172 const vector unsigned char mmoP2 = (const vector unsigned char)
173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, 173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); 174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
175 const vector unsigned char mmoP = (const vector unsigned char) 175 const vector unsigned char mmoP = (const vector unsigned char)
176 vec_lvsl(8, (unsigned char*)0); 176 vec_lvsl(8, (unsigned char*)0);
177 177
178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); 178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); 179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); 180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); 181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); 183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); 184 vector signed short mmoDiff = vec_sub(mmoL, mmoR);
185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); 185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
186 186
187 if (vec_any_gt(mmoSum, v4QP)) 187 if (vec_any_gt(mmoSum, v4QP))
188 return 0; 188 return 0;
189 else 189 else
190 return 1; 190 return 1;
191 } 191 }
192 else return 2; 192 else return 2;
193 } 193 }
194 194
195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { 195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; 216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; 217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; 218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
219 219
220 #define LOAD_LINE(i) \ 220 #define LOAD_LINE(i) \
221 const vector unsigned char perml##i = \ 221 const vector unsigned char perml##i = \
222 vec_lvsl(i * stride, src2); \ 222 vec_lvsl(i * stride, src2); \
223 vbA##i = vec_ld(i * stride, src2); \ 223 vbA##i = vec_ld(i * stride, src2); \
224 vbB##i = vec_ld(i * stride + 16, src2); \ 224 vbB##i = vec_ld(i * stride + 16, src2); \
225 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ 225 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \
226 vb##i = \ 226 vb##i = \
227 (vector signed short)vec_mergeh((vector unsigned char)zero, \ 227 (vector signed short)vec_mergeh((vector unsigned char)zero, \
228 (vector unsigned char)vbT##i) 228 (vector unsigned char)vbT##i)
229 229
230 #define LOAD_LINE_ALIGNED(i) \ 230 #define LOAD_LINE_ALIGNED(i) \
231 register int j##i = i * stride; \ 231 register int j##i = i * stride; \
232 vbT##i = vec_ld(j##i, src2); \ 232 vbT##i = vec_ld(j##i, src2); \
233 vb##i = \ 233 vb##i = \
234 (vector signed short)vec_mergeh((vector signed char)zero, \ 234 (vector signed short)vec_mergeh((vector signed char)zero, \
235 (vector signed char)vbT##i) 235 (vector signed char)vbT##i)
236 236
237 // special casing the aligned case is worthwhile, as all call from 237 // special casing the aligned case is worthwhile, as all call from
238 // the (transposed) horizontable deblocks will be aligned, in addition 238 // the (transposed) horizontable deblocks will be aligned, in addition
239 // to the naturraly aligned vertical deblocks. 239 // to the naturraly aligned vertical deblocks.
240 if (properStride && srcAlign) { 240 if (properStride && srcAlign) {
306 const vector signed short v_sumsB8 = vec_add(temp81, v_last); 306 const vector signed short v_sumsB8 = vec_add(temp81, v_last);
307 307
308 const vector signed short temp91 = vec_sub(v_sumsB8, vb5); 308 const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
309 const vector signed short v_sumsB9 = vec_add(temp91, v_last); 309 const vector signed short v_sumsB9 = vec_add(temp91, v_last);
310 310
311 #define COMPUTE_VR(i, j, k) \ 311 #define COMPUTE_VR(i, j, k) \
312 const vector signed short temps1##i = \ 312 const vector signed short temps1##i = \
313 vec_add(v_sumsB##i, v_sumsB##k); \ 313 vec_add(v_sumsB##i, v_sumsB##k); \
314 const vector signed short temps2##i = \ 314 const vector signed short temps2##i = \
315 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ 315 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
316 const vector signed short vr##j = vec_sra(temps2##i, v_4) 316 const vector signed short vr##j = vec_sra(temps2##i, v_4)
317 317
318 COMPUTE_VR(0, 1, 2); 318 COMPUTE_VR(0, 1, 2);
319 COMPUTE_VR(1, 2, 3); 319 COMPUTE_VR(1, 2, 3);
320 COMPUTE_VR(2, 3, 4); 320 COMPUTE_VR(2, 3, 4);
324 COMPUTE_VR(6, 7, 8); 324 COMPUTE_VR(6, 7, 8);
325 COMPUTE_VR(7, 8, 9); 325 COMPUTE_VR(7, 8, 9);
326 326
327 const vector signed char neg1 = vec_splat_s8(-1); 327 const vector signed char neg1 = vec_splat_s8(-1);
328 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 328 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
329 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); 329 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
330 330
331 #define PACK_AND_STORE(i) \ 331 #define PACK_AND_STORE(i) \
332 const vector unsigned char perms##i = \ 332 const vector unsigned char perms##i = \
333 vec_lvsr(i * stride, src2); \ 333 vec_lvsr(i * stride, src2); \
334 const vector unsigned char vf##i = \ 334 const vector unsigned char vf##i = \
335 vec_packsu(vr##i, (vector signed short)zero); \ 335 vec_packsu(vr##i, (vector signed short)zero); \
336 const vector unsigned char vg##i = \ 336 const vector unsigned char vg##i = \
337 vec_perm(vf##i, vbT##i, permHH); \ 337 vec_perm(vf##i, vbT##i, permHH); \
338 const vector unsigned char mask##i = \ 338 const vector unsigned char mask##i = \
339 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ 339 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
340 const vector unsigned char vg2##i = \ 340 const vector unsigned char vg2##i = \
341 vec_perm(vg##i, vg##i, perms##i); \ 341 vec_perm(vg##i, vg##i, perms##i); \
342 const vector unsigned char svA##i = \ 342 const vector unsigned char svA##i = \
343 vec_sel(vbA##i, vg2##i, mask##i); \ 343 vec_sel(vbA##i, vg2##i, mask##i); \
344 const vector unsigned char svB##i = \ 344 const vector unsigned char svB##i = \
345 vec_sel(vg2##i, vbB##i, mask##i); \ 345 vec_sel(vg2##i, vbB##i, mask##i); \
346 vec_st(svA##i, i * stride, src2); \ 346 vec_st(svA##i, i * stride, src2); \
347 vec_st(svB##i, i * stride + 16, src2) 347 vec_st(svB##i, i * stride + 16, src2)
348 348
349 #define PACK_AND_STORE_ALIGNED(i) \ 349 #define PACK_AND_STORE_ALIGNED(i) \
350 const vector unsigned char vf##i = \ 350 const vector unsigned char vf##i = \
351 vec_packsu(vr##i, (vector signed short)zero); \ 351 vec_packsu(vr##i, (vector signed short)zero); \
352 const vector unsigned char vg##i = \ 352 const vector unsigned char vg##i = \
353 vec_perm(vf##i, vbT##i, permHH); \ 353 vec_perm(vf##i, vbT##i, permHH); \
354 vec_st(vg##i, i * stride, src2) 354 vec_st(vg##i, i * stride, src2)
355 355
356 // special casing the aligned case is worthwhile, as all call from 356 // special casing the aligned case is worthwhile, as all call from
357 // the (transposed) horizontable deblocks will be aligned, in addition 357 // the (transposed) horizontable deblocks will be aligned, in addition
358 // to the naturraly aligned vertical deblocks. 358 // to the naturraly aligned vertical deblocks.
396 qp[0] = 8*c->QP; 396 qp[0] = 8*c->QP;
397 vector signed short vqp = vec_ld(0, qp); 397 vector signed short vqp = vec_ld(0, qp);
398 vqp = vec_splat(vqp, 0); 398 vqp = vec_splat(vqp, 0);
399 399
400 #define LOAD_LINE(i) \ 400 #define LOAD_LINE(i) \
401 const vector unsigned char perm##i = \ 401 const vector unsigned char perm##i = \
402 vec_lvsl(i * stride, src2); \ 402 vec_lvsl(i * stride, src2); \
403 const vector unsigned char vbA##i = \ 403 const vector unsigned char vbA##i = \
404 vec_ld(i * stride, src2); \ 404 vec_ld(i * stride, src2); \
405 const vector unsigned char vbB##i = \ 405 const vector unsigned char vbB##i = \
406 vec_ld(i * stride + 16, src2); \ 406 vec_ld(i * stride + 16, src2); \
407 const vector unsigned char vbT##i = \ 407 const vector unsigned char vbT##i = \
408 vec_perm(vbA##i, vbB##i, perm##i); \ 408 vec_perm(vbA##i, vbB##i, perm##i); \
409 const vector signed short vb##i = \ 409 const vector signed short vb##i = \
410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ 410 (vector signed short)vec_mergeh((vector unsigned char)zero, \
411 (vector unsigned char)vbT##i) 411 (vector unsigned char)vbT##i)
412 412
413 src2 += stride*3; 413 src2 += stride*3;
414 414
415 LOAD_LINE(1); 415 LOAD_LINE(1);
416 LOAD_LINE(2); 416 LOAD_LINE(2);
424 424
425 const vector signed short v_1 = vec_splat_s16(1); 425 const vector signed short v_1 = vec_splat_s16(1);
426 const vector signed short v_2 = vec_splat_s16(2); 426 const vector signed short v_2 = vec_splat_s16(2);
427 const vector signed short v_5 = vec_splat_s16(5); 427 const vector signed short v_5 = vec_splat_s16(5);
428 const vector signed short v_32 = vec_sl(v_1, 428 const vector signed short v_32 = vec_sl(v_1,
429 (vector unsigned short)v_5); 429 (vector unsigned short)v_5);
430 /* middle energy */ 430 /* middle energy */
431 const vector signed short l3minusl6 = vec_sub(vb3, vb6); 431 const vector signed short l3minusl6 = vec_sub(vb3, vb6);
432 const vector signed short l5minusl4 = vec_sub(vb5, vb4); 432 const vector signed short l5minusl4 = vec_sub(vb5, vb4);
433 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); 433 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
434 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); 434 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); 481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); 482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
483 483
484 const vector signed char neg1 = vec_splat_s8(-1); 484 const vector signed char neg1 = vec_splat_s8(-1);
485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); 486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
487 487
488 #define STORE(i) \ 488 #define STORE(i) \
489 const vector unsigned char perms##i = \ 489 const vector unsigned char perms##i = \
490 vec_lvsr(i * stride, src2); \ 490 vec_lvsr(i * stride, src2); \
491 const vector unsigned char vg##i = \ 491 const vector unsigned char vg##i = \
492 vec_perm(st##i, vbT##i, permHH); \ 492 vec_perm(st##i, vbT##i, permHH); \
493 const vector unsigned char mask##i = \ 493 const vector unsigned char mask##i = \
494 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ 494 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
495 const vector unsigned char vg2##i = \ 495 const vector unsigned char vg2##i = \
496 vec_perm(vg##i, vg##i, perms##i); \ 496 vec_perm(vg##i, vg##i, perms##i); \
497 const vector unsigned char svA##i = \ 497 const vector unsigned char svA##i = \
498 vec_sel(vbA##i, vg2##i, mask##i); \ 498 vec_sel(vbA##i, vg2##i, mask##i); \
499 const vector unsigned char svB##i = \ 499 const vector unsigned char svB##i = \
500 vec_sel(vg2##i, vbB##i, mask##i); \ 500 vec_sel(vg2##i, vbB##i, mask##i); \
501 vec_st(svA##i, i * stride, src2); \ 501 vec_st(svA##i, i * stride, src2); \
502 vec_st(svB##i, i * stride + 16, src2) 502 vec_st(svB##i, i * stride + 16, src2)
503 503
504 STORE(4); 504 STORE(4);
505 STORE(5); 505 STORE(5);
506 } 506 }
520 const vector signed int zero = vec_splat_s32(0); 520 const vector signed int zero = vec_splat_s32(0);
521 vector unsigned char v_dt; 521 vector unsigned char v_dt;
522 dt[0] = deringThreshold; 522 dt[0] = deringThreshold;
523 v_dt = vec_splat(vec_ld(0, dt), 0); 523 v_dt = vec_splat(vec_ld(0, dt), 0);
524 524
525 #define LOAD_LINE(i) \ 525 #define LOAD_LINE(i) \
526 const vector unsigned char perm##i = \ 526 const vector unsigned char perm##i = \
527 vec_lvsl(i * stride, srcCopy); \ 527 vec_lvsl(i * stride, srcCopy); \
528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ 528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \
529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ 529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \
530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) 530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
531 531
532 LOAD_LINE(0); 532 LOAD_LINE(0);
533 LOAD_LINE(1); 533 LOAD_LINE(1);
534 LOAD_LINE(2); 534 LOAD_LINE(2);
543 543
544 vector unsigned char v_avg; 544 vector unsigned char v_avg;
545 { 545 {
546 const vector unsigned char trunc_perm = (vector unsigned char) 546 const vector unsigned char trunc_perm = (vector unsigned char)
547 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 547 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); 548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);
549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); 549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); 550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); 551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); 552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
553 553
554 #define EXTRACT(op) do { \ 554 #define EXTRACT(op) do { \
555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ 555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \
556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ 556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \
557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ 557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \
558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ 558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \
559 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ 559 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \
582 582
583 signed int __attribute__((aligned(16))) S[8]; 583 signed int __attribute__((aligned(16))) S[8];
584 { 584 {
585 const vector unsigned short mask1 = (vector unsigned short) 585 const vector unsigned short mask1 = (vector unsigned short)
586 AVV(0x0001, 0x0002, 0x0004, 0x0008, 586 AVV(0x0001, 0x0002, 0x0004, 0x0008,
587 0x0010, 0x0020, 0x0040, 0x0080); 587 0x0010, 0x0020, 0x0040, 0x0080);
588 const vector unsigned short mask2 = (vector unsigned short) 588 const vector unsigned short mask2 = (vector unsigned short)
589 AVV(0x0100, 0x0200, 0x0000, 0x0000, 589 AVV(0x0100, 0x0200, 0x0000, 0x0000,
590 0x0000, 0x0000, 0x0000, 0x0000); 590 0x0000, 0x0000, 0x0000, 0x0000);
591 591
592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); 592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
593 const vector unsigned int vuint32_1 = vec_splat_u32(1); 593 const vector unsigned int vuint32_1 = vec_splat_u32(1);
594 594
595 #define COMPARE(i) \ 595 #define COMPARE(i) \
596 vector signed int sum##i; \ 596 vector signed int sum##i; \
597 do { \ 597 do { \
598 const vector unsigned char cmp##i = \ 598 const vector unsigned char cmp##i = \
599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ 599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \
600 const vector unsigned short cmpHi##i = \ 600 const vector unsigned short cmpHi##i = \
601 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ 601 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \
602 const vector unsigned short cmpLi##i = \ 602 const vector unsigned short cmpLi##i = \
603 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ 603 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \
604 const vector signed short cmpHf##i = \ 604 const vector signed short cmpHf##i = \
605 (vector signed short)vec_and(cmpHi##i, mask1); \ 605 (vector signed short)vec_and(cmpHi##i, mask1); \
606 const vector signed short cmpLf##i = \ 606 const vector signed short cmpLf##i = \
607 (vector signed short)vec_and(cmpLi##i, mask2); \ 607 (vector signed short)vec_and(cmpLi##i, mask2); \
608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ 608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \
609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ 609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \
610 sum##i = vec_sums(sumq##i, zero); } while (0) 610 sum##i = vec_sums(sumq##i, zero); } while (0)
611 611
612 COMPARE(0); 612 COMPARE(0);
613 COMPARE(1); 613 COMPARE(1);
614 COMPARE(2); 614 COMPARE(2);
641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); 641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
642 const vector signed int t2A = vec_or(sumA, tA); 642 const vector signed int t2A = vec_or(sumA, tA);
643 const vector signed int t2B = vec_or(sumB, tB); 643 const vector signed int t2B = vec_or(sumB, tB);
644 const vector signed int t2C = vec_or(sumC, tC); 644 const vector signed int t2C = vec_or(sumC, tC);
645 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), 645 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
646 vec_sl(t2A, vuint32_1)); 646 vec_sl(t2A, vuint32_1));
647 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), 647 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
648 vec_sl(t2B, vuint32_1)); 648 vec_sl(t2B, vuint32_1));
649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), 649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
650 vec_sl(t2C, vuint32_1)); 650 vec_sl(t2C, vuint32_1));
651 const vector signed int yA = vec_and(t2A, t3A); 651 const vector signed int yA = vec_and(t2A, t3A);
652 const vector signed int yB = vec_and(t2B, t3B); 652 const vector signed int yB = vec_and(t2B, t3B);
653 const vector signed int yC = vec_and(t2C, t3C); 653 const vector signed int yC = vec_and(t2C, t3C);
654 654
655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); 655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); 657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); 658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); 659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
660 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); 660 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
661 const vector signed int sumAp = vec_and(yA, 661 const vector signed int sumAp = vec_and(yA,
662 vec_and(sumAd4,sumAd8)); 662 vec_and(sumAd4,sumAd8));
663 const vector signed int sumBp = vec_and(yB, 663 const vector signed int sumBp = vec_and(yB,
664 vec_and(sumBd4,sumBd8)); 664 vec_and(sumBd4,sumBd8));
665 sumA2 = vec_or(sumAp, 665 sumA2 = vec_or(sumAp,
666 vec_sra(sumAp, 666 vec_sra(sumAp,
667 vuint32_16)); 667 vuint32_16));
668 sumB2 = vec_or(sumBp, 668 sumB2 = vec_or(sumBp,
669 vec_sra(sumBp, 669 vec_sra(sumBp,
670 vuint32_16)); 670 vuint32_16));
671 } 671 }
672 vec_st(sumA2, 0, S); 672 vec_st(sumA2, 0, S);
673 vec_st(sumB2, 16, S); 673 vec_st(sumB2, 16, S);
674 } 674 }
675 675
684 const vector signed int vsint32_8 = vec_splat_s32(8); 684 const vector signed int vsint32_8 = vec_splat_s32(8);
685 const vector unsigned int vuint32_4 = vec_splat_u32(4); 685 const vector unsigned int vuint32_4 = vec_splat_u32(4);
686 686
687 const vector unsigned char permA1 = (vector unsigned char) 687 const vector unsigned char permA1 = (vector unsigned char)
688 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, 688 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
689 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); 689 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
690 const vector unsigned char permA2 = (vector unsigned char) 690 const vector unsigned char permA2 = (vector unsigned char)
691 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 691 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
692 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); 692 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
693 const vector unsigned char permA1inc = (vector unsigned char) 693 const vector unsigned char permA1inc = (vector unsigned char)
694 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 694 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
696 const vector unsigned char permA2inc = (vector unsigned char) 696 const vector unsigned char permA2inc = (vector unsigned char)
697 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 697 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
698 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 698 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
699 const vector unsigned char magic = (vector unsigned char) 699 const vector unsigned char magic = (vector unsigned char)
700 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, 700 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
701 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 701 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
702 const vector unsigned char extractPerm = (vector unsigned char) 702 const vector unsigned char extractPerm = (vector unsigned char)
703 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, 703 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
704 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); 704 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01);
705 const vector unsigned char extractPermInc = (vector unsigned char) 705 const vector unsigned char extractPermInc = (vector unsigned char)
706 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 706 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
707 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); 707 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01);
708 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); 708 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
709 const vector unsigned char tenRight = (vector unsigned char) 709 const vector unsigned char tenRight = (vector unsigned char)
710 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 710 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
712 const vector unsigned char eightLeft = (vector unsigned char) 712 const vector unsigned char eightLeft = (vector unsigned char)
713 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 713 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
714 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); 714 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08);
715 715
716 716
717 #define F_INIT(i) \ 717 #define F_INIT(i) \
718 vector unsigned char tenRightM##i = tenRight; \ 718 vector unsigned char tenRightM##i = tenRight; \
719 vector unsigned char permA1M##i = permA1; \ 719 vector unsigned char permA1M##i = permA1; \
720 vector unsigned char permA2M##i = permA2; \ 720 vector unsigned char permA2M##i = permA2; \
721 vector unsigned char extractPermM##i = extractPerm 721 vector unsigned char extractPermM##i = extractPerm
722 722
723 #define F2(i, j, k, l) \ 723 #define F2(i, j, k, l) \
724 if (S[i] & (1 << (l+1))) { \ 724 if (S[i] & (1 << (l+1))) { \
725 const vector unsigned char a_##j##_A##l = \ 725 const vector unsigned char a_##j##_A##l = \
726 vec_perm(src##i, src##j, permA1M##i); \ 726 vec_perm(src##i, src##j, permA1M##i); \
727 const vector unsigned char a_##j##_B##l = \ 727 const vector unsigned char a_##j##_B##l = \
728 vec_perm(a_##j##_A##l, src##k, permA2M##i); \ 728 vec_perm(a_##j##_A##l, src##k, permA2M##i); \
729 const vector signed int a_##j##_sump##l = \ 729 const vector signed int a_##j##_sump##l = \
730 (vector signed int)vec_msum(a_##j##_B##l, magic, \ 730 (vector signed int)vec_msum(a_##j##_B##l, magic, \
731 (vector unsigned int)zero); \ 731 (vector unsigned int)zero); \
732 vector signed int F_##j##_##l = \ 732 vector signed int F_##j##_##l = \
733 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ 733 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \
734 F_##j##_##l = vec_splat(F_##j##_##l, 3); \ 734 F_##j##_##l = vec_splat(F_##j##_##l, 3); \
735 const vector signed int p_##j##_##l = \ 735 const vector signed int p_##j##_##l = \
736 (vector signed int)vec_perm(src##j, \ 736 (vector signed int)vec_perm(src##j, \
737 (vector unsigned char)zero, \ 737 (vector unsigned char)zero, \
738 extractPermM##i); \ 738 extractPermM##i); \
739 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2); \ 739 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2);\
740 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2); \ 740 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\
741 vector signed int newpm_##j##_##l; \ 741 vector signed int newpm_##j##_##l; \
742 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ 742 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \
743 newpm_##j##_##l = sum_##j##_##l; \ 743 newpm_##j##_##l = sum_##j##_##l; \
744 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ 744 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \
745 newpm_##j##_##l = diff_##j##_##l; \ 745 newpm_##j##_##l = diff_##j##_##l; \
746 else newpm_##j##_##l = F_##j##_##l; \ 746 else newpm_##j##_##l = F_##j##_##l; \
747 const vector unsigned char newpm2_##j##_##l = \ 747 const vector unsigned char newpm2_##j##_##l = \
748 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ 748 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \
749 const vector unsigned char mask##j##l = vec_add(identity, \ 749 const vector unsigned char mask##j##l = vec_add(identity, \
750 tenRightM##i); \ 750 tenRightM##i); \
751 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ 751 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \
752 } \ 752 } \
753 permA1M##i = vec_add(permA1M##i, permA1inc); \ 753 permA1M##i = vec_add(permA1M##i, permA1inc); \
754 permA2M##i = vec_add(permA2M##i, permA2inc); \ 754 permA2M##i = vec_add(permA2M##i, permA2inc); \
755 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ 755 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \
756 extractPermM##i = vec_add(extractPermM##i, extractPermInc) 756 extractPermM##i = vec_add(extractPermM##i, extractPermInc)
757 757
758 #define ITER(i, j, k) \ 758 #define ITER(i, j, k) \
759 F_INIT(i); \ 759 F_INIT(i); \
760 F2(i, j, k, 0); \ 760 F2(i, j, k, 0); \
761 F2(i, j, k, 1); \ 761 F2(i, j, k, 1); \
762 F2(i, j, k, 2); \ 762 F2(i, j, k, 2); \
763 F2(i, j, k, 3); \ 763 F2(i, j, k, 3); \
764 F2(i, j, k, 4); \ 764 F2(i, j, k, 4); \
765 F2(i, j, k, 5); \ 765 F2(i, j, k, 5); \
766 F2(i, j, k, 6); \ 766 F2(i, j, k, 6); \
767 F2(i, j, k, 7) 767 F2(i, j, k, 7)
768 768
769 ITER(0, 1, 2); 769 ITER(0, 1, 2);
770 ITER(1, 2, 3); 770 ITER(1, 2, 3);
771 ITER(2, 3, 4); 771 ITER(2, 3, 4);
775 ITER(6, 7, 8); 775 ITER(6, 7, 8);
776 ITER(7, 8, 9); 776 ITER(7, 8, 9);
777 777
778 const vector signed char neg1 = vec_splat_s8(-1); 778 const vector signed char neg1 = vec_splat_s8(-1);
779 779
780 #define STORE_LINE(i) \ 780 #define STORE_LINE(i) \
781 const vector unsigned char permST##i = \ 781 const vector unsigned char permST##i = \
782 vec_lvsr(i * stride, srcCopy); \ 782 vec_lvsr(i * stride, srcCopy); \
783 const vector unsigned char maskST##i = \ 783 const vector unsigned char maskST##i = \
784 vec_perm((vector unsigned char)zero, \ 784 vec_perm((vector unsigned char)zero, \
785 (vector unsigned char)neg1, permST##i); \ 785 (vector unsigned char)neg1, permST##i); \
786 src##i = vec_perm(src##i ,src##i, permST##i); \ 786 src##i = vec_perm(src##i ,src##i, permST##i); \
787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ 787 sA##i= vec_sel(sA##i, src##i, maskST##i); \
788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ 788 sB##i= vec_sel(src##i, sB##i, maskST##i); \
789 vec_st(sA##i, i * stride, srcCopy); \ 789 vec_st(sA##i, i * stride, srcCopy); \
790 vec_st(sB##i, i * stride + 16, srcCopy) 790 vec_st(sB##i, i * stride + 16, srcCopy)
791 791
792 STORE_LINE(1); 792 STORE_LINE(1);
793 STORE_LINE(2); 793 STORE_LINE(2);
794 STORE_LINE(3); 794 STORE_LINE(3);
806 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) 806 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
807 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) 807 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
808 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) 808 #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
809 809
810 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 810 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
811 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) 811 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
812 { 812 {
813 const vector signed int zero = vec_splat_s32(0); 813 const vector signed int zero = vec_splat_s32(0);
814 const vector signed short vsint16_1 = vec_splat_s16(1); 814 const vector signed short vsint16_1 = vec_splat_s16(1);
815 vector signed int v_dp = zero; 815 vector signed int v_dp = zero;
816 vector signed int v_sysdp = zero; 816 vector signed int v_sysdp = zero;
818 818
819 tempBluredPast[127]= maxNoise[0]; 819 tempBluredPast[127]= maxNoise[0];
820 tempBluredPast[128]= maxNoise[1]; 820 tempBluredPast[128]= maxNoise[1];
821 tempBluredPast[129]= maxNoise[2]; 821 tempBluredPast[129]= maxNoise[2];
822 822
823 #define LOAD_LINE(src, i) \ 823 #define LOAD_LINE(src, i) \
824 register int j##src##i = i * stride; \ 824 register int j##src##i = i * stride; \
825 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ 825 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \
826 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ 826 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
827 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ 827 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
828 const vector unsigned char v_##src##A##i = \ 828 const vector unsigned char v_##src##A##i = \
829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ 829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
830 vector signed short v_##src##Ass##i = \ 830 vector signed short v_##src##Ass##i = \
831 (vector signed short)vec_mergeh((vector signed char)zero, \ 831 (vector signed short)vec_mergeh((vector signed char)zero, \
832 (vector signed char)v_##src##A##i) 832 (vector signed char)v_##src##A##i)
833 833
834 LOAD_LINE(src, 0); 834 LOAD_LINE(src, 0);
835 LOAD_LINE(src, 1); 835 LOAD_LINE(src, 1);
836 LOAD_LINE(src, 2); 836 LOAD_LINE(src, 2);
837 LOAD_LINE(src, 3); 837 LOAD_LINE(src, 3);
848 LOAD_LINE(tempBlured, 5); 848 LOAD_LINE(tempBlured, 5);
849 LOAD_LINE(tempBlured, 6); 849 LOAD_LINE(tempBlured, 6);
850 LOAD_LINE(tempBlured, 7); 850 LOAD_LINE(tempBlured, 7);
851 #undef LOAD_LINE 851 #undef LOAD_LINE
852 852
853 #define ACCUMULATE_DIFFS(i) \ 853 #define ACCUMULATE_DIFFS(i) \
854 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ 854 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \
855 v_srcAss##i); \ 855 v_srcAss##i); \
856 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ 856 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \
857 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) 857 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp)
858 858
859 ACCUMULATE_DIFFS(0); 859 ACCUMULATE_DIFFS(0);
860 ACCUMULATE_DIFFS(1); 860 ACCUMULATE_DIFFS(1);
861 ACCUMULATE_DIFFS(2); 861 ACCUMULATE_DIFFS(2);
914 if (d < maxNoise[0]) { 914 if (d < maxNoise[0]) {
915 const vector signed short vsint16_7 = vec_splat_s16(7); 915 const vector signed short vsint16_7 = vec_splat_s16(7);
916 const vector signed short vsint16_4 = vec_splat_s16(4); 916 const vector signed short vsint16_4 = vec_splat_s16(4);
917 const vector unsigned short vuint16_3 = vec_splat_u16(3); 917 const vector unsigned short vuint16_3 = vec_splat_u16(3);
918 918
919 #define OP(i) \ 919 #define OP(i) \
920 const vector signed short v_temp##i = \ 920 const vector signed short v_temp##i = \
921 vec_mladd(v_tempBluredAss##i, \ 921 vec_mladd(v_tempBluredAss##i, \
922 vsint16_7, v_srcAss##i); \ 922 vsint16_7, v_srcAss##i); \
923 const vector signed short v_temp2##i = \ 923 const vector signed short v_temp2##i = \
924 vec_add(v_temp##i, vsint16_4); \ 924 vec_add(v_temp##i, vsint16_4); \
925 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3) 925 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
926 926
927 OP(0); 927 OP(0);
928 OP(1); 928 OP(1);
929 OP(2); 929 OP(2);
935 #undef OP 935 #undef OP
936 } else { 936 } else {
937 const vector signed short vsint16_3 = vec_splat_s16(3); 937 const vector signed short vsint16_3 = vec_splat_s16(3);
938 const vector signed short vsint16_2 = vec_splat_s16(2); 938 const vector signed short vsint16_2 = vec_splat_s16(2);
939 939
940 #define OP(i) \ 940 #define OP(i) \
941 const vector signed short v_temp##i = \ 941 const vector signed short v_temp##i = \
942 vec_mladd(v_tempBluredAss##i, \ 942 vec_mladd(v_tempBluredAss##i, \
943 vsint16_3, v_srcAss##i); \ 943 vsint16_3, v_srcAss##i); \
944 const vector signed short v_temp2##i = \ 944 const vector signed short v_temp2##i = \
945 vec_add(v_temp##i, vsint16_2); \ 945 vec_add(v_temp##i, vsint16_2); \
946 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2) 946 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
947 947
948 OP(0); 948 OP(0);
949 OP(1); 949 OP(1);
950 OP(2); 950 OP(2);
957 } 957 }
958 } 958 }
959 959
960 const vector signed char neg1 = vec_splat_s8(-1); 960 const vector signed char neg1 = vec_splat_s8(-1);
961 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 961 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
962 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); 962 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
963 963
964 #define PACK_AND_STORE(src, i) \ 964 #define PACK_AND_STORE(src, i) \
965 const vector unsigned char perms##src##i = \ 965 const vector unsigned char perms##src##i = \
966 vec_lvsr(i * stride, src); \ 966 vec_lvsr(i * stride, src); \
967 const vector unsigned char vf##src##i = \ 967 const vector unsigned char vf##src##i = \
968 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ 968 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \
969 const vector unsigned char vg##src##i = \ 969 const vector unsigned char vg##src##i = \
970 vec_perm(vf##src##i, v_##src##A##i, permHH); \ 970 vec_perm(vf##src##i, v_##src##A##i, permHH); \
971 const vector unsigned char mask##src##i = \ 971 const vector unsigned char mask##src##i = \
972 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ 972 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
973 const vector unsigned char vg2##src##i = \ 973 const vector unsigned char vg2##src##i = \
974 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ 974 vec_perm(vg##src##i, vg##src##i, perms##src##i); \
975 const vector unsigned char svA##src##i = \ 975 const vector unsigned char svA##src##i = \
976 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ 976 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \
977 const vector unsigned char svB##src##i = \ 977 const vector unsigned char svB##src##i = \
978 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ 978 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \
979 vec_st(svA##src##i, i * stride, src); \ 979 vec_st(svA##src##i, i * stride, src); \
980 vec_st(svB##src##i, i * stride + 16, src) 980 vec_st(svB##src##i, i * stride + 16, src)
981 981
982 PACK_AND_STORE(src, 0); 982 PACK_AND_STORE(src, 0);
983 PACK_AND_STORE(src, 1); 983 PACK_AND_STORE(src, 1);
984 PACK_AND_STORE(src, 2); 984 PACK_AND_STORE(src, 2);
999 } 999 }
1000 1000
1001 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { 1001 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1002 const vector unsigned char zero = vec_splat_u8(0); 1002 const vector unsigned char zero = vec_splat_u8(0);
1003 1003
1004 #define LOAD_DOUBLE_LINE(i, j) \ 1004 #define LOAD_DOUBLE_LINE(i, j) \
1005 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ 1005 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \
1006 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ 1006 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \
1007 vector unsigned char srcA##i = vec_ld(i * stride, src); \ 1007 vector unsigned char srcA##i = vec_ld(i * stride, src); \
1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ 1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \
1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ 1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \
1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ 1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \
1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ 1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) 1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1013 1013
1014 LOAD_DOUBLE_LINE(0, 1); 1014 LOAD_DOUBLE_LINE(0, 1);
1015 LOAD_DOUBLE_LINE(2, 3); 1015 LOAD_DOUBLE_LINE(2, 3);
1016 LOAD_DOUBLE_LINE(4, 5); 1016 LOAD_DOUBLE_LINE(4, 5);
1105 1105
1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { 1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1107 const vector unsigned char zero = vec_splat_u8(0); 1107 const vector unsigned char zero = vec_splat_u8(0);
1108 const vector unsigned char magic_perm = (const vector unsigned char) 1108 const vector unsigned char magic_perm = (const vector unsigned char)
1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); 1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
1111 1111
1112 #define LOAD_DOUBLE_LINE(i, j) \ 1112 #define LOAD_DOUBLE_LINE(i, j) \
1113 vector unsigned char src##i = vec_ld(i * 16, src); \ 1113 vector unsigned char src##i = vec_ld(i * 16, src); \
1114 vector unsigned char src##j = vec_ld(j * 16, src) 1114 vector unsigned char src##j = vec_ld(j * 16, src)
1115 1115
1116 LOAD_DOUBLE_LINE(0, 1); 1116 LOAD_DOUBLE_LINE(0, 1);
1117 LOAD_DOUBLE_LINE(2, 3); 1117 LOAD_DOUBLE_LINE(2, 3);
1118 LOAD_DOUBLE_LINE(4, 5); 1118 LOAD_DOUBLE_LINE(4, 5);
1167 temp6 = vec_mergeh(tempD, tempL); 1167 temp6 = vec_mergeh(tempD, tempL);
1168 temp7 = vec_mergel(tempD, tempL); 1168 temp7 = vec_mergel(tempD, tempL);
1169 1169
1170 1170
1171 const vector signed char neg1 = vec_splat_s8(-1); 1171 const vector signed char neg1 = vec_splat_s8(-1);
1172 #define STORE_DOUBLE_LINE(i, j) \ 1172 #define STORE_DOUBLE_LINE(i, j) \
1173 vector unsigned char dstA##i = vec_ld(i * stride, dst); \ 1173 vector unsigned char dstA##i = vec_ld(i * stride, dst); \
1174 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \ 1174 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \
1175 vector unsigned char dstA##j = vec_ld(j * stride, dst); \ 1175 vector unsigned char dstA##j = vec_ld(j * stride, dst); \
1176 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \ 1176 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \
1177 vector unsigned char align##i = vec_lvsr(i * stride, dst); \ 1177 vector unsigned char align##i = vec_lvsr(i * stride, dst); \
1178 vector unsigned char align##j = vec_lvsr(j * stride, dst); \ 1178 vector unsigned char align##j = vec_lvsr(j * stride, dst); \
1179 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \ 1179 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \
1180 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \ 1180 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \
1181 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \ 1181 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \
1182 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \ 1182 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \
1183 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \ 1183 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \
1184 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \ 1184 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \
1185 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \ 1185 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \
1186 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \ 1186 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \
1187 vec_st(dstAF##i, i * stride, dst); \ 1187 vec_st(dstAF##i, i * stride, dst); \
1188 vec_st(dstBF##i, i * stride + 16, dst); \ 1188 vec_st(dstBF##i, i * stride + 16, dst); \
1189 vec_st(dstAF##j, j * stride, dst); \ 1189 vec_st(dstAF##j, j * stride, dst); \
1190 vec_st(dstBF##j, j * stride + 16, dst) 1190 vec_st(dstBF##j, j * stride + 16, dst)
1191 1191
1192 STORE_DOUBLE_LINE(0,1); 1192 STORE_DOUBLE_LINE(0,1);
1193 STORE_DOUBLE_LINE(2,3); 1193 STORE_DOUBLE_LINE(2,3);
1194 STORE_DOUBLE_LINE(4,5); 1194 STORE_DOUBLE_LINE(4,5);