comparison libpostproc/postprocess_altivec_template.c @ 2967:ef2149182f1c libavcodec

COSMETICS: Remove all trailing whitespace.
author diego
date Sat, 17 Dec 2005 18:14:38 +0000
parents 703b80c99891
children bfabfdf9ce55
comparison
equal deleted inserted replaced
2966:564788471dd4 2967:ef2149182f1c
77 const int srcAlign = ((unsigned long)src2 % 16); 77 const int srcAlign = ((unsigned long)src2 % 16);
78 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0; 78 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
79 const vector signed int zero = vec_splat_s32(0); 79 const vector signed int zero = vec_splat_s32(0);
80 const vector signed short mask = vec_splat_s16(1); 80 const vector signed short mask = vec_splat_s16(1);
81 vector signed int v_numEq = vec_splat_s32(0); 81 vector signed int v_numEq = vec_splat_s32(0);
82 82
83 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 83 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
84 data[1] = data[0] * 2 + 1; 84 data[1] = data[0] * 2 + 1;
85 data[2] = c->QP * 2; 85 data[2] = c->QP * 2;
86 data[3] = c->QP * 4; 86 data[3] = c->QP * 4;
87 vector signed short v_data = vec_ld(0, data); 87 vector signed short v_data = vec_ld(0, data);
158 ITER(5, 6); 158 ITER(5, 6);
159 ITER(6, 7); 159 ITER(6, 7);
160 #undef ITER 160 #undef ITER
161 161
162 v_numEq = vec_sums(v_numEq, zero); 162 v_numEq = vec_sums(v_numEq, zero);
163 163
164 v_numEq = vec_splat(v_numEq, 3); 164 v_numEq = vec_splat(v_numEq, 3);
165 vec_ste(v_numEq, 0, &numEq); 165 vec_ste(v_numEq, 0, &numEq);
166 166
167 if (numEq > c->ppMode.flatnessThreshold) 167 if (numEq > c->ppMode.flatnessThreshold)
168 { 168 {
172 const vector unsigned char mmoP2 = (const vector unsigned char) 172 const vector unsigned char mmoP2 = (const vector unsigned char)
173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, 173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); 174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
175 const vector unsigned char mmoP = (const vector unsigned char) 175 const vector unsigned char mmoP = (const vector unsigned char)
176 vec_lvsl(8, (unsigned char*)0); 176 vec_lvsl(8, (unsigned char*)0);
177 177
178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); 178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); 179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); 180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); 181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
182 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2); 182 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); 183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); 184 vector signed short mmoDiff = vec_sub(mmoL, mmoR);
185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); 185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
186 186
187 if (vec_any_gt(mmoSum, v4QP)) 187 if (vec_any_gt(mmoSum, v4QP))
188 return 0; 188 return 0;
189 else 189 else
190 return 1; 190 return 1;
191 } 191 }
192 else return 2; 192 else return 2;
193 } 193 }
194 194
195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { 195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
196 /* 196 /*
197 this code makes no assumption on src or stride. 197 this code makes no assumption on src or stride.
207 const int srcAlign = ((unsigned long)src2 % 16); 207 const int srcAlign = ((unsigned long)src2 % 16);
208 short __attribute__ ((aligned(16))) qp[8]; 208 short __attribute__ ((aligned(16))) qp[8];
209 qp[0] = c->QP; 209 qp[0] = c->QP;
210 vector signed short vqp = vec_ld(0, qp); 210 vector signed short vqp = vec_ld(0, qp);
211 vqp = vec_splat(vqp, 0); 211 vqp = vec_splat(vqp, 0);
212 212
213 src2 += stride*3; 213 src2 += stride*3;
214 214
215 vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9; 215 vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; 216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; 217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; 218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
219 219
220 #define LOAD_LINE(i) \ 220 #define LOAD_LINE(i) \
221 const vector unsigned char perml##i = \ 221 const vector unsigned char perml##i = \
222 vec_lvsl(i * stride, src2); \ 222 vec_lvsl(i * stride, src2); \
223 vbA##i = vec_ld(i * stride, src2); \ 223 vbA##i = vec_ld(i * stride, src2); \
224 vbB##i = vec_ld(i * stride + 16, src2); \ 224 vbB##i = vec_ld(i * stride + 16, src2); \
273 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01); 273 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
274 const vector signed short v_diff89 = vec_sub(vb8, vb9); 274 const vector signed short v_diff89 = vec_sub(vb8, vb9);
275 const vector unsigned short v_cmp89 = 275 const vector unsigned short v_cmp89 =
276 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp); 276 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
277 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89); 277 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
278 278
279 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1); 279 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
280 const vector signed short temp02 = vec_add(vb2, vb3); 280 const vector signed short temp02 = vec_add(vb2, vb3);
281 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4); 281 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
282 const vector signed short v_sumsB0 = vec_add(temp02, temp03); 282 const vector signed short v_sumsB0 = vec_add(temp02, temp03);
283 283
407 const vector unsigned char vbT##i = \ 407 const vector unsigned char vbT##i = \
408 vec_perm(vbA##i, vbB##i, perm##i); \ 408 vec_perm(vbA##i, vbB##i, perm##i); \
409 const vector signed short vb##i = \ 409 const vector signed short vb##i = \
410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ 410 (vector signed short)vec_mergeh((vector unsigned char)zero, \
411 (vector unsigned char)vbT##i) 411 (vector unsigned char)vbT##i)
412 412
413 src2 += stride*3; 413 src2 += stride*3;
414 414
415 LOAD_LINE(1); 415 LOAD_LINE(1);
416 LOAD_LINE(2); 416 LOAD_LINE(2);
417 LOAD_LINE(3); 417 LOAD_LINE(3);
418 LOAD_LINE(4); 418 LOAD_LINE(4);
419 LOAD_LINE(5); 419 LOAD_LINE(5);
420 LOAD_LINE(6); 420 LOAD_LINE(6);
421 LOAD_LINE(7); 421 LOAD_LINE(7);
422 LOAD_LINE(8); 422 LOAD_LINE(8);
423 #undef LOAD_LINE 423 #undef LOAD_LINE
424 424
425 const vector signed short v_1 = vec_splat_s16(1); 425 const vector signed short v_1 = vec_splat_s16(1);
426 const vector signed short v_2 = vec_splat_s16(2); 426 const vector signed short v_2 = vec_splat_s16(2);
427 const vector signed short v_5 = vec_splat_s16(5); 427 const vector signed short v_5 = vec_splat_s16(5);
428 const vector signed short v_32 = vec_sl(v_1, 428 const vector signed short v_32 = vec_sl(v_1,
429 (vector unsigned short)v_5); 429 (vector unsigned short)v_5);
478 const vector signed short vb4minusd = vec_sub(vb4, dornotd); 478 const vector signed short vb4minusd = vec_sub(vb4, dornotd);
479 const vector signed short vb5plusd = vec_add(vb5, dornotd); 479 const vector signed short vb5plusd = vec_add(vb5, dornotd);
480 /* finally, stores */ 480 /* finally, stores */
481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); 481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); 482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
483 483
484 const vector signed char neg1 = vec_splat_s8(-1); 484 const vector signed char neg1 = vec_splat_s8(-1);
485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); 486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
487 487
488 #define STORE(i) \ 488 #define STORE(i) \
489 const vector unsigned char perms##i = \ 489 const vector unsigned char perms##i = \
490 vec_lvsr(i * stride, src2); \ 490 vec_lvsr(i * stride, src2); \
491 const vector unsigned char vg##i = \ 491 const vector unsigned char vg##i = \
492 vec_perm(st##i, vbT##i, permHH); \ 492 vec_perm(st##i, vbT##i, permHH); \
498 vec_sel(vbA##i, vg2##i, mask##i); \ 498 vec_sel(vbA##i, vg2##i, mask##i); \
499 const vector unsigned char svB##i = \ 499 const vector unsigned char svB##i = \
500 vec_sel(vg2##i, vbB##i, mask##i); \ 500 vec_sel(vg2##i, vbB##i, mask##i); \
501 vec_st(svA##i, i * stride, src2); \ 501 vec_st(svA##i, i * stride, src2); \
502 vec_st(svB##i, i * stride + 16, src2) 502 vec_st(svB##i, i * stride + 16, src2)
503 503
504 STORE(4); 504 STORE(4);
505 STORE(5); 505 STORE(5);
506 } 506 }
507 507
508 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { 508 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
526 const vector unsigned char perm##i = \ 526 const vector unsigned char perm##i = \
527 vec_lvsl(i * stride, srcCopy); \ 527 vec_lvsl(i * stride, srcCopy); \
528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ 528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \
529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ 529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \
530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) 530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
531 531
532 LOAD_LINE(0); 532 LOAD_LINE(0);
533 LOAD_LINE(1); 533 LOAD_LINE(1);
534 LOAD_LINE(2); 534 LOAD_LINE(2);
535 LOAD_LINE(3); 535 LOAD_LINE(3);
536 LOAD_LINE(4); 536 LOAD_LINE(4);
548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); 548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);
549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); 549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); 550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); 551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); 552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
553 553
554 #define EXTRACT(op) do { \ 554 #define EXTRACT(op) do { \
555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ 555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \
556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ 556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \
557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ 557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \
558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ 558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \
565 const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \ 565 const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \
566 const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \ 566 const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \
567 const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \ 567 const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \
568 const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \ 568 const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \
569 v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0) 569 v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)
570 570
571 vector unsigned char v_min; 571 vector unsigned char v_min;
572 vector unsigned char v_max; 572 vector unsigned char v_max;
573 EXTRACT(min); 573 EXTRACT(min);
574 EXTRACT(max); 574 EXTRACT(max);
575 #undef EXTRACT 575 #undef EXTRACT
576 576
577 if (vec_all_lt(vec_sub(v_max, v_min), v_dt)) 577 if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
578 return; 578 return;
579 579
580 v_avg = vec_avg(v_min, v_max); 580 v_avg = vec_avg(v_min, v_max);
581 } 581 }
582 582
583 signed int __attribute__((aligned(16))) S[8]; 583 signed int __attribute__((aligned(16))) S[8];
584 { 584 {
585 const vector unsigned short mask1 = (vector unsigned short) 585 const vector unsigned short mask1 = (vector unsigned short)
586 AVV(0x0001, 0x0002, 0x0004, 0x0008, 586 AVV(0x0001, 0x0002, 0x0004, 0x0008,
587 0x0010, 0x0020, 0x0040, 0x0080); 587 0x0010, 0x0020, 0x0040, 0x0080);
588 const vector unsigned short mask2 = (vector unsigned short) 588 const vector unsigned short mask2 = (vector unsigned short)
589 AVV(0x0100, 0x0200, 0x0000, 0x0000, 589 AVV(0x0100, 0x0200, 0x0000, 0x0000,
590 0x0000, 0x0000, 0x0000, 0x0000); 590 0x0000, 0x0000, 0x0000, 0x0000);
591 591
592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); 592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
593 const vector unsigned int vuint32_1 = vec_splat_u32(1); 593 const vector unsigned int vuint32_1 = vec_splat_u32(1);
594 594
595 #define COMPARE(i) \ 595 #define COMPARE(i) \
596 vector signed int sum##i; \ 596 vector signed int sum##i; \
597 do { \ 597 do { \
598 const vector unsigned char cmp##i = \ 598 const vector unsigned char cmp##i = \
599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ 599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \
606 const vector signed short cmpLf##i = \ 606 const vector signed short cmpLf##i = \
607 (vector signed short)vec_and(cmpLi##i, mask2); \ 607 (vector signed short)vec_and(cmpLi##i, mask2); \
608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ 608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \
609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ 609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \
610 sum##i = vec_sums(sumq##i, zero); } while (0) 610 sum##i = vec_sums(sumq##i, zero); } while (0)
611 611
612 COMPARE(0); 612 COMPARE(0);
613 COMPARE(1); 613 COMPARE(1);
614 COMPARE(2); 614 COMPARE(2);
615 COMPARE(3); 615 COMPARE(3);
616 COMPARE(4); 616 COMPARE(4);
618 COMPARE(6); 618 COMPARE(6);
619 COMPARE(7); 619 COMPARE(7);
620 COMPARE(8); 620 COMPARE(8);
621 COMPARE(9); 621 COMPARE(9);
622 #undef COMPARE 622 #undef COMPARE
623 623
624 vector signed int sumA2; 624 vector signed int sumA2;
625 vector signed int sumB2; 625 vector signed int sumB2;
626 { 626 {
627 const vector signed int sump02 = vec_mergel(sum0, sum2); 627 const vector signed int sump02 = vec_mergel(sum0, sum2);
628 const vector signed int sump13 = vec_mergel(sum1, sum3); 628 const vector signed int sump13 = vec_mergel(sum1, sum3);
629 const vector signed int sumA = vec_mergel(sump02, sump13); 629 const vector signed int sumA = vec_mergel(sump02, sump13);
630 630
631 const vector signed int sump46 = vec_mergel(sum4, sum6); 631 const vector signed int sump46 = vec_mergel(sum4, sum6);
632 const vector signed int sump57 = vec_mergel(sum5, sum7); 632 const vector signed int sump57 = vec_mergel(sum5, sum7);
633 const vector signed int sumB = vec_mergel(sump46, sump57); 633 const vector signed int sumB = vec_mergel(sump46, sump57);
634 634
635 const vector signed int sump8A = vec_mergel(sum8, zero); 635 const vector signed int sump8A = vec_mergel(sum8, zero);
636 const vector signed int sump9B = vec_mergel(sum9, zero); 636 const vector signed int sump9B = vec_mergel(sum9, zero);
637 const vector signed int sumC = vec_mergel(sump8A, sump9B); 637 const vector signed int sumC = vec_mergel(sump8A, sump9B);
638 638
639 const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16); 639 const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
640 const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16); 640 const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); 641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
642 const vector signed int t2A = vec_or(sumA, tA); 642 const vector signed int t2A = vec_or(sumA, tA);
643 const vector signed int t2B = vec_or(sumB, tB); 643 const vector signed int t2B = vec_or(sumB, tB);
649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), 649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
650 vec_sl(t2C, vuint32_1)); 650 vec_sl(t2C, vuint32_1));
651 const vector signed int yA = vec_and(t2A, t3A); 651 const vector signed int yA = vec_and(t2A, t3A);
652 const vector signed int yB = vec_and(t2B, t3B); 652 const vector signed int yB = vec_and(t2B, t3B);
653 const vector signed int yC = vec_and(t2C, t3C); 653 const vector signed int yC = vec_and(t2C, t3C);
654 654
655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); 655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
656 const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0); 656 const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); 657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); 658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); 659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
666 vec_sra(sumAp, 666 vec_sra(sumAp,
667 vuint32_16)); 667 vuint32_16));
668 sumB2 = vec_or(sumBp, 668 sumB2 = vec_or(sumBp,
669 vec_sra(sumBp, 669 vec_sra(sumBp,
670 vuint32_16)); 670 vuint32_16));
671 } 671 }
672 vec_st(sumA2, 0, S); 672 vec_st(sumA2, 0, S);
673 vec_st(sumB2, 16, S); 673 vec_st(sumB2, 16, S);
674 } 674 }
675 675
676 /* I'm not sure the following is actually faster 676 /* I'm not sure the following is actually faster
677 than straight, unvectorized C code :-( */ 677 than straight, unvectorized C code :-( */
678 678
679 int __attribute__((aligned(16))) tQP2[4]; 679 int __attribute__((aligned(16))) tQP2[4];
680 tQP2[0]= c->QP/2 + 1; 680 tQP2[0]= c->QP/2 + 1;
681 vector signed int vQP2 = vec_ld(0, tQP2); 681 vector signed int vQP2 = vec_ld(0, tQP2);
682 vQP2 = vec_splat(vQP2, 0); 682 vQP2 = vec_splat(vQP2, 0);
683 const vector unsigned char vuint8_2 = vec_splat_u8(2); 683 const vector unsigned char vuint8_2 = vec_splat_u8(2);
774 ITER(5, 6, 7); 774 ITER(5, 6, 7);
775 ITER(6, 7, 8); 775 ITER(6, 7, 8);
776 ITER(7, 8, 9); 776 ITER(7, 8, 9);
777 777
778 const vector signed char neg1 = vec_splat_s8(-1); 778 const vector signed char neg1 = vec_splat_s8(-1);
779 779
780 #define STORE_LINE(i) \ 780 #define STORE_LINE(i) \
781 const vector unsigned char permST##i = \ 781 const vector unsigned char permST##i = \
782 vec_lvsr(i * stride, srcCopy); \ 782 vec_lvsr(i * stride, srcCopy); \
783 const vector unsigned char maskST##i = \ 783 const vector unsigned char maskST##i = \
784 vec_perm((vector unsigned char)zero, \ 784 vec_perm((vector unsigned char)zero, \
786 src##i = vec_perm(src##i ,src##i, permST##i); \ 786 src##i = vec_perm(src##i ,src##i, permST##i); \
787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ 787 sA##i= vec_sel(sA##i, src##i, maskST##i); \
788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ 788 sB##i= vec_sel(src##i, sB##i, maskST##i); \
789 vec_st(sA##i, i * stride, srcCopy); \ 789 vec_st(sA##i, i * stride, srcCopy); \
790 vec_st(sB##i, i * stride + 16, srcCopy) 790 vec_st(sB##i, i * stride + 16, srcCopy)
791 791
792 STORE_LINE(1); 792 STORE_LINE(1);
793 STORE_LINE(2); 793 STORE_LINE(2);
794 STORE_LINE(3); 794 STORE_LINE(3);
795 STORE_LINE(4); 795 STORE_LINE(4);
796 STORE_LINE(5); 796 STORE_LINE(5);
813 const vector signed int zero = vec_splat_s32(0); 813 const vector signed int zero = vec_splat_s32(0);
814 const vector signed short vsint16_1 = vec_splat_s16(1); 814 const vector signed short vsint16_1 = vec_splat_s16(1);
815 vector signed int v_dp = zero; 815 vector signed int v_dp = zero;
816 vector signed int v_sysdp = zero; 816 vector signed int v_sysdp = zero;
817 int d, sysd, i; 817 int d, sysd, i;
818 818
819 tempBluredPast[127]= maxNoise[0]; 819 tempBluredPast[127]= maxNoise[0];
820 tempBluredPast[128]= maxNoise[1]; 820 tempBluredPast[128]= maxNoise[1];
821 tempBluredPast[129]= maxNoise[2]; 821 tempBluredPast[129]= maxNoise[2];
822 822
823 #define LOAD_LINE(src, i) \ 823 #define LOAD_LINE(src, i) \
828 const vector unsigned char v_##src##A##i = \ 828 const vector unsigned char v_##src##A##i = \
829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ 829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
830 vector signed short v_##src##Ass##i = \ 830 vector signed short v_##src##Ass##i = \
831 (vector signed short)vec_mergeh((vector signed char)zero, \ 831 (vector signed short)vec_mergeh((vector signed char)zero, \
832 (vector signed char)v_##src##A##i) 832 (vector signed char)v_##src##A##i)
833 833
834 LOAD_LINE(src, 0); 834 LOAD_LINE(src, 0);
835 LOAD_LINE(src, 1); 835 LOAD_LINE(src, 1);
836 LOAD_LINE(src, 2); 836 LOAD_LINE(src, 2);
837 LOAD_LINE(src, 3); 837 LOAD_LINE(src, 3);
838 LOAD_LINE(src, 4); 838 LOAD_LINE(src, 4);
869 v_dp = vec_sums(v_dp, zero); 869 v_dp = vec_sums(v_dp, zero);
870 v_sysdp = vec_sums(v_sysdp, zero); 870 v_sysdp = vec_sums(v_sysdp, zero);
871 871
872 v_dp = vec_splat(v_dp, 3); 872 v_dp = vec_splat(v_dp, 3);
873 v_sysdp = vec_splat(v_sysdp, 3); 873 v_sysdp = vec_splat(v_sysdp, 3);
874 874
875 vec_ste(v_dp, 0, &d); 875 vec_ste(v_dp, 0, &d);
876 vec_ste(v_sysdp, 0, &sysd); 876 vec_ste(v_sysdp, 0, &sysd);
877 877
878 i = d; 878 i = d;
879 d = (4*d 879 d = (4*d
913 } else { 913 } else {
914 if (d < maxNoise[0]) { 914 if (d < maxNoise[0]) {
915 const vector signed short vsint16_7 = vec_splat_s16(7); 915 const vector signed short vsint16_7 = vec_splat_s16(7);
916 const vector signed short vsint16_4 = vec_splat_s16(4); 916 const vector signed short vsint16_4 = vec_splat_s16(4);
917 const vector unsigned short vuint16_3 = vec_splat_u16(3); 917 const vector unsigned short vuint16_3 = vec_splat_u16(3);
918 918
919 #define OP(i) \ 919 #define OP(i) \
920 const vector signed short v_temp##i = \ 920 const vector signed short v_temp##i = \
921 vec_mladd(v_tempBluredAss##i, \ 921 vec_mladd(v_tempBluredAss##i, \
922 vsint16_7, v_srcAss##i); \ 922 vsint16_7, v_srcAss##i); \
923 const vector signed short v_temp2##i = \ 923 const vector signed short v_temp2##i = \
934 OP(7); 934 OP(7);
935 #undef OP 935 #undef OP
936 } else { 936 } else {
937 const vector signed short vsint16_3 = vec_splat_s16(3); 937 const vector signed short vsint16_3 = vec_splat_s16(3);
938 const vector signed short vsint16_2 = vec_splat_s16(2); 938 const vector signed short vsint16_2 = vec_splat_s16(2);
939 939
940 #define OP(i) \ 940 #define OP(i) \
941 const vector signed short v_temp##i = \ 941 const vector signed short v_temp##i = \
942 vec_mladd(v_tempBluredAss##i, \ 942 vec_mladd(v_tempBluredAss##i, \
943 vsint16_3, v_srcAss##i); \ 943 vsint16_3, v_srcAss##i); \
944 const vector signed short v_temp2##i = \ 944 const vector signed short v_temp2##i = \
1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ 1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \
1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ 1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \
1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ 1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \
1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ 1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) 1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1013 1013
1014 LOAD_DOUBLE_LINE(0, 1); 1014 LOAD_DOUBLE_LINE(0, 1);
1015 LOAD_DOUBLE_LINE(2, 3); 1015 LOAD_DOUBLE_LINE(2, 3);
1016 LOAD_DOUBLE_LINE(4, 5); 1016 LOAD_DOUBLE_LINE(4, 5);
1017 LOAD_DOUBLE_LINE(6, 7); 1017 LOAD_DOUBLE_LINE(6, 7);
1018 #undef LOAD_DOUBLE_LINE 1018 #undef LOAD_DOUBLE_LINE
1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { 1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1107 const vector unsigned char zero = vec_splat_u8(0); 1107 const vector unsigned char zero = vec_splat_u8(0);
1108 const vector unsigned char magic_perm = (const vector unsigned char) 1108 const vector unsigned char magic_perm = (const vector unsigned char)
1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); 1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
1111 1111
1112 #define LOAD_DOUBLE_LINE(i, j) \ 1112 #define LOAD_DOUBLE_LINE(i, j) \
1113 vector unsigned char src##i = vec_ld(i * 16, src); \ 1113 vector unsigned char src##i = vec_ld(i * 16, src); \
1114 vector unsigned char src##j = vec_ld(j * 16, src) 1114 vector unsigned char src##j = vec_ld(j * 16, src)
1115 1115
1116 LOAD_DOUBLE_LINE(0, 1); 1116 LOAD_DOUBLE_LINE(0, 1);