Mercurial > libavcodec.hg
comparison libpostproc/postprocess_altivec_template.c @ 2967:ef2149182f1c libavcodec
COSMETICS: Remove all trailing whitespace.
author | diego |
---|---|
date | Sat, 17 Dec 2005 18:14:38 +0000 |
parents | 703b80c99891 |
children | bfabfdf9ce55 |
comparison
equal
deleted
inserted
replaced
2966:564788471dd4 | 2967:ef2149182f1c |
---|---|
77 const int srcAlign = ((unsigned long)src2 % 16); | 77 const int srcAlign = ((unsigned long)src2 % 16); |
78 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0; | 78 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0; |
79 const vector signed int zero = vec_splat_s32(0); | 79 const vector signed int zero = vec_splat_s32(0); |
80 const vector signed short mask = vec_splat_s16(1); | 80 const vector signed short mask = vec_splat_s16(1); |
81 vector signed int v_numEq = vec_splat_s32(0); | 81 vector signed int v_numEq = vec_splat_s32(0); |
82 | 82 |
83 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; | 83 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; |
84 data[1] = data[0] * 2 + 1; | 84 data[1] = data[0] * 2 + 1; |
85 data[2] = c->QP * 2; | 85 data[2] = c->QP * 2; |
86 data[3] = c->QP * 4; | 86 data[3] = c->QP * 4; |
87 vector signed short v_data = vec_ld(0, data); | 87 vector signed short v_data = vec_ld(0, data); |
158 ITER(5, 6); | 158 ITER(5, 6); |
159 ITER(6, 7); | 159 ITER(6, 7); |
160 #undef ITER | 160 #undef ITER |
161 | 161 |
162 v_numEq = vec_sums(v_numEq, zero); | 162 v_numEq = vec_sums(v_numEq, zero); |
163 | 163 |
164 v_numEq = vec_splat(v_numEq, 3); | 164 v_numEq = vec_splat(v_numEq, 3); |
165 vec_ste(v_numEq, 0, &numEq); | 165 vec_ste(v_numEq, 0, &numEq); |
166 | 166 |
167 if (numEq > c->ppMode.flatnessThreshold) | 167 if (numEq > c->ppMode.flatnessThreshold) |
168 { | 168 { |
172 const vector unsigned char mmoP2 = (const vector unsigned char) | 172 const vector unsigned char mmoP2 = (const vector unsigned char) |
173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, | 173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, |
174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); | 174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); |
175 const vector unsigned char mmoP = (const vector unsigned char) | 175 const vector unsigned char mmoP = (const vector unsigned char) |
176 vec_lvsl(8, (unsigned char*)0); | 176 vec_lvsl(8, (unsigned char*)0); |
177 | 177 |
178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); | 178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); |
179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); | 179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); |
180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); | 180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); |
181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); | 181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); |
182 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2); | 182 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2); |
183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); | 183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); |
184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); | 184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); |
185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); | 185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); |
186 | 186 |
187 if (vec_any_gt(mmoSum, v4QP)) | 187 if (vec_any_gt(mmoSum, v4QP)) |
188 return 0; | 188 return 0; |
189 else | 189 else |
190 return 1; | 190 return 1; |
191 } | 191 } |
192 else return 2; | 192 else return 2; |
193 } | 193 } |
194 | 194 |
195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { | 195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { |
196 /* | 196 /* |
197 this code makes no assumption on src or stride. | 197 this code makes no assumption on src or stride. |
207 const int srcAlign = ((unsigned long)src2 % 16); | 207 const int srcAlign = ((unsigned long)src2 % 16); |
208 short __attribute__ ((aligned(16))) qp[8]; | 208 short __attribute__ ((aligned(16))) qp[8]; |
209 qp[0] = c->QP; | 209 qp[0] = c->QP; |
210 vector signed short vqp = vec_ld(0, qp); | 210 vector signed short vqp = vec_ld(0, qp); |
211 vqp = vec_splat(vqp, 0); | 211 vqp = vec_splat(vqp, 0); |
212 | 212 |
213 src2 += stride*3; | 213 src2 += stride*3; |
214 | 214 |
215 vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9; | 215 vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9; |
216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; | 216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; |
217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; | 217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; |
218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; | 218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; |
219 | 219 |
220 #define LOAD_LINE(i) \ | 220 #define LOAD_LINE(i) \ |
221 const vector unsigned char perml##i = \ | 221 const vector unsigned char perml##i = \ |
222 vec_lvsl(i * stride, src2); \ | 222 vec_lvsl(i * stride, src2); \ |
223 vbA##i = vec_ld(i * stride, src2); \ | 223 vbA##i = vec_ld(i * stride, src2); \ |
224 vbB##i = vec_ld(i * stride + 16, src2); \ | 224 vbB##i = vec_ld(i * stride + 16, src2); \ |
273 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01); | 273 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01); |
274 const vector signed short v_diff89 = vec_sub(vb8, vb9); | 274 const vector signed short v_diff89 = vec_sub(vb8, vb9); |
275 const vector unsigned short v_cmp89 = | 275 const vector unsigned short v_cmp89 = |
276 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp); | 276 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp); |
277 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89); | 277 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89); |
278 | 278 |
279 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1); | 279 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1); |
280 const vector signed short temp02 = vec_add(vb2, vb3); | 280 const vector signed short temp02 = vec_add(vb2, vb3); |
281 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4); | 281 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4); |
282 const vector signed short v_sumsB0 = vec_add(temp02, temp03); | 282 const vector signed short v_sumsB0 = vec_add(temp02, temp03); |
283 | 283 |
407 const vector unsigned char vbT##i = \ | 407 const vector unsigned char vbT##i = \ |
408 vec_perm(vbA##i, vbB##i, perm##i); \ | 408 vec_perm(vbA##i, vbB##i, perm##i); \ |
409 const vector signed short vb##i = \ | 409 const vector signed short vb##i = \ |
410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ | 410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ |
411 (vector unsigned char)vbT##i) | 411 (vector unsigned char)vbT##i) |
412 | 412 |
413 src2 += stride*3; | 413 src2 += stride*3; |
414 | 414 |
415 LOAD_LINE(1); | 415 LOAD_LINE(1); |
416 LOAD_LINE(2); | 416 LOAD_LINE(2); |
417 LOAD_LINE(3); | 417 LOAD_LINE(3); |
418 LOAD_LINE(4); | 418 LOAD_LINE(4); |
419 LOAD_LINE(5); | 419 LOAD_LINE(5); |
420 LOAD_LINE(6); | 420 LOAD_LINE(6); |
421 LOAD_LINE(7); | 421 LOAD_LINE(7); |
422 LOAD_LINE(8); | 422 LOAD_LINE(8); |
423 #undef LOAD_LINE | 423 #undef LOAD_LINE |
424 | 424 |
425 const vector signed short v_1 = vec_splat_s16(1); | 425 const vector signed short v_1 = vec_splat_s16(1); |
426 const vector signed short v_2 = vec_splat_s16(2); | 426 const vector signed short v_2 = vec_splat_s16(2); |
427 const vector signed short v_5 = vec_splat_s16(5); | 427 const vector signed short v_5 = vec_splat_s16(5); |
428 const vector signed short v_32 = vec_sl(v_1, | 428 const vector signed short v_32 = vec_sl(v_1, |
429 (vector unsigned short)v_5); | 429 (vector unsigned short)v_5); |
478 const vector signed short vb4minusd = vec_sub(vb4, dornotd); | 478 const vector signed short vb4minusd = vec_sub(vb4, dornotd); |
479 const vector signed short vb5plusd = vec_add(vb5, dornotd); | 479 const vector signed short vb5plusd = vec_add(vb5, dornotd); |
480 /* finally, stores */ | 480 /* finally, stores */ |
481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); | 481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); |
482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); | 482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); |
483 | 483 |
484 const vector signed char neg1 = vec_splat_s8(-1); | 484 const vector signed char neg1 = vec_splat_s8(-1); |
485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
487 | 487 |
488 #define STORE(i) \ | 488 #define STORE(i) \ |
489 const vector unsigned char perms##i = \ | 489 const vector unsigned char perms##i = \ |
490 vec_lvsr(i * stride, src2); \ | 490 vec_lvsr(i * stride, src2); \ |
491 const vector unsigned char vg##i = \ | 491 const vector unsigned char vg##i = \ |
492 vec_perm(st##i, vbT##i, permHH); \ | 492 vec_perm(st##i, vbT##i, permHH); \ |
498 vec_sel(vbA##i, vg2##i, mask##i); \ | 498 vec_sel(vbA##i, vg2##i, mask##i); \ |
499 const vector unsigned char svB##i = \ | 499 const vector unsigned char svB##i = \ |
500 vec_sel(vg2##i, vbB##i, mask##i); \ | 500 vec_sel(vg2##i, vbB##i, mask##i); \ |
501 vec_st(svA##i, i * stride, src2); \ | 501 vec_st(svA##i, i * stride, src2); \ |
502 vec_st(svB##i, i * stride + 16, src2) | 502 vec_st(svB##i, i * stride + 16, src2) |
503 | 503 |
504 STORE(4); | 504 STORE(4); |
505 STORE(5); | 505 STORE(5); |
506 } | 506 } |
507 | 507 |
508 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { | 508 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { |
526 const vector unsigned char perm##i = \ | 526 const vector unsigned char perm##i = \ |
527 vec_lvsl(i * stride, srcCopy); \ | 527 vec_lvsl(i * stride, srcCopy); \ |
528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ | 528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ |
529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ | 529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ |
530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) | 530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) |
531 | 531 |
532 LOAD_LINE(0); | 532 LOAD_LINE(0); |
533 LOAD_LINE(1); | 533 LOAD_LINE(1); |
534 LOAD_LINE(2); | 534 LOAD_LINE(2); |
535 LOAD_LINE(3); | 535 LOAD_LINE(3); |
536 LOAD_LINE(4); | 536 LOAD_LINE(4); |
548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); | 548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); |
549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); | 549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); |
550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); | 550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); |
551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); | 551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); |
552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); | 552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); |
553 | 553 |
554 #define EXTRACT(op) do { \ | 554 #define EXTRACT(op) do { \ |
555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ | 555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ |
556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ | 556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ |
557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ | 557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ |
558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ | 558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ |
565 const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \ | 565 const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \ |
566 const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \ | 566 const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \ |
567 const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \ | 567 const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \ |
568 const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \ | 568 const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \ |
569 v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0) | 569 v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0) |
570 | 570 |
571 vector unsigned char v_min; | 571 vector unsigned char v_min; |
572 vector unsigned char v_max; | 572 vector unsigned char v_max; |
573 EXTRACT(min); | 573 EXTRACT(min); |
574 EXTRACT(max); | 574 EXTRACT(max); |
575 #undef EXTRACT | 575 #undef EXTRACT |
576 | 576 |
577 if (vec_all_lt(vec_sub(v_max, v_min), v_dt)) | 577 if (vec_all_lt(vec_sub(v_max, v_min), v_dt)) |
578 return; | 578 return; |
579 | 579 |
580 v_avg = vec_avg(v_min, v_max); | 580 v_avg = vec_avg(v_min, v_max); |
581 } | 581 } |
582 | 582 |
583 signed int __attribute__((aligned(16))) S[8]; | 583 signed int __attribute__((aligned(16))) S[8]; |
584 { | 584 { |
585 const vector unsigned short mask1 = (vector unsigned short) | 585 const vector unsigned short mask1 = (vector unsigned short) |
586 AVV(0x0001, 0x0002, 0x0004, 0x0008, | 586 AVV(0x0001, 0x0002, 0x0004, 0x0008, |
587 0x0010, 0x0020, 0x0040, 0x0080); | 587 0x0010, 0x0020, 0x0040, 0x0080); |
588 const vector unsigned short mask2 = (vector unsigned short) | 588 const vector unsigned short mask2 = (vector unsigned short) |
589 AVV(0x0100, 0x0200, 0x0000, 0x0000, | 589 AVV(0x0100, 0x0200, 0x0000, 0x0000, |
590 0x0000, 0x0000, 0x0000, 0x0000); | 590 0x0000, 0x0000, 0x0000, 0x0000); |
591 | 591 |
592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); | 592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); |
593 const vector unsigned int vuint32_1 = vec_splat_u32(1); | 593 const vector unsigned int vuint32_1 = vec_splat_u32(1); |
594 | 594 |
595 #define COMPARE(i) \ | 595 #define COMPARE(i) \ |
596 vector signed int sum##i; \ | 596 vector signed int sum##i; \ |
597 do { \ | 597 do { \ |
598 const vector unsigned char cmp##i = \ | 598 const vector unsigned char cmp##i = \ |
599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ | 599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ |
606 const vector signed short cmpLf##i = \ | 606 const vector signed short cmpLf##i = \ |
607 (vector signed short)vec_and(cmpLi##i, mask2); \ | 607 (vector signed short)vec_and(cmpLi##i, mask2); \ |
608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ | 608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ |
609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ | 609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ |
610 sum##i = vec_sums(sumq##i, zero); } while (0) | 610 sum##i = vec_sums(sumq##i, zero); } while (0) |
611 | 611 |
612 COMPARE(0); | 612 COMPARE(0); |
613 COMPARE(1); | 613 COMPARE(1); |
614 COMPARE(2); | 614 COMPARE(2); |
615 COMPARE(3); | 615 COMPARE(3); |
616 COMPARE(4); | 616 COMPARE(4); |
618 COMPARE(6); | 618 COMPARE(6); |
619 COMPARE(7); | 619 COMPARE(7); |
620 COMPARE(8); | 620 COMPARE(8); |
621 COMPARE(9); | 621 COMPARE(9); |
622 #undef COMPARE | 622 #undef COMPARE |
623 | 623 |
624 vector signed int sumA2; | 624 vector signed int sumA2; |
625 vector signed int sumB2; | 625 vector signed int sumB2; |
626 { | 626 { |
627 const vector signed int sump02 = vec_mergel(sum0, sum2); | 627 const vector signed int sump02 = vec_mergel(sum0, sum2); |
628 const vector signed int sump13 = vec_mergel(sum1, sum3); | 628 const vector signed int sump13 = vec_mergel(sum1, sum3); |
629 const vector signed int sumA = vec_mergel(sump02, sump13); | 629 const vector signed int sumA = vec_mergel(sump02, sump13); |
630 | 630 |
631 const vector signed int sump46 = vec_mergel(sum4, sum6); | 631 const vector signed int sump46 = vec_mergel(sum4, sum6); |
632 const vector signed int sump57 = vec_mergel(sum5, sum7); | 632 const vector signed int sump57 = vec_mergel(sum5, sum7); |
633 const vector signed int sumB = vec_mergel(sump46, sump57); | 633 const vector signed int sumB = vec_mergel(sump46, sump57); |
634 | 634 |
635 const vector signed int sump8A = vec_mergel(sum8, zero); | 635 const vector signed int sump8A = vec_mergel(sum8, zero); |
636 const vector signed int sump9B = vec_mergel(sum9, zero); | 636 const vector signed int sump9B = vec_mergel(sum9, zero); |
637 const vector signed int sumC = vec_mergel(sump8A, sump9B); | 637 const vector signed int sumC = vec_mergel(sump8A, sump9B); |
638 | 638 |
639 const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16); | 639 const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16); |
640 const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16); | 640 const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16); |
641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); | 641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); |
642 const vector signed int t2A = vec_or(sumA, tA); | 642 const vector signed int t2A = vec_or(sumA, tA); |
643 const vector signed int t2B = vec_or(sumB, tB); | 643 const vector signed int t2B = vec_or(sumB, tB); |
649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), | 649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), |
650 vec_sl(t2C, vuint32_1)); | 650 vec_sl(t2C, vuint32_1)); |
651 const vector signed int yA = vec_and(t2A, t3A); | 651 const vector signed int yA = vec_and(t2A, t3A); |
652 const vector signed int yB = vec_and(t2B, t3B); | 652 const vector signed int yB = vec_and(t2B, t3B); |
653 const vector signed int yC = vec_and(t2C, t3C); | 653 const vector signed int yC = vec_and(t2C, t3C); |
654 | 654 |
655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); | 655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); |
656 const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0); | 656 const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0); |
657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); | 657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); |
658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); | 658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); |
659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); | 659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); |
666 vec_sra(sumAp, | 666 vec_sra(sumAp, |
667 vuint32_16)); | 667 vuint32_16)); |
668 sumB2 = vec_or(sumBp, | 668 sumB2 = vec_or(sumBp, |
669 vec_sra(sumBp, | 669 vec_sra(sumBp, |
670 vuint32_16)); | 670 vuint32_16)); |
671 } | 671 } |
672 vec_st(sumA2, 0, S); | 672 vec_st(sumA2, 0, S); |
673 vec_st(sumB2, 16, S); | 673 vec_st(sumB2, 16, S); |
674 } | 674 } |
675 | 675 |
676 /* I'm not sure the following is actually faster | 676 /* I'm not sure the following is actually faster |
677 than straight, unvectorized C code :-( */ | 677 than straight, unvectorized C code :-( */ |
678 | 678 |
679 int __attribute__((aligned(16))) tQP2[4]; | 679 int __attribute__((aligned(16))) tQP2[4]; |
680 tQP2[0]= c->QP/2 + 1; | 680 tQP2[0]= c->QP/2 + 1; |
681 vector signed int vQP2 = vec_ld(0, tQP2); | 681 vector signed int vQP2 = vec_ld(0, tQP2); |
682 vQP2 = vec_splat(vQP2, 0); | 682 vQP2 = vec_splat(vQP2, 0); |
683 const vector unsigned char vuint8_2 = vec_splat_u8(2); | 683 const vector unsigned char vuint8_2 = vec_splat_u8(2); |
774 ITER(5, 6, 7); | 774 ITER(5, 6, 7); |
775 ITER(6, 7, 8); | 775 ITER(6, 7, 8); |
776 ITER(7, 8, 9); | 776 ITER(7, 8, 9); |
777 | 777 |
778 const vector signed char neg1 = vec_splat_s8(-1); | 778 const vector signed char neg1 = vec_splat_s8(-1); |
779 | 779 |
780 #define STORE_LINE(i) \ | 780 #define STORE_LINE(i) \ |
781 const vector unsigned char permST##i = \ | 781 const vector unsigned char permST##i = \ |
782 vec_lvsr(i * stride, srcCopy); \ | 782 vec_lvsr(i * stride, srcCopy); \ |
783 const vector unsigned char maskST##i = \ | 783 const vector unsigned char maskST##i = \ |
784 vec_perm((vector unsigned char)zero, \ | 784 vec_perm((vector unsigned char)zero, \ |
786 src##i = vec_perm(src##i ,src##i, permST##i); \ | 786 src##i = vec_perm(src##i ,src##i, permST##i); \ |
787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ | 787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ |
788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ | 788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ |
789 vec_st(sA##i, i * stride, srcCopy); \ | 789 vec_st(sA##i, i * stride, srcCopy); \ |
790 vec_st(sB##i, i * stride + 16, srcCopy) | 790 vec_st(sB##i, i * stride + 16, srcCopy) |
791 | 791 |
792 STORE_LINE(1); | 792 STORE_LINE(1); |
793 STORE_LINE(2); | 793 STORE_LINE(2); |
794 STORE_LINE(3); | 794 STORE_LINE(3); |
795 STORE_LINE(4); | 795 STORE_LINE(4); |
796 STORE_LINE(5); | 796 STORE_LINE(5); |
813 const vector signed int zero = vec_splat_s32(0); | 813 const vector signed int zero = vec_splat_s32(0); |
814 const vector signed short vsint16_1 = vec_splat_s16(1); | 814 const vector signed short vsint16_1 = vec_splat_s16(1); |
815 vector signed int v_dp = zero; | 815 vector signed int v_dp = zero; |
816 vector signed int v_sysdp = zero; | 816 vector signed int v_sysdp = zero; |
817 int d, sysd, i; | 817 int d, sysd, i; |
818 | 818 |
819 tempBluredPast[127]= maxNoise[0]; | 819 tempBluredPast[127]= maxNoise[0]; |
820 tempBluredPast[128]= maxNoise[1]; | 820 tempBluredPast[128]= maxNoise[1]; |
821 tempBluredPast[129]= maxNoise[2]; | 821 tempBluredPast[129]= maxNoise[2]; |
822 | 822 |
823 #define LOAD_LINE(src, i) \ | 823 #define LOAD_LINE(src, i) \ |
828 const vector unsigned char v_##src##A##i = \ | 828 const vector unsigned char v_##src##A##i = \ |
829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ | 829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ |
830 vector signed short v_##src##Ass##i = \ | 830 vector signed short v_##src##Ass##i = \ |
831 (vector signed short)vec_mergeh((vector signed char)zero, \ | 831 (vector signed short)vec_mergeh((vector signed char)zero, \ |
832 (vector signed char)v_##src##A##i) | 832 (vector signed char)v_##src##A##i) |
833 | 833 |
834 LOAD_LINE(src, 0); | 834 LOAD_LINE(src, 0); |
835 LOAD_LINE(src, 1); | 835 LOAD_LINE(src, 1); |
836 LOAD_LINE(src, 2); | 836 LOAD_LINE(src, 2); |
837 LOAD_LINE(src, 3); | 837 LOAD_LINE(src, 3); |
838 LOAD_LINE(src, 4); | 838 LOAD_LINE(src, 4); |
869 v_dp = vec_sums(v_dp, zero); | 869 v_dp = vec_sums(v_dp, zero); |
870 v_sysdp = vec_sums(v_sysdp, zero); | 870 v_sysdp = vec_sums(v_sysdp, zero); |
871 | 871 |
872 v_dp = vec_splat(v_dp, 3); | 872 v_dp = vec_splat(v_dp, 3); |
873 v_sysdp = vec_splat(v_sysdp, 3); | 873 v_sysdp = vec_splat(v_sysdp, 3); |
874 | 874 |
875 vec_ste(v_dp, 0, &d); | 875 vec_ste(v_dp, 0, &d); |
876 vec_ste(v_sysdp, 0, &sysd); | 876 vec_ste(v_sysdp, 0, &sysd); |
877 | 877 |
878 i = d; | 878 i = d; |
879 d = (4*d | 879 d = (4*d |
913 } else { | 913 } else { |
914 if (d < maxNoise[0]) { | 914 if (d < maxNoise[0]) { |
915 const vector signed short vsint16_7 = vec_splat_s16(7); | 915 const vector signed short vsint16_7 = vec_splat_s16(7); |
916 const vector signed short vsint16_4 = vec_splat_s16(4); | 916 const vector signed short vsint16_4 = vec_splat_s16(4); |
917 const vector unsigned short vuint16_3 = vec_splat_u16(3); | 917 const vector unsigned short vuint16_3 = vec_splat_u16(3); |
918 | 918 |
919 #define OP(i) \ | 919 #define OP(i) \ |
920 const vector signed short v_temp##i = \ | 920 const vector signed short v_temp##i = \ |
921 vec_mladd(v_tempBluredAss##i, \ | 921 vec_mladd(v_tempBluredAss##i, \ |
922 vsint16_7, v_srcAss##i); \ | 922 vsint16_7, v_srcAss##i); \ |
923 const vector signed short v_temp2##i = \ | 923 const vector signed short v_temp2##i = \ |
934 OP(7); | 934 OP(7); |
935 #undef OP | 935 #undef OP |
936 } else { | 936 } else { |
937 const vector signed short vsint16_3 = vec_splat_s16(3); | 937 const vector signed short vsint16_3 = vec_splat_s16(3); |
938 const vector signed short vsint16_2 = vec_splat_s16(2); | 938 const vector signed short vsint16_2 = vec_splat_s16(2); |
939 | 939 |
940 #define OP(i) \ | 940 #define OP(i) \ |
941 const vector signed short v_temp##i = \ | 941 const vector signed short v_temp##i = \ |
942 vec_mladd(v_tempBluredAss##i, \ | 942 vec_mladd(v_tempBluredAss##i, \ |
943 vsint16_3, v_srcAss##i); \ | 943 vsint16_3, v_srcAss##i); \ |
944 const vector signed short v_temp2##i = \ | 944 const vector signed short v_temp2##i = \ |
1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ | 1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ |
1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ | 1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ |
1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ | 1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ |
1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ | 1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ |
1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) | 1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) |
1013 | 1013 |
1014 LOAD_DOUBLE_LINE(0, 1); | 1014 LOAD_DOUBLE_LINE(0, 1); |
1015 LOAD_DOUBLE_LINE(2, 3); | 1015 LOAD_DOUBLE_LINE(2, 3); |
1016 LOAD_DOUBLE_LINE(4, 5); | 1016 LOAD_DOUBLE_LINE(4, 5); |
1017 LOAD_DOUBLE_LINE(6, 7); | 1017 LOAD_DOUBLE_LINE(6, 7); |
1018 #undef LOAD_DOUBLE_LINE | 1018 #undef LOAD_DOUBLE_LINE |
1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { | 1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { |
1107 const vector unsigned char zero = vec_splat_u8(0); | 1107 const vector unsigned char zero = vec_splat_u8(0); |
1108 const vector unsigned char magic_perm = (const vector unsigned char) | 1108 const vector unsigned char magic_perm = (const vector unsigned char) |
1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
1111 | 1111 |
1112 #define LOAD_DOUBLE_LINE(i, j) \ | 1112 #define LOAD_DOUBLE_LINE(i, j) \ |
1113 vector unsigned char src##i = vec_ld(i * 16, src); \ | 1113 vector unsigned char src##i = vec_ld(i * 16, src); \ |
1114 vector unsigned char src##j = vec_ld(j * 16, src) | 1114 vector unsigned char src##j = vec_ld(j * 16, src) |
1115 | 1115 |
1116 LOAD_DOUBLE_LINE(0, 1); | 1116 LOAD_DOUBLE_LINE(0, 1); |