comparison simple_idct.c @ 744:2f7da29ede37 libavcodec

Move Alpha optimized IDCT to own file. Based on a patch by Mns Rullgrd <mru@users.sourceforge.net>. I've left out the idctCol2 part, because W4 has recently been decreed to be 16383, and also I doubt it will give a noticeable speedup.
author mellum
date Fri, 11 Oct 2002 23:01:16 +0000
parents ff90043f4a2d
children caa77cd960c0
comparison
equal deleted inserted replaced
743:4cf7173a004e 744:2f7da29ede37
23 */ 23 */
24 #include "avcodec.h" 24 #include "avcodec.h"
25 #include "dsputil.h" 25 #include "dsputil.h"
26 #include "simple_idct.h" 26 #include "simple_idct.h"
27 27
28 //#define ARCH_ALPHA
29
30 #if 0 28 #if 0
31 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ 29 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
32 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ 30 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
33 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ 31 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
34 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ 32 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
47 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 45 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 #define ROW_SHIFT 11 46 #define ROW_SHIFT 11
49 #define COL_SHIFT 20 // 6 47 #define COL_SHIFT 20 // 6
50 #endif 48 #endif
51 49
52 #ifdef ARCH_ALPHA
53 #define FAST_64BIT
54 #endif
55
56 #if defined(ARCH_POWERPC_405) 50 #if defined(ARCH_POWERPC_405)
57 51
58 /* signed 16x16 -> 32 multiply add accumulate */ 52 /* signed 16x16 -> 32 multiply add accumulate */
59 #define MAC16(rt, ra, rb) \ 53 #define MAC16(rt, ra, rb) \
60 asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); 54 asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
70 64
71 /* signed 16x16 -> 32 multiply */ 65 /* signed 16x16 -> 32 multiply */
72 #define MUL16(rt, ra, rb) rt = (ra) * (rb) 66 #define MUL16(rt, ra, rb) rt = (ra) * (rb)
73 67
74 #endif 68 #endif
75
76 #ifdef ARCH_ALPHA
77 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */
78 static inline int idctRowCondDC(int16_t *row)
79 {
80 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
81 uint64_t *lrow = (uint64_t *) row;
82
83 if (lrow[1] == 0) {
84 if (lrow[0] == 0)
85 return 0;
86 if ((lrow[0] & ~0xffffULL) == 0) {
87 uint64_t v;
88 #if 1 //is ok if |a0| < 1024 than theres an +-1 error (for the *W4 case for W4=16383 !!!)
89 a0 = row[0]<<3;
90 #else
91 a0 = W4 * row[0];
92 a0 += 1 << (ROW_SHIFT - 1);
93 a0 >>= ROW_SHIFT;
94 #endif
95 v = (uint16_t) a0;
96 v += v << 16;
97 v += v << 32;
98 lrow[0] = v;
99 lrow[1] = v;
100
101 return 1;
102 }
103 }
104
105 a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
106 a1 = a0;
107 a2 = a0;
108 a3 = a0;
109
110 if (row[2]) {
111 a0 += W2 * row[2];
112 a1 += W6 * row[2];
113 a2 -= W6 * row[2];
114 a3 -= W2 * row[2];
115 }
116
117 if (row[4]) {
118 a0 += W4 * row[4];
119 a1 -= W4 * row[4];
120 a2 -= W4 * row[4];
121 a3 += W4 * row[4];
122 }
123
124 if (row[6]) {
125 a0 += W6 * row[6];
126 a1 -= W2 * row[6];
127 a2 += W2 * row[6];
128 a3 -= W6 * row[6];
129 }
130
131 if (row[1]) {
132 b0 = W1 * row[1];
133 b1 = W3 * row[1];
134 b2 = W5 * row[1];
135 b3 = W7 * row[1];
136 } else {
137 b0 = 0;
138 b1 = 0;
139 b2 = 0;
140 b3 = 0;
141 }
142
143 if (row[3]) {
144 b0 += W3 * row[3];
145 b1 -= W7 * row[3];
146 b2 -= W1 * row[3];
147 b3 -= W5 * row[3];
148 }
149
150 if (row[5]) {
151 b0 += W5 * row[5];
152 b1 -= W1 * row[5];
153 b2 += W7 * row[5];
154 b3 += W3 * row[5];
155 }
156
157 if (row[7]) {
158 b0 += W7 * row[7];
159 b1 -= W5 * row[7];
160 b2 += W3 * row[7];
161 b3 -= W1 * row[7];
162 }
163
164 row[0] = (a0 + b0) >> ROW_SHIFT;
165 row[1] = (a1 + b1) >> ROW_SHIFT;
166 row[2] = (a2 + b2) >> ROW_SHIFT;
167 row[3] = (a3 + b3) >> ROW_SHIFT;
168 row[4] = (a3 - b3) >> ROW_SHIFT;
169 row[5] = (a2 - b2) >> ROW_SHIFT;
170 row[6] = (a1 - b1) >> ROW_SHIFT;
171 row[7] = (a0 - b0) >> ROW_SHIFT;
172
173 return 2;
174 }
175
176 inline static void idctSparseCol2(int16_t *col)
177 {
178 int a0, a1, a2, a3, b0, b1, b2, b3;
179
180 col[0] += (1 << (COL_SHIFT - 1)) / W4;
181
182 a0 = W4 * col[8 * 0];
183 a1 = W4 * col[8 * 0];
184 a2 = W4 * col[8 * 0];
185 a3 = W4 * col[8 * 0];
186
187 if (col[8 * 2]) {
188 a0 += W2 * col[8 * 2];
189 a1 += W6 * col[8 * 2];
190 a2 -= W6 * col[8 * 2];
191 a3 -= W2 * col[8 * 2];
192 }
193
194 if (col[8 * 4]) {
195 a0 += W4 * col[8 * 4];
196 a1 -= W4 * col[8 * 4];
197 a2 -= W4 * col[8 * 4];
198 a3 += W4 * col[8 * 4];
199 }
200
201 if (col[8 * 6]) {
202 a0 += W6 * col[8 * 6];
203 a1 -= W2 * col[8 * 6];
204 a2 += W2 * col[8 * 6];
205 a3 -= W6 * col[8 * 6];
206 }
207
208 if (col[8 * 1]) {
209 b0 = W1 * col[8 * 1];
210 b1 = W3 * col[8 * 1];
211 b2 = W5 * col[8 * 1];
212 b3 = W7 * col[8 * 1];
213 } else {
214 b0 = b1 = b2 = b3 = 0;
215 }
216
217 if (col[8 * 3]) {
218 b0 += W3 * col[8 * 3];
219 b1 -= W7 * col[8 * 3];
220 b2 -= W1 * col[8 * 3];
221 b3 -= W5 * col[8 * 3];
222 }
223
224 if (col[8 * 5]) {
225 b0 += W5 * col[8 * 5];
226 b1 -= W1 * col[8 * 5];
227 b2 += W7 * col[8 * 5];
228 b3 += W3 * col[8 * 5];
229 }
230
231 if (col[8 * 7]) {
232 b0 += W7 * col[8 * 7];
233 b1 -= W5 * col[8 * 7];
234 b2 += W3 * col[8 * 7];
235 b3 -= W1 * col[8 * 7];
236 }
237
238 col[8 * 0] = (a0 + b0) >> COL_SHIFT;
239 col[8 * 7] = (a0 - b0) >> COL_SHIFT;
240 col[8 * 1] = (a1 + b1) >> COL_SHIFT;
241 col[8 * 6] = (a1 - b1) >> COL_SHIFT;
242 col[8 * 2] = (a2 + b2) >> COL_SHIFT;
243 col[8 * 5] = (a2 - b2) >> COL_SHIFT;
244 col[8 * 3] = (a3 + b3) >> COL_SHIFT;
245 col[8 * 4] = (a3 - b3) >> COL_SHIFT;
246 }
247
248 #else /* not ARCH_ALPHA */
249 69
250 static inline void idctRowCondDC (int16_t * row) 70 static inline void idctRowCondDC (int16_t * row)
251 { 71 {
252 int a0, a1, a2, a3, b0, b1, b2, b3; 72 int a0, a1, a2, a3, b0, b1, b2, b3;
253 #ifdef FAST_64BIT 73 #ifdef FAST_64BIT
335 row[2] = (a2 + b2) >> ROW_SHIFT; 155 row[2] = (a2 + b2) >> ROW_SHIFT;
336 row[5] = (a2 - b2) >> ROW_SHIFT; 156 row[5] = (a2 - b2) >> ROW_SHIFT;
337 row[3] = (a3 + b3) >> ROW_SHIFT; 157 row[3] = (a3 + b3) >> ROW_SHIFT;
338 row[4] = (a3 - b3) >> ROW_SHIFT; 158 row[4] = (a3 - b3) >> ROW_SHIFT;
339 } 159 }
340 #endif /* not ARCH_ALPHA */
341 160
342 static inline void idctSparseColPut (UINT8 *dest, int line_size, 161 static inline void idctSparseColPut (UINT8 *dest, int line_size,
343 int16_t * col) 162 int16_t * col)
344 { 163 {
345 int a0, a1, a2, a3, b0, b1, b2, b3; 164 int a0, a1, a2, a3, b0, b1, b2, b3;
544 col[40] = ((a2 - b2) >> COL_SHIFT); 363 col[40] = ((a2 - b2) >> COL_SHIFT);
545 col[48] = ((a1 - b1) >> COL_SHIFT); 364 col[48] = ((a1 - b1) >> COL_SHIFT);
546 col[56] = ((a0 - b0) >> COL_SHIFT); 365 col[56] = ((a0 - b0) >> COL_SHIFT);
547 } 366 }
548 367
549
550 #ifdef ARCH_ALPHA
551 /* If all rows but the first one are zero after row transformation,
552 all rows will be identical after column transformation. */
553 static inline void idctCol2(int16_t *col)
554 {
555 int i;
556 uint64_t l, r;
557 uint64_t *lcol = (uint64_t *) col;
558
559 for (i = 0; i < 8; ++i) {
560 int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4;
561
562 a0 *= W4;
563 col[0] = a0 >> COL_SHIFT;
564 ++col;
565 }
566
567 l = lcol[0];
568 r = lcol[1];
569 lcol[ 2] = l; lcol[ 3] = r;
570 lcol[ 4] = l; lcol[ 5] = r;
571 lcol[ 6] = l; lcol[ 7] = r;
572 lcol[ 8] = l; lcol[ 9] = r;
573 lcol[10] = l; lcol[11] = r;
574 lcol[12] = l; lcol[13] = r;
575 lcol[14] = l; lcol[15] = r;
576 }
577
578 void simple_idct (short *block)
579 {
580
581 int i;
582 int rowsZero = 1; /* all rows except row 0 zero */
583 int rowsConstant = 1; /* all rows consist of a constant value */
584
585 for (i = 0; i < 8; i++) {
586 int sparseness = idctRowCondDC(block + 8 * i);
587
588 if (i > 0 && sparseness > 0)
589 rowsZero = 0;
590 if (sparseness == 2)
591 rowsConstant = 0;
592 }
593
594 if (rowsZero) {
595 idctCol2(block);
596 } else if (rowsConstant) {
597 uint64_t *lblock = (uint64_t *) block;
598
599 idctSparseCol2(block);
600 for (i = 0; i < 8; i++) {
601 uint64_t v = (uint16_t) block[i * 8];
602
603 v += v << 16;
604 v += v << 32;
605 lblock[0] = v;
606 lblock[1] = v;
607 lblock += 2;
608 }
609 } else {
610 for (i = 0; i < 8; i++)
611 idctSparseCol2(block + i);
612 }
613 }
614
615 /* XXX: suppress this mess */
616 void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
617 {
618 simple_idct(block);
619 put_pixels_clamped(block, dest, line_size);
620 }
621
622 void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
623 {
624 simple_idct(block);
625 add_pixels_clamped(block, dest, line_size);
626 }
627
628 #else
629
630 void simple_idct_put(UINT8 *dest, int line_size, INT16 *block) 368 void simple_idct_put(UINT8 *dest, int line_size, INT16 *block)
631 { 369 {
632 int i; 370 int i;
633 for(i=0; i<8; i++) 371 for(i=0; i<8; i++)
634 idctRowCondDC(block + i*8); 372 idctRowCondDC(block + i*8);
654 idctRowCondDC(block + i*8); 392 idctRowCondDC(block + i*8);
655 393
656 for(i=0; i<8; i++) 394 for(i=0; i<8; i++)
657 idctSparseCol(block + i); 395 idctSparseCol(block + i);
658 } 396 }
659
660 #endif
661 397
662 /* 2x4x8 idct */ 398 /* 2x4x8 idct */
663 399
664 #define CN_SHIFT 12 400 #define CN_SHIFT 12
665 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5)) 401 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))