Mercurial > libavcodec.hg
comparison simple_idct.c @ 744:2f7da29ede37 libavcodec
Move Alpha optimized IDCT to own file. Based on a patch by Mns
Rullgrd <mru@users.sourceforge.net>.
I've left out the idctCol2 part, because W4 has recently been decreed
to be 16383, and also I doubt it will give a noticeable speedup.
author | mellum |
---|---|
date | Fri, 11 Oct 2002 23:01:16 +0000 |
parents | ff90043f4a2d |
children | caa77cd960c0 |
comparison
equal
deleted
inserted
replaced
743:4cf7173a004e | 744:2f7da29ede37 |
---|---|
23 */ | 23 */ |
24 #include "avcodec.h" | 24 #include "avcodec.h" |
25 #include "dsputil.h" | 25 #include "dsputil.h" |
26 #include "simple_idct.h" | 26 #include "simple_idct.h" |
27 | 27 |
28 //#define ARCH_ALPHA | |
29 | |
30 #if 0 | 28 #if 0 |
31 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | 29 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ |
32 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | 30 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ |
33 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ | 31 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ |
34 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ | 32 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ |
47 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | 45 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
48 #define ROW_SHIFT 11 | 46 #define ROW_SHIFT 11 |
49 #define COL_SHIFT 20 // 6 | 47 #define COL_SHIFT 20 // 6 |
50 #endif | 48 #endif |
51 | 49 |
52 #ifdef ARCH_ALPHA | |
53 #define FAST_64BIT | |
54 #endif | |
55 | |
56 #if defined(ARCH_POWERPC_405) | 50 #if defined(ARCH_POWERPC_405) |
57 | 51 |
58 /* signed 16x16 -> 32 multiply add accumulate */ | 52 /* signed 16x16 -> 32 multiply add accumulate */ |
59 #define MAC16(rt, ra, rb) \ | 53 #define MAC16(rt, ra, rb) \ |
60 asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); | 54 asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); |
70 | 64 |
71 /* signed 16x16 -> 32 multiply */ | 65 /* signed 16x16 -> 32 multiply */ |
72 #define MUL16(rt, ra, rb) rt = (ra) * (rb) | 66 #define MUL16(rt, ra, rb) rt = (ra) * (rb) |
73 | 67 |
74 #endif | 68 #endif |
75 | |
76 #ifdef ARCH_ALPHA | |
77 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */ | |
78 static inline int idctRowCondDC(int16_t *row) | |
79 { | |
80 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; | |
81 uint64_t *lrow = (uint64_t *) row; | |
82 | |
83 if (lrow[1] == 0) { | |
84 if (lrow[0] == 0) | |
85 return 0; | |
86 if ((lrow[0] & ~0xffffULL) == 0) { | |
87 uint64_t v; | |
88 #if 1 //is ok if |a0| < 1024 than theres an +-1 error (for the *W4 case for W4=16383 !!!) | |
89 a0 = row[0]<<3; | |
90 #else | |
91 a0 = W4 * row[0]; | |
92 a0 += 1 << (ROW_SHIFT - 1); | |
93 a0 >>= ROW_SHIFT; | |
94 #endif | |
95 v = (uint16_t) a0; | |
96 v += v << 16; | |
97 v += v << 32; | |
98 lrow[0] = v; | |
99 lrow[1] = v; | |
100 | |
101 return 1; | |
102 } | |
103 } | |
104 | |
105 a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); | |
106 a1 = a0; | |
107 a2 = a0; | |
108 a3 = a0; | |
109 | |
110 if (row[2]) { | |
111 a0 += W2 * row[2]; | |
112 a1 += W6 * row[2]; | |
113 a2 -= W6 * row[2]; | |
114 a3 -= W2 * row[2]; | |
115 } | |
116 | |
117 if (row[4]) { | |
118 a0 += W4 * row[4]; | |
119 a1 -= W4 * row[4]; | |
120 a2 -= W4 * row[4]; | |
121 a3 += W4 * row[4]; | |
122 } | |
123 | |
124 if (row[6]) { | |
125 a0 += W6 * row[6]; | |
126 a1 -= W2 * row[6]; | |
127 a2 += W2 * row[6]; | |
128 a3 -= W6 * row[6]; | |
129 } | |
130 | |
131 if (row[1]) { | |
132 b0 = W1 * row[1]; | |
133 b1 = W3 * row[1]; | |
134 b2 = W5 * row[1]; | |
135 b3 = W7 * row[1]; | |
136 } else { | |
137 b0 = 0; | |
138 b1 = 0; | |
139 b2 = 0; | |
140 b3 = 0; | |
141 } | |
142 | |
143 if (row[3]) { | |
144 b0 += W3 * row[3]; | |
145 b1 -= W7 * row[3]; | |
146 b2 -= W1 * row[3]; | |
147 b3 -= W5 * row[3]; | |
148 } | |
149 | |
150 if (row[5]) { | |
151 b0 += W5 * row[5]; | |
152 b1 -= W1 * row[5]; | |
153 b2 += W7 * row[5]; | |
154 b3 += W3 * row[5]; | |
155 } | |
156 | |
157 if (row[7]) { | |
158 b0 += W7 * row[7]; | |
159 b1 -= W5 * row[7]; | |
160 b2 += W3 * row[7]; | |
161 b3 -= W1 * row[7]; | |
162 } | |
163 | |
164 row[0] = (a0 + b0) >> ROW_SHIFT; | |
165 row[1] = (a1 + b1) >> ROW_SHIFT; | |
166 row[2] = (a2 + b2) >> ROW_SHIFT; | |
167 row[3] = (a3 + b3) >> ROW_SHIFT; | |
168 row[4] = (a3 - b3) >> ROW_SHIFT; | |
169 row[5] = (a2 - b2) >> ROW_SHIFT; | |
170 row[6] = (a1 - b1) >> ROW_SHIFT; | |
171 row[7] = (a0 - b0) >> ROW_SHIFT; | |
172 | |
173 return 2; | |
174 } | |
175 | |
176 inline static void idctSparseCol2(int16_t *col) | |
177 { | |
178 int a0, a1, a2, a3, b0, b1, b2, b3; | |
179 | |
180 col[0] += (1 << (COL_SHIFT - 1)) / W4; | |
181 | |
182 a0 = W4 * col[8 * 0]; | |
183 a1 = W4 * col[8 * 0]; | |
184 a2 = W4 * col[8 * 0]; | |
185 a3 = W4 * col[8 * 0]; | |
186 | |
187 if (col[8 * 2]) { | |
188 a0 += W2 * col[8 * 2]; | |
189 a1 += W6 * col[8 * 2]; | |
190 a2 -= W6 * col[8 * 2]; | |
191 a3 -= W2 * col[8 * 2]; | |
192 } | |
193 | |
194 if (col[8 * 4]) { | |
195 a0 += W4 * col[8 * 4]; | |
196 a1 -= W4 * col[8 * 4]; | |
197 a2 -= W4 * col[8 * 4]; | |
198 a3 += W4 * col[8 * 4]; | |
199 } | |
200 | |
201 if (col[8 * 6]) { | |
202 a0 += W6 * col[8 * 6]; | |
203 a1 -= W2 * col[8 * 6]; | |
204 a2 += W2 * col[8 * 6]; | |
205 a3 -= W6 * col[8 * 6]; | |
206 } | |
207 | |
208 if (col[8 * 1]) { | |
209 b0 = W1 * col[8 * 1]; | |
210 b1 = W3 * col[8 * 1]; | |
211 b2 = W5 * col[8 * 1]; | |
212 b3 = W7 * col[8 * 1]; | |
213 } else { | |
214 b0 = b1 = b2 = b3 = 0; | |
215 } | |
216 | |
217 if (col[8 * 3]) { | |
218 b0 += W3 * col[8 * 3]; | |
219 b1 -= W7 * col[8 * 3]; | |
220 b2 -= W1 * col[8 * 3]; | |
221 b3 -= W5 * col[8 * 3]; | |
222 } | |
223 | |
224 if (col[8 * 5]) { | |
225 b0 += W5 * col[8 * 5]; | |
226 b1 -= W1 * col[8 * 5]; | |
227 b2 += W7 * col[8 * 5]; | |
228 b3 += W3 * col[8 * 5]; | |
229 } | |
230 | |
231 if (col[8 * 7]) { | |
232 b0 += W7 * col[8 * 7]; | |
233 b1 -= W5 * col[8 * 7]; | |
234 b2 += W3 * col[8 * 7]; | |
235 b3 -= W1 * col[8 * 7]; | |
236 } | |
237 | |
238 col[8 * 0] = (a0 + b0) >> COL_SHIFT; | |
239 col[8 * 7] = (a0 - b0) >> COL_SHIFT; | |
240 col[8 * 1] = (a1 + b1) >> COL_SHIFT; | |
241 col[8 * 6] = (a1 - b1) >> COL_SHIFT; | |
242 col[8 * 2] = (a2 + b2) >> COL_SHIFT; | |
243 col[8 * 5] = (a2 - b2) >> COL_SHIFT; | |
244 col[8 * 3] = (a3 + b3) >> COL_SHIFT; | |
245 col[8 * 4] = (a3 - b3) >> COL_SHIFT; | |
246 } | |
247 | |
248 #else /* not ARCH_ALPHA */ | |
249 | 69 |
250 static inline void idctRowCondDC (int16_t * row) | 70 static inline void idctRowCondDC (int16_t * row) |
251 { | 71 { |
252 int a0, a1, a2, a3, b0, b1, b2, b3; | 72 int a0, a1, a2, a3, b0, b1, b2, b3; |
253 #ifdef FAST_64BIT | 73 #ifdef FAST_64BIT |
335 row[2] = (a2 + b2) >> ROW_SHIFT; | 155 row[2] = (a2 + b2) >> ROW_SHIFT; |
336 row[5] = (a2 - b2) >> ROW_SHIFT; | 156 row[5] = (a2 - b2) >> ROW_SHIFT; |
337 row[3] = (a3 + b3) >> ROW_SHIFT; | 157 row[3] = (a3 + b3) >> ROW_SHIFT; |
338 row[4] = (a3 - b3) >> ROW_SHIFT; | 158 row[4] = (a3 - b3) >> ROW_SHIFT; |
339 } | 159 } |
340 #endif /* not ARCH_ALPHA */ | |
341 | 160 |
342 static inline void idctSparseColPut (UINT8 *dest, int line_size, | 161 static inline void idctSparseColPut (UINT8 *dest, int line_size, |
343 int16_t * col) | 162 int16_t * col) |
344 { | 163 { |
345 int a0, a1, a2, a3, b0, b1, b2, b3; | 164 int a0, a1, a2, a3, b0, b1, b2, b3; |
544 col[40] = ((a2 - b2) >> COL_SHIFT); | 363 col[40] = ((a2 - b2) >> COL_SHIFT); |
545 col[48] = ((a1 - b1) >> COL_SHIFT); | 364 col[48] = ((a1 - b1) >> COL_SHIFT); |
546 col[56] = ((a0 - b0) >> COL_SHIFT); | 365 col[56] = ((a0 - b0) >> COL_SHIFT); |
547 } | 366 } |
548 | 367 |
549 | |
550 #ifdef ARCH_ALPHA | |
551 /* If all rows but the first one are zero after row transformation, | |
552 all rows will be identical after column transformation. */ | |
553 static inline void idctCol2(int16_t *col) | |
554 { | |
555 int i; | |
556 uint64_t l, r; | |
557 uint64_t *lcol = (uint64_t *) col; | |
558 | |
559 for (i = 0; i < 8; ++i) { | |
560 int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4; | |
561 | |
562 a0 *= W4; | |
563 col[0] = a0 >> COL_SHIFT; | |
564 ++col; | |
565 } | |
566 | |
567 l = lcol[0]; | |
568 r = lcol[1]; | |
569 lcol[ 2] = l; lcol[ 3] = r; | |
570 lcol[ 4] = l; lcol[ 5] = r; | |
571 lcol[ 6] = l; lcol[ 7] = r; | |
572 lcol[ 8] = l; lcol[ 9] = r; | |
573 lcol[10] = l; lcol[11] = r; | |
574 lcol[12] = l; lcol[13] = r; | |
575 lcol[14] = l; lcol[15] = r; | |
576 } | |
577 | |
578 void simple_idct (short *block) | |
579 { | |
580 | |
581 int i; | |
582 int rowsZero = 1; /* all rows except row 0 zero */ | |
583 int rowsConstant = 1; /* all rows consist of a constant value */ | |
584 | |
585 for (i = 0; i < 8; i++) { | |
586 int sparseness = idctRowCondDC(block + 8 * i); | |
587 | |
588 if (i > 0 && sparseness > 0) | |
589 rowsZero = 0; | |
590 if (sparseness == 2) | |
591 rowsConstant = 0; | |
592 } | |
593 | |
594 if (rowsZero) { | |
595 idctCol2(block); | |
596 } else if (rowsConstant) { | |
597 uint64_t *lblock = (uint64_t *) block; | |
598 | |
599 idctSparseCol2(block); | |
600 for (i = 0; i < 8; i++) { | |
601 uint64_t v = (uint16_t) block[i * 8]; | |
602 | |
603 v += v << 16; | |
604 v += v << 32; | |
605 lblock[0] = v; | |
606 lblock[1] = v; | |
607 lblock += 2; | |
608 } | |
609 } else { | |
610 for (i = 0; i < 8; i++) | |
611 idctSparseCol2(block + i); | |
612 } | |
613 } | |
614 | |
615 /* XXX: suppress this mess */ | |
616 void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block) | |
617 { | |
618 simple_idct(block); | |
619 put_pixels_clamped(block, dest, line_size); | |
620 } | |
621 | |
622 void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block) | |
623 { | |
624 simple_idct(block); | |
625 add_pixels_clamped(block, dest, line_size); | |
626 } | |
627 | |
628 #else | |
629 | |
630 void simple_idct_put(UINT8 *dest, int line_size, INT16 *block) | 368 void simple_idct_put(UINT8 *dest, int line_size, INT16 *block) |
631 { | 369 { |
632 int i; | 370 int i; |
633 for(i=0; i<8; i++) | 371 for(i=0; i<8; i++) |
634 idctRowCondDC(block + i*8); | 372 idctRowCondDC(block + i*8); |
654 idctRowCondDC(block + i*8); | 392 idctRowCondDC(block + i*8); |
655 | 393 |
656 for(i=0; i<8; i++) | 394 for(i=0; i<8; i++) |
657 idctSparseCol(block + i); | 395 idctSparseCol(block + i); |
658 } | 396 } |
659 | |
660 #endif | |
661 | 397 |
662 /* 2x4x8 idct */ | 398 /* 2x4x8 idct */ |
663 | 399 |
664 #define CN_SHIFT 12 | 400 #define CN_SHIFT 12 |
665 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5)) | 401 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5)) |