Mercurial > libavcodec.hg
annotate jrevdct.c @ 10392:32ee88f14239 libavcodec
Fix w32thread implementation to handle job count > thread_count.
author | reimar |
---|---|
date | Tue, 13 Oct 2009 13:03:12 +0000 |
parents | 674090f13019 |
children | 7dd2a45249a9 |
rev | line source |
---|---|
0 | 1 /* |
2 * jrevdct.c | |
3 * | |
4 * This file is part of the Independent JPEG Group's software. | |
3669
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
5 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
6 * The authors make NO WARRANTY or representation, either express or implied, |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
7 * with respect to this software, its quality, accuracy, merchantability, or |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
8 * fitness for a particular purpose. This software is provided "AS IS", and |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
9 * you, its user, assume the entire risk as to its quality and accuracy. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
10 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
11 * This software is copyright (C) 1991, 1992, Thomas G. Lane. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
12 * All Rights Reserved except as specified below. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
13 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
14 * Permission is hereby granted to use, copy, modify, and distribute this |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
15 * software (or portions thereof) for any purpose, without fee, subject to |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
16 * these conditions: |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
17 * (1) If any part of the source code for this software is distributed, then |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
18 * this README file must be included, with this copyright and no-warranty |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
19 * notice unaltered; and any additions, deletions, or changes to the original |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
20 * files must be clearly indicated in accompanying documentation. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
21 * (2) If only executable code is distributed, then the accompanying |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
22 * documentation must state that "this software is based in part on the work |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
23 * of the Independent JPEG Group". |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
24 * (3) Permission for use of this software is granted only if the user accepts |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
25 * full responsibility for any undesirable consequences; the authors accept |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
26 * NO LIABILITY for damages of any kind. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
27 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
28 * These conditions apply to any software derived from or based on the IJG |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
29 * code, not just to the unmodified library. If you use our work, you ought |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
30 * to acknowledge us. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
31 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
32 * Permission is NOT granted for the use of any IJG author's name or company |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
33 * name in advertising or publicity relating to this software or products |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
34 * derived from it. This software may be referred to only as "the Independent |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
35 * JPEG Group's software". |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
36 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
37 * We specifically permit and encourage the use of this software as the basis |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
38 * of commercial products, provided that all warranty or liability claims are |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
39 * assumed by the product vendor. |
0 | 40 * |
41 * This file contains the basic inverse-DCT transformation subroutine. | |
42 * | |
43 * This implementation is based on an algorithm described in | |
44 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT | |
45 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, | |
46 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. | |
47 * The primary algorithm described there uses 11 multiplies and 29 adds. | |
48 * We use their alternate method with 12 multiplies and 32 adds. | |
49 * The advantage of this method is that no data path contains more than one | |
50 * multiplication; this allows a very simple and accurate implementation in | |
51 * scaled fixed-point arithmetic, with a minimal number of shifts. | |
2967 | 52 * |
0 | 53 * I've made lots of modifications to attempt to take advantage of the |
54 * sparse nature of the DCT matrices we're getting. Although the logic | |
55 * is cumbersome, it's straightforward and the resulting code is much | |
56 * faster. | |
57 * | |
58 * A better way to do this would be to pass in the DCT block as a sparse | |
59 * matrix, perhaps with the difference cases encoded. | |
60 */ | |
2967 | 61 |
1106 | 62 /** |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
6763
diff
changeset
|
63 * @file libavcodec/jrevdct.c |
1106 | 64 * Independent JPEG Group's LLM idct. |
65 */ | |
2967 | 66 |
6763 | 67 #include "libavutil/common.h" |
0 | 68 #include "dsputil.h" |
69 | |
70 #define EIGHT_BIT_SAMPLES | |
71 | |
72 #define DCTSIZE 8 | |
73 #define DCTSIZE2 64 | |
74 | |
75 #define GLOBAL | |
76 | |
77 #define RIGHT_SHIFT(x, n) ((x) >> (n)) | |
78 | |
79 typedef DCTELEM DCTBLOCK[DCTSIZE2]; | |
80 | |
81 #define CONST_BITS 13 | |
82 | |
83 /* | |
84 * This routine is specialized to the case DCTSIZE = 8. | |
85 */ | |
86 | |
87 #if DCTSIZE != 8 | |
88 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ | |
89 #endif | |
90 | |
91 | |
92 /* | |
93 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT | |
94 * on each column. Direct algorithms are also available, but they are | |
95 * much more complex and seem not to be any faster when reduced to code. | |
96 * | |
97 * The poop on this scaling stuff is as follows: | |
98 * | |
99 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) | |
100 * larger than the true IDCT outputs. The final outputs are therefore | |
101 * a factor of N larger than desired; since N=8 this can be cured by | |
102 * a simple right shift at the end of the algorithm. The advantage of | |
103 * this arrangement is that we save two multiplications per 1-D IDCT, | |
104 * because the y0 and y4 inputs need not be divided by sqrt(N). | |
105 * | |
106 * We have to do addition and subtraction of the integer inputs, which | |
107 * is no problem, and multiplication by fractional constants, which is | |
108 * a problem to do in integer arithmetic. We multiply all the constants | |
109 * by CONST_SCALE and convert them to integer constants (thus retaining | |
110 * CONST_BITS bits of precision in the constants). After doing a | |
111 * multiplication we have to divide the product by CONST_SCALE, with proper | |
112 * rounding, to produce the correct output. This division can be done | |
113 * cheaply as a right shift of CONST_BITS bits. We postpone shifting | |
114 * as long as possible so that partial sums can be added together with | |
115 * full fractional precision. | |
116 * | |
117 * The outputs of the first pass are scaled up by PASS1_BITS bits so that | |
118 * they are represented to better-than-integral precision. These outputs | |
119 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word | |
120 * with the recommended scaling. (To scale up 12-bit sample data further, an | |
121 * intermediate int32 array would be needed.) | |
122 * | |
123 * To avoid overflow of the 32-bit intermediate results in pass 2, we must | |
124 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis | |
125 * shows that the values given below are the most effective. | |
126 */ | |
127 | |
128 #ifdef EIGHT_BIT_SAMPLES | |
129 #define PASS1_BITS 2 | |
130 #else | |
2979 | 131 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ |
0 | 132 #endif |
133 | |
2979 | 134 #define ONE ((int32_t) 1) |
0 | 135 |
136 #define CONST_SCALE (ONE << CONST_BITS) | |
137 | |
138 /* Convert a positive real constant to an integer scaled by CONST_SCALE. | |
139 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time, | |
140 * you will pay a significant penalty in run time. In that case, figure | |
141 * the correct integer constant values and insert them by hand. | |
142 */ | |
143 | |
144 /* Actually FIX is no longer used, we precomputed them all */ | |
2979 | 145 #define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5)) |
0 | 146 |
1064 | 147 /* Descale and correctly round an int32_t value that's scaled by N bits. |
0 | 148 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding |
149 * the fudge factor is correct for either sign of X. | |
150 */ | |
151 | |
152 #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) | |
153 | |
1064 | 154 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result. |
0 | 155 * For 8-bit samples with the recommended scaling, all the variable |
156 * and constant values involved are no more than 16 bits wide, so a | |
157 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply; | |
158 * this provides a useful speedup on many machines. | |
159 * There is no way to specify a 16x16->32 multiply in portable C, but | |
160 * some C compilers will do the right thing if you provide the correct | |
161 * combination of casts. | |
162 * NB: for 12-bit samples, a full 32-bit multiplication will be needed. | |
163 */ | |
164 | |
165 #ifdef EIGHT_BIT_SAMPLES | |
2979 | 166 #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ |
1064 | 167 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const))) |
0 | 168 #endif |
2979 | 169 #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ |
1064 | 170 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const))) |
0 | 171 #endif |
172 #endif | |
173 | |
2979 | 174 #ifndef MULTIPLY /* default definition */ |
0 | 175 #define MULTIPLY(var,const) ((var) * (const)) |
176 #endif | |
177 | |
178 | |
2967 | 179 /* |
0 | 180 Unlike our decoder where we approximate the FIXes, we need to use exact |
2967 | 181 ones here or successive P-frames will drift too much with Reference frame coding |
0 | 182 */ |
183 #define FIX_0_211164243 1730 | |
184 #define FIX_0_275899380 2260 | |
185 #define FIX_0_298631336 2446 | |
186 #define FIX_0_390180644 3196 | |
187 #define FIX_0_509795579 4176 | |
188 #define FIX_0_541196100 4433 | |
189 #define FIX_0_601344887 4926 | |
190 #define FIX_0_765366865 6270 | |
191 #define FIX_0_785694958 6436 | |
192 #define FIX_0_899976223 7373 | |
193 #define FIX_1_061594337 8697 | |
194 #define FIX_1_111140466 9102 | |
195 #define FIX_1_175875602 9633 | |
196 #define FIX_1_306562965 10703 | |
197 #define FIX_1_387039845 11363 | |
198 #define FIX_1_451774981 11893 | |
199 #define FIX_1_501321110 12299 | |
200 #define FIX_1_662939225 13623 | |
201 #define FIX_1_847759065 15137 | |
202 #define FIX_1_961570560 16069 | |
203 #define FIX_2_053119869 16819 | |
204 #define FIX_2_172734803 17799 | |
205 #define FIX_2_562915447 20995 | |
206 #define FIX_3_072711026 25172 | |
207 | |
208 /* | |
209 * Perform the inverse DCT on one block of coefficients. | |
210 */ | |
211 | |
212 void j_rev_dct(DCTBLOCK data) | |
213 { | |
1064 | 214 int32_t tmp0, tmp1, tmp2, tmp3; |
215 int32_t tmp10, tmp11, tmp12, tmp13; | |
216 int32_t z1, z2, z3, z4, z5; | |
217 int32_t d0, d1, d2, d3, d4, d5, d6, d7; | |
0 | 218 register DCTELEM *dataptr; |
219 int rowctr; | |
2967 | 220 |
0 | 221 /* Pass 1: process rows. */ |
222 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ | |
223 /* furthermore, we scale the results by 2**PASS1_BITS. */ | |
224 | |
225 dataptr = data; | |
226 | |
227 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
228 /* Due to quantization, we will usually find that many of the input | |
229 * coefficients are zero, especially the AC terms. We can exploit this | |
230 * by short-circuiting the IDCT calculation for any row in which all | |
231 * the AC terms are zero. In that case each output is equal to the | |
232 * DC coefficient (with scale factor as needed). | |
233 * With typical images and quantization tables, half or more of the | |
234 * row DCT calculations can be simplified this way. | |
235 */ | |
236 | |
237 register int *idataptr = (int*)dataptr; | |
238 | |
36
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
239 /* WARNING: we do the same permutation as MMX idct to simplify the |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
240 video core */ |
0 | 241 d0 = dataptr[0]; |
36
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
242 d2 = dataptr[1]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
243 d4 = dataptr[2]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
244 d6 = dataptr[3]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
245 d1 = dataptr[4]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
246 d3 = dataptr[5]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
247 d5 = dataptr[6]; |
0 | 248 d7 = dataptr[7]; |
249 | |
36
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
250 if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) { |
0 | 251 /* AC terms all zero */ |
252 if (d0) { | |
2979 | 253 /* Compute a 32 bit value to assign. */ |
254 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS); | |
255 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000); | |
2967 | 256 |
2979 | 257 idataptr[0] = v; |
258 idataptr[1] = v; | |
259 idataptr[2] = v; | |
260 idataptr[3] = v; | |
0 | 261 } |
2967 | 262 |
2979 | 263 dataptr += DCTSIZE; /* advance pointer to next row */ |
0 | 264 continue; |
265 } | |
266 | |
267 /* Even part: reverse the even part of the forward DCT. */ | |
268 /* The rotator is sqrt(2)*c(-6). */ | |
269 { | |
270 if (d6) { | |
2979 | 271 if (d2) { |
272 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
273 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
274 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
275 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
0 | 276 |
2979 | 277 tmp0 = (d0 + d4) << CONST_BITS; |
278 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 279 |
2979 | 280 tmp10 = tmp0 + tmp3; |
281 tmp13 = tmp0 - tmp3; | |
282 tmp11 = tmp1 + tmp2; | |
283 tmp12 = tmp1 - tmp2; | |
284 } else { | |
285 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
286 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
287 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
0 | 288 |
2979 | 289 tmp0 = (d0 + d4) << CONST_BITS; |
290 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 291 |
2979 | 292 tmp10 = tmp0 + tmp3; |
293 tmp13 = tmp0 - tmp3; | |
294 tmp11 = tmp1 + tmp2; | |
295 tmp12 = tmp1 - tmp2; | |
296 } | |
2263 | 297 } else { |
2979 | 298 if (d2) { |
299 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
300 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
301 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
0 | 302 |
2979 | 303 tmp0 = (d0 + d4) << CONST_BITS; |
304 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 305 |
2979 | 306 tmp10 = tmp0 + tmp3; |
307 tmp13 = tmp0 - tmp3; | |
308 tmp11 = tmp1 + tmp2; | |
309 tmp12 = tmp1 - tmp2; | |
310 } else { | |
311 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
312 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
313 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
314 } | |
0 | 315 } |
316 | |
317 /* Odd part per figure 8; the matrix is unitary and hence its | |
318 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. | |
319 */ | |
320 | |
321 if (d7) { | |
2979 | 322 if (d5) { |
323 if (d3) { | |
324 if (d1) { | |
325 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ | |
326 z1 = d7 + d1; | |
327 z2 = d5 + d3; | |
328 z3 = d7 + d3; | |
329 z4 = d5 + d1; | |
330 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); | |
2967 | 331 |
2979 | 332 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
333 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
334 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
335 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
336 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
337 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
338 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
339 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 340 |
2979 | 341 z3 += z5; |
342 z4 += z5; | |
2967 | 343 |
2979 | 344 tmp0 += z1 + z3; |
345 tmp1 += z2 + z4; | |
346 tmp2 += z2 + z3; | |
347 tmp3 += z1 + z4; | |
348 } else { | |
349 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ | |
350 z2 = d5 + d3; | |
351 z3 = d7 + d3; | |
352 z5 = MULTIPLY(z3 + d5, FIX_1_175875602); | |
2967 | 353 |
2979 | 354 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
355 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
356 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
357 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
358 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
359 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
360 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
2967 | 361 |
2979 | 362 z3 += z5; |
363 z4 += z5; | |
2967 | 364 |
2979 | 365 tmp0 += z1 + z3; |
366 tmp1 += z2 + z4; | |
367 tmp2 += z2 + z3; | |
368 tmp3 = z1 + z4; | |
369 } | |
370 } else { | |
371 if (d1) { | |
372 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ | |
373 z1 = d7 + d1; | |
374 z4 = d5 + d1; | |
375 z5 = MULTIPLY(d7 + z4, FIX_1_175875602); | |
2967 | 376 |
2979 | 377 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
378 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
379 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
380 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
381 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
382 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
383 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 384 |
2979 | 385 z3 += z5; |
386 z4 += z5; | |
2967 | 387 |
2979 | 388 tmp0 += z1 + z3; |
389 tmp1 += z2 + z4; | |
390 tmp2 = z2 + z3; | |
391 tmp3 += z1 + z4; | |
392 } else { | |
393 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ | |
394 tmp0 = MULTIPLY(-d7, FIX_0_601344887); | |
395 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
396 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
397 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
398 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
399 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
400 z5 = MULTIPLY(d5 + d7, FIX_1_175875602); | |
2967 | 401 |
2979 | 402 z3 += z5; |
403 z4 += z5; | |
2967 | 404 |
2979 | 405 tmp0 += z3; |
406 tmp1 += z4; | |
407 tmp2 = z2 + z3; | |
408 tmp3 = z1 + z4; | |
409 } | |
410 } | |
411 } else { | |
412 if (d3) { | |
413 if (d1) { | |
414 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ | |
415 z1 = d7 + d1; | |
416 z3 = d7 + d3; | |
417 z5 = MULTIPLY(z3 + d1, FIX_1_175875602); | |
2967 | 418 |
2979 | 419 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
420 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
421 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
422 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
423 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
424 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
425 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
2967 | 426 |
2979 | 427 z3 += z5; |
428 z4 += z5; | |
2967 | 429 |
2979 | 430 tmp0 += z1 + z3; |
431 tmp1 = z2 + z4; | |
432 tmp2 += z2 + z3; | |
433 tmp3 += z1 + z4; | |
434 } else { | |
435 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ | |
436 z3 = d7 + d3; | |
2967 | 437 |
2979 | 438 tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
439 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
440 tmp2 = MULTIPLY(d3, FIX_0_509795579); | |
441 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
442 z5 = MULTIPLY(z3, FIX_1_175875602); | |
443 z3 = MULTIPLY(-z3, FIX_0_785694958); | |
2967 | 444 |
2979 | 445 tmp0 += z3; |
446 tmp1 = z2 + z5; | |
447 tmp2 += z3; | |
448 tmp3 = z1 + z5; | |
449 } | |
450 } else { | |
451 if (d1) { | |
452 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ | |
453 z1 = d7 + d1; | |
454 z5 = MULTIPLY(z1, FIX_1_175875602); | |
0 | 455 |
2979 | 456 z1 = MULTIPLY(z1, FIX_0_275899380); |
457 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
458 tmp0 = MULTIPLY(-d7, FIX_1_662939225); | |
459 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
460 tmp3 = MULTIPLY(d1, FIX_1_111140466); | |
0 | 461 |
2979 | 462 tmp0 += z1; |
463 tmp1 = z4 + z5; | |
464 tmp2 = z3 + z5; | |
465 tmp3 += z1; | |
466 } else { | |
467 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ | |
468 tmp0 = MULTIPLY(-d7, FIX_1_387039845); | |
469 tmp1 = MULTIPLY(d7, FIX_1_175875602); | |
470 tmp2 = MULTIPLY(-d7, FIX_0_785694958); | |
471 tmp3 = MULTIPLY(d7, FIX_0_275899380); | |
472 } | |
473 } | |
474 } | |
0 | 475 } else { |
2979 | 476 if (d5) { |
477 if (d3) { | |
478 if (d1) { | |
479 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ | |
480 z2 = d5 + d3; | |
481 z4 = d5 + d1; | |
482 z5 = MULTIPLY(d3 + z4, FIX_1_175875602); | |
2967 | 483 |
2979 | 484 tmp1 = MULTIPLY(d5, FIX_2_053119869); |
485 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
486 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
487 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
488 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
489 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
490 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 491 |
2979 | 492 z3 += z5; |
493 z4 += z5; | |
2967 | 494 |
2979 | 495 tmp0 = z1 + z3; |
496 tmp1 += z2 + z4; | |
497 tmp2 += z2 + z3; | |
498 tmp3 += z1 + z4; | |
499 } else { | |
500 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ | |
501 z2 = d5 + d3; | |
2967 | 502 |
2979 | 503 z5 = MULTIPLY(z2, FIX_1_175875602); |
504 tmp1 = MULTIPLY(d5, FIX_1_662939225); | |
505 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
506 z2 = MULTIPLY(-z2, FIX_1_387039845); | |
507 tmp2 = MULTIPLY(d3, FIX_1_111140466); | |
508 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
2967 | 509 |
2979 | 510 tmp0 = z3 + z5; |
511 tmp1 += z2; | |
512 tmp2 += z2; | |
513 tmp3 = z4 + z5; | |
514 } | |
515 } else { | |
516 if (d1) { | |
517 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ | |
518 z4 = d5 + d1; | |
2967 | 519 |
2979 | 520 z5 = MULTIPLY(z4, FIX_1_175875602); |
521 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
522 tmp3 = MULTIPLY(d1, FIX_0_601344887); | |
523 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
524 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
525 z4 = MULTIPLY(z4, FIX_0_785694958); | |
2967 | 526 |
2979 | 527 tmp0 = z1 + z5; |
528 tmp1 += z4; | |
529 tmp2 = z2 + z5; | |
530 tmp3 += z4; | |
531 } else { | |
532 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ | |
533 tmp0 = MULTIPLY(d5, FIX_1_175875602); | |
534 tmp1 = MULTIPLY(d5, FIX_0_275899380); | |
535 tmp2 = MULTIPLY(-d5, FIX_1_387039845); | |
536 tmp3 = MULTIPLY(d5, FIX_0_785694958); | |
537 } | |
538 } | |
539 } else { | |
540 if (d3) { | |
541 if (d1) { | |
542 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ | |
543 z5 = d1 + d3; | |
544 tmp3 = MULTIPLY(d1, FIX_0_211164243); | |
545 tmp2 = MULTIPLY(-d3, FIX_1_451774981); | |
546 z1 = MULTIPLY(d1, FIX_1_061594337); | |
547 z2 = MULTIPLY(-d3, FIX_2_172734803); | |
548 z4 = MULTIPLY(z5, FIX_0_785694958); | |
549 z5 = MULTIPLY(z5, FIX_1_175875602); | |
2967 | 550 |
2979 | 551 tmp0 = z1 - z4; |
552 tmp1 = z2 + z4; | |
553 tmp2 += z5; | |
554 tmp3 += z5; | |
555 } else { | |
556 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ | |
557 tmp0 = MULTIPLY(-d3, FIX_0_785694958); | |
558 tmp1 = MULTIPLY(-d3, FIX_1_387039845); | |
559 tmp2 = MULTIPLY(-d3, FIX_0_275899380); | |
560 tmp3 = MULTIPLY(d3, FIX_1_175875602); | |
561 } | |
562 } else { | |
563 if (d1) { | |
564 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ | |
565 tmp0 = MULTIPLY(d1, FIX_0_275899380); | |
566 tmp1 = MULTIPLY(d1, FIX_0_785694958); | |
567 tmp2 = MULTIPLY(d1, FIX_1_175875602); | |
568 tmp3 = MULTIPLY(d1, FIX_1_387039845); | |
569 } else { | |
570 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ | |
571 tmp0 = tmp1 = tmp2 = tmp3 = 0; | |
572 } | |
573 } | |
574 } | |
0 | 575 } |
576 } | |
577 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
578 | |
579 dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); | |
580 dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); | |
581 dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); | |
582 dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); | |
583 dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); | |
584 dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); | |
585 dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); | |
586 dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); | |
587 | |
2979 | 588 dataptr += DCTSIZE; /* advance pointer to next row */ |
0 | 589 } |
590 | |
591 /* Pass 2: process columns. */ | |
592 /* Note that we must descale the results by a factor of 8 == 2**3, */ | |
593 /* and also undo the PASS1_BITS scaling. */ | |
594 | |
595 dataptr = data; | |
596 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
597 /* Columns of zeroes can be exploited in the same way as we did with rows. | |
598 * However, the row calculation has created many nonzero AC terms, so the | |
599 * simplification applies less often (typically 5% to 10% of the time). | |
600 * On machines with very fast multiplication, it's possible that the | |
601 * test takes more time than it's worth. In that case this section | |
602 * may be commented out. | |
603 */ | |
604 | |
605 d0 = dataptr[DCTSIZE*0]; | |
606 d1 = dataptr[DCTSIZE*1]; | |
607 d2 = dataptr[DCTSIZE*2]; | |
608 d3 = dataptr[DCTSIZE*3]; | |
609 d4 = dataptr[DCTSIZE*4]; | |
610 d5 = dataptr[DCTSIZE*5]; | |
611 d6 = dataptr[DCTSIZE*6]; | |
612 d7 = dataptr[DCTSIZE*7]; | |
613 | |
614 /* Even part: reverse the even part of the forward DCT. */ | |
615 /* The rotator is sqrt(2)*c(-6). */ | |
616 if (d6) { | |
2979 | 617 if (d2) { |
618 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
619 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
620 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
621 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
0 | 622 |
2979 | 623 tmp0 = (d0 + d4) << CONST_BITS; |
624 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 625 |
2979 | 626 tmp10 = tmp0 + tmp3; |
627 tmp13 = tmp0 - tmp3; | |
628 tmp11 = tmp1 + tmp2; | |
629 tmp12 = tmp1 - tmp2; | |
630 } else { | |
631 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
632 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
633 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
0 | 634 |
2979 | 635 tmp0 = (d0 + d4) << CONST_BITS; |
636 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 637 |
2979 | 638 tmp10 = tmp0 + tmp3; |
639 tmp13 = tmp0 - tmp3; | |
640 tmp11 = tmp1 + tmp2; | |
641 tmp12 = tmp1 - tmp2; | |
642 } | |
2263 | 643 } else { |
2979 | 644 if (d2) { |
645 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
646 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
647 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
0 | 648 |
2979 | 649 tmp0 = (d0 + d4) << CONST_BITS; |
650 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 651 |
2979 | 652 tmp10 = tmp0 + tmp3; |
653 tmp13 = tmp0 - tmp3; | |
654 tmp11 = tmp1 + tmp2; | |
655 tmp12 = tmp1 - tmp2; | |
656 } else { | |
657 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
658 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
659 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
660 } | |
0 | 661 } |
662 | |
663 /* Odd part per figure 8; the matrix is unitary and hence its | |
664 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. | |
665 */ | |
666 if (d7) { | |
2979 | 667 if (d5) { |
668 if (d3) { | |
669 if (d1) { | |
670 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ | |
671 z1 = d7 + d1; | |
672 z2 = d5 + d3; | |
673 z3 = d7 + d3; | |
674 z4 = d5 + d1; | |
675 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); | |
2967 | 676 |
2979 | 677 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
678 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
679 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
680 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
681 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
682 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
683 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
684 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 685 |
2979 | 686 z3 += z5; |
687 z4 += z5; | |
2967 | 688 |
2979 | 689 tmp0 += z1 + z3; |
690 tmp1 += z2 + z4; | |
691 tmp2 += z2 + z3; | |
692 tmp3 += z1 + z4; | |
693 } else { | |
694 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ | |
695 z2 = d5 + d3; | |
696 z3 = d7 + d3; | |
697 z5 = MULTIPLY(z3 + d5, FIX_1_175875602); | |
2967 | 698 |
2979 | 699 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
700 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
701 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
702 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
703 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
704 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
705 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
2967 | 706 |
2979 | 707 z3 += z5; |
708 z4 += z5; | |
2967 | 709 |
2979 | 710 tmp0 += z1 + z3; |
711 tmp1 += z2 + z4; | |
712 tmp2 += z2 + z3; | |
713 tmp3 = z1 + z4; | |
714 } | |
715 } else { | |
716 if (d1) { | |
717 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ | |
718 z1 = d7 + d1; | |
719 z3 = d7; | |
720 z4 = d5 + d1; | |
721 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); | |
2967 | 722 |
2979 | 723 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
724 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
725 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
726 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
727 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
728 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
729 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 730 |
2979 | 731 z3 += z5; |
732 z4 += z5; | |
2967 | 733 |
2979 | 734 tmp0 += z1 + z3; |
735 tmp1 += z2 + z4; | |
736 tmp2 = z2 + z3; | |
737 tmp3 += z1 + z4; | |
738 } else { | |
739 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ | |
740 tmp0 = MULTIPLY(-d7, FIX_0_601344887); | |
741 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
742 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
743 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
744 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
745 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
746 z5 = MULTIPLY(d5 + d7, FIX_1_175875602); | |
2967 | 747 |
2979 | 748 z3 += z5; |
749 z4 += z5; | |
2967 | 750 |
2979 | 751 tmp0 += z3; |
752 tmp1 += z4; | |
753 tmp2 = z2 + z3; | |
754 tmp3 = z1 + z4; | |
755 } | |
756 } | |
757 } else { | |
758 if (d3) { | |
759 if (d1) { | |
760 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ | |
761 z1 = d7 + d1; | |
762 z3 = d7 + d3; | |
763 z5 = MULTIPLY(z3 + d1, FIX_1_175875602); | |
2967 | 764 |
2979 | 765 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
766 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
767 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
768 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
769 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
770 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
771 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
2967 | 772 |
2979 | 773 z3 += z5; |
774 z4 += z5; | |
2967 | 775 |
2979 | 776 tmp0 += z1 + z3; |
777 tmp1 = z2 + z4; | |
778 tmp2 += z2 + z3; | |
779 tmp3 += z1 + z4; | |
780 } else { | |
781 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ | |
782 z3 = d7 + d3; | |
2967 | 783 |
2979 | 784 tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
785 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
786 tmp2 = MULTIPLY(d3, FIX_0_509795579); | |
787 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
788 z5 = MULTIPLY(z3, FIX_1_175875602); | |
789 z3 = MULTIPLY(-z3, FIX_0_785694958); | |
2967 | 790 |
2979 | 791 tmp0 += z3; |
792 tmp1 = z2 + z5; | |
793 tmp2 += z3; | |
794 tmp3 = z1 + z5; | |
795 } | |
796 } else { | |
797 if (d1) { | |
798 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ | |
799 z1 = d7 + d1; | |
800 z5 = MULTIPLY(z1, FIX_1_175875602); | |
0 | 801 |
2979 | 802 z1 = MULTIPLY(z1, FIX_0_275899380); |
803 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
804 tmp0 = MULTIPLY(-d7, FIX_1_662939225); | |
805 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
806 tmp3 = MULTIPLY(d1, FIX_1_111140466); | |
0 | 807 |
2979 | 808 tmp0 += z1; |
809 tmp1 = z4 + z5; | |
810 tmp2 = z3 + z5; | |
811 tmp3 += z1; | |
812 } else { | |
813 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ | |
814 tmp0 = MULTIPLY(-d7, FIX_1_387039845); | |
815 tmp1 = MULTIPLY(d7, FIX_1_175875602); | |
816 tmp2 = MULTIPLY(-d7, FIX_0_785694958); | |
817 tmp3 = MULTIPLY(d7, FIX_0_275899380); | |
818 } | |
819 } | |
820 } | |
0 | 821 } else { |
2979 | 822 if (d5) { |
823 if (d3) { | |
824 if (d1) { | |
825 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ | |
826 z2 = d5 + d3; | |
827 z4 = d5 + d1; | |
828 z5 = MULTIPLY(d3 + z4, FIX_1_175875602); | |
2967 | 829 |
2979 | 830 tmp1 = MULTIPLY(d5, FIX_2_053119869); |
831 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
832 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
833 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
834 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
835 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
836 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 837 |
2979 | 838 z3 += z5; |
839 z4 += z5; | |
2967 | 840 |
2979 | 841 tmp0 = z1 + z3; |
842 tmp1 += z2 + z4; | |
843 tmp2 += z2 + z3; | |
844 tmp3 += z1 + z4; | |
845 } else { | |
846 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ | |
847 z2 = d5 + d3; | |
2967 | 848 |
2979 | 849 z5 = MULTIPLY(z2, FIX_1_175875602); |
850 tmp1 = MULTIPLY(d5, FIX_1_662939225); | |
851 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
852 z2 = MULTIPLY(-z2, FIX_1_387039845); | |
853 tmp2 = MULTIPLY(d3, FIX_1_111140466); | |
854 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
2967 | 855 |
2979 | 856 tmp0 = z3 + z5; |
857 tmp1 += z2; | |
858 tmp2 += z2; | |
859 tmp3 = z4 + z5; | |
860 } | |
861 } else { | |
862 if (d1) { | |
863 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ | |
864 z4 = d5 + d1; | |
2967 | 865 |
2979 | 866 z5 = MULTIPLY(z4, FIX_1_175875602); |
867 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
868 tmp3 = MULTIPLY(d1, FIX_0_601344887); | |
869 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
870 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
871 z4 = MULTIPLY(z4, FIX_0_785694958); | |
2967 | 872 |
2979 | 873 tmp0 = z1 + z5; |
874 tmp1 += z4; | |
875 tmp2 = z2 + z5; | |
876 tmp3 += z4; | |
877 } else { | |
878 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ | |
879 tmp0 = MULTIPLY(d5, FIX_1_175875602); | |
880 tmp1 = MULTIPLY(d5, FIX_0_275899380); | |
881 tmp2 = MULTIPLY(-d5, FIX_1_387039845); | |
882 tmp3 = MULTIPLY(d5, FIX_0_785694958); | |
883 } | |
884 } | |
885 } else { | |
886 if (d3) { | |
887 if (d1) { | |
888 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ | |
889 z5 = d1 + d3; | |
890 tmp3 = MULTIPLY(d1, FIX_0_211164243); | |
891 tmp2 = MULTIPLY(-d3, FIX_1_451774981); | |
892 z1 = MULTIPLY(d1, FIX_1_061594337); | |
893 z2 = MULTIPLY(-d3, FIX_2_172734803); | |
894 z4 = MULTIPLY(z5, FIX_0_785694958); | |
895 z5 = MULTIPLY(z5, FIX_1_175875602); | |
2967 | 896 |
2979 | 897 tmp0 = z1 - z4; |
898 tmp1 = z2 + z4; | |
899 tmp2 += z5; | |
900 tmp3 += z5; | |
901 } else { | |
902 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ | |
903 tmp0 = MULTIPLY(-d3, FIX_0_785694958); | |
904 tmp1 = MULTIPLY(-d3, FIX_1_387039845); | |
905 tmp2 = MULTIPLY(-d3, FIX_0_275899380); | |
906 tmp3 = MULTIPLY(d3, FIX_1_175875602); | |
907 } | |
908 } else { | |
909 if (d1) { | |
910 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ | |
911 tmp0 = MULTIPLY(d1, FIX_0_275899380); | |
912 tmp1 = MULTIPLY(d1, FIX_0_785694958); | |
913 tmp2 = MULTIPLY(d1, FIX_1_175875602); | |
914 tmp3 = MULTIPLY(d1, FIX_1_387039845); | |
915 } else { | |
916 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ | |
917 tmp0 = tmp1 = tmp2 = tmp3 = 0; | |
918 } | |
919 } | |
920 } | |
0 | 921 } |
922 | |
923 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
924 | |
925 dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3, | |
2979 | 926 CONST_BITS+PASS1_BITS+3); |
0 | 927 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3, |
2979 | 928 CONST_BITS+PASS1_BITS+3); |
0 | 929 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2, |
2979 | 930 CONST_BITS+PASS1_BITS+3); |
0 | 931 dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2, |
2979 | 932 CONST_BITS+PASS1_BITS+3); |
0 | 933 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1, |
2979 | 934 CONST_BITS+PASS1_BITS+3); |
0 | 935 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1, |
2979 | 936 CONST_BITS+PASS1_BITS+3); |
0 | 937 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0, |
2979 | 938 CONST_BITS+PASS1_BITS+3); |
0 | 939 dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0, |
2979 | 940 CONST_BITS+PASS1_BITS+3); |
2967 | 941 |
2979 | 942 dataptr++; /* advance pointer to next column */ |
0 | 943 } |
944 } | |
945 | |
2256 | 946 #undef DCTSIZE |
947 #define DCTSIZE 4 | |
948 #define DCTSTRIDE 8 | |
949 | |
950 void j_rev_dct4(DCTBLOCK data) | |
951 { | |
952 int32_t tmp0, tmp1, tmp2, tmp3; | |
953 int32_t tmp10, tmp11, tmp12, tmp13; | |
954 int32_t z1; | |
955 int32_t d0, d2, d4, d6; | |
956 register DCTELEM *dataptr; | |
957 int rowctr; | |
2262 | 958 |
2256 | 959 /* Pass 1: process rows. */ |
960 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ | |
961 /* furthermore, we scale the results by 2**PASS1_BITS. */ | |
962 | |
2262 | 963 data[0] += 4; |
2967 | 964 |
2256 | 965 dataptr = data; |
966 | |
967 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
968 /* Due to quantization, we will usually find that many of the input | |
969 * coefficients are zero, especially the AC terms. We can exploit this | |
970 * by short-circuiting the IDCT calculation for any row in which all | |
971 * the AC terms are zero. In that case each output is equal to the | |
972 * DC coefficient (with scale factor as needed). | |
973 * With typical images and quantization tables, half or more of the | |
974 * row DCT calculations can be simplified this way. | |
975 */ | |
976 | |
977 register int *idataptr = (int*)dataptr; | |
978 | |
979 d0 = dataptr[0]; | |
980 d2 = dataptr[1]; | |
981 d4 = dataptr[2]; | |
982 d6 = dataptr[3]; | |
983 | |
984 if ((d2 | d4 | d6) == 0) { | |
985 /* AC terms all zero */ | |
986 if (d0) { | |
2979 | 987 /* Compute a 32 bit value to assign. */ |
988 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS); | |
989 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000); | |
2967 | 990 |
2979 | 991 idataptr[0] = v; |
992 idataptr[1] = v; | |
2256 | 993 } |
2967 | 994 |
2979 | 995 dataptr += DCTSTRIDE; /* advance pointer to next row */ |
2256 | 996 continue; |
997 } | |
2967 | 998 |
2256 | 999 /* Even part: reverse the even part of the forward DCT. */ |
1000 /* The rotator is sqrt(2)*c(-6). */ | |
1001 if (d6) { | |
2979 | 1002 if (d2) { |
1003 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
1004 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
1005 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
1006 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
2256 | 1007 |
2979 | 1008 tmp0 = (d0 + d4) << CONST_BITS; |
1009 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1010 |
2979 | 1011 tmp10 = tmp0 + tmp3; |
1012 tmp13 = tmp0 - tmp3; | |
1013 tmp11 = tmp1 + tmp2; | |
1014 tmp12 = tmp1 - tmp2; | |
1015 } else { | |
1016 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
1017 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
1018 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
2256 | 1019 |
2979 | 1020 tmp0 = (d0 + d4) << CONST_BITS; |
1021 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1022 |
2979 | 1023 tmp10 = tmp0 + tmp3; |
1024 tmp13 = tmp0 - tmp3; | |
1025 tmp11 = tmp1 + tmp2; | |
1026 tmp12 = tmp1 - tmp2; | |
1027 } | |
2262 | 1028 } else { |
2979 | 1029 if (d2) { |
1030 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
1031 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
1032 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
2256 | 1033 |
2979 | 1034 tmp0 = (d0 + d4) << CONST_BITS; |
1035 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1036 |
2979 | 1037 tmp10 = tmp0 + tmp3; |
1038 tmp13 = tmp0 - tmp3; | |
1039 tmp11 = tmp1 + tmp2; | |
1040 tmp12 = tmp1 - tmp2; | |
1041 } else { | |
1042 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
1043 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
1044 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
1045 } | |
2256 | 1046 } |
1047 | |
1048 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
1049 | |
1050 dataptr[0] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS); | |
1051 dataptr[1] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS); | |
1052 dataptr[2] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS); | |
1053 dataptr[3] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS); | |
1054 | |
2979 | 1055 dataptr += DCTSTRIDE; /* advance pointer to next row */ |
2256 | 1056 } |
1057 | |
1058 /* Pass 2: process columns. */ | |
1059 /* Note that we must descale the results by a factor of 8 == 2**3, */ | |
1060 /* and also undo the PASS1_BITS scaling. */ | |
1061 | |
1062 dataptr = data; | |
1063 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
1064 /* Columns of zeroes can be exploited in the same way as we did with rows. | |
1065 * However, the row calculation has created many nonzero AC terms, so the | |
1066 * simplification applies less often (typically 5% to 10% of the time). | |
1067 * On machines with very fast multiplication, it's possible that the | |
1068 * test takes more time than it's worth. In that case this section | |
1069 * may be commented out. | |
1070 */ | |
1071 | |
1072 d0 = dataptr[DCTSTRIDE*0]; | |
1073 d2 = dataptr[DCTSTRIDE*1]; | |
1074 d4 = dataptr[DCTSTRIDE*2]; | |
1075 d6 = dataptr[DCTSTRIDE*3]; | |
1076 | |
1077 /* Even part: reverse the even part of the forward DCT. */ | |
1078 /* The rotator is sqrt(2)*c(-6). */ | |
1079 if (d6) { | |
2979 | 1080 if (d2) { |
1081 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
1082 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
1083 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
1084 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
2256 | 1085 |
2979 | 1086 tmp0 = (d0 + d4) << CONST_BITS; |
1087 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1088 |
2979 | 1089 tmp10 = tmp0 + tmp3; |
1090 tmp13 = tmp0 - tmp3; | |
1091 tmp11 = tmp1 + tmp2; | |
1092 tmp12 = tmp1 - tmp2; | |
1093 } else { | |
1094 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
1095 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
1096 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
2256 | 1097 |
2979 | 1098 tmp0 = (d0 + d4) << CONST_BITS; |
1099 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1100 |
2979 | 1101 tmp10 = tmp0 + tmp3; |
1102 tmp13 = tmp0 - tmp3; | |
1103 tmp11 = tmp1 + tmp2; | |
1104 tmp12 = tmp1 - tmp2; | |
1105 } | |
2262 | 1106 } else { |
2979 | 1107 if (d2) { |
1108 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
1109 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
1110 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
2256 | 1111 |
2979 | 1112 tmp0 = (d0 + d4) << CONST_BITS; |
1113 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1114 |
2979 | 1115 tmp10 = tmp0 + tmp3; |
1116 tmp13 = tmp0 - tmp3; | |
1117 tmp11 = tmp1 + tmp2; | |
1118 tmp12 = tmp1 - tmp2; | |
1119 } else { | |
1120 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
1121 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
1122 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
1123 } | |
2256 | 1124 } |
1125 | |
1126 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
1127 | |
2262 | 1128 dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3); |
1129 dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3); | |
1130 dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3); | |
1131 dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3); | |
2967 | 1132 |
2979 | 1133 dataptr++; /* advance pointer to next column */ |
2256 | 1134 } |
1135 } | |
1136 | |
2257 | 1137 void j_rev_dct2(DCTBLOCK data){ |
1138 int d00, d01, d10, d11; | |
1139 | |
1140 data[0] += 4; | |
1141 d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE]; | |
1142 d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE]; | |
1143 d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE]; | |
1144 d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE]; | |
2967 | 1145 |
2257 | 1146 data[0+0*DCTSTRIDE]= (d00 + d10)>>3; |
1147 data[1+0*DCTSTRIDE]= (d01 + d11)>>3; | |
1148 data[0+1*DCTSTRIDE]= (d00 - d10)>>3; | |
1149 data[1+1*DCTSTRIDE]= (d01 - d11)>>3; | |
1150 } | |
2256 | 1151 |
2259 | 1152 void j_rev_dct1(DCTBLOCK data){ |
1153 data[0] = (data[0] + 4)>>3; | |
1154 } | |
1155 | |
440
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
36
diff
changeset
|
1156 #undef FIX |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
36
diff
changeset
|
1157 #undef CONST_BITS |