Mercurial > libavcodec.hg
annotate jrevdct.c @ 9108:fdc0db1e0087 libavcodec
H.264: Simplify decode_residual()
author | alexc |
---|---|
date | Tue, 03 Mar 2009 05:26:39 +0000 |
parents | e9d9d946f213 |
children | ab2daba3e200 |
rev | line source |
---|---|
0 | 1 /* |
2 * jrevdct.c | |
3 * | |
4 * This file is part of the Independent JPEG Group's software. | |
3669
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
5 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
6 * The authors make NO WARRANTY or representation, either express or implied, |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
7 * with respect to this software, its quality, accuracy, merchantability, or |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
8 * fitness for a particular purpose. This software is provided "AS IS", and |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
9 * you, its user, assume the entire risk as to its quality and accuracy. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
10 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
11 * This software is copyright (C) 1991, 1992, Thomas G. Lane. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
12 * All Rights Reserved except as specified below. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
13 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
14 * Permission is hereby granted to use, copy, modify, and distribute this |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
15 * software (or portions thereof) for any purpose, without fee, subject to |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
16 * these conditions: |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
17 * (1) If any part of the source code for this software is distributed, then |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
18 * this README file must be included, with this copyright and no-warranty |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
19 * notice unaltered; and any additions, deletions, or changes to the original |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
20 * files must be clearly indicated in accompanying documentation. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
21 * (2) If only executable code is distributed, then the accompanying |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
22 * documentation must state that "this software is based in part on the work |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
23 * of the Independent JPEG Group". |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
24 * (3) Permission for use of this software is granted only if the user accepts |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
25 * full responsibility for any undesirable consequences; the authors accept |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
26 * NO LIABILITY for damages of any kind. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
27 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
28 * These conditions apply to any software derived from or based on the IJG |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
29 * code, not just to the unmodified library. If you use our work, you ought |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
30 * to acknowledge us. |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
31 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
32 * Permission is NOT granted for the use of any IJG author's name or company |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
33 * name in advertising or publicity relating to this software or products |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
34 * derived from it. This software may be referred to only as "the Independent |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
35 * JPEG Group's software". |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
36 * |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
37 * We specifically permit and encourage the use of this software as the basis |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
38 * of commercial products, provided that all warranty or liability claims are |
9b98e18a1b1c
Add copyright notice from the Independent JPEG Group instead of referring
diego
parents:
2979
diff
changeset
|
39 * assumed by the product vendor. |
0 | 40 * |
41 * This file contains the basic inverse-DCT transformation subroutine. | |
42 * | |
43 * This implementation is based on an algorithm described in | |
44 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT | |
45 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, | |
46 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. | |
47 * The primary algorithm described there uses 11 multiplies and 29 adds. | |
48 * We use their alternate method with 12 multiplies and 32 adds. | |
49 * The advantage of this method is that no data path contains more than one | |
50 * multiplication; this allows a very simple and accurate implementation in | |
51 * scaled fixed-point arithmetic, with a minimal number of shifts. | |
2967 | 52 * |
0 | 53 * I've made lots of modifications to attempt to take advantage of the |
54 * sparse nature of the DCT matrices we're getting. Although the logic | |
55 * is cumbersome, it's straightforward and the resulting code is much | |
56 * faster. | |
57 * | |
58 * A better way to do this would be to pass in the DCT block as a sparse | |
59 * matrix, perhaps with the difference cases encoded. | |
60 */ | |
2967 | 61 |
1106 | 62 /** |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
6763
diff
changeset
|
63 * @file libavcodec/jrevdct.c |
1106 | 64 * Independent JPEG Group's LLM idct. |
65 */ | |
2967 | 66 |
6763 | 67 #include "libavutil/common.h" |
0 | 68 #include "dsputil.h" |
69 | |
70 #define EIGHT_BIT_SAMPLES | |
71 | |
72 #define DCTSIZE 8 | |
73 #define DCTSIZE2 64 | |
74 | |
75 #define GLOBAL | |
76 | |
77 #define RIGHT_SHIFT(x, n) ((x) >> (n)) | |
78 | |
79 typedef DCTELEM DCTBLOCK[DCTSIZE2]; | |
80 | |
81 #define CONST_BITS 13 | |
82 | |
83 /* | |
84 * This routine is specialized to the case DCTSIZE = 8. | |
85 */ | |
86 | |
87 #if DCTSIZE != 8 | |
88 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ | |
89 #endif | |
90 | |
91 | |
92 /* | |
93 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT | |
94 * on each column. Direct algorithms are also available, but they are | |
95 * much more complex and seem not to be any faster when reduced to code. | |
96 * | |
97 * The poop on this scaling stuff is as follows: | |
98 * | |
99 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) | |
100 * larger than the true IDCT outputs. The final outputs are therefore | |
101 * a factor of N larger than desired; since N=8 this can be cured by | |
102 * a simple right shift at the end of the algorithm. The advantage of | |
103 * this arrangement is that we save two multiplications per 1-D IDCT, | |
104 * because the y0 and y4 inputs need not be divided by sqrt(N). | |
105 * | |
106 * We have to do addition and subtraction of the integer inputs, which | |
107 * is no problem, and multiplication by fractional constants, which is | |
108 * a problem to do in integer arithmetic. We multiply all the constants | |
109 * by CONST_SCALE and convert them to integer constants (thus retaining | |
110 * CONST_BITS bits of precision in the constants). After doing a | |
111 * multiplication we have to divide the product by CONST_SCALE, with proper | |
112 * rounding, to produce the correct output. This division can be done | |
113 * cheaply as a right shift of CONST_BITS bits. We postpone shifting | |
114 * as long as possible so that partial sums can be added together with | |
115 * full fractional precision. | |
116 * | |
117 * The outputs of the first pass are scaled up by PASS1_BITS bits so that | |
118 * they are represented to better-than-integral precision. These outputs | |
119 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word | |
120 * with the recommended scaling. (To scale up 12-bit sample data further, an | |
121 * intermediate int32 array would be needed.) | |
122 * | |
123 * To avoid overflow of the 32-bit intermediate results in pass 2, we must | |
124 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis | |
125 * shows that the values given below are the most effective. | |
126 */ | |
127 | |
128 #ifdef EIGHT_BIT_SAMPLES | |
129 #define PASS1_BITS 2 | |
130 #else | |
2979 | 131 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ |
0 | 132 #endif |
133 | |
2979 | 134 #define ONE ((int32_t) 1) |
0 | 135 |
136 #define CONST_SCALE (ONE << CONST_BITS) | |
137 | |
138 /* Convert a positive real constant to an integer scaled by CONST_SCALE. | |
139 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time, | |
140 * you will pay a significant penalty in run time. In that case, figure | |
141 * the correct integer constant values and insert them by hand. | |
142 */ | |
143 | |
144 /* Actually FIX is no longer used, we precomputed them all */ | |
2979 | 145 #define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5)) |
0 | 146 |
1064 | 147 /* Descale and correctly round an int32_t value that's scaled by N bits. |
0 | 148 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding |
149 * the fudge factor is correct for either sign of X. | |
150 */ | |
151 | |
152 #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) | |
153 | |
1064 | 154 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result. |
0 | 155 * For 8-bit samples with the recommended scaling, all the variable |
156 * and constant values involved are no more than 16 bits wide, so a | |
157 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply; | |
158 * this provides a useful speedup on many machines. | |
159 * There is no way to specify a 16x16->32 multiply in portable C, but | |
160 * some C compilers will do the right thing if you provide the correct | |
161 * combination of casts. | |
162 * NB: for 12-bit samples, a full 32-bit multiplication will be needed. | |
163 */ | |
164 | |
165 #ifdef EIGHT_BIT_SAMPLES | |
2979 | 166 #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ |
1064 | 167 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const))) |
0 | 168 #endif |
2979 | 169 #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ |
1064 | 170 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const))) |
0 | 171 #endif |
172 #endif | |
173 | |
2979 | 174 #ifndef MULTIPLY /* default definition */ |
0 | 175 #define MULTIPLY(var,const) ((var) * (const)) |
176 #endif | |
177 | |
178 | |
2967 | 179 /* |
0 | 180 Unlike our decoder where we approximate the FIXes, we need to use exact |
2967 | 181 ones here or successive P-frames will drift too much with Reference frame coding |
0 | 182 */ |
183 #define FIX_0_211164243 1730 | |
184 #define FIX_0_275899380 2260 | |
185 #define FIX_0_298631336 2446 | |
186 #define FIX_0_390180644 3196 | |
187 #define FIX_0_509795579 4176 | |
188 #define FIX_0_541196100 4433 | |
189 #define FIX_0_601344887 4926 | |
190 #define FIX_0_765366865 6270 | |
191 #define FIX_0_785694958 6436 | |
192 #define FIX_0_899976223 7373 | |
193 #define FIX_1_061594337 8697 | |
194 #define FIX_1_111140466 9102 | |
195 #define FIX_1_175875602 9633 | |
196 #define FIX_1_306562965 10703 | |
197 #define FIX_1_387039845 11363 | |
198 #define FIX_1_451774981 11893 | |
199 #define FIX_1_501321110 12299 | |
200 #define FIX_1_662939225 13623 | |
201 #define FIX_1_847759065 15137 | |
202 #define FIX_1_961570560 16069 | |
203 #define FIX_2_053119869 16819 | |
204 #define FIX_2_172734803 17799 | |
205 #define FIX_2_562915447 20995 | |
206 #define FIX_3_072711026 25172 | |
207 | |
208 /* | |
209 * Perform the inverse DCT on one block of coefficients. | |
210 */ | |
211 | |
212 void j_rev_dct(DCTBLOCK data) | |
213 { | |
1064 | 214 int32_t tmp0, tmp1, tmp2, tmp3; |
215 int32_t tmp10, tmp11, tmp12, tmp13; | |
216 int32_t z1, z2, z3, z4, z5; | |
217 int32_t d0, d1, d2, d3, d4, d5, d6, d7; | |
0 | 218 register DCTELEM *dataptr; |
219 int rowctr; | |
2967 | 220 |
0 | 221 /* Pass 1: process rows. */ |
222 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ | |
223 /* furthermore, we scale the results by 2**PASS1_BITS. */ | |
224 | |
225 dataptr = data; | |
226 | |
227 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
228 /* Due to quantization, we will usually find that many of the input | |
229 * coefficients are zero, especially the AC terms. We can exploit this | |
230 * by short-circuiting the IDCT calculation for any row in which all | |
231 * the AC terms are zero. In that case each output is equal to the | |
232 * DC coefficient (with scale factor as needed). | |
233 * With typical images and quantization tables, half or more of the | |
234 * row DCT calculations can be simplified this way. | |
235 */ | |
236 | |
237 register int *idataptr = (int*)dataptr; | |
238 | |
36
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
239 /* WARNING: we do the same permutation as MMX idct to simplify the |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
240 video core */ |
0 | 241 d0 = dataptr[0]; |
36
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
242 d2 = dataptr[1]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
243 d4 = dataptr[2]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
244 d6 = dataptr[3]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
245 d1 = dataptr[4]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
246 d3 = dataptr[5]; |
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
247 d5 = dataptr[6]; |
0 | 248 d7 = dataptr[7]; |
249 | |
36
23723a0ebd24
permuted coefs in normal IDCT to avoid having different cases there
glantau
parents:
0
diff
changeset
|
250 if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) { |
0 | 251 /* AC terms all zero */ |
252 if (d0) { | |
2979 | 253 /* Compute a 32 bit value to assign. */ |
254 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS); | |
255 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000); | |
2967 | 256 |
2979 | 257 idataptr[0] = v; |
258 idataptr[1] = v; | |
259 idataptr[2] = v; | |
260 idataptr[3] = v; | |
0 | 261 } |
2967 | 262 |
2979 | 263 dataptr += DCTSIZE; /* advance pointer to next row */ |
0 | 264 continue; |
265 } | |
266 | |
267 /* Even part: reverse the even part of the forward DCT. */ | |
268 /* The rotator is sqrt(2)*c(-6). */ | |
269 { | |
270 if (d6) { | |
2979 | 271 if (d2) { |
272 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
273 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
274 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
275 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
0 | 276 |
2979 | 277 tmp0 = (d0 + d4) << CONST_BITS; |
278 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 279 |
2979 | 280 tmp10 = tmp0 + tmp3; |
281 tmp13 = tmp0 - tmp3; | |
282 tmp11 = tmp1 + tmp2; | |
283 tmp12 = tmp1 - tmp2; | |
284 } else { | |
285 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
286 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
287 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
0 | 288 |
2979 | 289 tmp0 = (d0 + d4) << CONST_BITS; |
290 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 291 |
2979 | 292 tmp10 = tmp0 + tmp3; |
293 tmp13 = tmp0 - tmp3; | |
294 tmp11 = tmp1 + tmp2; | |
295 tmp12 = tmp1 - tmp2; | |
296 } | |
2263 | 297 } else { |
2979 | 298 if (d2) { |
299 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
300 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
301 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
0 | 302 |
2979 | 303 tmp0 = (d0 + d4) << CONST_BITS; |
304 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 305 |
2979 | 306 tmp10 = tmp0 + tmp3; |
307 tmp13 = tmp0 - tmp3; | |
308 tmp11 = tmp1 + tmp2; | |
309 tmp12 = tmp1 - tmp2; | |
310 } else { | |
311 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
312 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
313 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
314 } | |
0 | 315 } |
316 | |
317 /* Odd part per figure 8; the matrix is unitary and hence its | |
318 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. | |
319 */ | |
320 | |
321 if (d7) { | |
2979 | 322 if (d5) { |
323 if (d3) { | |
324 if (d1) { | |
325 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ | |
326 z1 = d7 + d1; | |
327 z2 = d5 + d3; | |
328 z3 = d7 + d3; | |
329 z4 = d5 + d1; | |
330 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); | |
2967 | 331 |
2979 | 332 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
333 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
334 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
335 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
336 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
337 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
338 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
339 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 340 |
2979 | 341 z3 += z5; |
342 z4 += z5; | |
2967 | 343 |
2979 | 344 tmp0 += z1 + z3; |
345 tmp1 += z2 + z4; | |
346 tmp2 += z2 + z3; | |
347 tmp3 += z1 + z4; | |
348 } else { | |
349 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ | |
350 z2 = d5 + d3; | |
351 z3 = d7 + d3; | |
352 z5 = MULTIPLY(z3 + d5, FIX_1_175875602); | |
2967 | 353 |
2979 | 354 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
355 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
356 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
357 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
358 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
359 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
360 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
2967 | 361 |
2979 | 362 z3 += z5; |
363 z4 += z5; | |
2967 | 364 |
2979 | 365 tmp0 += z1 + z3; |
366 tmp1 += z2 + z4; | |
367 tmp2 += z2 + z3; | |
368 tmp3 = z1 + z4; | |
369 } | |
370 } else { | |
371 if (d1) { | |
372 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ | |
373 z1 = d7 + d1; | |
374 z4 = d5 + d1; | |
375 z5 = MULTIPLY(d7 + z4, FIX_1_175875602); | |
2967 | 376 |
2979 | 377 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
378 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
379 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
380 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
381 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
382 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
383 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 384 |
2979 | 385 z3 += z5; |
386 z4 += z5; | |
2967 | 387 |
2979 | 388 tmp0 += z1 + z3; |
389 tmp1 += z2 + z4; | |
390 tmp2 = z2 + z3; | |
391 tmp3 += z1 + z4; | |
392 } else { | |
393 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ | |
394 tmp0 = MULTIPLY(-d7, FIX_0_601344887); | |
395 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
396 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
397 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
398 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
399 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
400 z5 = MULTIPLY(d5 + d7, FIX_1_175875602); | |
2967 | 401 |
2979 | 402 z3 += z5; |
403 z4 += z5; | |
2967 | 404 |
2979 | 405 tmp0 += z3; |
406 tmp1 += z4; | |
407 tmp2 = z2 + z3; | |
408 tmp3 = z1 + z4; | |
409 } | |
410 } | |
411 } else { | |
412 if (d3) { | |
413 if (d1) { | |
414 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ | |
415 z1 = d7 + d1; | |
416 z3 = d7 + d3; | |
417 z5 = MULTIPLY(z3 + d1, FIX_1_175875602); | |
2967 | 418 |
2979 | 419 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
420 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
421 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
422 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
423 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
424 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
425 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
2967 | 426 |
2979 | 427 z3 += z5; |
428 z4 += z5; | |
2967 | 429 |
2979 | 430 tmp0 += z1 + z3; |
431 tmp1 = z2 + z4; | |
432 tmp2 += z2 + z3; | |
433 tmp3 += z1 + z4; | |
434 } else { | |
435 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ | |
436 z3 = d7 + d3; | |
2967 | 437 |
2979 | 438 tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
439 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
440 tmp2 = MULTIPLY(d3, FIX_0_509795579); | |
441 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
442 z5 = MULTIPLY(z3, FIX_1_175875602); | |
443 z3 = MULTIPLY(-z3, FIX_0_785694958); | |
2967 | 444 |
2979 | 445 tmp0 += z3; |
446 tmp1 = z2 + z5; | |
447 tmp2 += z3; | |
448 tmp3 = z1 + z5; | |
449 } | |
450 } else { | |
451 if (d1) { | |
452 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ | |
453 z1 = d7 + d1; | |
454 z5 = MULTIPLY(z1, FIX_1_175875602); | |
0 | 455 |
2979 | 456 z1 = MULTIPLY(z1, FIX_0_275899380); |
457 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
458 tmp0 = MULTIPLY(-d7, FIX_1_662939225); | |
459 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
460 tmp3 = MULTIPLY(d1, FIX_1_111140466); | |
0 | 461 |
2979 | 462 tmp0 += z1; |
463 tmp1 = z4 + z5; | |
464 tmp2 = z3 + z5; | |
465 tmp3 += z1; | |
466 } else { | |
467 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ | |
468 tmp0 = MULTIPLY(-d7, FIX_1_387039845); | |
469 tmp1 = MULTIPLY(d7, FIX_1_175875602); | |
470 tmp2 = MULTIPLY(-d7, FIX_0_785694958); | |
471 tmp3 = MULTIPLY(d7, FIX_0_275899380); | |
472 } | |
473 } | |
474 } | |
0 | 475 } else { |
2979 | 476 if (d5) { |
477 if (d3) { | |
478 if (d1) { | |
479 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ | |
480 z2 = d5 + d3; | |
481 z4 = d5 + d1; | |
482 z5 = MULTIPLY(d3 + z4, FIX_1_175875602); | |
2967 | 483 |
2979 | 484 tmp1 = MULTIPLY(d5, FIX_2_053119869); |
485 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
486 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
487 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
488 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
489 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
490 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 491 |
2979 | 492 z3 += z5; |
493 z4 += z5; | |
2967 | 494 |
2979 | 495 tmp0 = z1 + z3; |
496 tmp1 += z2 + z4; | |
497 tmp2 += z2 + z3; | |
498 tmp3 += z1 + z4; | |
499 } else { | |
500 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ | |
501 z2 = d5 + d3; | |
2967 | 502 |
2979 | 503 z5 = MULTIPLY(z2, FIX_1_175875602); |
504 tmp1 = MULTIPLY(d5, FIX_1_662939225); | |
505 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
506 z2 = MULTIPLY(-z2, FIX_1_387039845); | |
507 tmp2 = MULTIPLY(d3, FIX_1_111140466); | |
508 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
2967 | 509 |
2979 | 510 tmp0 = z3 + z5; |
511 tmp1 += z2; | |
512 tmp2 += z2; | |
513 tmp3 = z4 + z5; | |
514 } | |
515 } else { | |
516 if (d1) { | |
517 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ | |
518 z4 = d5 + d1; | |
2967 | 519 |
2979 | 520 z5 = MULTIPLY(z4, FIX_1_175875602); |
521 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
522 tmp3 = MULTIPLY(d1, FIX_0_601344887); | |
523 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
524 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
525 z4 = MULTIPLY(z4, FIX_0_785694958); | |
2967 | 526 |
2979 | 527 tmp0 = z1 + z5; |
528 tmp1 += z4; | |
529 tmp2 = z2 + z5; | |
530 tmp3 += z4; | |
531 } else { | |
532 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ | |
533 tmp0 = MULTIPLY(d5, FIX_1_175875602); | |
534 tmp1 = MULTIPLY(d5, FIX_0_275899380); | |
535 tmp2 = MULTIPLY(-d5, FIX_1_387039845); | |
536 tmp3 = MULTIPLY(d5, FIX_0_785694958); | |
537 } | |
538 } | |
539 } else { | |
540 if (d3) { | |
541 if (d1) { | |
542 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ | |
543 z5 = d1 + d3; | |
544 tmp3 = MULTIPLY(d1, FIX_0_211164243); | |
545 tmp2 = MULTIPLY(-d3, FIX_1_451774981); | |
546 z1 = MULTIPLY(d1, FIX_1_061594337); | |
547 z2 = MULTIPLY(-d3, FIX_2_172734803); | |
548 z4 = MULTIPLY(z5, FIX_0_785694958); | |
549 z5 = MULTIPLY(z5, FIX_1_175875602); | |
2967 | 550 |
2979 | 551 tmp0 = z1 - z4; |
552 tmp1 = z2 + z4; | |
553 tmp2 += z5; | |
554 tmp3 += z5; | |
555 } else { | |
556 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ | |
557 tmp0 = MULTIPLY(-d3, FIX_0_785694958); | |
558 tmp1 = MULTIPLY(-d3, FIX_1_387039845); | |
559 tmp2 = MULTIPLY(-d3, FIX_0_275899380); | |
560 tmp3 = MULTIPLY(d3, FIX_1_175875602); | |
561 } | |
562 } else { | |
563 if (d1) { | |
564 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ | |
565 tmp0 = MULTIPLY(d1, FIX_0_275899380); | |
566 tmp1 = MULTIPLY(d1, FIX_0_785694958); | |
567 tmp2 = MULTIPLY(d1, FIX_1_175875602); | |
568 tmp3 = MULTIPLY(d1, FIX_1_387039845); | |
569 } else { | |
570 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ | |
571 tmp0 = tmp1 = tmp2 = tmp3 = 0; | |
572 } | |
573 } | |
574 } | |
0 | 575 } |
576 } | |
577 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
578 | |
579 dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); | |
580 dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); | |
581 dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); | |
582 dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); | |
583 dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); | |
584 dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); | |
585 dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); | |
586 dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); | |
587 | |
2979 | 588 dataptr += DCTSIZE; /* advance pointer to next row */ |
0 | 589 } |
590 | |
591 /* Pass 2: process columns. */ | |
592 /* Note that we must descale the results by a factor of 8 == 2**3, */ | |
593 /* and also undo the PASS1_BITS scaling. */ | |
594 | |
595 dataptr = data; | |
596 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
597 /* Columns of zeroes can be exploited in the same way as we did with rows. | |
598 * However, the row calculation has created many nonzero AC terms, so the | |
599 * simplification applies less often (typically 5% to 10% of the time). | |
600 * On machines with very fast multiplication, it's possible that the | |
601 * test takes more time than it's worth. In that case this section | |
602 * may be commented out. | |
603 */ | |
604 | |
605 d0 = dataptr[DCTSIZE*0]; | |
606 d1 = dataptr[DCTSIZE*1]; | |
607 d2 = dataptr[DCTSIZE*2]; | |
608 d3 = dataptr[DCTSIZE*3]; | |
609 d4 = dataptr[DCTSIZE*4]; | |
610 d5 = dataptr[DCTSIZE*5]; | |
611 d6 = dataptr[DCTSIZE*6]; | |
612 d7 = dataptr[DCTSIZE*7]; | |
613 | |
614 /* Even part: reverse the even part of the forward DCT. */ | |
615 /* The rotator is sqrt(2)*c(-6). */ | |
616 if (d6) { | |
2979 | 617 if (d2) { |
618 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
619 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
620 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
621 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
0 | 622 |
2979 | 623 tmp0 = (d0 + d4) << CONST_BITS; |
624 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 625 |
2979 | 626 tmp10 = tmp0 + tmp3; |
627 tmp13 = tmp0 - tmp3; | |
628 tmp11 = tmp1 + tmp2; | |
629 tmp12 = tmp1 - tmp2; | |
630 } else { | |
631 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
632 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
633 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
0 | 634 |
2979 | 635 tmp0 = (d0 + d4) << CONST_BITS; |
636 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 637 |
2979 | 638 tmp10 = tmp0 + tmp3; |
639 tmp13 = tmp0 - tmp3; | |
640 tmp11 = tmp1 + tmp2; | |
641 tmp12 = tmp1 - tmp2; | |
642 } | |
2263 | 643 } else { |
2979 | 644 if (d2) { |
645 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
646 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
647 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
0 | 648 |
2979 | 649 tmp0 = (d0 + d4) << CONST_BITS; |
650 tmp1 = (d0 - d4) << CONST_BITS; | |
0 | 651 |
2979 | 652 tmp10 = tmp0 + tmp3; |
653 tmp13 = tmp0 - tmp3; | |
654 tmp11 = tmp1 + tmp2; | |
655 tmp12 = tmp1 - tmp2; | |
656 } else { | |
657 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
658 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
659 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
660 } | |
0 | 661 } |
662 | |
663 /* Odd part per figure 8; the matrix is unitary and hence its | |
664 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. | |
665 */ | |
666 if (d7) { | |
2979 | 667 if (d5) { |
668 if (d3) { | |
669 if (d1) { | |
670 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ | |
671 z1 = d7 + d1; | |
672 z2 = d5 + d3; | |
673 z3 = d7 + d3; | |
674 z4 = d5 + d1; | |
675 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); | |
2967 | 676 |
2979 | 677 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
678 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
679 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
680 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
681 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
682 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
683 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
684 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 685 |
2979 | 686 z3 += z5; |
687 z4 += z5; | |
2967 | 688 |
2979 | 689 tmp0 += z1 + z3; |
690 tmp1 += z2 + z4; | |
691 tmp2 += z2 + z3; | |
692 tmp3 += z1 + z4; | |
693 } else { | |
694 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ | |
695 z1 = d7; | |
696 z2 = d5 + d3; | |
697 z3 = d7 + d3; | |
698 z5 = MULTIPLY(z3 + d5, FIX_1_175875602); | |
2967 | 699 |
2979 | 700 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
701 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
702 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
703 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
704 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
705 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
706 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
2967 | 707 |
2979 | 708 z3 += z5; |
709 z4 += z5; | |
2967 | 710 |
2979 | 711 tmp0 += z1 + z3; |
712 tmp1 += z2 + z4; | |
713 tmp2 += z2 + z3; | |
714 tmp3 = z1 + z4; | |
715 } | |
716 } else { | |
717 if (d1) { | |
718 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ | |
719 z1 = d7 + d1; | |
720 z2 = d5; | |
721 z3 = d7; | |
722 z4 = d5 + d1; | |
723 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); | |
2967 | 724 |
2979 | 725 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
726 tmp1 = MULTIPLY(d5, FIX_2_053119869); | |
727 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
728 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
729 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
730 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
731 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 732 |
2979 | 733 z3 += z5; |
734 z4 += z5; | |
2967 | 735 |
2979 | 736 tmp0 += z1 + z3; |
737 tmp1 += z2 + z4; | |
738 tmp2 = z2 + z3; | |
739 tmp3 += z1 + z4; | |
740 } else { | |
741 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ | |
742 tmp0 = MULTIPLY(-d7, FIX_0_601344887); | |
743 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
744 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
745 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
746 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
747 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
748 z5 = MULTIPLY(d5 + d7, FIX_1_175875602); | |
2967 | 749 |
2979 | 750 z3 += z5; |
751 z4 += z5; | |
2967 | 752 |
2979 | 753 tmp0 += z3; |
754 tmp1 += z4; | |
755 tmp2 = z2 + z3; | |
756 tmp3 = z1 + z4; | |
757 } | |
758 } | |
759 } else { | |
760 if (d3) { | |
761 if (d1) { | |
762 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ | |
763 z1 = d7 + d1; | |
764 z3 = d7 + d3; | |
765 z5 = MULTIPLY(z3 + d1, FIX_1_175875602); | |
2967 | 766 |
2979 | 767 tmp0 = MULTIPLY(d7, FIX_0_298631336); |
768 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
769 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
770 z1 = MULTIPLY(-z1, FIX_0_899976223); | |
771 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
772 z3 = MULTIPLY(-z3, FIX_1_961570560); | |
773 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
2967 | 774 |
2979 | 775 z3 += z5; |
776 z4 += z5; | |
2967 | 777 |
2979 | 778 tmp0 += z1 + z3; |
779 tmp1 = z2 + z4; | |
780 tmp2 += z2 + z3; | |
781 tmp3 += z1 + z4; | |
782 } else { | |
783 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ | |
784 z3 = d7 + d3; | |
2967 | 785 |
2979 | 786 tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
787 z1 = MULTIPLY(-d7, FIX_0_899976223); | |
788 tmp2 = MULTIPLY(d3, FIX_0_509795579); | |
789 z2 = MULTIPLY(-d3, FIX_2_562915447); | |
790 z5 = MULTIPLY(z3, FIX_1_175875602); | |
791 z3 = MULTIPLY(-z3, FIX_0_785694958); | |
2967 | 792 |
2979 | 793 tmp0 += z3; |
794 tmp1 = z2 + z5; | |
795 tmp2 += z3; | |
796 tmp3 = z1 + z5; | |
797 } | |
798 } else { | |
799 if (d1) { | |
800 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ | |
801 z1 = d7 + d1; | |
802 z5 = MULTIPLY(z1, FIX_1_175875602); | |
0 | 803 |
2979 | 804 z1 = MULTIPLY(z1, FIX_0_275899380); |
805 z3 = MULTIPLY(-d7, FIX_1_961570560); | |
806 tmp0 = MULTIPLY(-d7, FIX_1_662939225); | |
807 z4 = MULTIPLY(-d1, FIX_0_390180644); | |
808 tmp3 = MULTIPLY(d1, FIX_1_111140466); | |
0 | 809 |
2979 | 810 tmp0 += z1; |
811 tmp1 = z4 + z5; | |
812 tmp2 = z3 + z5; | |
813 tmp3 += z1; | |
814 } else { | |
815 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ | |
816 tmp0 = MULTIPLY(-d7, FIX_1_387039845); | |
817 tmp1 = MULTIPLY(d7, FIX_1_175875602); | |
818 tmp2 = MULTIPLY(-d7, FIX_0_785694958); | |
819 tmp3 = MULTIPLY(d7, FIX_0_275899380); | |
820 } | |
821 } | |
822 } | |
0 | 823 } else { |
2979 | 824 if (d5) { |
825 if (d3) { | |
826 if (d1) { | |
827 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ | |
828 z2 = d5 + d3; | |
829 z4 = d5 + d1; | |
830 z5 = MULTIPLY(d3 + z4, FIX_1_175875602); | |
2967 | 831 |
2979 | 832 tmp1 = MULTIPLY(d5, FIX_2_053119869); |
833 tmp2 = MULTIPLY(d3, FIX_3_072711026); | |
834 tmp3 = MULTIPLY(d1, FIX_1_501321110); | |
835 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
836 z2 = MULTIPLY(-z2, FIX_2_562915447); | |
837 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
838 z4 = MULTIPLY(-z4, FIX_0_390180644); | |
2967 | 839 |
2979 | 840 z3 += z5; |
841 z4 += z5; | |
2967 | 842 |
2979 | 843 tmp0 = z1 + z3; |
844 tmp1 += z2 + z4; | |
845 tmp2 += z2 + z3; | |
846 tmp3 += z1 + z4; | |
847 } else { | |
848 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ | |
849 z2 = d5 + d3; | |
2967 | 850 |
2979 | 851 z5 = MULTIPLY(z2, FIX_1_175875602); |
852 tmp1 = MULTIPLY(d5, FIX_1_662939225); | |
853 z4 = MULTIPLY(-d5, FIX_0_390180644); | |
854 z2 = MULTIPLY(-z2, FIX_1_387039845); | |
855 tmp2 = MULTIPLY(d3, FIX_1_111140466); | |
856 z3 = MULTIPLY(-d3, FIX_1_961570560); | |
2967 | 857 |
2979 | 858 tmp0 = z3 + z5; |
859 tmp1 += z2; | |
860 tmp2 += z2; | |
861 tmp3 = z4 + z5; | |
862 } | |
863 } else { | |
864 if (d1) { | |
865 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ | |
866 z4 = d5 + d1; | |
2967 | 867 |
2979 | 868 z5 = MULTIPLY(z4, FIX_1_175875602); |
869 z1 = MULTIPLY(-d1, FIX_0_899976223); | |
870 tmp3 = MULTIPLY(d1, FIX_0_601344887); | |
871 tmp1 = MULTIPLY(-d5, FIX_0_509795579); | |
872 z2 = MULTIPLY(-d5, FIX_2_562915447); | |
873 z4 = MULTIPLY(z4, FIX_0_785694958); | |
2967 | 874 |
2979 | 875 tmp0 = z1 + z5; |
876 tmp1 += z4; | |
877 tmp2 = z2 + z5; | |
878 tmp3 += z4; | |
879 } else { | |
880 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ | |
881 tmp0 = MULTIPLY(d5, FIX_1_175875602); | |
882 tmp1 = MULTIPLY(d5, FIX_0_275899380); | |
883 tmp2 = MULTIPLY(-d5, FIX_1_387039845); | |
884 tmp3 = MULTIPLY(d5, FIX_0_785694958); | |
885 } | |
886 } | |
887 } else { | |
888 if (d3) { | |
889 if (d1) { | |
890 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ | |
891 z5 = d1 + d3; | |
892 tmp3 = MULTIPLY(d1, FIX_0_211164243); | |
893 tmp2 = MULTIPLY(-d3, FIX_1_451774981); | |
894 z1 = MULTIPLY(d1, FIX_1_061594337); | |
895 z2 = MULTIPLY(-d3, FIX_2_172734803); | |
896 z4 = MULTIPLY(z5, FIX_0_785694958); | |
897 z5 = MULTIPLY(z5, FIX_1_175875602); | |
2967 | 898 |
2979 | 899 tmp0 = z1 - z4; |
900 tmp1 = z2 + z4; | |
901 tmp2 += z5; | |
902 tmp3 += z5; | |
903 } else { | |
904 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ | |
905 tmp0 = MULTIPLY(-d3, FIX_0_785694958); | |
906 tmp1 = MULTIPLY(-d3, FIX_1_387039845); | |
907 tmp2 = MULTIPLY(-d3, FIX_0_275899380); | |
908 tmp3 = MULTIPLY(d3, FIX_1_175875602); | |
909 } | |
910 } else { | |
911 if (d1) { | |
912 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ | |
913 tmp0 = MULTIPLY(d1, FIX_0_275899380); | |
914 tmp1 = MULTIPLY(d1, FIX_0_785694958); | |
915 tmp2 = MULTIPLY(d1, FIX_1_175875602); | |
916 tmp3 = MULTIPLY(d1, FIX_1_387039845); | |
917 } else { | |
918 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ | |
919 tmp0 = tmp1 = tmp2 = tmp3 = 0; | |
920 } | |
921 } | |
922 } | |
0 | 923 } |
924 | |
925 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
926 | |
927 dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3, | |
2979 | 928 CONST_BITS+PASS1_BITS+3); |
0 | 929 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3, |
2979 | 930 CONST_BITS+PASS1_BITS+3); |
0 | 931 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2, |
2979 | 932 CONST_BITS+PASS1_BITS+3); |
0 | 933 dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2, |
2979 | 934 CONST_BITS+PASS1_BITS+3); |
0 | 935 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1, |
2979 | 936 CONST_BITS+PASS1_BITS+3); |
0 | 937 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1, |
2979 | 938 CONST_BITS+PASS1_BITS+3); |
0 | 939 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0, |
2979 | 940 CONST_BITS+PASS1_BITS+3); |
0 | 941 dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0, |
2979 | 942 CONST_BITS+PASS1_BITS+3); |
2967 | 943 |
2979 | 944 dataptr++; /* advance pointer to next column */ |
0 | 945 } |
946 } | |
947 | |
2256 | 948 #undef DCTSIZE |
949 #define DCTSIZE 4 | |
950 #define DCTSTRIDE 8 | |
951 | |
952 void j_rev_dct4(DCTBLOCK data) | |
953 { | |
954 int32_t tmp0, tmp1, tmp2, tmp3; | |
955 int32_t tmp10, tmp11, tmp12, tmp13; | |
956 int32_t z1; | |
957 int32_t d0, d2, d4, d6; | |
958 register DCTELEM *dataptr; | |
959 int rowctr; | |
2262 | 960 |
2256 | 961 /* Pass 1: process rows. */ |
962 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ | |
963 /* furthermore, we scale the results by 2**PASS1_BITS. */ | |
964 | |
2262 | 965 data[0] += 4; |
2967 | 966 |
2256 | 967 dataptr = data; |
968 | |
969 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
970 /* Due to quantization, we will usually find that many of the input | |
971 * coefficients are zero, especially the AC terms. We can exploit this | |
972 * by short-circuiting the IDCT calculation for any row in which all | |
973 * the AC terms are zero. In that case each output is equal to the | |
974 * DC coefficient (with scale factor as needed). | |
975 * With typical images and quantization tables, half or more of the | |
976 * row DCT calculations can be simplified this way. | |
977 */ | |
978 | |
979 register int *idataptr = (int*)dataptr; | |
980 | |
981 d0 = dataptr[0]; | |
982 d2 = dataptr[1]; | |
983 d4 = dataptr[2]; | |
984 d6 = dataptr[3]; | |
985 | |
986 if ((d2 | d4 | d6) == 0) { | |
987 /* AC terms all zero */ | |
988 if (d0) { | |
2979 | 989 /* Compute a 32 bit value to assign. */ |
990 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS); | |
991 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000); | |
2967 | 992 |
2979 | 993 idataptr[0] = v; |
994 idataptr[1] = v; | |
2256 | 995 } |
2967 | 996 |
2979 | 997 dataptr += DCTSTRIDE; /* advance pointer to next row */ |
2256 | 998 continue; |
999 } | |
2967 | 1000 |
2256 | 1001 /* Even part: reverse the even part of the forward DCT. */ |
1002 /* The rotator is sqrt(2)*c(-6). */ | |
1003 if (d6) { | |
2979 | 1004 if (d2) { |
1005 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
1006 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
1007 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
1008 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
2256 | 1009 |
2979 | 1010 tmp0 = (d0 + d4) << CONST_BITS; |
1011 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1012 |
2979 | 1013 tmp10 = tmp0 + tmp3; |
1014 tmp13 = tmp0 - tmp3; | |
1015 tmp11 = tmp1 + tmp2; | |
1016 tmp12 = tmp1 - tmp2; | |
1017 } else { | |
1018 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
1019 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
1020 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
2256 | 1021 |
2979 | 1022 tmp0 = (d0 + d4) << CONST_BITS; |
1023 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1024 |
2979 | 1025 tmp10 = tmp0 + tmp3; |
1026 tmp13 = tmp0 - tmp3; | |
1027 tmp11 = tmp1 + tmp2; | |
1028 tmp12 = tmp1 - tmp2; | |
1029 } | |
2262 | 1030 } else { |
2979 | 1031 if (d2) { |
1032 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
1033 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
1034 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
2256 | 1035 |
2979 | 1036 tmp0 = (d0 + d4) << CONST_BITS; |
1037 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1038 |
2979 | 1039 tmp10 = tmp0 + tmp3; |
1040 tmp13 = tmp0 - tmp3; | |
1041 tmp11 = tmp1 + tmp2; | |
1042 tmp12 = tmp1 - tmp2; | |
1043 } else { | |
1044 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
1045 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
1046 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
1047 } | |
2256 | 1048 } |
1049 | |
1050 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
1051 | |
1052 dataptr[0] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS); | |
1053 dataptr[1] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS); | |
1054 dataptr[2] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS); | |
1055 dataptr[3] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS); | |
1056 | |
2979 | 1057 dataptr += DCTSTRIDE; /* advance pointer to next row */ |
2256 | 1058 } |
1059 | |
1060 /* Pass 2: process columns. */ | |
1061 /* Note that we must descale the results by a factor of 8 == 2**3, */ | |
1062 /* and also undo the PASS1_BITS scaling. */ | |
1063 | |
1064 dataptr = data; | |
1065 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { | |
1066 /* Columns of zeroes can be exploited in the same way as we did with rows. | |
1067 * However, the row calculation has created many nonzero AC terms, so the | |
1068 * simplification applies less often (typically 5% to 10% of the time). | |
1069 * On machines with very fast multiplication, it's possible that the | |
1070 * test takes more time than it's worth. In that case this section | |
1071 * may be commented out. | |
1072 */ | |
1073 | |
1074 d0 = dataptr[DCTSTRIDE*0]; | |
1075 d2 = dataptr[DCTSTRIDE*1]; | |
1076 d4 = dataptr[DCTSTRIDE*2]; | |
1077 d6 = dataptr[DCTSTRIDE*3]; | |
1078 | |
1079 /* Even part: reverse the even part of the forward DCT. */ | |
1080 /* The rotator is sqrt(2)*c(-6). */ | |
1081 if (d6) { | |
2979 | 1082 if (d2) { |
1083 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ | |
1084 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); | |
1085 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); | |
1086 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); | |
2256 | 1087 |
2979 | 1088 tmp0 = (d0 + d4) << CONST_BITS; |
1089 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1090 |
2979 | 1091 tmp10 = tmp0 + tmp3; |
1092 tmp13 = tmp0 - tmp3; | |
1093 tmp11 = tmp1 + tmp2; | |
1094 tmp12 = tmp1 - tmp2; | |
1095 } else { | |
1096 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ | |
1097 tmp2 = MULTIPLY(-d6, FIX_1_306562965); | |
1098 tmp3 = MULTIPLY(d6, FIX_0_541196100); | |
2256 | 1099 |
2979 | 1100 tmp0 = (d0 + d4) << CONST_BITS; |
1101 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1102 |
2979 | 1103 tmp10 = tmp0 + tmp3; |
1104 tmp13 = tmp0 - tmp3; | |
1105 tmp11 = tmp1 + tmp2; | |
1106 tmp12 = tmp1 - tmp2; | |
1107 } | |
2262 | 1108 } else { |
2979 | 1109 if (d2) { |
1110 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ | |
1111 tmp2 = MULTIPLY(d2, FIX_0_541196100); | |
1112 tmp3 = MULTIPLY(d2, FIX_1_306562965); | |
2256 | 1113 |
2979 | 1114 tmp0 = (d0 + d4) << CONST_BITS; |
1115 tmp1 = (d0 - d4) << CONST_BITS; | |
2256 | 1116 |
2979 | 1117 tmp10 = tmp0 + tmp3; |
1118 tmp13 = tmp0 - tmp3; | |
1119 tmp11 = tmp1 + tmp2; | |
1120 tmp12 = tmp1 - tmp2; | |
1121 } else { | |
1122 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ | |
1123 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; | |
1124 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; | |
1125 } | |
2256 | 1126 } |
1127 | |
1128 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ | |
1129 | |
2262 | 1130 dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3); |
1131 dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3); | |
1132 dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3); | |
1133 dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3); | |
2967 | 1134 |
2979 | 1135 dataptr++; /* advance pointer to next column */ |
2256 | 1136 } |
1137 } | |
1138 | |
2257 | 1139 void j_rev_dct2(DCTBLOCK data){ |
1140 int d00, d01, d10, d11; | |
1141 | |
1142 data[0] += 4; | |
1143 d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE]; | |
1144 d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE]; | |
1145 d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE]; | |
1146 d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE]; | |
2967 | 1147 |
2257 | 1148 data[0+0*DCTSTRIDE]= (d00 + d10)>>3; |
1149 data[1+0*DCTSTRIDE]= (d01 + d11)>>3; | |
1150 data[0+1*DCTSTRIDE]= (d00 - d10)>>3; | |
1151 data[1+1*DCTSTRIDE]= (d01 - d11)>>3; | |
1152 } | |
2256 | 1153 |
2259 | 1154 void j_rev_dct1(DCTBLOCK data){ |
1155 data[0] = (data[0] + 4)>>3; | |
1156 } | |
1157 | |
440
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
36
diff
changeset
|
1158 #undef FIX |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
36
diff
changeset
|
1159 #undef CONST_BITS |