Mercurial > libavcodec.hg
annotate jfdctfst.c @ 905:2b93dc762f9a libavcodec
fixing illegal 3. esc bug (the mpeg4 std only requires encoders to use unescaped symbols but not esc1 or esc2 if they are shorter than esc3, andjust beause its logical to use the shortest possible vlc doesnt mean encoders do that)
author | michaelni |
---|---|
date | Wed, 04 Dec 2002 11:47:24 +0000 |
parents | 11dbd00682fc |
children | b32afefe7d33 |
rev | line source |
---|---|
0 | 1 /* |
2 * jfdctfst.c | |
3 * | |
4 * Copyright (C) 1994-1996, Thomas G. Lane. | |
5 * This file is part of the Independent JPEG Group's software. | |
6 * For conditions of distribution and use, see the accompanying README file. | |
7 * | |
8 * This file contains a fast, not so accurate integer implementation of the | |
9 * forward DCT (Discrete Cosine Transform). | |
10 * | |
11 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT | |
12 * on each column. Direct algorithms are also available, but they are | |
13 * much more complex and seem not to be any faster when reduced to code. | |
14 * | |
15 * This implementation is based on Arai, Agui, and Nakajima's algorithm for | |
16 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in | |
17 * Japanese, but the algorithm is described in the Pennebaker & Mitchell | |
18 * JPEG textbook (see REFERENCES section in file README). The following code | |
19 * is based directly on figure 4-8 in P&M. | |
20 * While an 8-point DCT cannot be done in less than 11 multiplies, it is | |
21 * possible to arrange the computation so that many of the multiplies are | |
22 * simple scalings of the final outputs. These multiplies can then be | |
23 * folded into the multiplications or divisions by the JPEG quantization | |
24 * table entries. The AA&N method leaves only 5 multiplies and 29 adds | |
25 * to be done in the DCT itself. | |
26 * The primary disadvantage of this method is that with fixed-point math, | |
27 * accuracy is lost due to imprecise representation of the scaled | |
28 * quantization values. The smaller the quantization table entry, the less | |
29 * precise the scaled value, so this implementation does worse with high- | |
30 * quality-setting files than with low-quality ones. | |
31 */ | |
32 | |
33 #include <stdlib.h> | |
34 #include <stdio.h> | |
35 #include "common.h" | |
36 #include "dsputil.h" | |
37 | |
38 #define DCTSIZE 8 | |
39 #define GLOBAL(x) x | |
40 #define RIGHT_SHIFT(x, n) ((x) >> (n)) | |
41 #define SHIFT_TEMPS | |
42 | |
43 /* | |
44 * This module is specialized to the case DCTSIZE = 8. | |
45 */ | |
46 | |
47 #if DCTSIZE != 8 | |
48 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ | |
49 #endif | |
50 | |
51 | |
52 /* Scaling decisions are generally the same as in the LL&M algorithm; | |
53 * see jfdctint.c for more details. However, we choose to descale | |
54 * (right shift) multiplication products as soon as they are formed, | |
55 * rather than carrying additional fractional bits into subsequent additions. | |
56 * This compromises accuracy slightly, but it lets us save a few shifts. | |
57 * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) | |
58 * everywhere except in the multiplications proper; this saves a good deal | |
59 * of work on 16-bit-int machines. | |
60 * | |
61 * Again to save a few shifts, the intermediate results between pass 1 and | |
62 * pass 2 are not upscaled, but are represented only to integral precision. | |
63 * | |
64 * A final compromise is to represent the multiplicative constants to only | |
65 * 8 fractional bits, rather than 13. This saves some shifting work on some | |
66 * machines, and may also reduce the cost of multiplication (since there | |
67 * are fewer one-bits in the constants). | |
68 */ | |
69 | |
70 #define CONST_BITS 8 | |
71 | |
72 | |
73 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus | |
74 * causing a lot of useless floating-point operations at run time. | |
75 * To get around this we use the following pre-calculated constants. | |
76 * If you change CONST_BITS you may want to add appropriate values. | |
77 * (With a reasonable C compiler, you can just rely on the FIX() macro...) | |
78 */ | |
79 | |
80 #if CONST_BITS == 8 | |
81 #define FIX_0_382683433 ((INT32) 98) /* FIX(0.382683433) */ | |
82 #define FIX_0_541196100 ((INT32) 139) /* FIX(0.541196100) */ | |
83 #define FIX_0_707106781 ((INT32) 181) /* FIX(0.707106781) */ | |
84 #define FIX_1_306562965 ((INT32) 334) /* FIX(1.306562965) */ | |
85 #else | |
86 #define FIX_0_382683433 FIX(0.382683433) | |
87 #define FIX_0_541196100 FIX(0.541196100) | |
88 #define FIX_0_707106781 FIX(0.707106781) | |
89 #define FIX_1_306562965 FIX(1.306562965) | |
90 #endif | |
91 | |
92 | |
93 /* We can gain a little more speed, with a further compromise in accuracy, | |
94 * by omitting the addition in a descaling shift. This yields an incorrectly | |
95 * rounded result half the time... | |
96 */ | |
97 | |
98 #ifndef USE_ACCURATE_ROUNDING | |
99 #undef DESCALE | |
100 #define DESCALE(x,n) RIGHT_SHIFT(x, n) | |
101 #endif | |
102 | |
103 | |
104 /* Multiply a DCTELEM variable by an INT32 constant, and immediately | |
105 * descale to yield a DCTELEM result. | |
106 */ | |
107 | |
108 #define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS)) | |
109 | |
110 | |
111 /* | |
112 * Perform the forward DCT on one block of samples. | |
113 */ | |
114 | |
115 GLOBAL(void) | |
474
11dbd00682fc
avoid name clash with libjpeg - added missing externs
bellard
parents:
440
diff
changeset
|
116 fdct_ifast (DCTELEM * data) |
0 | 117 { |
118 DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
119 DCTELEM tmp10, tmp11, tmp12, tmp13; | |
120 DCTELEM z1, z2, z3, z4, z5, z11, z13; | |
121 DCTELEM *dataptr; | |
122 int ctr; | |
123 SHIFT_TEMPS | |
124 | |
125 /* Pass 1: process rows. */ | |
126 | |
127 dataptr = data; | |
128 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { | |
129 tmp0 = dataptr[0] + dataptr[7]; | |
130 tmp7 = dataptr[0] - dataptr[7]; | |
131 tmp1 = dataptr[1] + dataptr[6]; | |
132 tmp6 = dataptr[1] - dataptr[6]; | |
133 tmp2 = dataptr[2] + dataptr[5]; | |
134 tmp5 = dataptr[2] - dataptr[5]; | |
135 tmp3 = dataptr[3] + dataptr[4]; | |
136 tmp4 = dataptr[3] - dataptr[4]; | |
137 | |
138 /* Even part */ | |
139 | |
140 tmp10 = tmp0 + tmp3; /* phase 2 */ | |
141 tmp13 = tmp0 - tmp3; | |
142 tmp11 = tmp1 + tmp2; | |
143 tmp12 = tmp1 - tmp2; | |
144 | |
145 dataptr[0] = tmp10 + tmp11; /* phase 3 */ | |
146 dataptr[4] = tmp10 - tmp11; | |
147 | |
148 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ | |
149 dataptr[2] = tmp13 + z1; /* phase 5 */ | |
150 dataptr[6] = tmp13 - z1; | |
151 | |
152 /* Odd part */ | |
153 | |
154 tmp10 = tmp4 + tmp5; /* phase 2 */ | |
155 tmp11 = tmp5 + tmp6; | |
156 tmp12 = tmp6 + tmp7; | |
157 | |
158 /* The rotator is modified from fig 4-8 to avoid extra negations. */ | |
159 z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
160 z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ | |
161 z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
162 z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
163 | |
164 z11 = tmp7 + z3; /* phase 5 */ | |
165 z13 = tmp7 - z3; | |
166 | |
167 dataptr[5] = z13 + z2; /* phase 6 */ | |
168 dataptr[3] = z13 - z2; | |
169 dataptr[1] = z11 + z4; | |
170 dataptr[7] = z11 - z4; | |
171 | |
172 dataptr += DCTSIZE; /* advance pointer to next row */ | |
173 } | |
174 | |
175 /* Pass 2: process columns. */ | |
176 | |
177 dataptr = data; | |
178 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { | |
179 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; | |
180 tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; | |
181 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; | |
182 tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; | |
183 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; | |
184 tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; | |
185 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; | |
186 tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; | |
187 | |
188 /* Even part */ | |
189 | |
190 tmp10 = tmp0 + tmp3; /* phase 2 */ | |
191 tmp13 = tmp0 - tmp3; | |
192 tmp11 = tmp1 + tmp2; | |
193 tmp12 = tmp1 - tmp2; | |
194 | |
195 dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ | |
196 dataptr[DCTSIZE*4] = tmp10 - tmp11; | |
197 | |
198 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ | |
199 dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ | |
200 dataptr[DCTSIZE*6] = tmp13 - z1; | |
201 | |
202 /* Odd part */ | |
203 | |
204 tmp10 = tmp4 + tmp5; /* phase 2 */ | |
205 tmp11 = tmp5 + tmp6; | |
206 tmp12 = tmp6 + tmp7; | |
207 | |
208 /* The rotator is modified from fig 4-8 to avoid extra negations. */ | |
209 z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
210 z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ | |
211 z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
212 z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
213 | |
214 z11 = tmp7 + z3; /* phase 5 */ | |
215 z13 = tmp7 - z3; | |
216 | |
217 dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ | |
218 dataptr[DCTSIZE*3] = z13 - z2; | |
219 dataptr[DCTSIZE*1] = z11 + z4; | |
220 dataptr[DCTSIZE*7] = z11 - z4; | |
221 | |
222 dataptr++; /* advance pointer to next column */ | |
223 } | |
224 } | |
440
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
225 |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
226 |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
227 #undef GLOBAL |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
228 #undef CONST_BITS |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
229 #undef DESCALE |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
230 #undef FIX_0_541196100 |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
231 #undef FIX_1_306562965 |