Mercurial > libavcodec.hg
annotate jfdctfst.c @ 3198:6b9f0c4fbdbe libavcodec
First part of a series of speed-enchancing patches.
This one sets up a snow.h and makes snow use the dsputil function pointer
framework to access the three functions that will be implemented in asm
in the other parts of the patchset.
Patch by Robert Edele < yartrebo AH earthlink POIS net>
Original thread:
Subject: [Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations
Date: Sun, 05 Feb 2006 12:47:14 -0500
author | gpoirier |
---|---|
date | Thu, 16 Mar 2006 19:18:18 +0000 |
parents | bfabfdf9ce55 |
children | 9b98e18a1b1c |
rev | line source |
---|---|
0 | 1 /* |
2 * jfdctfst.c | |
3 * | |
4 * Copyright (C) 1994-1996, Thomas G. Lane. | |
5 * This file is part of the Independent JPEG Group's software. | |
6 * For conditions of distribution and use, see the accompanying README file. | |
7 * | |
8 * This file contains a fast, not so accurate integer implementation of the | |
9 * forward DCT (Discrete Cosine Transform). | |
10 * | |
11 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT | |
12 * on each column. Direct algorithms are also available, but they are | |
13 * much more complex and seem not to be any faster when reduced to code. | |
14 * | |
15 * This implementation is based on Arai, Agui, and Nakajima's algorithm for | |
16 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in | |
17 * Japanese, but the algorithm is described in the Pennebaker & Mitchell | |
18 * JPEG textbook (see REFERENCES section in file README). The following code | |
19 * is based directly on figure 4-8 in P&M. | |
20 * While an 8-point DCT cannot be done in less than 11 multiplies, it is | |
21 * possible to arrange the computation so that many of the multiplies are | |
22 * simple scalings of the final outputs. These multiplies can then be | |
23 * folded into the multiplications or divisions by the JPEG quantization | |
24 * table entries. The AA&N method leaves only 5 multiplies and 29 adds | |
25 * to be done in the DCT itself. | |
26 * The primary disadvantage of this method is that with fixed-point math, | |
27 * accuracy is lost due to imprecise representation of the scaled | |
28 * quantization values. The smaller the quantization table entry, the less | |
29 * precise the scaled value, so this implementation does worse with high- | |
30 * quality-setting files than with low-quality ones. | |
31 */ | |
32 | |
1106 | 33 /** |
34 * @file jfdctfst.c | |
35 * Independent JPEG Group's fast AAN dct. | |
36 */ | |
2967 | 37 |
0 | 38 #include <stdlib.h> |
39 #include <stdio.h> | |
40 #include "common.h" | |
41 #include "dsputil.h" | |
42 | |
43 #define DCTSIZE 8 | |
44 #define GLOBAL(x) x | |
45 #define RIGHT_SHIFT(x, n) ((x) >> (n)) | |
46 #define SHIFT_TEMPS | |
47 | |
48 /* | |
49 * This module is specialized to the case DCTSIZE = 8. | |
50 */ | |
51 | |
52 #if DCTSIZE != 8 | |
53 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ | |
54 #endif | |
55 | |
56 | |
57 /* Scaling decisions are generally the same as in the LL&M algorithm; | |
58 * see jfdctint.c for more details. However, we choose to descale | |
59 * (right shift) multiplication products as soon as they are formed, | |
60 * rather than carrying additional fractional bits into subsequent additions. | |
61 * This compromises accuracy slightly, but it lets us save a few shifts. | |
62 * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) | |
63 * everywhere except in the multiplications proper; this saves a good deal | |
64 * of work on 16-bit-int machines. | |
65 * | |
66 * Again to save a few shifts, the intermediate results between pass 1 and | |
67 * pass 2 are not upscaled, but are represented only to integral precision. | |
68 * | |
69 * A final compromise is to represent the multiplicative constants to only | |
70 * 8 fractional bits, rather than 13. This saves some shifting work on some | |
71 * machines, and may also reduce the cost of multiplication (since there | |
72 * are fewer one-bits in the constants). | |
73 */ | |
74 | |
75 #define CONST_BITS 8 | |
76 | |
77 | |
78 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus | |
79 * causing a lot of useless floating-point operations at run time. | |
80 * To get around this we use the following pre-calculated constants. | |
81 * If you change CONST_BITS you may want to add appropriate values. | |
82 * (With a reasonable C compiler, you can just rely on the FIX() macro...) | |
83 */ | |
84 | |
85 #if CONST_BITS == 8 | |
2979 | 86 #define FIX_0_382683433 ((int32_t) 98) /* FIX(0.382683433) */ |
87 #define FIX_0_541196100 ((int32_t) 139) /* FIX(0.541196100) */ | |
88 #define FIX_0_707106781 ((int32_t) 181) /* FIX(0.707106781) */ | |
89 #define FIX_1_306562965 ((int32_t) 334) /* FIX(1.306562965) */ | |
0 | 90 #else |
91 #define FIX_0_382683433 FIX(0.382683433) | |
92 #define FIX_0_541196100 FIX(0.541196100) | |
93 #define FIX_0_707106781 FIX(0.707106781) | |
94 #define FIX_1_306562965 FIX(1.306562965) | |
95 #endif | |
96 | |
97 | |
98 /* We can gain a little more speed, with a further compromise in accuracy, | |
99 * by omitting the addition in a descaling shift. This yields an incorrectly | |
100 * rounded result half the time... | |
101 */ | |
102 | |
103 #ifndef USE_ACCURATE_ROUNDING | |
104 #undef DESCALE | |
105 #define DESCALE(x,n) RIGHT_SHIFT(x, n) | |
106 #endif | |
107 | |
108 | |
1064 | 109 /* Multiply a DCTELEM variable by an int32_t constant, and immediately |
0 | 110 * descale to yield a DCTELEM result. |
111 */ | |
112 | |
113 #define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS)) | |
114 | |
1589 | 115 static always_inline void row_fdct(DCTELEM * data){ |
116 int_fast16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
117 int_fast16_t tmp10, tmp11, tmp12, tmp13; | |
118 int_fast16_t z1, z2, z3, z4, z5, z11, z13; | |
0 | 119 DCTELEM *dataptr; |
120 int ctr; | |
121 SHIFT_TEMPS | |
122 | |
123 /* Pass 1: process rows. */ | |
124 | |
125 dataptr = data; | |
126 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { | |
127 tmp0 = dataptr[0] + dataptr[7]; | |
128 tmp7 = dataptr[0] - dataptr[7]; | |
129 tmp1 = dataptr[1] + dataptr[6]; | |
130 tmp6 = dataptr[1] - dataptr[6]; | |
131 tmp2 = dataptr[2] + dataptr[5]; | |
132 tmp5 = dataptr[2] - dataptr[5]; | |
133 tmp3 = dataptr[3] + dataptr[4]; | |
134 tmp4 = dataptr[3] - dataptr[4]; | |
2967 | 135 |
0 | 136 /* Even part */ |
2967 | 137 |
2979 | 138 tmp10 = tmp0 + tmp3; /* phase 2 */ |
0 | 139 tmp13 = tmp0 - tmp3; |
140 tmp11 = tmp1 + tmp2; | |
141 tmp12 = tmp1 - tmp2; | |
2967 | 142 |
0 | 143 dataptr[0] = tmp10 + tmp11; /* phase 3 */ |
144 dataptr[4] = tmp10 - tmp11; | |
2967 | 145 |
0 | 146 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ |
2979 | 147 dataptr[2] = tmp13 + z1; /* phase 5 */ |
0 | 148 dataptr[6] = tmp13 - z1; |
2967 | 149 |
0 | 150 /* Odd part */ |
151 | |
2979 | 152 tmp10 = tmp4 + tmp5; /* phase 2 */ |
0 | 153 tmp11 = tmp5 + tmp6; |
154 tmp12 = tmp6 + tmp7; | |
155 | |
156 /* The rotator is modified from fig 4-8 to avoid extra negations. */ | |
157 z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
2979 | 158 z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ |
159 z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
160 z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
0 | 161 |
2979 | 162 z11 = tmp7 + z3; /* phase 5 */ |
0 | 163 z13 = tmp7 - z3; |
164 | |
2979 | 165 dataptr[5] = z13 + z2; /* phase 6 */ |
0 | 166 dataptr[3] = z13 - z2; |
167 dataptr[1] = z11 + z4; | |
168 dataptr[7] = z11 - z4; | |
169 | |
2979 | 170 dataptr += DCTSIZE; /* advance pointer to next row */ |
0 | 171 } |
1589 | 172 } |
0 | 173 |
1589 | 174 /* |
175 * Perform the forward DCT on one block of samples. | |
176 */ | |
177 | |
178 GLOBAL(void) | |
179 fdct_ifast (DCTELEM * data) | |
180 { | |
181 int_fast16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
182 int_fast16_t tmp10, tmp11, tmp12, tmp13; | |
183 int_fast16_t z1, z2, z3, z4, z5, z11, z13; | |
184 DCTELEM *dataptr; | |
185 int ctr; | |
186 SHIFT_TEMPS | |
187 | |
188 row_fdct(data); | |
2967 | 189 |
0 | 190 /* Pass 2: process columns. */ |
191 | |
192 dataptr = data; | |
193 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { | |
194 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; | |
195 tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; | |
196 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; | |
197 tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; | |
198 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; | |
199 tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; | |
200 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; | |
201 tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; | |
2967 | 202 |
0 | 203 /* Even part */ |
2967 | 204 |
2979 | 205 tmp10 = tmp0 + tmp3; /* phase 2 */ |
0 | 206 tmp13 = tmp0 - tmp3; |
207 tmp11 = tmp1 + tmp2; | |
208 tmp12 = tmp1 - tmp2; | |
2967 | 209 |
0 | 210 dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ |
211 dataptr[DCTSIZE*4] = tmp10 - tmp11; | |
2967 | 212 |
0 | 213 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ |
214 dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ | |
215 dataptr[DCTSIZE*6] = tmp13 - z1; | |
2967 | 216 |
0 | 217 /* Odd part */ |
218 | |
2979 | 219 tmp10 = tmp4 + tmp5; /* phase 2 */ |
0 | 220 tmp11 = tmp5 + tmp6; |
221 tmp12 = tmp6 + tmp7; | |
222 | |
223 /* The rotator is modified from fig 4-8 to avoid extra negations. */ | |
224 z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
225 z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ | |
226 z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
227 z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
228 | |
2979 | 229 z11 = tmp7 + z3; /* phase 5 */ |
0 | 230 z13 = tmp7 - z3; |
231 | |
232 dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ | |
233 dataptr[DCTSIZE*3] = z13 - z2; | |
234 dataptr[DCTSIZE*1] = z11 + z4; | |
235 dataptr[DCTSIZE*7] = z11 - z4; | |
236 | |
2979 | 237 dataptr++; /* advance pointer to next column */ |
0 | 238 } |
239 } | |
440
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
240 |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
241 /* |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
242 * Perform the forward 2-4-8 DCT on one block of samples. |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
243 */ |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
244 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
245 GLOBAL(void) |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
246 fdct_ifast248 (DCTELEM * data) |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
247 { |
1589 | 248 int_fast16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
249 int_fast16_t tmp10, tmp11, tmp12, tmp13; | |
250 int_fast16_t z1; | |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
251 DCTELEM *dataptr; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
252 int ctr; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
253 SHIFT_TEMPS |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
254 |
1589 | 255 row_fdct(data); |
2967 | 256 |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
257 /* Pass 2: process columns. */ |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
258 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
259 dataptr = data; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
260 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
261 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
262 tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
263 tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
264 tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
265 tmp4 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*1]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
266 tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
267 tmp6 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
268 tmp7 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
269 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
270 /* Even part */ |
2967 | 271 |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
272 tmp10 = tmp0 + tmp3; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
273 tmp11 = tmp1 + tmp2; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
274 tmp12 = tmp1 - tmp2; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
275 tmp13 = tmp0 - tmp3; |
2967 | 276 |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
277 dataptr[DCTSIZE*0] = tmp10 + tmp11; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
278 dataptr[DCTSIZE*4] = tmp10 - tmp11; |
2967 | 279 |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
280 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
281 dataptr[DCTSIZE*2] = tmp13 + z1; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
282 dataptr[DCTSIZE*6] = tmp13 - z1; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
283 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
284 tmp10 = tmp4 + tmp7; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
285 tmp11 = tmp5 + tmp6; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
286 tmp12 = tmp5 - tmp6; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
287 tmp13 = tmp4 - tmp7; |
2967 | 288 |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
289 dataptr[DCTSIZE*1] = tmp10 + tmp11; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
290 dataptr[DCTSIZE*5] = tmp10 - tmp11; |
2967 | 291 |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
292 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
293 dataptr[DCTSIZE*3] = tmp13 + z1; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
294 dataptr[DCTSIZE*7] = tmp13 - z1; |
2967 | 295 |
2979 | 296 dataptr++; /* advance pointer to next column */ |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
297 } |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
298 } |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1106
diff
changeset
|
299 |
440
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
300 |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
301 #undef GLOBAL |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
302 #undef CONST_BITS |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
303 #undef DESCALE |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
304 #undef FIX_0_541196100 |
000aeeac27a2
* started to cleanup name clashes for onetime compilation
kabi
parents:
0
diff
changeset
|
305 #undef FIX_1_306562965 |