comparison bfin/fdct_bfin.S @ 4765:85298e8c55c4 libavcodec

bfin dsputils, basic pixel operations sads, diffs, motion compensation and standard IEEE 8x8 block transforms patch by Marc Hoffman, mmh pleasantst com
author diego
date Sun, 01 Apr 2007 22:28:45 +0000
parents
children 75bf61c6c385
comparison
equal deleted inserted replaced
4764:da0598df2e53 4765:85298e8c55c4
1 /*
2 * fdct BlackFin
3 *
4 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22 /*
23 void ff_bfin_fdct (DCTELEM *buf);
24
25 This implementation works only for 8x8 input. The range of input
26 must be -256 to 255 i.e. 8bit input represented in a 16bit data
27 word. The original data must be sign extended into the 16bit data
28 words.
29
30
31 Chen factorization of
32
33 8
34 X(m) = sum (x(n) * cos ((2n+1)*m*pi/16))
35 n=0
36
37 C4
38 0 --*-------------*0+7---*-----*0+3-------*-*-------------------> 0
39 \ / \ / X S4,S4
40 1 --*-\---------/-*1+6---*-\-/-*1+2-------*-*-------------------> 4
41 \ / \ -C4 C3
42 2 --*---\-----/---*2+5---*-/-\-*1-2---------------*-*-----------> 2
43 \ / / \ X S3,-S3
44 3 --*-----\-/-----*3+4---*-----*0-3---------------*-*-----------> 6
45 / C7 C3
46 4 --*-----/-\-----*3-4------------*-*4+5--*-----*---------------> 1
47 / \ -C4 X \ /S7 C3
48 5 --*---/-----\---*2-5---*-*------*=*4-5----\-/------*-*--------> 5
49 / \ X S4,S4 / X S3,-S3
50 6 --*-/---------\-*1-6---*-*------*=*7-6----/-\------*-*--------> 3
51 / \ C4 X / \-S7 C3
52 --*-------------*0-7------------*-*7+6--*-----*---------------> 7
53 C7
54
55 Notation
56 Cn = cos(n*pi/8) used throughout the code.
57
58
59 Registers used:
60 R0, R1, R2, R3, R4, R5, R6,R7, P0, P1, P2, P3, P4, P5, A0, A1.
61 Other registers used:
62 I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0.
63
64 Input - r0 - pointer to start of DCTELEM *block
65
66 Output - The DCT output coefficients in the DCTELEM *block
67
68 Register constraint:
69 This code is called from jpeg_encode.
70 R6, R5, R4 if modified should be stored and restored.
71
72
73 Performance: (Timer version 0.6.33)
74 Code Size : 240 Bytes.
75 Memory Required :
76 Input Matrix : 8 * 8 * 2 Bytes.
77 Coefficients : 16 Bytes
78 Temporary matrix: 8 * 8 * 2 Bytes.
79 Cycle Count :26+{18+8*(14+2S)}*2 where S -> Stalls
80 (7.45 c/pel)
81 -----------------------------------------
82 | Size | Forward DCT | Inverse DCT |
83 -----------------------------------------
84 | 8x8 | 284 Cycles | 311 Cycles |
85 -----------------------------------------
86
87 Ck = int16(cos(k/16*pi)*32767+.5)/2
88 #define C4 23170
89 #define C3 13623
90 #define C6 6270
91 #define C7 3196
92
93 Sk = int16(sin(k/16*pi)*32767+.5)/2
94 #define S4 11585
95 #define S3 9102
96 #define S6 15137
97 #define S7 16069
98
99 the coefficients are ordered as follows:
100 short dct_coef[]
101 C4,S4,
102 C6,S6,
103 C7,S7,
104 S3,C3,
105
106 -----------------------------------------------------------
107 FFMPEG conformance testing results
108 -----------------------------------------------------------
109 dct-test: modified with the following
110 dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test);
111 produces the following output:
112
113 root:/u/ffmpeg/bhead/libavcodec> ./dct-test
114 ffmpeg DCT/IDCT test
115
116 2 -131 -6 -48 -36 33 -83 24
117 34 52 -24 -15 5 92 57 143
118 -67 -43 -1 74 -16 5 -71 32
119 -78 106 92 -34 -38 81 20 -18
120 7 -62 40 2 -15 90 -62 -83
121 -83 1 -104 -13 43 -19 7 11
122 -63 31 12 -29 83 72 21 10
123 -17 -63 -15 73 50 -91 159 -14
124 DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27
125 DCT BFINfdct: 92.1 kdct/s
126 root:/u/ffmpeg/bhead/libavcodec>
127
128 */
129
130 #include "config_bfin.h"
131
132 .section .l1.data.B,"aw",@progbits
133 .align 4;
134 dct_coeff:
135 .short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537;
136
137 .section .l1.data.A,"aw",@progbits
138 .align 4
139 vtmp: .space 128
140
141 DEFUN(fdct,mL1,
142 (DCTELEM *block)):
143 [--SP] = (R7:4, P5:3); // Push the registers onto the stack.
144
145 b0 = r0;
146 r0 = [P3+dct_coeff@GOT17M4];
147 b3 = r0;
148 r0 = [P3+vtmp@GOT17M4];
149 b2 = r0;
150
151 L3 = 16; // L3 is set to 16 to make the coefficient
152 // array Circular.
153
154
155 //----------------------------------------------------------------------------
156
157 /*
158 * I0, I1, and I2 registers are used to read the input data. I3 register is used
159 * to read the coefficients. P0 and P1 registers are used for writing the output
160 * data.
161 */
162 M0 = 12 (X); // All these initializations are used in the
163 M1 = 16 (X); // modification of address offsets.
164
165 M2 = 128 (X);
166
167 P2 = 16;
168 P3 = 32 (X);
169 P4 = -110 (X);
170 P5 = -62 (X);
171 P0 = 2(X);
172
173
174 // Prescale the input to get the correct precision.
175 i0=b0;
176 i1=b0;
177
178 lsetup (.0, .1) LC0 = P3;
179 r0=[i0++];
180 .0: r1=r0<<3 (v) || r0=[i0++] ;
181 .1: [i1++]=r1;
182
183 /*
184 * B0 points to the "in" buffer.
185 * B2 points to "temp" buffer in the first iteration.
186 */
187
188 lsetup (.2, .3) LC0 = P0;
189 .2:
190 I0 = B0; // I0 points to Input Element (0, 0).
191 I1 = B0; // Element 1 and 0 is read in R0.
192 I1 += M0 || R0 = [I0++]; // I1 points to Input Element (0, 6).
193 I2 = I1; // Element 6 is read into R3.H.
194 I2 -= 4 || R3.H = W[I1++]; // I2 points to Input Element (0, 4).
195
196 I3 = B3; // I3 points to Coefficients.
197 P0 = B2; // P0 points to temporary array Element
198 // (0, 0).
199 P1 = B2; // P1 points to temporary array.
200 R7 = [P1++P2] || R2 = [I2++]; // P1 points to temporary array
201 // Element (1, 0).
202 // R7 is a dummy read. X4,X5
203 // are read into R2.
204 R3.L = W[I1--]; // X7 is read into R3.L.
205 R1.H = W[I0++]; // X2 is read into R1.H.
206
207
208 /*
209 * X0 = (X0 + X7) / 2.
210 * X1 = (X1 + X6) / 2.
211 * X6 = (X1 - X6) / 2.
212 * X7 = (X0 - X7) / 2.
213 * It reads the data 3 in R1.L.
214 */
215
216 R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP;
217
218 /*
219 * X2 = (X2 + X5) / 2.
220 * X3 = (X3 + X4) / 2.
221 * X4 = (X3 - X4) / 2.
222 * X5 = (X2 - X5) / 2.
223 * R7 = C4 = cos(4*pi/16)
224 */
225
226 R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO) || NOP || R7 = [I3++];
227
228 /*
229 * At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
230 * R3 has (6,7).
231 * Where the notation (x, y) represents uper/lower half pairs.
232 */
233
234 /*
235 * X0 = X0 + X3.
236 * X1 = X1 + X2.
237 * X2 = X1 - X2.
238 * X3 = X0 - X3.
239 */
240 R0 = R0 +|+ R1, R1 = R0 -|- R1;
241
242 lsetup (.row0, .row1) LC1 = P2 >> 1; // 1d dct, loops 8x
243 .row0:
244
245 /*
246 * This is part 2 computation continued.....
247 * A1 = X6 * cos(pi/4)
248 * A0 = X6 * cos(pi/4)
249 * A1 = A1 - X5 * cos(pi/4)
250 * A0 = A0 + X5 * cos(pi/4).
251 * The instruction W[I0] = R3.L is used for packing it to R2.L.
252 */
253
254 A1=R3.H*R7.l, A0=R3.H*R7.l || I1+=M1 || W[I0] = R3.L;
255 R4.H=(A1-=R2.L*R7.l), R4.L=(A0+=R2.L*R7.l) || I2+=M0 || NOP;
256
257 /* R0 = (X1,X0) R1 = (X2,X3) R4 = (X5, X6). */
258
259 /*
260 * A1 = X0 * cos(pi/4)
261 * A0 = X0 * cos(pi/4)
262 * A1 = A1 - X1 * cos(pi/4)
263 * A0 = A0 + X1 * cos(pi/4)
264 * R7 = (C2,C6)
265 */
266 A1=R0.L*R7.h, A0=R0.L*R7.h || NOP || R3.H=W[I1++];
267 R5.H=(A1-=R0.H*R7.h),R5.L=(A0+=R0.H*R7.h) || R7=[I3++] || NOP;
268
269 /*
270 * A1 = X2 * cos(3pi/8)
271 * A0 = X3 * cos(3pi/8)
272 * A1 = A1 + X3 * cos(pi/8)
273 * A0 = A0 - X2 * cos(pi/8)
274 * R3 = cos(pi/4)
275 * R7 = (cos(7pi/8),cos(pi/8))
276 * X4 = X4 + X5.
277 * X5 = X4 - X5.
278 * X6 = X7 - X6.
279 * X7 = X7 + X6.
280 */
281 A1=R1.H*R7.L, A0=R1.L*R7.L || W[P0++P3]=R5.L || R2.L=W[I0];
282 R2=R2+|+R4, R4=R2-|-R4 || I0+=4 || R3.L=W[I1--];
283 R6.H=(A1+=R1.L*R7.H),R6.L=(A0 -= R1.H * R7.H) || I0+=4 || R7=[I3++];
284
285 /* R2 = (X4, X7) R4 = (X5,X6) R5 = (X1, X0) R6 = (X2,X3). */
286
287 /*
288 * A1 = X4 * cos(7pi/16)
289 * A0 = X7 * cos(7pi/16)
290 * A1 = A1 + X7 * cos(pi/16)
291 * A0 = A0 - X4 * cos(pi/16)
292 */
293
294 A1=R2.H*R7.L, A0=R2.L*R7.L || W[P0++P3]=R6.H || R0=[I0++];
295 R2.H=(A1+=R2.L*R7.H),R2.L=(A0-=R2.H*R7.H) || W[P0++P3]=R5.H || R7=[I3++];
296
297 /*
298 * A1 = X5 * cos(3pi/16)
299 * A0 = X6 * cos(3pi/16)
300 * A1 = A1 + X6 * cos(5pi/16)
301 * A0 = A0 - X5 * cos(5pi/16)
302 * The output values are written.
303 */
304
305 A1=R4.H*R7.H, A0=R4.L*R7.H || W[P0++P2]=R6.L || R1.H=W[I0++];
306 R4.H=(A1+=R4.L*R7.L),R4.L=(A0-=R4.H*R7.L) || W[P0++P4]=R2.L || R1.L=W[I0++];
307
308
309 /* Beginning of next stage, **pipelined** + drain and store the
310 rest of the column store. */
311
312 R0=R0+|+R3,R3=R0-|-R3 || W[P1++P3]=R2.H || R2=[I2++];
313 R1=R1+|+R2,R2=R1-|-R2 (CO) || W[P1++P3]=R4.L || R7=[I3++];
314 .row1: R0=R0+|+R1,R1=R0-|-R1 || W[P1++P5]=R4.H || NOP;
315
316 // Exchange input with output.
317 B1 = B0;
318 B0 = B2;
319 .3: B2 = B1;
320
321 L3=0;
322 (r7:4,p5:3) = [sp++];
323 RTS;
324