comparison bfin/dsputil_bfin.c @ 4765:85298e8c55c4 libavcodec

bfin dsputils, basic pixel operations sads, diffs, motion compensation and standard IEEE 8x8 block transforms patch by Marc Hoffman, mmh pleasantst com
author diego
date Sun, 01 Apr 2007 22:28:45 +0000
parents c8c591fe26f8
children 65ee324848ac
comparison
equal deleted inserted replaced
4764:da0598df2e53 4765:85298e8c55c4
1 /* 1 /*
2 * Copyright (c) 2006 Michael Benjamin 2 * BlackFin DSPUTILS
3 *
4 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
3 * 6 *
4 * This file is part of FFmpeg. 7 * This file is part of FFmpeg.
5 * 8 *
6 * FFmpeg is free software; you can redistribute it and/or 9 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public 10 * modify it under the terms of the GNU Lesser General Public
16 * You should have received a copy of the GNU Lesser General Public 19 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software 20 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */ 22 */
20 23
24 #include <unistd.h>
25 #include <bits/bfin_sram.h>
21 #include "../avcodec.h" 26 #include "../avcodec.h"
22 #include "../dsputil.h" 27 #include "../dsputil.h"
23 28
24 static int sad8x8_bfin( void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h ) 29 #define USE_L1CODE
25 { 30
26 int sum; 31 #ifdef USE_L1CODE
27 __asm__ __volatile__ ( 32 #define L1CODE __attribute__ ((l1_text))
28 "P0 = %1;" // blk1 33 #else
29 "P1 = %2;" // blk2 34 #define L1CODE
30 "P2 = %3;\n" // h 35 #endif
31 "I0 = P0;" 36 int off;
32 "I1 = P1;\n" 37
33 "A0 = 0;" 38
34 "A1 = 0;\n" 39 extern void ff_bfin_idct (DCTELEM *block) L1CODE;
35 "M0 = P2;\n" 40 extern void ff_bfin_fdct (DCTELEM *block) L1CODE;
36 "P3 = 32;\n" 41 extern void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
37 "LSETUP (sad8x8LoopBegin, sad8x8LoopEnd) LC0=P3;\n" 42 extern void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
38 "sad8x8LoopBegin:\n" 43 extern void ff_bfin_diff_pixels (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride) L1CODE;
39 " DISALGNEXCPT || R0 = [I0] || R2 = [I1];\n" 44 extern void ff_bfin_get_pixels (DCTELEM *restrict block, const uint8_t *pixels, int line_size) L1CODE;
40 " DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];\n" 45 extern int ff_bfin_pix_norm1 (uint8_t * pix, int line_size) L1CODE;
41 "sad8x8LoopEnd:\n" 46 extern int ff_bfin_z_sad8x8 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
42 " SAA ( R1:0 , R3:2 );\n" 47 extern int ff_bfin_z_sad16x16 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
43 "R3 = A1.L + A1.H, R2 = A0.L + A0.H;\n" 48
44 "%0 = R2 + R3 (S);\n" 49 extern void ff_bfin_z_put_pixels16_xy2 (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
45 : "=&d" (sum) 50 extern void ff_bfin_z_put_pixels8_xy2 (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
46 : "m"(blk1), "m"(blk2), "m"(h) 51 extern void ff_bfin_put_pixels16_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
47 : "P0","P1","P2","I0","I1","A0","A1","R0","R1","R2","R3"); 52 extern void ff_bfin_put_pixels8_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
48 return sum; 53
49 } 54
55 extern int ff_bfin_pix_sum (uint8_t *p, int stride) L1CODE;
56
57 extern void ff_bfin_put_pixels8uc (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
58 extern void ff_bfin_put_pixels16uc (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
59 extern void ff_bfin_put_pixels8uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
60 extern void ff_bfin_put_pixels16uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
61
62 extern int ff_bfin_sse4 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
63 extern int ff_bfin_sse8 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
64 extern int ff_bfin_sse16 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
65
66
67 #if 0
68 void pblk (uint8_t *p, int w, int h, int s)
69 {
70 int i,j;
71 av_log (0,0,"0x%08x:\n", p);
72 for (i = 0;i<h;i++) {
73 for (j=0;j<w;j++)
74 av_log (0,0,"%3d ", p[j]);
75 p+=s;
76 av_log (0,0,"\n");
77 }
78 av_log (0,0,"\n");
79 }
80 #endif
81
82 static void bfin_idct_add (uint8_t *dest, int line_size, DCTELEM *block)
83 {
84 ff_bfin_idct (block);
85 ff_bfin_add_pixels_clamped (block, dest, line_size);
86 }
87
88 static void bfin_idct_put (uint8_t *dest, int line_size, DCTELEM *block)
89 {
90 ff_bfin_idct (block);
91 ff_bfin_put_pixels_clamped (block, dest, line_size);
92 }
93
94
95 static void bfin_clear_blocks (DCTELEM *blocks)
96 {
97 // This is just a simple memset.
98 //
99 asm("P0=192; "
100 "I0=%0; "
101 "R0=0; "
102 "LSETUP(clear_blocks_blkfn_lab,clear_blocks_blkfn_lab)LC0=P0;"
103 "clear_blocks_blkfn_lab:"
104 "[I0++]=R0;"
105 ::"a" (blocks):"P0","I0","R0");
106 }
107
108
109
110 static void bfin_put_pixels8 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
111 {
112 ff_bfin_put_pixels8uc (block, pixels, pixels, line_size, line_size, h);
113 }
114
115 static void bfin_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
116 {
117 ff_bfin_put_pixels8uc (block, pixels, pixels+1, line_size, line_size, h);
118 }
119
120 static void bfin_put_pixels8_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
121 {
122 ff_bfin_put_pixels8uc (block, pixels, pixels+line_size, line_size, line_size, h);
123 }
124
125 static void bfin_put_pixels8_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
126 {
127 ff_bfin_z_put_pixels8_xy2 (block,s0,line_size, line_size, h);
128 }
129
130 static void bfin_put_pixels16 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
131 {
132 ff_bfin_put_pixels16uc (block, pixels, pixels, line_size, line_size, h);
133 }
134
135 static void bfin_put_pixels16_x2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
136 {
137 ff_bfin_put_pixels16uc (block, pixels, pixels+1, line_size, line_size, h);
138 }
139
140 static void bfin_put_pixels16_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
141 {
142 ff_bfin_put_pixels16uc (block, pixels, pixels+line_size, line_size, line_size, h);
143 }
144
145 static void bfin_put_pixels16_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
146 {
147 ff_bfin_z_put_pixels16_xy2 (block,s0,line_size, line_size, h);
148 }
149
150 void bfin_put_pixels8_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
151 {
152 ff_bfin_put_pixels8uc_nornd (block, pixels, pixels, line_size, h);
153 }
154
155 static void bfin_put_pixels8_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
156 {
157 ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+1, line_size, h);
158 }
159
160 static void bfin_put_pixels8_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
161 {
162 ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+line_size, line_size, h);
163 }
164
165
166 void bfin_put_pixels16_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
167 {
168 ff_bfin_put_pixels16uc_nornd (block, pixels, pixels, line_size, h);
169 }
170
171 static void bfin_put_pixels16_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
172 {
173 ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+1, line_size, h);
174 }
175
176 static void bfin_put_pixels16_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
177 {
178 ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+line_size, line_size, h);
179 }
180
181 static int bfin_pix_abs16 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
182 {
183 return ff_bfin_z_sad16x16 (blk1,blk2,line_size,line_size,h);
184 }
185
186 static uint8_t vtmp_blk[256] __attribute__((l1_data_B));
187
188 static int bfin_pix_abs16_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
189 {
190 ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+1, 16, line_size, h);
191 return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
192 }
193
194 static int bfin_pix_abs16_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
195 {
196 ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+line_size, 16, line_size, h);
197 return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
198 }
199
200 static int bfin_pix_abs16_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
201 {
202 ff_bfin_z_put_pixels16_xy2 (vtmp_blk, blk2, 16, line_size, h);
203 return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
204 }
205
206 static int bfin_pix_abs8 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
207 {
208 return ff_bfin_z_sad8x8 (blk1,blk2,line_size,line_size, h);
209 }
210
211 static int bfin_pix_abs8_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
212 {
213 ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+1, 8, line_size, h);
214 return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
215 }
216
217 static int bfin_pix_abs8_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
218 {
219 ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+line_size, 8, line_size, h);
220 return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
221 }
222
223 static int bfin_pix_abs8_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
224 {
225 ff_bfin_z_put_pixels8_xy2 (vtmp_blk, blk2, 8, line_size, h);
226 return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
227 }
228
229
230 /*
231 decoder optimization
232 start on 2/11 100 frames of 352x240@25 compiled with no optimization -g debugging
233 9.824s ~ 2.44x off
234 6.360s ~ 1.58x off with -O2
235 5.740s ~ 1.43x off with idcts
236
237 2.64s 2/20 same sman.mp4 decode only
238
239 */
50 240
51 void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx ) 241 void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
52 { 242 {
53 c->pix_abs[1][0] = sad8x8_bfin; 243 c->get_pixels = ff_bfin_get_pixels;
54 c->sad[1] = sad8x8_bfin; 244 c->diff_pixels = ff_bfin_diff_pixels;
55 } 245 c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
246 c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
247
248 c->clear_blocks = bfin_clear_blocks;
249 c->pix_sum = ff_bfin_pix_sum;
250 c->pix_norm1 = ff_bfin_pix_norm1;
251
252 c->sad[0] = bfin_pix_abs16;
253 c->sad[1] = bfin_pix_abs8;
254
255 /* TODO [0] 16 [1] 8 */
256 c->pix_abs[0][0] = bfin_pix_abs16;
257 c->pix_abs[0][1] = bfin_pix_abs16_x2;
258 c->pix_abs[0][2] = bfin_pix_abs16_y2;
259 c->pix_abs[0][3] = bfin_pix_abs16_xy2;
260
261 c->pix_abs[1][0] = bfin_pix_abs8;
262 c->pix_abs[1][1] = bfin_pix_abs8_x2;
263 c->pix_abs[1][2] = bfin_pix_abs8_y2;
264 c->pix_abs[1][3] = bfin_pix_abs8_xy2;
265
266
267 c->sse[0] = ff_bfin_sse16;
268 c->sse[1] = ff_bfin_sse8;
269 c->sse[2] = ff_bfin_sse4;
270
271
272 /**
273 * Halfpel motion compensation with rounding (a+b+1)>>1.
274 * This is an array[4][4] of motion compensation functions for 4
275 * horizontal blocksizes (8,16) and the 4 halfpel positions
276 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
277 * @param block destination where the result is stored
278 * @param pixels source
279 * @param line_size number of bytes in a horizontal line of block
280 * @param h height
281 */
282
283 c->put_pixels_tab[0][0] = bfin_put_pixels16;
284 c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
285 c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
286 c->put_pixels_tab[0][3] = bfin_put_pixels16_xy2;
287
288 c->put_pixels_tab[1][0] = bfin_put_pixels8;
289 c->put_pixels_tab[1][1] = bfin_put_pixels8_x2;
290 c->put_pixels_tab[1][2] = bfin_put_pixels8_y2;
291 c->put_pixels_tab[1][3] = bfin_put_pixels8_xy2;
292
293 c->put_no_rnd_pixels_tab[1][0] = bfin_put_pixels8_nornd;
294 c->put_no_rnd_pixels_tab[1][1] = bfin_put_pixels8_x2_nornd;
295 c->put_no_rnd_pixels_tab[1][2] = bfin_put_pixels8_y2_nornd;
296 c->put_no_rnd_pixels_tab[1][3] = ff_bfin_put_pixels8_xy2_nornd;
297
298 c->put_no_rnd_pixels_tab[0][0] = bfin_put_pixels16_nornd;
299 c->put_no_rnd_pixels_tab[0][1] = bfin_put_pixels16_x2_nornd;
300 c->put_no_rnd_pixels_tab[0][2] = bfin_put_pixels16_y2_nornd;
301 c->put_no_rnd_pixels_tab[0][3] = ff_bfin_put_pixels16_xy2_nornd;
302
303 c->fdct = ff_bfin_fdct;
304 c->idct = ff_bfin_idct;
305 c->idct_add = bfin_idct_add;
306 c->idct_put = bfin_idct_put;
307 }
308
309
310