Mercurial > libavcodec.hg
annotate ppc/snow_altivec.c @ 5818:e0a872dd3ea1 libavcodec
Fix MJPEG decoder for AMV files.
Since decoding is doing from the end and aligned by 16
previous code worked correctly only when picture height was dividable by 16,
otherwise it provides garbage in top lines and truncates bottom.
New code adjusts data[] pointers taking in account alignment issue.
author | voroshil |
---|---|
date | Sat, 13 Oct 2007 17:38:58 +0000 |
parents | 784dcbdc910f |
children | f7cbb7733146 |
rev | line source |
---|---|
3222 | 1 /* |
5749 | 2 * AltiVec-optimized snow DSP utils |
3222 | 3 * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
3222 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
3222 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
3222 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3222 | 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 */ | |
21 | |
5010
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
3947
diff
changeset
|
22 #include "dsputil.h" |
3222 | 23 |
24 #include "gcc_fixes.h" | |
25 #include "dsputil_altivec.h" | |
5010
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
3947
diff
changeset
|
26 #include "snow.h" |
3222 | 27 |
28 #undef NDEBUG | |
29 #include <assert.h> | |
30 | |
31 | |
32 | |
33 //FIXME remove this replication | |
34 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num))) | |
35 | |
36 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line) | |
37 { | |
38 int offset; | |
39 DWTELEM * buffer; | |
40 | |
41 // av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line); | |
42 | |
43 assert(buf->data_stack_top >= 0); | |
44 // assert(!buf->line[line]); | |
45 if (buf->line[line]) | |
46 return buf->line[line]; | |
47 | |
48 offset = buf->line_width * line; | |
49 buffer = buf->data_stack[buf->data_stack_top]; | |
50 buf->data_stack_top--; | |
51 buf->line[line] = buffer; | |
52 | |
53 // av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1); | |
54 | |
55 return buffer; | |
56 } | |
57 | |
58 | |
59 //altivec code | |
60 | |
5612 | 61 void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width) |
3222 | 62 { |
5612 | 63 #if 0 |
3222 | 64 const int w2= (width+1)>>1; |
5612 | 65 DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]); |
3222 | 66 const int w_l= (width>>1); |
67 const int w_r= w2 - 1; | |
68 int i; | |
5612 | 69 vector signed short t1, t2, x, y, tmp1, tmp2; |
70 vector signed short *vbuf, *vtmp; | |
3222 | 71 vector unsigned char align; |
72 | |
73 { // Lift 0 | |
5612 | 74 IDWTELEM * const ref = b + w2 - 1; |
75 IDWTELEM b_0 = b[0]; | |
76 vector signed short v7 = vec_splat_s16(7); | |
77 vbuf = (vector signed short *)b; | |
3222 | 78 |
79 tmp1 = vec_ld (0, ref); | |
80 align = vec_lvsl (0, ref); | |
81 tmp2 = vec_ld (15, ref); | |
5612 | 82 t1 = vec_perm(tmp1, tmp2, align); |
3222 | 83 |
84 for (i=0; i<w_l-15; i+=16) { | |
85 #if 0 | |
5612 | 86 /* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3); |
3222 | 87 b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3); |
88 b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3); | |
5612 | 89 b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/ |
90 b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8); | |
3222 | 91 #else |
92 | |
5612 | 93 tmp1 = vec_ld (0, ref+8+i); |
94 tmp2 = vec_ld (15, ref+8+i); | |
3222 | 95 |
96 t2 = vec_perm(tmp1, tmp2, align); | |
97 | |
5612 | 98 y = vec_add(t1, vec_sld(t1,t2,2)); |
99 // y = vec_add(vec_add(y,y),y); | |
3222 | 100 |
5612 | 101 tmp1 = vec_ld (0, ref+12+i); |
3222 | 102 |
103 y = vec_add(y, vec_splat_s32(4)); | |
104 y = vec_sra(y, vec_splat_u32(3)); | |
105 | |
5612 | 106 tmp2 = vec_ld (15, ref+12+i); |
3222 | 107 |
108 *vbuf = vec_sub(*vbuf, y); | |
109 | |
5612 | 110 t1 = t2; |
3222 | 111 |
112 vbuf++; | |
113 | |
114 t2 = vec_perm(tmp1, tmp2, align); | |
115 | |
116 y = vec_add(t1,vec_sld(t1,t2,4)); | |
117 y = vec_add(vec_add(y,y),y); | |
118 | |
119 tmp1 = vec_ld (0, ref+12+i); | |
120 | |
121 y = vec_add(y, vec_splat_s32(4)); | |
122 y = vec_sra(y, vec_splat_u32(3)); | |
123 | |
124 tmp2 = vec_ld (15, ref+12+i); | |
125 | |
126 *vbuf = vec_sub(*vbuf, y); | |
127 | |
128 t1=t2; | |
129 | |
130 vbuf++; | |
131 | |
132 t2 = vec_perm(tmp1, tmp2, align); | |
133 | |
134 y = vec_add(t1,vec_sld(t1,t2,4)); | |
135 y = vec_add(vec_add(y,y),y); | |
136 | |
137 tmp1 = vec_ld (0, ref+16+i); | |
138 | |
139 y = vec_add(y, vec_splat_s32(4)); | |
140 y = vec_sra(y, vec_splat_u32(3)); | |
141 | |
142 tmp2 = vec_ld (15, ref+16+i); | |
143 | |
144 *vbuf = vec_sub(*vbuf, y); | |
145 | |
146 t1=t2; | |
147 | |
148 t2 = vec_perm(tmp1, tmp2, align); | |
149 | |
150 y = vec_add(t1,vec_sld(t1,t2,4)); | |
151 y = vec_add(vec_add(y,y),y); | |
152 | |
153 vbuf++; | |
154 | |
155 y = vec_add(y, vec_splat_s32(4)); | |
156 y = vec_sra(y, vec_splat_u32(3)); | |
157 *vbuf = vec_sub(*vbuf, y); | |
158 | |
159 t1=t2; | |
160 | |
161 vbuf++; | |
162 | |
163 #endif | |
5612 | 164 |
3222 | 165 } |
166 | |
167 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); | |
168 b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | |
169 } | |
170 | |
171 { // Lift 1 | |
172 DWTELEM * const dst = b+w2; | |
173 | |
174 i = 0; | |
175 for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){ | |
176 dst[i] = dst[i] - (b[i] + b[i + 1]); | |
177 } | |
178 | |
179 align = vec_lvsl(0, b+i); | |
180 tmp1 = vec_ld(0, b+i); | |
181 vbuf = (vector signed int*) (dst + i); | |
182 tmp2 = vec_ld(15, b+i); | |
183 | |
184 t1 = vec_perm(tmp1, tmp2, align); | |
185 | |
186 for (; i<w_r-3; i+=4) { | |
187 | |
188 #if 0 | |
189 dst[i] = dst[i] - (b[i] + b[i + 1]); | |
190 dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]); | |
191 dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]); | |
192 dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]); | |
193 #else | |
194 | |
195 tmp1 = vec_ld(0, b+4+i); | |
196 tmp2 = vec_ld(15, b+4+i); | |
197 | |
198 t2 = vec_perm(tmp1, tmp2, align); | |
199 | |
200 y = vec_add(t1, vec_sld(t1,t2,4)); | |
201 *vbuf = vec_sub (*vbuf, y); | |
202 | |
203 vbuf++; | |
204 | |
205 t1 = t2; | |
206 | |
207 #endif | |
208 | |
209 } | |
210 | |
211 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); | |
212 } | |
213 | |
214 { // Lift 2 | |
215 DWTELEM * const ref = b+w2 - 1; | |
216 DWTELEM b_0 = b[0]; | |
217 vbuf= (vector signed int *) b; | |
218 | |
219 tmp1 = vec_ld (0, ref); | |
220 align = vec_lvsl (0, ref); | |
221 tmp2 = vec_ld (15, ref); | |
222 t1= vec_perm(tmp1, tmp2, align); | |
223 | |
224 i = 0; | |
225 for (; i<w_l-15; i+=16) { | |
226 #if 0 | |
227 b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4); | |
228 b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4); | |
229 b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4); | |
230 b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4); | |
231 #else | |
232 tmp1 = vec_ld (0, ref+4+i); | |
233 tmp2 = vec_ld (15, ref+4+i); | |
234 | |
235 t2 = vec_perm(tmp1, tmp2, align); | |
236 | |
237 y = vec_add(t1,vec_sld(t1,t2,4)); | |
238 y = vec_sub(vec_splat_s32(8),y); | |
239 | |
240 tmp1 = vec_ld (0, ref+8+i); | |
241 | |
242 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
243 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
244 | |
245 tmp2 = vec_ld (15, ref+8+i); | |
246 | |
247 *vbuf = vec_sub( *vbuf, y); | |
248 | |
249 t1 = t2; | |
250 | |
251 vbuf++; | |
252 | |
253 t2 = vec_perm(tmp1, tmp2, align); | |
254 | |
255 y = vec_add(t1,vec_sld(t1,t2,4)); | |
256 y = vec_sub(vec_splat_s32(8),y); | |
257 | |
258 tmp1 = vec_ld (0, ref+12+i); | |
259 | |
260 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
261 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
262 | |
263 tmp2 = vec_ld (15, ref+12+i); | |
264 | |
265 *vbuf = vec_sub( *vbuf, y); | |
266 | |
267 t1 = t2; | |
268 | |
269 vbuf++; | |
270 | |
271 t2 = vec_perm(tmp1, tmp2, align); | |
272 | |
273 y = vec_add(t1,vec_sld(t1,t2,4)); | |
274 y = vec_sub(vec_splat_s32(8),y); | |
275 | |
276 tmp1 = vec_ld (0, ref+16+i); | |
277 | |
278 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
279 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
280 | |
281 tmp2 = vec_ld (15, ref+16+i); | |
282 | |
283 *vbuf = vec_sub( *vbuf, y); | |
284 | |
285 t1 = t2; | |
286 | |
287 vbuf++; | |
288 | |
289 t2 = vec_perm(tmp1, tmp2, align); | |
290 | |
291 y = vec_add(t1,vec_sld(t1,t2,4)); | |
292 y = vec_sub(vec_splat_s32(8),y); | |
293 | |
294 t1 = t2; | |
295 | |
296 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
297 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
298 *vbuf = vec_sub( *vbuf, y); | |
299 | |
300 vbuf++; | |
301 | |
302 #endif | |
303 } | |
304 | |
305 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); | |
306 b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS); | |
307 } | |
308 | |
309 { // Lift 3 | |
310 DWTELEM * const src = b+w2; | |
311 | |
312 vbuf = (vector signed int *)b; | |
313 vtmp = (vector signed int *)temp; | |
314 | |
315 i = 0; | |
316 align = vec_lvsl(0, src); | |
317 | |
318 for (; i<w_r-3; i+=4) { | |
319 #if 0 | |
320 temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1); | |
321 temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1); | |
322 temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1); | |
323 temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1); | |
324 #else | |
325 tmp1 = vec_ld(0,src+i); | |
326 t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4)); | |
327 tmp2 = vec_ld(15,src+i); | |
328 t1 = vec_sub(vec_splat_s32(0),t1); //bad! | |
329 t1 = vec_add(t1,vec_add(t1,t1)); | |
330 t2 = vec_perm(tmp1 ,tmp2 ,align); | |
331 t1 = vec_sra(t1,vec_splat_u32(1)); | |
332 vbuf++; | |
333 *vtmp = vec_sub(t2,t1); | |
334 vtmp++; | |
335 | |
336 #endif | |
337 | |
338 } | |
339 | |
340 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1); | |
341 } | |
342 | |
343 { | |
344 //Interleave | |
345 int a; | |
346 vector signed int *t = (vector signed int *)temp, | |
347 *v = (vector signed int *)b; | |
348 | |
349 snow_interleave_line_header(&i, width, b, temp); | |
350 | |
351 for (; (i & 0xE) != 0xE; i-=2){ | |
352 b[i+1] = temp[i>>1]; | |
353 b[i] = b[i>>1]; | |
354 } | |
355 for (i-=14; i>=0; i-=16){ | |
356 a=i/4; | |
357 | |
358 v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]); | |
359 v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]); | |
360 v[a+1]=vec_mergel(v[a>>1],t[a>>1]); | |
361 v[a]=vec_mergeh(v[a>>1],t[a>>1]); | |
362 | |
363 } | |
364 | |
365 } | |
5612 | 366 #endif |
3222 | 367 } |
368 | |
369 void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width) | |
370 { | |
371 int i, w4 = width/4; | |
372 vector signed int *v0, *v1,*v2,*v3,*v4,*v5; | |
373 vector signed int t1, t2; | |
374 | |
375 v0=(vector signed int *)b0; | |
376 v1=(vector signed int *)b1; | |
377 v2=(vector signed int *)b2; | |
378 v3=(vector signed int *)b3; | |
379 v4=(vector signed int *)b4; | |
380 v5=(vector signed int *)b5; | |
381 | |
382 for (i=0; i< w4;i++) | |
383 { | |
384 | |
385 #if 0 | |
386 b4[i] -= (3*(b3[i] + b5[i])+4)>>3; | |
387 b3[i] -= ((b2[i] + b4[i])); | |
388 b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4; | |
389 b1[i] += (3*(b0[i] + b2[i]))>>1; | |
390 #else | |
391 t1 = vec_add(v3[i], v5[i]); | |
392 t2 = vec_add(t1, vec_add(t1,t1)); | |
393 t1 = vec_add(t2, vec_splat_s32(4)); | |
394 v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3))); | |
395 | |
396 v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i])); | |
397 | |
398 t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i])); | |
399 t2 = vec_sl(v2[i], vec_splat_u32(2)); | |
400 v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4))); | |
401 t1 = vec_add(v0[i], v2[i]); | |
402 t2 = vec_add(t1, vec_add(t1,t1)); | |
403 v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1))); | |
404 | |
405 #endif | |
406 } | |
407 | |
408 for(i*=4; i < width; i++) | |
409 { | |
410 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; | |
411 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; | |
412 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; | |
413 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; | |
414 } | |
415 } | |
416 | |
3272 | 417 #define LOAD_BLOCKS \ |
418 tmp1 = vec_ld(0, &block[3][y*src_stride]);\ | |
419 align = vec_lvsl(0, &block[3][y*src_stride]);\ | |
420 tmp2 = vec_ld(15, &block[3][y*src_stride]);\ | |
421 \ | |
422 b3 = vec_perm(tmp1,tmp2,align);\ | |
423 \ | |
424 tmp1 = vec_ld(0, &block[2][y*src_stride]);\ | |
425 align = vec_lvsl(0, &block[2][y*src_stride]);\ | |
426 tmp2 = vec_ld(15, &block[2][y*src_stride]);\ | |
427 \ | |
428 b2 = vec_perm(tmp1,tmp2,align);\ | |
429 \ | |
430 tmp1 = vec_ld(0, &block[1][y*src_stride]);\ | |
431 align = vec_lvsl(0, &block[1][y*src_stride]);\ | |
432 tmp2 = vec_ld(15, &block[1][y*src_stride]);\ | |
433 \ | |
434 b1 = vec_perm(tmp1,tmp2,align);\ | |
435 \ | |
436 tmp1 = vec_ld(0, &block[0][y*src_stride]);\ | |
437 align = vec_lvsl(0, &block[0][y*src_stride]);\ | |
438 tmp2 = vec_ld(15, &block[0][y*src_stride]);\ | |
439 \ | |
440 b0 = vec_perm(tmp1,tmp2,align); | |
441 | |
442 #define LOAD_OBMCS \ | |
443 tmp1 = vec_ld(0, obmc1);\ | |
444 align = vec_lvsl(0, obmc1);\ | |
445 tmp2 = vec_ld(15, obmc1);\ | |
446 \ | |
447 ob1 = vec_perm(tmp1,tmp2,align);\ | |
448 \ | |
449 tmp1 = vec_ld(0, obmc2);\ | |
450 align = vec_lvsl(0, obmc2);\ | |
451 tmp2 = vec_ld(15, obmc2);\ | |
452 \ | |
453 ob2 = vec_perm(tmp1,tmp2,align);\ | |
454 \ | |
455 tmp1 = vec_ld(0, obmc3);\ | |
456 align = vec_lvsl(0, obmc3);\ | |
457 tmp2 = vec_ld(15, obmc3);\ | |
458 \ | |
459 ob3 = vec_perm(tmp1,tmp2,align);\ | |
460 \ | |
461 tmp1 = vec_ld(0, obmc4);\ | |
462 align = vec_lvsl(0, obmc4);\ | |
463 tmp2 = vec_ld(15, obmc4);\ | |
464 \ | |
465 ob4 = vec_perm(tmp1,tmp2,align); | |
466 | |
467 /* interleave logic | |
468 * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ] | |
469 * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ] | |
470 * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ] | |
471 */ | |
472 | |
473 #define STEPS_0_1\ | |
474 h1 = (vector unsigned short)\ | |
475 vec_mergeh(ob1, ob2);\ | |
476 \ | |
477 h2 = (vector unsigned short)\ | |
478 vec_mergeh(ob3, ob4);\ | |
479 \ | |
480 ih = (vector unsigned char)\ | |
481 vec_mergeh(h1,h2);\ | |
482 \ | |
483 l1 = (vector unsigned short) vec_mergeh(b3, b2);\ | |
484 \ | |
485 ih1 = (vector unsigned char) vec_mergel(h1, h2);\ | |
486 \ | |
487 l2 = (vector unsigned short) vec_mergeh(b1, b0);\ | |
488 \ | |
489 il = (vector unsigned char) vec_mergeh(l1, l2);\ | |
490 \ | |
491 v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ | |
492 \ | |
493 il1 = (vector unsigned char) vec_mergel(l1, l2);\ | |
494 \ | |
495 v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); | |
496 | |
497 #define FINAL_STEP_SCALAR\ | |
498 for(x=0; x<b_w; x++)\ | |
499 if(add){\ | |
500 vbuf[x] += dst[x + src_x];\ | |
501 vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\ | |
502 if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\ | |
503 dst8[x + y*src_stride] = vbuf[x];\ | |
504 }else{\ | |
505 dst[x + src_x] -= vbuf[x];\ | |
506 } | |
3222 | 507 |
508 static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, | |
509 const int obmc_stride, | |
510 uint8_t * * block, int b_w, | |
511 int b_h, int src_x, int src_y, | |
512 int src_stride, slice_buffer * sb, | |
513 int add, uint8_t * dst8) | |
514 { | |
515 int y, x; | |
516 DWTELEM * dst; | |
517 vector unsigned short h1, h2, l1, l2; | |
3272 | 518 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; |
3222 | 519 vector unsigned char b0,b1,b2,b3; |
3272 | 520 vector unsigned char ob1,ob2,ob3,ob4; |
521 | |
522 DECLARE_ALIGNED_16(int, vbuf[16]); | |
523 vector signed int *v = (vector signed int *)vbuf, *d; | |
3222 | 524 |
525 for(y=0; y<b_h; y++){ | |
5409 | 526 //FIXME ugly misuse of obmc_stride |
3222 | 527 |
528 uint8_t *obmc1= obmc + y*obmc_stride; | |
529 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
530 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
531 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
532 | |
533 dst = slice_buffer_get_line(sb, src_y + y); | |
534 d = (vector signed int *)(dst + src_x); | |
535 | |
3272 | 536 //FIXME i could avoid some loads! |
3222 | 537 |
3272 | 538 // load blocks |
539 LOAD_BLOCKS | |
3222 | 540 |
3272 | 541 // load obmcs |
542 LOAD_OBMCS | |
3271 | 543 |
3272 | 544 // steps 0 1 |
545 STEPS_0_1 | |
3222 | 546 |
3272 | 547 FINAL_STEP_SCALAR |
3222 | 548 |
549 } | |
550 | |
3272 | 551 } |
3222 | 552 |
3272 | 553 #define STEPS_2_3\ |
554 h1 = (vector unsigned short) vec_mergel(ob1, ob2);\ | |
555 \ | |
556 h2 = (vector unsigned short) vec_mergel(ob3, ob4);\ | |
557 \ | |
558 ih = (vector unsigned char) vec_mergeh(h1,h2);\ | |
559 \ | |
560 l1 = (vector unsigned short) vec_mergel(b3, b2);\ | |
561 \ | |
562 l2 = (vector unsigned short) vec_mergel(b1, b0);\ | |
563 \ | |
564 ih1 = (vector unsigned char) vec_mergel(h1,h2);\ | |
565 \ | |
566 il = (vector unsigned char) vec_mergeh(l1,l2);\ | |
567 \ | |
568 v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ | |
569 \ | |
570 il1 = (vector unsigned char) vec_mergel(l1,l2);\ | |
571 \ | |
572 v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); | |
573 | |
3222 | 574 |
575 static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, | |
576 const int obmc_stride, | |
577 uint8_t * * block, int b_w, | |
578 int b_h, int src_x, int src_y, | |
579 int src_stride, slice_buffer * sb, | |
580 int add, uint8_t * dst8) | |
581 { | |
582 int y, x; | |
583 DWTELEM * dst; | |
584 vector unsigned short h1, h2, l1, l2; | |
3272 | 585 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; |
3222 | 586 vector unsigned char b0,b1,b2,b3; |
3272 | 587 vector unsigned char ob1,ob2,ob3,ob4; |
588 DECLARE_ALIGNED_16(int, vbuf[b_w]); | |
589 vector signed int *v = (vector signed int *)vbuf, *d; | |
590 | |
591 for(y=0; y<b_h; y++){ | |
5409 | 592 //FIXME ugly misuse of obmc_stride |
3272 | 593 |
594 uint8_t *obmc1= obmc + y*obmc_stride; | |
595 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
596 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
597 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
598 | |
599 dst = slice_buffer_get_line(sb, src_y + y); | |
600 d = (vector signed int *)(dst + src_x); | |
601 | |
602 // load blocks | |
603 LOAD_BLOCKS | |
604 | |
605 // load obmcs | |
606 LOAD_OBMCS | |
607 | |
608 // steps 0 1 2 3 | |
609 STEPS_0_1 | |
610 | |
611 STEPS_2_3 | |
612 | |
613 FINAL_STEP_SCALAR | |
614 | |
615 } | |
616 } | |
617 | |
618 #define FINAL_STEP_VEC \ | |
619 \ | |
620 if(add)\ | |
621 {\ | |
622 for(x=0; x<b_w/4; x++)\ | |
623 {\ | |
624 v[x] = vec_add(v[x], d[x]);\ | |
625 v[x] = vec_sra(vec_add(v[x],\ | |
626 vec_sl( vec_splat_s32(1),\ | |
627 vec_splat_u32(7))),\ | |
628 vec_splat_u32(8));\ | |
629 \ | |
3288 | 630 mask = (vector bool int) vec_sl((vector signed int)\ |
3272 | 631 vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\ |
3288 | 632 mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\ |
3272 | 633 \ |
3288 | 634 mask = (vector bool int)\ |
3272 | 635 vec_cmpeq((vector signed int)mask,\ |
636 (vector signed int)vec_splat_u32(0));\ | |
637 \ | |
638 vs = vec_sra(v[x],vec_splat_u32(8));\ | |
639 vs = vec_sra(v[x],vec_splat_u32(8));\ | |
640 vs = vec_sra(v[x],vec_splat_u32(15));\ | |
641 \ | |
642 vs = vec_nor(vs,vs);\ | |
643 \ | |
644 v[x]= vec_sel(v[x],vs,mask);\ | |
645 }\ | |
646 \ | |
647 for(x=0; x<b_w; x++)\ | |
648 dst8[x + y*src_stride] = vbuf[x];\ | |
649 \ | |
650 }\ | |
651 else\ | |
652 for(x=0; x<b_w/4; x++)\ | |
653 d[x] = vec_sub(d[x], v[x]); | |
654 | |
655 static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc, | |
656 const int obmc_stride, | |
657 uint8_t * * block, int b_w, | |
658 int b_h, int src_x, int src_y, | |
659 int src_stride, slice_buffer * sb, | |
660 int add, uint8_t * dst8) | |
661 { | |
662 int y, x; | |
663 DWTELEM * dst; | |
664 vector bool int mask; | |
665 vector signed int vs; | |
666 vector unsigned short h1, h2, l1, l2; | |
667 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; | |
668 vector unsigned char b0,b1,b2,b3; | |
669 vector unsigned char ob1,ob2,ob3,ob4; | |
670 | |
671 DECLARE_ALIGNED_16(int, vbuf[16]); | |
672 vector signed int *v = (vector signed int *)vbuf, *d; | |
3222 | 673 |
674 for(y=0; y<b_h; y++){ | |
5409 | 675 //FIXME ugly misuse of obmc_stride |
3222 | 676 |
677 uint8_t *obmc1= obmc + y*obmc_stride; | |
678 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
679 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
680 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
681 | |
3272 | 682 dst = slice_buffer_get_line(sb, src_y + y); |
683 d = (vector signed int *)(dst + src_x); | |
684 | |
685 //FIXME i could avoid some loads! | |
686 | |
687 // load blocks | |
688 LOAD_BLOCKS | |
689 | |
690 // load obmcs | |
691 LOAD_OBMCS | |
692 | |
693 // steps 0 1 | |
694 STEPS_0_1 | |
695 | |
696 FINAL_STEP_VEC | |
697 | |
698 } | |
699 | |
700 } | |
3222 | 701 |
3272 | 702 static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, |
703 const int obmc_stride, | |
704 uint8_t * * block, int b_w, | |
705 int b_h, int src_x, int src_y, | |
706 int src_stride, slice_buffer * sb, | |
707 int add, uint8_t * dst8) | |
708 { | |
709 int y, x; | |
710 DWTELEM * dst; | |
711 vector bool int mask; | |
712 vector signed int vs; | |
713 vector unsigned short h1, h2, l1, l2; | |
714 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; | |
715 vector unsigned char b0,b1,b2,b3; | |
716 vector unsigned char ob1,ob2,ob3,ob4; | |
717 DECLARE_ALIGNED_16(int, vbuf[b_w]); | |
718 vector signed int *v = (vector signed int *)vbuf, *d; | |
719 | |
720 for(y=0; y<b_h; y++){ | |
5409 | 721 //FIXME ugly misuse of obmc_stride |
3272 | 722 |
723 uint8_t *obmc1= obmc + y*obmc_stride; | |
724 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
725 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
726 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
3222 | 727 |
728 dst = slice_buffer_get_line(sb, src_y + y); | |
729 d = (vector signed int *)(dst + src_x); | |
730 | |
731 // load blocks | |
3272 | 732 LOAD_BLOCKS |
3222 | 733 |
3272 | 734 // load obmcs |
735 LOAD_OBMCS | |
3222 | 736 |
3272 | 737 // steps 0 1 2 3 |
738 STEPS_0_1 | |
3222 | 739 |
3272 | 740 STEPS_2_3 |
3222 | 741 |
3272 | 742 FINAL_STEP_VEC |
3222 | 743 |
3272 | 744 } |
3222 | 745 } |
746 | |
747 | |
748 void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, | |
749 uint8_t * * block, int b_w, int b_h, | |
750 int src_x, int src_y, int src_stride, | |
751 slice_buffer * sb, int add, | |
752 uint8_t * dst8) | |
753 { | |
3272 | 754 if (src_x&15) { |
755 if (b_w == 16) | |
756 inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, | |
757 b_w, b_h, src_x, src_y, | |
758 src_stride, sb, add, dst8); | |
759 else if (b_w == 8) | |
760 inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block, | |
761 b_w, b_h, src_x, src_y, | |
762 src_stride, sb, add, dst8); | |
763 else | |
764 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, | |
765 src_y, src_stride, sb, add, dst8); | |
766 } else { | |
767 if (b_w == 16) | |
768 inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block, | |
769 b_w, b_h, src_x, src_y, | |
770 src_stride, sb, add, dst8); | |
771 else if (b_w == 8) | |
772 inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block, | |
773 b_w, b_h, src_x, src_y, | |
774 src_stride, sb, add, dst8); | |
775 else | |
776 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, | |
777 src_y, src_stride, sb, add, dst8); | |
778 } | |
3222 | 779 } |
3547 | 780 |
781 | |
782 void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) | |
783 { | |
5612 | 784 #if 0 |
3547 | 785 c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; |
786 c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; | |
787 c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; | |
5612 | 788 #endif |
3547 | 789 } |