Mercurial > libavcodec.hg
annotate ppc/snow_altivec.c @ 9003:b595a8a59967 libavcodec
Change the type of pblocks from pointers to short array into
pointers to array of 64 DCTELEM, similarly to other block fields.
This also get rid of some casts and fixes a warning.
author | iive |
---|---|
date | Sun, 22 Feb 2009 09:02:06 +0000 |
parents | a8a79f5385f6 |
children |
rev | line source |
---|---|
3222 | 1 /* |
5749 | 2 * AltiVec-optimized snow DSP utils |
3222 | 3 * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
3222 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
3222 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
3222 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3222 | 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 */ | |
21 | |
6763 | 22 #include "libavcodec/dsputil.h" |
23 #include "libavcodec/snow.h" | |
3222 | 24 |
25 #include "gcc_fixes.h" | |
26 #include "dsputil_altivec.h" | |
27 | |
28 #undef NDEBUG | |
29 #include <assert.h> | |
30 | |
31 | |
32 | |
33 //FIXME remove this replication | |
34 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num))) | |
35 | |
36 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line) | |
37 { | |
38 int offset; | |
39 DWTELEM * buffer; | |
40 | |
41 // av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line); | |
42 | |
43 assert(buf->data_stack_top >= 0); | |
44 // assert(!buf->line[line]); | |
45 if (buf->line[line]) | |
46 return buf->line[line]; | |
47 | |
48 offset = buf->line_width * line; | |
49 buffer = buf->data_stack[buf->data_stack_top]; | |
50 buf->data_stack_top--; | |
51 buf->line[line] = buffer; | |
52 | |
53 // av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1); | |
54 | |
55 return buffer; | |
56 } | |
57 | |
58 | |
59 //altivec code | |
60 | |
5612 | 61 void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width) |
3222 | 62 { |
5612 | 63 #if 0 |
3222 | 64 const int w2= (width+1)>>1; |
5612 | 65 DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]); |
3222 | 66 const int w_l= (width>>1); |
67 const int w_r= w2 - 1; | |
68 int i; | |
5612 | 69 vector signed short t1, t2, x, y, tmp1, tmp2; |
70 vector signed short *vbuf, *vtmp; | |
3222 | 71 vector unsigned char align; |
72 | |
73 { // Lift 0 | |
5612 | 74 IDWTELEM * const ref = b + w2 - 1; |
75 IDWTELEM b_0 = b[0]; | |
76 vector signed short v7 = vec_splat_s16(7); | |
77 vbuf = (vector signed short *)b; | |
3222 | 78 |
79 tmp1 = vec_ld (0, ref); | |
80 align = vec_lvsl (0, ref); | |
81 tmp2 = vec_ld (15, ref); | |
5612 | 82 t1 = vec_perm(tmp1, tmp2, align); |
3222 | 83 |
84 for (i=0; i<w_l-15; i+=16) { | |
85 #if 0 | |
5612 | 86 /* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3); |
3222 | 87 b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3); |
88 b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3); | |
5612 | 89 b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/ |
90 b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8); | |
3222 | 91 #else |
92 | |
5612 | 93 tmp1 = vec_ld (0, ref+8+i); |
94 tmp2 = vec_ld (15, ref+8+i); | |
3222 | 95 |
96 t2 = vec_perm(tmp1, tmp2, align); | |
97 | |
5612 | 98 y = vec_add(t1, vec_sld(t1,t2,2)); |
99 // y = vec_add(vec_add(y,y),y); | |
3222 | 100 |
5612 | 101 tmp1 = vec_ld (0, ref+12+i); |
3222 | 102 |
103 y = vec_add(y, vec_splat_s32(4)); | |
104 y = vec_sra(y, vec_splat_u32(3)); | |
105 | |
5612 | 106 tmp2 = vec_ld (15, ref+12+i); |
3222 | 107 |
108 *vbuf = vec_sub(*vbuf, y); | |
109 | |
5612 | 110 t1 = t2; |
3222 | 111 |
112 vbuf++; | |
113 | |
114 t2 = vec_perm(tmp1, tmp2, align); | |
115 | |
116 y = vec_add(t1,vec_sld(t1,t2,4)); | |
117 y = vec_add(vec_add(y,y),y); | |
118 | |
119 tmp1 = vec_ld (0, ref+12+i); | |
120 | |
121 y = vec_add(y, vec_splat_s32(4)); | |
122 y = vec_sra(y, vec_splat_u32(3)); | |
123 | |
124 tmp2 = vec_ld (15, ref+12+i); | |
125 | |
126 *vbuf = vec_sub(*vbuf, y); | |
127 | |
128 t1=t2; | |
129 | |
130 vbuf++; | |
131 | |
132 t2 = vec_perm(tmp1, tmp2, align); | |
133 | |
134 y = vec_add(t1,vec_sld(t1,t2,4)); | |
135 y = vec_add(vec_add(y,y),y); | |
136 | |
137 tmp1 = vec_ld (0, ref+16+i); | |
138 | |
139 y = vec_add(y, vec_splat_s32(4)); | |
140 y = vec_sra(y, vec_splat_u32(3)); | |
141 | |
142 tmp2 = vec_ld (15, ref+16+i); | |
143 | |
144 *vbuf = vec_sub(*vbuf, y); | |
145 | |
146 t1=t2; | |
147 | |
148 t2 = vec_perm(tmp1, tmp2, align); | |
149 | |
150 y = vec_add(t1,vec_sld(t1,t2,4)); | |
151 y = vec_add(vec_add(y,y),y); | |
152 | |
153 vbuf++; | |
154 | |
155 y = vec_add(y, vec_splat_s32(4)); | |
156 y = vec_sra(y, vec_splat_u32(3)); | |
157 *vbuf = vec_sub(*vbuf, y); | |
158 | |
159 t1=t2; | |
160 | |
161 vbuf++; | |
162 | |
163 #endif | |
5612 | 164 |
3222 | 165 } |
166 | |
167 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); | |
168 b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | |
169 } | |
170 | |
171 { // Lift 1 | |
172 DWTELEM * const dst = b+w2; | |
173 | |
174 i = 0; | |
175 for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){ | |
176 dst[i] = dst[i] - (b[i] + b[i + 1]); | |
177 } | |
178 | |
179 align = vec_lvsl(0, b+i); | |
180 tmp1 = vec_ld(0, b+i); | |
181 vbuf = (vector signed int*) (dst + i); | |
182 tmp2 = vec_ld(15, b+i); | |
183 | |
184 t1 = vec_perm(tmp1, tmp2, align); | |
185 | |
186 for (; i<w_r-3; i+=4) { | |
187 | |
188 #if 0 | |
189 dst[i] = dst[i] - (b[i] + b[i + 1]); | |
190 dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]); | |
191 dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]); | |
192 dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]); | |
193 #else | |
194 | |
195 tmp1 = vec_ld(0, b+4+i); | |
196 tmp2 = vec_ld(15, b+4+i); | |
197 | |
198 t2 = vec_perm(tmp1, tmp2, align); | |
199 | |
200 y = vec_add(t1, vec_sld(t1,t2,4)); | |
201 *vbuf = vec_sub (*vbuf, y); | |
202 | |
203 vbuf++; | |
204 | |
205 t1 = t2; | |
206 | |
207 #endif | |
208 | |
209 } | |
210 | |
211 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); | |
212 } | |
213 | |
214 { // Lift 2 | |
215 DWTELEM * const ref = b+w2 - 1; | |
216 DWTELEM b_0 = b[0]; | |
217 vbuf= (vector signed int *) b; | |
218 | |
219 tmp1 = vec_ld (0, ref); | |
220 align = vec_lvsl (0, ref); | |
221 tmp2 = vec_ld (15, ref); | |
222 t1= vec_perm(tmp1, tmp2, align); | |
223 | |
224 i = 0; | |
225 for (; i<w_l-15; i+=16) { | |
226 #if 0 | |
227 b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4); | |
228 b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4); | |
229 b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4); | |
230 b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4); | |
231 #else | |
232 tmp1 = vec_ld (0, ref+4+i); | |
233 tmp2 = vec_ld (15, ref+4+i); | |
234 | |
235 t2 = vec_perm(tmp1, tmp2, align); | |
236 | |
237 y = vec_add(t1,vec_sld(t1,t2,4)); | |
238 y = vec_sub(vec_splat_s32(8),y); | |
239 | |
240 tmp1 = vec_ld (0, ref+8+i); | |
241 | |
242 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
243 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
244 | |
245 tmp2 = vec_ld (15, ref+8+i); | |
246 | |
247 *vbuf = vec_sub( *vbuf, y); | |
248 | |
249 t1 = t2; | |
250 | |
251 vbuf++; | |
252 | |
253 t2 = vec_perm(tmp1, tmp2, align); | |
254 | |
255 y = vec_add(t1,vec_sld(t1,t2,4)); | |
256 y = vec_sub(vec_splat_s32(8),y); | |
257 | |
258 tmp1 = vec_ld (0, ref+12+i); | |
259 | |
260 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
261 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
262 | |
263 tmp2 = vec_ld (15, ref+12+i); | |
264 | |
265 *vbuf = vec_sub( *vbuf, y); | |
266 | |
267 t1 = t2; | |
268 | |
269 vbuf++; | |
270 | |
271 t2 = vec_perm(tmp1, tmp2, align); | |
272 | |
273 y = vec_add(t1,vec_sld(t1,t2,4)); | |
274 y = vec_sub(vec_splat_s32(8),y); | |
275 | |
276 tmp1 = vec_ld (0, ref+16+i); | |
277 | |
278 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
279 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
280 | |
281 tmp2 = vec_ld (15, ref+16+i); | |
282 | |
283 *vbuf = vec_sub( *vbuf, y); | |
284 | |
285 t1 = t2; | |
286 | |
287 vbuf++; | |
288 | |
289 t2 = vec_perm(tmp1, tmp2, align); | |
290 | |
291 y = vec_add(t1,vec_sld(t1,t2,4)); | |
292 y = vec_sub(vec_splat_s32(8),y); | |
293 | |
294 t1 = t2; | |
295 | |
296 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
297 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
298 *vbuf = vec_sub( *vbuf, y); | |
299 | |
300 vbuf++; | |
301 | |
302 #endif | |
303 } | |
304 | |
305 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); | |
306 b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS); | |
307 } | |
308 | |
309 { // Lift 3 | |
310 DWTELEM * const src = b+w2; | |
311 | |
312 vbuf = (vector signed int *)b; | |
313 vtmp = (vector signed int *)temp; | |
314 | |
315 i = 0; | |
316 align = vec_lvsl(0, src); | |
317 | |
318 for (; i<w_r-3; i+=4) { | |
319 #if 0 | |
320 temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1); | |
321 temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1); | |
322 temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1); | |
323 temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1); | |
324 #else | |
325 tmp1 = vec_ld(0,src+i); | |
326 t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4)); | |
327 tmp2 = vec_ld(15,src+i); | |
328 t1 = vec_sub(vec_splat_s32(0),t1); //bad! | |
329 t1 = vec_add(t1,vec_add(t1,t1)); | |
330 t2 = vec_perm(tmp1 ,tmp2 ,align); | |
331 t1 = vec_sra(t1,vec_splat_u32(1)); | |
332 vbuf++; | |
333 *vtmp = vec_sub(t2,t1); | |
334 vtmp++; | |
335 | |
336 #endif | |
337 | |
338 } | |
339 | |
340 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1); | |
341 } | |
342 | |
343 { | |
344 //Interleave | |
345 int a; | |
346 vector signed int *t = (vector signed int *)temp, | |
347 *v = (vector signed int *)b; | |
348 | |
349 snow_interleave_line_header(&i, width, b, temp); | |
350 | |
351 for (; (i & 0xE) != 0xE; i-=2){ | |
352 b[i+1] = temp[i>>1]; | |
353 b[i] = b[i>>1]; | |
354 } | |
355 for (i-=14; i>=0; i-=16){ | |
356 a=i/4; | |
357 | |
358 v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]); | |
359 v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]); | |
360 v[a+1]=vec_mergel(v[a>>1],t[a>>1]); | |
361 v[a]=vec_mergeh(v[a>>1],t[a>>1]); | |
362 | |
363 } | |
364 | |
365 } | |
5612 | 366 #endif |
3222 | 367 } |
368 | |
369 void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width) | |
370 { | |
371 int i, w4 = width/4; | |
372 vector signed int *v0, *v1,*v2,*v3,*v4,*v5; | |
373 vector signed int t1, t2; | |
374 | |
375 v0=(vector signed int *)b0; | |
376 v1=(vector signed int *)b1; | |
377 v2=(vector signed int *)b2; | |
378 v3=(vector signed int *)b3; | |
379 v4=(vector signed int *)b4; | |
380 v5=(vector signed int *)b5; | |
381 | |
7333
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
382 for (i=0; i< w4;i++) { |
3222 | 383 |
384 #if 0 | |
385 b4[i] -= (3*(b3[i] + b5[i])+4)>>3; | |
386 b3[i] -= ((b2[i] + b4[i])); | |
387 b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4; | |
388 b1[i] += (3*(b0[i] + b2[i]))>>1; | |
389 #else | |
390 t1 = vec_add(v3[i], v5[i]); | |
391 t2 = vec_add(t1, vec_add(t1,t1)); | |
392 t1 = vec_add(t2, vec_splat_s32(4)); | |
393 v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3))); | |
394 | |
395 v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i])); | |
396 | |
397 t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i])); | |
398 t2 = vec_sl(v2[i], vec_splat_u32(2)); | |
399 v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4))); | |
400 t1 = vec_add(v0[i], v2[i]); | |
401 t2 = vec_add(t1, vec_add(t1,t1)); | |
402 v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1))); | |
403 | |
404 #endif | |
405 } | |
406 | |
407 for(i*=4; i < width; i++) | |
408 { | |
409 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; | |
410 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; | |
411 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; | |
412 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; | |
413 } | |
414 } | |
415 | |
3272 | 416 #define LOAD_BLOCKS \ |
417 tmp1 = vec_ld(0, &block[3][y*src_stride]);\ | |
418 align = vec_lvsl(0, &block[3][y*src_stride]);\ | |
419 tmp2 = vec_ld(15, &block[3][y*src_stride]);\ | |
420 \ | |
421 b3 = vec_perm(tmp1,tmp2,align);\ | |
422 \ | |
423 tmp1 = vec_ld(0, &block[2][y*src_stride]);\ | |
424 align = vec_lvsl(0, &block[2][y*src_stride]);\ | |
425 tmp2 = vec_ld(15, &block[2][y*src_stride]);\ | |
426 \ | |
427 b2 = vec_perm(tmp1,tmp2,align);\ | |
428 \ | |
429 tmp1 = vec_ld(0, &block[1][y*src_stride]);\ | |
430 align = vec_lvsl(0, &block[1][y*src_stride]);\ | |
431 tmp2 = vec_ld(15, &block[1][y*src_stride]);\ | |
432 \ | |
433 b1 = vec_perm(tmp1,tmp2,align);\ | |
434 \ | |
435 tmp1 = vec_ld(0, &block[0][y*src_stride]);\ | |
436 align = vec_lvsl(0, &block[0][y*src_stride]);\ | |
437 tmp2 = vec_ld(15, &block[0][y*src_stride]);\ | |
438 \ | |
439 b0 = vec_perm(tmp1,tmp2,align); | |
440 | |
441 #define LOAD_OBMCS \ | |
442 tmp1 = vec_ld(0, obmc1);\ | |
443 align = vec_lvsl(0, obmc1);\ | |
444 tmp2 = vec_ld(15, obmc1);\ | |
445 \ | |
446 ob1 = vec_perm(tmp1,tmp2,align);\ | |
447 \ | |
448 tmp1 = vec_ld(0, obmc2);\ | |
449 align = vec_lvsl(0, obmc2);\ | |
450 tmp2 = vec_ld(15, obmc2);\ | |
451 \ | |
452 ob2 = vec_perm(tmp1,tmp2,align);\ | |
453 \ | |
454 tmp1 = vec_ld(0, obmc3);\ | |
455 align = vec_lvsl(0, obmc3);\ | |
456 tmp2 = vec_ld(15, obmc3);\ | |
457 \ | |
458 ob3 = vec_perm(tmp1,tmp2,align);\ | |
459 \ | |
460 tmp1 = vec_ld(0, obmc4);\ | |
461 align = vec_lvsl(0, obmc4);\ | |
462 tmp2 = vec_ld(15, obmc4);\ | |
463 \ | |
464 ob4 = vec_perm(tmp1,tmp2,align); | |
465 | |
466 /* interleave logic | |
467 * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ] | |
468 * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ] | |
469 * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ] | |
470 */ | |
471 | |
472 #define STEPS_0_1\ | |
473 h1 = (vector unsigned short)\ | |
474 vec_mergeh(ob1, ob2);\ | |
475 \ | |
476 h2 = (vector unsigned short)\ | |
477 vec_mergeh(ob3, ob4);\ | |
478 \ | |
479 ih = (vector unsigned char)\ | |
480 vec_mergeh(h1,h2);\ | |
481 \ | |
482 l1 = (vector unsigned short) vec_mergeh(b3, b2);\ | |
483 \ | |
484 ih1 = (vector unsigned char) vec_mergel(h1, h2);\ | |
485 \ | |
486 l2 = (vector unsigned short) vec_mergeh(b1, b0);\ | |
487 \ | |
488 il = (vector unsigned char) vec_mergeh(l1, l2);\ | |
489 \ | |
490 v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ | |
491 \ | |
492 il1 = (vector unsigned char) vec_mergel(l1, l2);\ | |
493 \ | |
494 v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); | |
495 | |
496 #define FINAL_STEP_SCALAR\ | |
497 for(x=0; x<b_w; x++)\ | |
498 if(add){\ | |
499 vbuf[x] += dst[x + src_x];\ | |
500 vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\ | |
501 if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\ | |
502 dst8[x + y*src_stride] = vbuf[x];\ | |
503 }else{\ | |
504 dst[x + src_x] -= vbuf[x];\ | |
505 } | |
3222 | 506 |
507 static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, | |
508 const int obmc_stride, | |
509 uint8_t * * block, int b_w, | |
510 int b_h, int src_x, int src_y, | |
511 int src_stride, slice_buffer * sb, | |
512 int add, uint8_t * dst8) | |
513 { | |
514 int y, x; | |
515 DWTELEM * dst; | |
516 vector unsigned short h1, h2, l1, l2; | |
3272 | 517 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; |
3222 | 518 vector unsigned char b0,b1,b2,b3; |
3272 | 519 vector unsigned char ob1,ob2,ob3,ob4; |
520 | |
521 DECLARE_ALIGNED_16(int, vbuf[16]); | |
522 vector signed int *v = (vector signed int *)vbuf, *d; | |
3222 | 523 |
524 for(y=0; y<b_h; y++){ | |
5409 | 525 //FIXME ugly misuse of obmc_stride |
3222 | 526 |
527 uint8_t *obmc1= obmc + y*obmc_stride; | |
528 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
529 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
530 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
531 | |
532 dst = slice_buffer_get_line(sb, src_y + y); | |
533 d = (vector signed int *)(dst + src_x); | |
534 | |
3272 | 535 //FIXME i could avoid some loads! |
3222 | 536 |
3272 | 537 // load blocks |
538 LOAD_BLOCKS | |
3222 | 539 |
3272 | 540 // load obmcs |
541 LOAD_OBMCS | |
3271 | 542 |
3272 | 543 // steps 0 1 |
544 STEPS_0_1 | |
3222 | 545 |
3272 | 546 FINAL_STEP_SCALAR |
3222 | 547 |
548 } | |
549 | |
3272 | 550 } |
3222 | 551 |
3272 | 552 #define STEPS_2_3\ |
553 h1 = (vector unsigned short) vec_mergel(ob1, ob2);\ | |
554 \ | |
555 h2 = (vector unsigned short) vec_mergel(ob3, ob4);\ | |
556 \ | |
557 ih = (vector unsigned char) vec_mergeh(h1,h2);\ | |
558 \ | |
559 l1 = (vector unsigned short) vec_mergel(b3, b2);\ | |
560 \ | |
561 l2 = (vector unsigned short) vec_mergel(b1, b0);\ | |
562 \ | |
563 ih1 = (vector unsigned char) vec_mergel(h1,h2);\ | |
564 \ | |
565 il = (vector unsigned char) vec_mergeh(l1,l2);\ | |
566 \ | |
567 v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ | |
568 \ | |
569 il1 = (vector unsigned char) vec_mergel(l1,l2);\ | |
570 \ | |
571 v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); | |
572 | |
3222 | 573 |
574 static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, | |
575 const int obmc_stride, | |
576 uint8_t * * block, int b_w, | |
577 int b_h, int src_x, int src_y, | |
578 int src_stride, slice_buffer * sb, | |
579 int add, uint8_t * dst8) | |
580 { | |
581 int y, x; | |
582 DWTELEM * dst; | |
583 vector unsigned short h1, h2, l1, l2; | |
3272 | 584 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; |
3222 | 585 vector unsigned char b0,b1,b2,b3; |
3272 | 586 vector unsigned char ob1,ob2,ob3,ob4; |
587 DECLARE_ALIGNED_16(int, vbuf[b_w]); | |
588 vector signed int *v = (vector signed int *)vbuf, *d; | |
589 | |
590 for(y=0; y<b_h; y++){ | |
5409 | 591 //FIXME ugly misuse of obmc_stride |
3272 | 592 |
593 uint8_t *obmc1= obmc + y*obmc_stride; | |
594 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
595 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
596 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
597 | |
598 dst = slice_buffer_get_line(sb, src_y + y); | |
599 d = (vector signed int *)(dst + src_x); | |
600 | |
601 // load blocks | |
602 LOAD_BLOCKS | |
603 | |
604 // load obmcs | |
605 LOAD_OBMCS | |
606 | |
607 // steps 0 1 2 3 | |
608 STEPS_0_1 | |
609 | |
610 STEPS_2_3 | |
611 | |
612 FINAL_STEP_SCALAR | |
613 | |
614 } | |
615 } | |
616 | |
617 #define FINAL_STEP_VEC \ | |
618 \ | |
619 if(add)\ | |
620 {\ | |
621 for(x=0; x<b_w/4; x++)\ | |
622 {\ | |
623 v[x] = vec_add(v[x], d[x]);\ | |
624 v[x] = vec_sra(vec_add(v[x],\ | |
625 vec_sl( vec_splat_s32(1),\ | |
626 vec_splat_u32(7))),\ | |
627 vec_splat_u32(8));\ | |
628 \ | |
3288 | 629 mask = (vector bool int) vec_sl((vector signed int)\ |
3272 | 630 vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\ |
3288 | 631 mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\ |
3272 | 632 \ |
3288 | 633 mask = (vector bool int)\ |
3272 | 634 vec_cmpeq((vector signed int)mask,\ |
635 (vector signed int)vec_splat_u32(0));\ | |
636 \ | |
637 vs = vec_sra(v[x],vec_splat_u32(8));\ | |
638 vs = vec_sra(v[x],vec_splat_u32(8));\ | |
639 vs = vec_sra(v[x],vec_splat_u32(15));\ | |
640 \ | |
641 vs = vec_nor(vs,vs);\ | |
642 \ | |
643 v[x]= vec_sel(v[x],vs,mask);\ | |
644 }\ | |
645 \ | |
646 for(x=0; x<b_w; x++)\ | |
647 dst8[x + y*src_stride] = vbuf[x];\ | |
648 \ | |
649 }\ | |
650 else\ | |
651 for(x=0; x<b_w/4; x++)\ | |
652 d[x] = vec_sub(d[x], v[x]); | |
653 | |
654 static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc, | |
655 const int obmc_stride, | |
656 uint8_t * * block, int b_w, | |
657 int b_h, int src_x, int src_y, | |
658 int src_stride, slice_buffer * sb, | |
659 int add, uint8_t * dst8) | |
660 { | |
661 int y, x; | |
662 DWTELEM * dst; | |
663 vector bool int mask; | |
664 vector signed int vs; | |
665 vector unsigned short h1, h2, l1, l2; | |
666 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; | |
667 vector unsigned char b0,b1,b2,b3; | |
668 vector unsigned char ob1,ob2,ob3,ob4; | |
669 | |
670 DECLARE_ALIGNED_16(int, vbuf[16]); | |
671 vector signed int *v = (vector signed int *)vbuf, *d; | |
3222 | 672 |
673 for(y=0; y<b_h; y++){ | |
5409 | 674 //FIXME ugly misuse of obmc_stride |
3222 | 675 |
676 uint8_t *obmc1= obmc + y*obmc_stride; | |
677 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
678 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
679 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
680 | |
3272 | 681 dst = slice_buffer_get_line(sb, src_y + y); |
682 d = (vector signed int *)(dst + src_x); | |
683 | |
684 //FIXME i could avoid some loads! | |
685 | |
686 // load blocks | |
687 LOAD_BLOCKS | |
688 | |
689 // load obmcs | |
690 LOAD_OBMCS | |
691 | |
692 // steps 0 1 | |
693 STEPS_0_1 | |
694 | |
695 FINAL_STEP_VEC | |
696 | |
697 } | |
698 | |
699 } | |
3222 | 700 |
3272 | 701 static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, |
702 const int obmc_stride, | |
703 uint8_t * * block, int b_w, | |
704 int b_h, int src_x, int src_y, | |
705 int src_stride, slice_buffer * sb, | |
706 int add, uint8_t * dst8) | |
707 { | |
708 int y, x; | |
709 DWTELEM * dst; | |
710 vector bool int mask; | |
711 vector signed int vs; | |
712 vector unsigned short h1, h2, l1, l2; | |
713 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; | |
714 vector unsigned char b0,b1,b2,b3; | |
715 vector unsigned char ob1,ob2,ob3,ob4; | |
716 DECLARE_ALIGNED_16(int, vbuf[b_w]); | |
717 vector signed int *v = (vector signed int *)vbuf, *d; | |
718 | |
719 for(y=0; y<b_h; y++){ | |
5409 | 720 //FIXME ugly misuse of obmc_stride |
3272 | 721 |
722 uint8_t *obmc1= obmc + y*obmc_stride; | |
723 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
724 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
725 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
3222 | 726 |
727 dst = slice_buffer_get_line(sb, src_y + y); | |
728 d = (vector signed int *)(dst + src_x); | |
729 | |
730 // load blocks | |
3272 | 731 LOAD_BLOCKS |
3222 | 732 |
3272 | 733 // load obmcs |
734 LOAD_OBMCS | |
3222 | 735 |
3272 | 736 // steps 0 1 2 3 |
737 STEPS_0_1 | |
3222 | 738 |
3272 | 739 STEPS_2_3 |
3222 | 740 |
3272 | 741 FINAL_STEP_VEC |
3222 | 742 |
3272 | 743 } |
3222 | 744 } |
745 | |
746 | |
747 void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, | |
748 uint8_t * * block, int b_w, int b_h, | |
749 int src_x, int src_y, int src_stride, | |
750 slice_buffer * sb, int add, | |
751 uint8_t * dst8) | |
752 { | |
3272 | 753 if (src_x&15) { |
754 if (b_w == 16) | |
755 inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, | |
756 b_w, b_h, src_x, src_y, | |
757 src_stride, sb, add, dst8); | |
758 else if (b_w == 8) | |
759 inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block, | |
760 b_w, b_h, src_x, src_y, | |
761 src_stride, sb, add, dst8); | |
762 else | |
763 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, | |
764 src_y, src_stride, sb, add, dst8); | |
765 } else { | |
766 if (b_w == 16) | |
767 inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block, | |
768 b_w, b_h, src_x, src_y, | |
769 src_stride, sb, add, dst8); | |
770 else if (b_w == 8) | |
771 inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block, | |
772 b_w, b_h, src_x, src_y, | |
773 src_stride, sb, add, dst8); | |
774 else | |
775 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, | |
776 src_y, src_stride, sb, add, dst8); | |
777 } | |
3222 | 778 } |
3547 | 779 |
780 | |
781 void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) | |
782 { | |
5612 | 783 #if 0 |
7333
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
784 c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; |
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
785 c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; |
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
786 c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; |
5612 | 787 #endif |
3547 | 788 } |