comparison postproc/swscale_altivec_template.c @ 12130:2ef24558b732

AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
author michael
date Tue, 06 Apr 2004 00:25:47 +0000
parents 21e5cb258a95
children 79a2af950cf7
comparison
equal deleted inserted replaced
12129:c1aff21286dd 12130:2ef24558b732
18 You should have received a copy of the GNU General Public License 18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software 19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */ 21 */
22 22
23 static const vector unsigned int altivec_vectorShiftInt19 = {19, 19, 19, 19}; 23 #ifdef CONFIG_DARWIN
24 static const vector signed int vzero =
25 (vector signed int)(0, 0, 0, 0);
26 static const vector unsigned int altivec_vectorShiftInt19 =
27 (vector unsigned int)(19, 19, 19, 19);
28 #else
29 static const vector signed int vzero =
30 (vector signed int){0,0,0,0};
31 static const vector unsigned int altivec_vectorShiftInt19 =
32 (vector unsigned int){19, 19, 19, 19};
33
34 #endif
35
24 static inline void 36 static inline void
25 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) { 37 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) {
26 register int i; 38 register int i;
27 if ((unsigned long)dest % 16) { 39 if ((unsigned long)dest % 16) {
28 /* badly aligned store, we force store alignement */ 40 /* badly aligned store, we force store alignement */
199 } 211 }
200 altivec_packIntArrayToCharArray(u,uDest,chrDstW); 212 altivec_packIntArrayToCharArray(u,uDest,chrDstW);
201 altivec_packIntArrayToCharArray(v,vDest,chrDstW); 213 altivec_packIntArrayToCharArray(v,vDest,chrDstW);
202 } 214 }
203 } 215 }
216
217 static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, int filterSize) {
218 register int i;
219 int __attribute__ ((aligned (16))) tempo[4];
220
221 if (filterSize % 4) {
222 for(i=0; i<dstW; i++) {
223 register int j;
224 register int srcPos = filterPos[i];
225 register int val = 0;
226 for(j=0; j<filterSize; j++) {
227 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
228 }
229 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1);
230 }
231 }
232 else
233 switch (filterSize) {
234 case 4:
235 {
236 for(i=0; i<dstW; i++) {
237 register int j;
238 register int srcPos = filterPos[i];
239
240 vector unsigned char src_v0 = vec_ld(srcPos, src);
241 vector unsigned char src_v1;
242 if ((((int)src + srcPos)% 16) > 12) {
243 src_v1 = vec_ld(srcPos + 16, src);
244 }
245 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
246
247 vector signed short src_v = // vec_unpackh sign-extends...
248 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
249 // now put our elements in the even slots
250 src_v = vec_mergeh(src_v, (vector signed short)vzero);
251
252 vector signed short filter_v = vec_ld(i << 3, filter);
253 // the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2)
254
255 // the neat trick : we only care for half the elements,
256 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
257 // and we're going to use vec_mule, so we chose
258 // carefully how to "unpack" the elements into the even slots
259 if ((i << 3) % 16)
260 filter_v = vec_mergel(filter_v,(vector signed short)vzero);
261 else
262 filter_v = vec_mergeh(filter_v,(vector signed short)vzero);
263
264 vector signed int val_vEven = vec_mule(src_v, filter_v);
265 vector signed int val_s = vec_sums(val_vEven, vzero);
266 vec_st(val_s, 0, tempo);
267 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
268 }
269 }
270 break;
271
272 case 8:
273 {
274 for(i=0; i<dstW; i++) {
275 register int srcPos = filterPos[i];
276
277 vector unsigned char src_v0 = vec_ld(srcPos, src);
278 vector unsigned char src_v1;
279 if ((((int)src + srcPos)% 16) > 8) {
280 src_v1 = vec_ld(srcPos + 16, src);
281 }
282 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
283
284 vector signed short src_v = // vec_unpackh sign-extends...
285 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
286 vector signed short filter_v = vec_ld(i << 4, filter);
287 // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
288
289 vector signed int val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
290 vector signed int val_s = vec_sums(val_v, vzero);
291 vec_st(val_s, 0, tempo);
292 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
293 }
294 }
295 break;
296
297 case 16:
298 {
299 for(i=0; i<dstW; i++) {
300 register int srcPos = filterPos[i];
301
302 vector unsigned char src_v0 = vec_ld(srcPos, src);
303 vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
304 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
305
306 vector signed short src_vA = // vec_unpackh sign-extends...
307 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
308 vector signed short src_vB = // vec_unpackh sign-extends...
309 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
310
311 vector signed short filter_v0 = vec_ld(i << 5, filter);
312 vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
313 // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
314
315 vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
316 vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
317
318 vector signed int val_s = vec_sums(val_v, vzero);
319
320 vec_st(val_s, 0, tempo);
321 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
322 }
323 }
324 break;
325
326 default:
327 {
328 for(i=0; i<dstW; i++) {
329 register int j;
330 register int srcPos = filterPos[i];
331
332 vector signed int val_v = (vector signed int)vzero;
333 vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
334 vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter);
335
336 vector unsigned char src_v0 = vec_ld(srcPos, src);
337 vector unsigned char permS = vec_lvsl(srcPos, src);
338
339 for (j = 0 ; j < filterSize - 15; j += 16) {
340 vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
341 vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
342
343 vector signed short src_vA = // vec_unpackh sign-extends...
344 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
345 vector signed short src_vB = // vec_unpackh sign-extends...
346 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
347
348 vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
349 vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
350 vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF);
351 vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF);
352
353 vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
354 val_v = vec_msums(src_vB, filter_v1, val_acc);
355
356 filter_v0R = filter_v2R;
357 src_v0 = src_v1;
358 }
359
360 if (j < (filterSize-7)) {
361 // loading src_v0 is useless, it's already done above
362 //vector unsigned char src_v0 = vec_ld(srcPos + j, src);
363 vector unsigned char src_v1;
364 if ((((int)src + srcPos)% 16) > 8) {
365 src_v1 = vec_ld(srcPos + j + 16, src);
366 }
367 vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
368
369 vector signed short src_v = // vec_unpackh sign-extends...
370 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
371 // loading filter_v0R is useless, it's already done above
372 //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
373 vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
374 vector signed short filter_v = vec_perm(filter_v0R, filter_v1R, permF);
375
376 val_v = vec_msums(src_v, filter_v, val_v);
377 }
378
379 vector signed int val_s = vec_sums(val_v, vzero);
380
381 vec_st(val_s, 0, tempo);
382 dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
383 }
384
385 }
386 }
387 }