comparison ppc/dsputil_altivec.c @ 1033:b4172ff70d27 libavcodec

Altivec on non darwin systems patch by Romain Dolbeau
author bellard
date Sun, 26 Jan 2003 22:29:47 +0000
parents 9cc1031e1864
children b32afefe7d33
comparison
equal deleted inserted replaced
1032:8f440ca8e0b0 1033:b4172ff70d27
19 */ 19 */
20 20
21 #include "../dsputil.h" 21 #include "../dsputil.h"
22 #include "dsputil_altivec.h" 22 #include "dsputil_altivec.h"
23 23
24 #if CONFIG_DARWIN 24 #ifdef CONFIG_DARWIN
25 #include <sys/sysctl.h> 25 #include <sys/sysctl.h>
26 #endif 26 #else /* CONFIG_DARWIN */
27 #include <signal.h>
28 #include <setjmp.h>
29
30 static sigjmp_buf jmpbuf;
31 static volatile sig_atomic_t canjump = 0;
32
33 static void sigill_handler (int sig)
34 {
35 if (!canjump) {
36 signal (sig, SIG_DFL);
37 raise (sig);
38 }
39
40 canjump = 0;
41 siglongjmp (jmpbuf, 1);
42 }
43 #endif /* CONFIG_DARWIN */
27 44
28 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) 45 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
29 { 46 {
30 int i; 47 int i;
31 int s __attribute__((aligned(16))); 48 int s __attribute__((aligned(16)));
32 const vector unsigned char zero = (const vector unsigned char)(0); 49 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
33 vector unsigned char *tv; 50 vector unsigned char *tv;
34 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 51 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
35 vector unsigned int sad; 52 vector unsigned int sad;
36 vector signed int sumdiffs; 53 vector signed int sumdiffs;
37 54
38 s = 0; 55 s = 0;
39 sad = (vector unsigned int)(0); 56 sad = (vector unsigned int)vec_splat_u32(0);
40 for(i=0;i<16;i++) { 57 for(i=0;i<16;i++) {
41 /* 58 /*
42 Read unaligned pixels into our vectors. The vectors are as follows: 59 Read unaligned pixels into our vectors. The vectors are as follows:
43 pix1v: pix1[0]-pix1[15] 60 pix1v: pix1[0]-pix1[15]
44 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] 61 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
74 91
75 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) 92 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
76 { 93 {
77 int i; 94 int i;
78 int s __attribute__((aligned(16))); 95 int s __attribute__((aligned(16)));
79 const vector unsigned char zero = (const vector unsigned char)(0); 96 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
80 vector unsigned char *tv; 97 vector unsigned char *tv;
81 vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 98 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
82 vector unsigned int sad; 99 vector unsigned int sad;
83 vector signed int sumdiffs; 100 vector signed int sumdiffs;
84 uint8_t *pix3 = pix2 + line_size; 101 uint8_t *pix3 = pix2 + line_size;
85 102
86 s = 0; 103 s = 0;
87 sad = (vector unsigned int)(0); 104 sad = (vector unsigned int)vec_splat_u32(0);
88 105
89 /* 106 /*
90 Due to the fact that pix3 = pix2 + line_size, the pix3 of one 107 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
91 iteration becomes pix2 in the next iteration. We can use this 108 iteration becomes pix2 in the next iteration. We can use this
92 fact to avoid a potentially expensive unaligned read, each 109 fact to avoid a potentially expensive unaligned read, each
135 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) 152 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
136 { 153 {
137 int i; 154 int i;
138 int s __attribute__((aligned(16))); 155 int s __attribute__((aligned(16)));
139 uint8_t *pix3 = pix2 + line_size; 156 uint8_t *pix3 = pix2 + line_size;
140 const vector unsigned char zero = (const vector unsigned char)(0); 157 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
141 const vector unsigned short two = (const vector unsigned short)(2); 158 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
142 vector unsigned char *tv, avgv, t5; 159 vector unsigned char *tv, avgv, t5;
143 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 160 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
144 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 161 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
145 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 162 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
146 vector unsigned short avghv, avglv; 163 vector unsigned short avghv, avglv;
147 vector unsigned short t1, t2, t3, t4; 164 vector unsigned short t1, t2, t3, t4;
148 vector unsigned int sad; 165 vector unsigned int sad;
149 vector signed int sumdiffs; 166 vector signed int sumdiffs;
150 167
151 sad = (vector unsigned int)(0); 168 sad = (vector unsigned int)vec_splat_u32(0);
152 169
153 s = 0; 170 s = 0;
154 171
155 /* 172 /*
156 Due to the fact that pix3 = pix2 + line_size, the pix3 of one 173 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
235 252
236 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) 253 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
237 { 254 {
238 int i; 255 int i;
239 int s __attribute__((aligned(16))); 256 int s __attribute__((aligned(16)));
240 const vector unsigned int zero = (const vector unsigned int)(0); 257 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
241 vector unsigned char perm1, perm2, *pix1v, *pix2v; 258 vector unsigned char perm1, perm2, *pix1v, *pix2v;
242 vector unsigned char t1, t2, t3,t4, t5; 259 vector unsigned char t1, t2, t3,t4, t5;
243 vector unsigned int sad; 260 vector unsigned int sad;
244 vector signed int sumdiffs; 261 vector signed int sumdiffs;
245 262
246 sad = (vector unsigned int) (0); 263 sad = (vector unsigned int)vec_splat_u32(0);
247 264
248 265
249 for(i=0;i<16;i++) { 266 for(i=0;i<16;i++) {
250 /* Read potentially unaligned pixels into t1 and t2 */ 267 /* Read potentially unaligned pixels into t1 and t2 */
251 perm1 = vec_lvsl(0, pix1); 268 perm1 = vec_lvsl(0, pix1);
277 294
278 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) 295 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
279 { 296 {
280 int i; 297 int i;
281 int s __attribute__((aligned(16))); 298 int s __attribute__((aligned(16)));
282 const vector unsigned int zero = (const vector unsigned int)(0); 299 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
283 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 300 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
284 vector unsigned char t1, t2, t3,t4, t5; 301 vector unsigned char t1, t2, t3,t4, t5;
285 vector unsigned int sad; 302 vector unsigned int sad;
286 vector signed int sumdiffs; 303 vector signed int sumdiffs;
287 304
288 sad = (vector unsigned int)(0); 305 sad = (vector unsigned int)vec_splat_u32(0);
289 permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 306 #ifdef CONFIG_DARWIN
307 permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
308 #else
309 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
310 #endif
290 311
291 for(i=0;i<8;i++) { 312 for(i=0;i<8;i++) {
292 /* Read potentially unaligned pixels into t1 and t2 313 /* Read potentially unaligned pixels into t1 and t2
293 Since we're reading 16 pixels, and actually only want 8, 314 Since we're reading 16 pixels, and actually only want 8,
294 mask out the last 8 pixels. The 0s don't change the sum. */ 315 mask out the last 8 pixels. The 0s don't change the sum. */
321 342
322 int pix_norm1_altivec(uint8_t *pix, int line_size) 343 int pix_norm1_altivec(uint8_t *pix, int line_size)
323 { 344 {
324 int i; 345 int i;
325 int s __attribute__((aligned(16))); 346 int s __attribute__((aligned(16)));
326 const vector unsigned int zero = (const vector unsigned int)(0); 347 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
327 vector unsigned char *tv; 348 vector unsigned char *tv;
328 vector unsigned char pixv; 349 vector unsigned char pixv;
329 vector unsigned int sv; 350 vector unsigned int sv;
330 vector signed int sum; 351 vector signed int sum;
331 352
332 sv = (vector unsigned int)(0); 353 sv = (vector unsigned int)vec_splat_u32(0);
333 354
334 s = 0; 355 s = 0;
335 for (i = 0; i < 16; i++) { 356 for (i = 0; i < 16; i++) {
336 /* Read in the potentially unaligned pixels */ 357 /* Read in the potentially unaligned pixels */
337 tv = (vector unsigned char *) pix; 358 tv = (vector unsigned char *) pix;
357 */ 378 */
358 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) 379 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
359 { 380 {
360 int i; 381 int i;
361 int s __attribute__((aligned(16))); 382 int s __attribute__((aligned(16)));
362 const vector unsigned int zero = (const vector unsigned int)(0); 383 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
363 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 384 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
364 vector unsigned char t1, t2, t3,t4, t5; 385 vector unsigned char t1, t2, t3,t4, t5;
365 vector unsigned int sum; 386 vector unsigned int sum;
366 vector signed int sumsqr; 387 vector signed int sumsqr;
367 388
368 sum = (vector unsigned int)(0); 389 sum = (vector unsigned int)vec_splat_u32(0);
369 permclear = (vector unsigned char)(0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); 390 #ifdef CONFIG_DARWIN
391 permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
392 #else
393 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
394 #endif
370 395
371 for(i=0;i<8;i++) { 396 for(i=0;i<8;i++) {
372 /* Read potentially unaligned pixels into t1 and t2 397 /* Read potentially unaligned pixels into t1 and t2
373 Since we're reading 16 pixels, and actually only want 8, 398 Since we're reading 16 pixels, and actually only want 8,
374 mask out the last 8 pixels. The 0s don't change the sum. */ 399 mask out the last 8 pixels. The 0s don't change the sum. */
411 */ 436 */
412 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) 437 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
413 { 438 {
414 int i; 439 int i;
415 int s __attribute__((aligned(16))); 440 int s __attribute__((aligned(16)));
416 const vector unsigned int zero = (const vector unsigned int)(0); 441 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
417 vector unsigned char perm1, perm2, *pix1v, *pix2v; 442 vector unsigned char perm1, perm2, *pix1v, *pix2v;
418 vector unsigned char t1, t2, t3,t4, t5; 443 vector unsigned char t1, t2, t3,t4, t5;
419 vector unsigned int sum; 444 vector unsigned int sum;
420 vector signed int sumsqr; 445 vector signed int sumsqr;
421 446
422 sum = (vector unsigned int)(0); 447 sum = (vector unsigned int)vec_splat_u32(0);
423 448
424 for(i=0;i<16;i++) { 449 for(i=0;i<16;i++) {
425 /* Read potentially unaligned pixels into t1 and t2 */ 450 /* Read potentially unaligned pixels into t1 and t2 */
426 perm1 = vec_lvsl(0, pix1); 451 perm1 = vec_lvsl(0, pix1);
427 pix1v = (vector unsigned char *) pix1; 452 pix1v = (vector unsigned char *) pix1;
455 return s; 480 return s;
456 } 481 }
457 482
458 int pix_sum_altivec(UINT8 * pix, int line_size) 483 int pix_sum_altivec(UINT8 * pix, int line_size)
459 { 484 {
460 const vector unsigned int zero = (const vector unsigned int)(0); 485 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
461 vector unsigned char perm, *pixv; 486 vector unsigned char perm, *pixv;
462 vector unsigned char t1; 487 vector unsigned char t1;
463 vector unsigned int sad; 488 vector unsigned int sad;
464 vector signed int sumdiffs; 489 vector signed int sumdiffs;
465 490
466 int i; 491 int i;
467 int s __attribute__((aligned(16))); 492 int s __attribute__((aligned(16)));
468 493
469 sad = (vector unsigned int) (0); 494 sad = (vector unsigned int)vec_splat_u32(0);
470 495
471 for (i = 0; i < 16; i++) { 496 for (i = 0; i < 16; i++) {
472 /* Read the potentially unaligned 16 pixels into t1 */ 497 /* Read the potentially unaligned 16 pixels into t1 */
473 perm = vec_lvsl(0, pix); 498 perm = vec_lvsl(0, pix);
474 pixv = (vector unsigned char *) pix; 499 pixv = (vector unsigned char *) pix;
490 515
491 void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size) 516 void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
492 { 517 {
493 int i; 518 int i;
494 vector unsigned char perm, bytes, *pixv; 519 vector unsigned char perm, bytes, *pixv;
495 const vector unsigned char zero = (const vector unsigned char) (0); 520 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
496 vector signed short shorts; 521 vector signed short shorts;
497 522
498 for(i=0;i<8;i++) 523 for(i=0;i<8;i++)
499 { 524 {
500 // Read potentially unaligned pixels. 525 // Read potentially unaligned pixels.
517 void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1, 542 void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
518 const UINT8 *s2, int stride) 543 const UINT8 *s2, int stride)
519 { 544 {
520 int i; 545 int i;
521 vector unsigned char perm, bytes, *pixv; 546 vector unsigned char perm, bytes, *pixv;
522 const vector unsigned char zero = (const vector unsigned char) (0); 547 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
523 vector signed short shorts1, shorts2; 548 vector signed short shorts1, shorts2;
524 549
525 for(i=0;i<4;i++) 550 for(i=0;i<4;i++)
526 { 551 {
527 // Read potentially unaligned pixels 552 // Read potentially unaligned pixels
828 pixelsavg; 853 pixelsavg;
829 register vector unsigned char 854 register vector unsigned char
830 blockv, temp1, temp2; 855 blockv, temp1, temp2;
831 register vector unsigned short 856 register vector unsigned short
832 pixelssum1, pixelssum2, temp3; 857 pixelssum1, pixelssum2, temp3;
833 register const vector unsigned char vczero = (const vector unsigned char)(0); 858 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
834 register const vector unsigned short vctwo = (const vector unsigned short)(2); 859 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
835 860
836 temp1 = vec_ld(0, pixels); 861 temp1 = vec_ld(0, pixels);
837 temp2 = vec_ld(16, pixels); 862 temp2 = vec_ld(16, pixels);
838 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 863 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
839 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 864 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
943 pixelsavg; 968 pixelsavg;
944 register vector unsigned char 969 register vector unsigned char
945 blockv, temp1, temp2; 970 blockv, temp1, temp2;
946 register vector unsigned short 971 register vector unsigned short
947 pixelssum1, pixelssum2, temp3; 972 pixelssum1, pixelssum2, temp3;
948 register const vector unsigned char vczero = (const vector unsigned char)(0); 973 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
949 register const vector unsigned short vcone = (const vector unsigned short)(1); 974 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
950 register const vector unsigned short vctwo = (const vector unsigned short)(2); 975 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
951 976
952 temp1 = vec_ld(0, pixels); 977 temp1 = vec_ld(0, pixels);
953 temp2 = vec_ld(16, pixels); 978 temp2 = vec_ld(16, pixels);
954 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 979 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
955 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 980 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1059 register vector unsigned char 1084 register vector unsigned char
1060 blockv, temp1, temp2; 1085 blockv, temp1, temp2;
1061 register vector unsigned short 1086 register vector unsigned short
1062 pixelssum1, pixelssum2, temp3, 1087 pixelssum1, pixelssum2, temp3,
1063 pixelssum3, pixelssum4, temp4; 1088 pixelssum3, pixelssum4, temp4;
1064 register const vector unsigned char vczero = (const vector unsigned char)(0); 1089 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1065 register const vector unsigned short vctwo = (const vector unsigned short)(2); 1090 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1066 1091
1067 temp1 = vec_ld(0, pixels); 1092 temp1 = vec_ld(0, pixels);
1068 temp2 = vec_ld(16, pixels); 1093 temp2 = vec_ld(16, pixels);
1069 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 1094 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1070 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 1095 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1179 register vector unsigned char 1204 register vector unsigned char
1180 blockv, temp1, temp2; 1205 blockv, temp1, temp2;
1181 register vector unsigned short 1206 register vector unsigned short
1182 pixelssum1, pixelssum2, temp3, 1207 pixelssum1, pixelssum2, temp3,
1183 pixelssum3, pixelssum4, temp4; 1208 pixelssum3, pixelssum4, temp4;
1184 register const vector unsigned char vczero = (const vector unsigned char)(0); 1209 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1185 register const vector unsigned short vcone = (const vector unsigned short)(1); 1210 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1186 register const vector unsigned short vctwo = (const vector unsigned short)(2); 1211 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1187 1212
1188 temp1 = vec_ld(0, pixels); 1213 temp1 = vec_ld(0, pixels);
1189 temp2 = vec_ld(16, pixels); 1214 temp2 = vec_ld(16, pixels);
1190 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 1215 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1191 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 1216 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1252 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 1277 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1253 } 1278 }
1254 1279
1255 int has_altivec(void) 1280 int has_altivec(void)
1256 { 1281 {
1257 #if CONFIG_DARWIN 1282 #ifdef CONFIG_DARWIN
1258 int sels[2] = {CTL_HW, HW_VECTORUNIT}; 1283 int sels[2] = {CTL_HW, HW_VECTORUNIT};
1259 int has_vu = 0; 1284 int has_vu = 0;
1260 size_t len = sizeof(has_vu); 1285 size_t len = sizeof(has_vu);
1261 int err; 1286 int err;
1262 1287
1263 err = sysctl(sels, 2, &has_vu, &len, NULL, 0); 1288 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1264 1289
1265 if (err == 0) return (has_vu != 0); 1290 if (err == 0) return (has_vu != 0);
1266 #endif 1291 #else /* CONFIG_DARWIN */
1292 /* no Darwin, do it the brute-force way */
1293 /* this is borrowed from the libmpeg2 library */
1294 {
1295 signal (SIGILL, sigill_handler);
1296 if (sigsetjmp (jmpbuf, 1)) {
1297 signal (SIGILL, SIG_DFL);
1298 } else {
1299 canjump = 1;
1300
1301 asm volatile ("mtspr 256, %0\n\t"
1302 "vand %%v0, %%v0, %%v0"
1303 :
1304 : "r" (-1));
1305
1306 signal (SIGILL, SIG_DFL);
1307 return 1;
1308 }
1309 }
1310 #endif /* CONFIG_DARWIN */
1267 return 0; 1311 return 0;
1268 } 1312 }