comparison dsputil.c @ 6385:40fbc878ce3f libavcodec

pseudo-simd add_bytes and diff_bytes 2x faster than scalar in 32bit, 4x faster in 64bit (as opposed to 8x in mmx)
author lorenm
date Thu, 21 Feb 2008 07:54:46 +0000
parents 0a403ade8c81
children 8b570faa9a8d
comparison
equal deleted inserted replaced
6384:0a403ade8c81 6385:40fbc878ce3f
47 /* pngdec.c */ 47 /* pngdec.c */
48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); 48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
49 49
50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; 50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
51 uint32_t ff_squareTbl[512] = {0, }; 51 uint32_t ff_squareTbl[512] = {0, };
52
53 static const unsigned long pb_7f = 0x7f7f7f7f7f7f7f7fUL;
54 static const unsigned long pb_80 = 0x8080808080808080UL;
52 55
53 const uint8_t ff_zigzag_direct[64] = { 56 const uint8_t ff_zigzag_direct[64] = {
54 0, 1, 8, 16, 9, 2, 3, 10, 57 0, 1, 8, 16, 9, 2, 3, 10,
55 17, 24, 32, 25, 18, 11, 4, 5, 58 17, 24, 32, 25, 18, 11, 4, 5,
56 12, 19, 26, 33, 40, 48, 41, 34, 59 12, 19, 26, 33, 40, 48, 41, 34,
3274 { 3277 {
3275 memset(blocks, 0, sizeof(DCTELEM)*6*64); 3278 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3276 } 3279 }
3277 3280
3278 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ 3281 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3279 int i; 3282 long i;
3280 for(i=0; i+7<w; i+=8){ 3283 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3281 dst[i+0] += src[i+0]; 3284 long a = *(long*)(src+i);
3282 dst[i+1] += src[i+1]; 3285 long b = *(long*)(dst+i);
3283 dst[i+2] += src[i+2]; 3286 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3284 dst[i+3] += src[i+3];
3285 dst[i+4] += src[i+4];
3286 dst[i+5] += src[i+5];
3287 dst[i+6] += src[i+6];
3288 dst[i+7] += src[i+7];
3289 } 3287 }
3290 for(; i<w; i++) 3288 for(; i<w; i++)
3291 dst[i+0] += src[i+0]; 3289 dst[i+0] += src[i+0];
3292 } 3290 }
3293 3291
3294 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 3292 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3295 int i; 3293 long i;
3296 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ 3294 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3297 long a = *(long*)(src1+i); 3295 long a = *(long*)(src1+i);
3298 long b = *(long*)(src2+i); 3296 long b = *(long*)(src2+i);
3299 *(long*)(dst+i) = ((a&0x7f7f7f7f7f7f7f7fL) + (b&0x7f7f7f7f7f7f7f7fL)) ^ ((a^b)&0x8080808080808080L); 3297 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3300 } 3298 }
3301 for(; i<w; i++) 3299 for(; i<w; i++)
3302 dst[i] = src1[i]+src2[i]; 3300 dst[i] = src1[i]+src2[i];
3303 } 3301 }
3304 3302
3305 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 3303 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3306 int i; 3304 long i;
3305 #ifndef HAVE_FAST_UNALIGNED
3306 if((long)src2 & (sizeof(long)-1)){
3307 for(i=0; i+7<w; i+=8){ 3307 for(i=0; i+7<w; i+=8){
3308 dst[i+0] = src1[i+0]-src2[i+0]; 3308 dst[i+0] = src1[i+0]-src2[i+0];
3309 dst[i+1] = src1[i+1]-src2[i+1]; 3309 dst[i+1] = src1[i+1]-src2[i+1];
3310 dst[i+2] = src1[i+2]-src2[i+2]; 3310 dst[i+2] = src1[i+2]-src2[i+2];
3311 dst[i+3] = src1[i+3]-src2[i+3]; 3311 dst[i+3] = src1[i+3]-src2[i+3];
3312 dst[i+4] = src1[i+4]-src2[i+4]; 3312 dst[i+4] = src1[i+4]-src2[i+4];
3313 dst[i+5] = src1[i+5]-src2[i+5]; 3313 dst[i+5] = src1[i+5]-src2[i+5];
3314 dst[i+6] = src1[i+6]-src2[i+6]; 3314 dst[i+6] = src1[i+6]-src2[i+6];
3315 dst[i+7] = src1[i+7]-src2[i+7]; 3315 dst[i+7] = src1[i+7]-src2[i+7];
3316 }
3317 }else
3318 #endif
3319 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3320 long a = *(long*)(src1+i);
3321 long b = *(long*)(src2+i);
3322 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3316 } 3323 }
3317 for(; i<w; i++) 3324 for(; i<w; i++)
3318 dst[i+0] = src1[i+0]-src2[i+0]; 3325 dst[i+0] = src1[i+0]-src2[i+0];
3319 } 3326 }
3320 3327