Mercurial > libavcodec.hg
comparison dsputil.c @ 6385:40fbc878ce3f libavcodec
pseudo-simd add_bytes and diff_bytes
2x faster than scalar in 32bit, 4x faster in 64bit (as opposed to 8x in mmx)
author | lorenm |
---|---|
date | Thu, 21 Feb 2008 07:54:46 +0000 |
parents | 0a403ade8c81 |
children | 8b570faa9a8d |
comparison
equal
deleted
inserted
replaced
6384:0a403ade8c81 | 6385:40fbc878ce3f |
---|---|
47 /* pngdec.c */ | 47 /* pngdec.c */ |
48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); | 48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); |
49 | 49 |
50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; | 50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; |
51 uint32_t ff_squareTbl[512] = {0, }; | 51 uint32_t ff_squareTbl[512] = {0, }; |
52 | |
53 static const unsigned long pb_7f = 0x7f7f7f7f7f7f7f7fUL; | |
54 static const unsigned long pb_80 = 0x8080808080808080UL; | |
52 | 55 |
53 const uint8_t ff_zigzag_direct[64] = { | 56 const uint8_t ff_zigzag_direct[64] = { |
54 0, 1, 8, 16, 9, 2, 3, 10, | 57 0, 1, 8, 16, 9, 2, 3, 10, |
55 17, 24, 32, 25, 18, 11, 4, 5, | 58 17, 24, 32, 25, 18, 11, 4, 5, |
56 12, 19, 26, 33, 40, 48, 41, 34, | 59 12, 19, 26, 33, 40, 48, 41, 34, |
3274 { | 3277 { |
3275 memset(blocks, 0, sizeof(DCTELEM)*6*64); | 3278 memset(blocks, 0, sizeof(DCTELEM)*6*64); |
3276 } | 3279 } |
3277 | 3280 |
3278 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ | 3281 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ |
3279 int i; | 3282 long i; |
3280 for(i=0; i+7<w; i+=8){ | 3283 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ |
3281 dst[i+0] += src[i+0]; | 3284 long a = *(long*)(src+i); |
3282 dst[i+1] += src[i+1]; | 3285 long b = *(long*)(dst+i); |
3283 dst[i+2] += src[i+2]; | 3286 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); |
3284 dst[i+3] += src[i+3]; | |
3285 dst[i+4] += src[i+4]; | |
3286 dst[i+5] += src[i+5]; | |
3287 dst[i+6] += src[i+6]; | |
3288 dst[i+7] += src[i+7]; | |
3289 } | 3287 } |
3290 for(; i<w; i++) | 3288 for(; i<w; i++) |
3291 dst[i+0] += src[i+0]; | 3289 dst[i+0] += src[i+0]; |
3292 } | 3290 } |
3293 | 3291 |
3294 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | 3292 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
3295 int i; | 3293 long i; |
3296 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | 3294 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ |
3297 long a = *(long*)(src1+i); | 3295 long a = *(long*)(src1+i); |
3298 long b = *(long*)(src2+i); | 3296 long b = *(long*)(src2+i); |
3299 *(long*)(dst+i) = ((a&0x7f7f7f7f7f7f7f7fL) + (b&0x7f7f7f7f7f7f7f7fL)) ^ ((a^b)&0x8080808080808080L); | 3297 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); |
3300 } | 3298 } |
3301 for(; i<w; i++) | 3299 for(; i<w; i++) |
3302 dst[i] = src1[i]+src2[i]; | 3300 dst[i] = src1[i]+src2[i]; |
3303 } | 3301 } |
3304 | 3302 |
3305 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | 3303 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
3306 int i; | 3304 long i; |
3305 #ifndef HAVE_FAST_UNALIGNED | |
3306 if((long)src2 & (sizeof(long)-1)){ | |
3307 for(i=0; i+7<w; i+=8){ | 3307 for(i=0; i+7<w; i+=8){ |
3308 dst[i+0] = src1[i+0]-src2[i+0]; | 3308 dst[i+0] = src1[i+0]-src2[i+0]; |
3309 dst[i+1] = src1[i+1]-src2[i+1]; | 3309 dst[i+1] = src1[i+1]-src2[i+1]; |
3310 dst[i+2] = src1[i+2]-src2[i+2]; | 3310 dst[i+2] = src1[i+2]-src2[i+2]; |
3311 dst[i+3] = src1[i+3]-src2[i+3]; | 3311 dst[i+3] = src1[i+3]-src2[i+3]; |
3312 dst[i+4] = src1[i+4]-src2[i+4]; | 3312 dst[i+4] = src1[i+4]-src2[i+4]; |
3313 dst[i+5] = src1[i+5]-src2[i+5]; | 3313 dst[i+5] = src1[i+5]-src2[i+5]; |
3314 dst[i+6] = src1[i+6]-src2[i+6]; | 3314 dst[i+6] = src1[i+6]-src2[i+6]; |
3315 dst[i+7] = src1[i+7]-src2[i+7]; | 3315 dst[i+7] = src1[i+7]-src2[i+7]; |
3316 } | |
3317 }else | |
3318 #endif | |
3319 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | |
3320 long a = *(long*)(src1+i); | |
3321 long b = *(long*)(src2+i); | |
3322 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80); | |
3316 } | 3323 } |
3317 for(; i<w; i++) | 3324 for(; i<w; i++) |
3318 dst[i+0] = src1[i+0]-src2[i+0]; | 3325 dst[i+0] = src1[i+0]-src2[i+0]; |
3319 } | 3326 } |
3320 | 3327 |