Mercurial > libavcodec.hg
changeset 6384:0a403ade8c81 libavcodec
simd and unroll png_filter_row
cycles per 1000 pixels on core2:
left: 9211->5170
top: 9283->2138
avg: 12215->7611
paeth: 64024->17360
overall rgb png decoding speed: +45%
overall greyscale png decoding speed: +6%
author | lorenm |
---|---|
date | Thu, 21 Feb 2008 07:10:46 +0000 |
parents | 7ba06222bda7 |
children | 40fbc878ce3f |
files | dsputil.c dsputil.h i386/dsputil_mmx.c pngdec.c |
diffstat | 4 files changed, 196 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- a/dsputil.c Thu Feb 21 00:06:07 2008 +0000 +++ b/dsputil.c Thu Feb 21 07:10:46 2008 +0000 @@ -44,6 +44,9 @@ /* flacenc.c */ void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc); +/* pngdec.c */ +void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); + uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; uint32_t ff_squareTbl[512] = {0, }; @@ -3288,6 +3291,17 @@ dst[i+0] += src[i+0]; } +static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ + int i; + for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ + long a = *(long*)(src1+i); + long b = *(long*)(src2+i); + *(long*)(dst+i) = ((a&0x7f7f7f7f7f7f7f7fL) + (b&0x7f7f7f7f7f7f7f7fL)) ^ ((a^b)&0x8080808080808080L); + } + for(; i<w; i++) + dst[i] = src1[i]+src2[i]; +} + static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ int i; for(i=0; i+7<w; i+=8){ @@ -4232,9 +4246,13 @@ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c; c->add_bytes= add_bytes_c; + c->add_bytes_l2= add_bytes_l2_c; c->diff_bytes= diff_bytes_c; c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c; c->bswap_buf= bswap_buf; +#ifdef CONFIG_PNG_DECODER + c->add_png_paeth_prediction= ff_add_png_paeth_prediction; +#endif c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
--- a/dsputil.h Thu Feb 21 00:06:07 2008 +0000 +++ b/dsputil.h Thu Feb 21 07:10:46 2008 +0000 @@ -304,12 +304,15 @@ /* huffyuv specific */ void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w); + void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w); void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w); /** * subtract huffyuv's variant of median prediction * note, this might read from src1[-1], src2[-1] */ void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top); + /* this might write to dst[w] */ + void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w); void (*h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
--- a/i386/dsputil_mmx.c Thu Feb 21 00:06:07 2008 +0000 +++ b/i386/dsputil_mmx.c Thu Feb 21 07:10:46 2008 +0000 @@ -59,6 +59,7 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; @@ -605,6 +606,26 @@ dst[i+0] += src[i+0]; } +static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ + long i=0; + asm volatile( + "1: \n\t" + "movq (%2, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "paddb (%3, %0), %%mm0 \n\t" + "paddb 8(%3, %0), %%mm1 \n\t" + "movq %%mm0, (%1, %0) \n\t" + "movq %%mm1, 8(%1, %0) \n\t" + "add $16, %0 \n\t" + "cmp %4, %0 \n\t" + " jb 1b \n\t" + : "+r" (i) + : "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15) + ); + for(; i<w; i++) + dst[i] = src1[i] + src2[i]; +} + #define H263_LOOP_FILTER \ "pxor %%mm7, %%mm7 \n\t"\ "movq %0, %%mm0 \n\t"\ @@ -1564,6 +1585,80 @@ *left = src2[w-1]; } +#define PAETH(cpu, abs3)\ +void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ +{\ + long i = -bpp;\ + long end = w-3;\ + asm volatile(\ + "pxor %%mm7, %%mm7 \n"\ + "movd (%1,%0), %%mm0 \n"\ + "movd (%2,%0), %%mm1 \n"\ + "punpcklbw %%mm7, %%mm0 \n"\ + "punpcklbw %%mm7, %%mm1 \n"\ + "add %4, %0 \n"\ + "1: \n"\ + "movq %%mm1, %%mm2 \n"\ + "movd (%2,%0), %%mm1 \n"\ + "movq %%mm2, %%mm3 \n"\ + "punpcklbw %%mm7, %%mm1 \n"\ + "movq %%mm2, %%mm4 \n"\ + "psubw %%mm1, %%mm3 \n"\ + "psubw %%mm0, %%mm4 \n"\ + "movq %%mm3, %%mm5 \n"\ + "paddw %%mm4, %%mm5 \n"\ + abs3\ + "movq %%mm4, %%mm6 \n"\ + "pminsw %%mm5, %%mm6 \n"\ + "pcmpgtw %%mm6, %%mm3 \n"\ + "pcmpgtw %%mm5, %%mm4 \n"\ + "movq %%mm4, %%mm6 \n"\ + "pand %%mm3, %%mm4 \n"\ + "pandn %%mm3, %%mm6 \n"\ + "pandn %%mm0, %%mm3 \n"\ + "movd (%3,%0), %%mm0 \n"\ + "pand %%mm1, %%mm6 \n"\ + "pand %%mm4, %%mm2 \n"\ + "punpcklbw %%mm7, %%mm0 \n"\ + "movq %6, %%mm5 \n"\ + "paddw %%mm6, %%mm0 \n"\ + "paddw %%mm2, %%mm3 \n"\ + "paddw %%mm3, %%mm0 \n"\ + "pand %%mm5, %%mm0 \n"\ + "movq %%mm0, %%mm3 \n"\ + "packuswb %%mm3, %%mm3 \n"\ + "movd %%mm3, (%1,%0) \n"\ + "add %4, %0 \n"\ + "cmp %5, %0 \n"\ + "jle 1b \n"\ + :"+r"(i)\ + :"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\ + "m"(ff_pw_255)\ + :"memory"\ + );\ +} + +#define ABS3_MMX2\ + "psubw %%mm5, %%mm7 \n"\ + "pmaxsw %%mm7, %%mm5 \n"\ + "pxor %%mm6, %%mm6 \n"\ + "pxor %%mm7, %%mm7 \n"\ + "psubw %%mm3, %%mm6 \n"\ + "psubw %%mm4, %%mm7 \n"\ + "pmaxsw %%mm6, %%mm3 \n"\ + "pmaxsw %%mm7, %%mm4 \n"\ + "pxor %%mm7, %%mm7 \n" + +#define ABS3_SSSE3\ + "pabsw %%mm3, %%mm3 \n"\ + "pabsw %%mm4, %%mm4 \n"\ + "pabsw %%mm5, %%mm5 \n" + +PAETH(mmx2, ABS3_MMX2) +#ifdef HAVE_SSSE3 +PAETH(ssse3, ABS3_SSSE3) +#endif + #define DIFF_PIXELS_1(m,a,t,p1,p2)\ "mov"#m" "#p1", "#a" \n\t"\ "mov"#m" "#p2", "#t" \n\t"\ @@ -3317,6 +3412,7 @@ c->gmc= gmc_mmx; c->add_bytes= add_bytes_mmx; + c->add_bytes_l2= add_bytes_l2_mmx; #ifdef CONFIG_ENCODERS c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; @@ -3471,6 +3567,7 @@ if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) ff_vc1dsp_init_mmx(c, avctx); + c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; #ifdef CONFIG_ENCODERS c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; #endif //CONFIG_ENCODERS @@ -3565,6 +3662,7 @@ H264_QPEL_FUNCS(3, 1, ssse3); H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); + c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; } #endif
--- a/pngdec.c Thu Feb 21 00:06:07 2008 +0000 +++ b/pngdec.c Thu Feb 21 07:10:46 2008 +0000 @@ -21,6 +21,7 @@ #include "avcodec.h" #include "bytestream.h" #include "png.h" +#include "dsputil.h" /* TODO: * - add 2, 4 and 16 bit depth support @@ -31,6 +32,8 @@ //#define DEBUG typedef struct PNGDecContext { + DSPContext dsp; + const uint8_t *bytestream; const uint8_t *bytestream_start; const uint8_t *bytestream_end; @@ -129,12 +132,60 @@ } } -/* XXX: optimize */ +void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp) +{ + int i; + for(i = 0; i < w; i++) { + int a, b, c, p, pa, pb, pc; + + a = dst[i - bpp]; + b = top[i]; + c = top[i - bpp]; + + p = b - c; + pc = a - c; + + pa = abs(p); + pb = abs(pc); + pc = abs(p + pc); + + if (pa <= pb && pa <= pc) + p = a; + else if (pb <= pc) + p = b; + else + p = c; + dst[i] = p + src[i]; + } +} + +#define UNROLL1(bpp, op) {\ + r = dst[0];\ + if(bpp >= 2) g = dst[1];\ + if(bpp >= 3) b = dst[2];\ + if(bpp >= 4) a = dst[3];\ + for(; i < size; i+=bpp) {\ + dst[i+0] = r = op(r, src[i+0], last[i+0]);\ + if(bpp == 1) continue;\ + dst[i+1] = g = op(g, src[i+1], last[i+1]);\ + if(bpp == 2) continue;\ + dst[i+2] = b = op(b, src[i+2], last[i+2]);\ + if(bpp == 3) continue;\ + dst[i+3] = a = op(a, src[i+3], last[i+3]);\ + }\ +} + +#define UNROLL_FILTER(op)\ + if(bpp == 1) UNROLL1(1, op)\ + else if(bpp == 2) UNROLL1(2, op)\ + else if(bpp == 3) UNROLL1(3, op)\ + else if(bpp == 4) UNROLL1(4, op)\ + /* NOTE: 'dst' can be equal to 'last' */ -static void png_filter_row(uint8_t *dst, int filter_type, +static void png_filter_row(DSPContext *dsp, uint8_t *dst, int filter_type, uint8_t *src, uint8_t *last, int size, int bpp) { - int i, p; + int i, p, r, g, b, a; switch(filter_type) { case PNG_FILTER_VALUE_NONE: @@ -144,54 +195,41 @@ for(i = 0; i < bpp; i++) { dst[i] = src[i]; } - for(i = bpp; i < size; i++) { - p = dst[i - bpp]; - dst[i] = p + src[i]; + if(bpp == 4) { + p = *(int*)dst; + for(; i < size; i+=bpp) { + int s = *(int*)(src+i); + p = ((s&0x7f7f7f7f) + (p&0x7f7f7f7f)) ^ ((s^p)&0x80808080); + *(int*)(dst+i) = p; + } + } else { +#define OP_SUB(x,s,l) x+s + UNROLL_FILTER(OP_SUB); } break; case PNG_FILTER_VALUE_UP: - for(i = 0; i < size; i++) { - p = last[i]; - dst[i] = p + src[i]; - } + dsp->add_bytes_l2(dst, src, last, size); break; case PNG_FILTER_VALUE_AVG: for(i = 0; i < bpp; i++) { p = (last[i] >> 1); dst[i] = p + src[i]; } - for(i = bpp; i < size; i++) { - p = ((dst[i - bpp] + last[i]) >> 1); - dst[i] = p + src[i]; - } +#define OP_AVG(x,s,l) (((x + l) >> 1) + s) & 0xff + UNROLL_FILTER(OP_AVG); break; case PNG_FILTER_VALUE_PAETH: for(i = 0; i < bpp; i++) { p = last[i]; dst[i] = p + src[i]; } - for(i = bpp; i < size; i++) { - int a, b, c, pa, pb, pc; - - a = dst[i - bpp]; - b = last[i]; - c = last[i - bpp]; - - p = b - c; - pc = a - c; - - pa = abs(p); - pb = abs(pc); - pc = abs(p + pc); - - if (pa <= pb && pa <= pc) - p = a; - else if (pb <= pc) - p = b; - else - p = c; - dst[i] = p + src[i]; + if(bpp > 1 && size > 4) { + // would write off the end of the array if we let it process the last pixel with bpp=3 + int w = bpp==4 ? size : size-3; + dsp->add_png_paeth_prediction(dst+i, src+i, last+i, w-i, bpp); + i = w; } + ff_add_png_paeth_prediction(dst+i, src+i, last+i, size-i, bpp); break; } } @@ -222,7 +260,7 @@ ptr = s->image_buf + s->image_linesize * s->y; /* need to swap bytes correctly for RGB_ALPHA */ if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) { - png_filter_row(s->tmp_row, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, s->last_row, s->row_size, s->bpp); memcpy(s->last_row, s->tmp_row, s->row_size); convert_to_rgb32(ptr, s->tmp_row, s->width); @@ -233,7 +271,7 @@ else last_row = ptr - s->image_linesize; - png_filter_row(ptr, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1, last_row, s->row_size, s->bpp); } s->y++; @@ -249,7 +287,7 @@ wait for the next one */ if (got_line) break; - png_filter_row(s->tmp_row, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, s->last_row, s->pass_row_size, s->bpp); memcpy(s->last_row, s->tmp_row, s->pass_row_size); got_line = 1; @@ -534,6 +572,7 @@ avcodec_get_frame_defaults((AVFrame*)&s->picture); avctx->coded_frame= (AVFrame*)&s->picture; + dsputil_init(&s->dsp, avctx); return 0; }