# HG changeset patch # User lorenm # Date 1203577846 0 # Node ID 0a403ade8c812d7b3ae4783bab91696eae13a839 # Parent 7ba06222bda758494b3308836a87713dd43161fa simd and unroll png_filter_row cycles per 1000 pixels on core2: left: 9211->5170 top: 9283->2138 avg: 12215->7611 paeth: 64024->17360 overall rgb png decoding speed: +45% overall greyscale png decoding speed: +6% diff -r 7ba06222bda7 -r 0a403ade8c81 dsputil.c --- a/dsputil.c Thu Feb 21 00:06:07 2008 +0000 +++ b/dsputil.c Thu Feb 21 07:10:46 2008 +0000 @@ -44,6 +44,9 @@ /* flacenc.c */ void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc); +/* pngdec.c */ +void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); + uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; uint32_t ff_squareTbl[512] = {0, }; @@ -3288,6 +3291,17 @@ dst[i+0] += src[i+0]; } +static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ + int i; + for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ + long a = *(long*)(src1+i); + long b = *(long*)(src2+i); + *(long*)(dst+i) = ((a&0x7f7f7f7f7f7f7f7fL) + (b&0x7f7f7f7f7f7f7f7fL)) ^ ((a^b)&0x8080808080808080L); + } + for(; issd_int8_vs_int16 = ssd_int8_vs_int16_c; c->add_bytes= add_bytes_c; + c->add_bytes_l2= add_bytes_l2_c; c->diff_bytes= diff_bytes_c; c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c; c->bswap_buf= bswap_buf; +#ifdef CONFIG_PNG_DECODER + c->add_png_paeth_prediction= ff_add_png_paeth_prediction; +#endif c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; diff -r 7ba06222bda7 -r 0a403ade8c81 dsputil.h --- a/dsputil.h Thu Feb 21 00:06:07 2008 +0000 +++ b/dsputil.h Thu Feb 21 07:10:46 2008 +0000 @@ -304,12 +304,15 @@ /* huffyuv specific */ void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w); + void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w); void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w); /** * subtract huffyuv's variant of median prediction * note, this might read from src1[-1], src2[-1] */ void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top); + /* this might write to dst[w] */ + void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w); void (*h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); diff -r 7ba06222bda7 -r 0a403ade8c81 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Thu Feb 21 00:06:07 2008 +0000 +++ b/i386/dsputil_mmx.c Thu Feb 21 07:10:46 2008 +0000 @@ -59,6 +59,7 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; @@ -605,6 +606,26 @@ dst[i+0] += src[i+0]; } +static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ + long i=0; + asm volatile( + "1: \n\t" + "movq (%2, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "paddb (%3, %0), %%mm0 \n\t" + "paddb 8(%3, %0), %%mm1 \n\t" + "movq %%mm0, (%1, %0) \n\t" + "movq %%mm1, 8(%1, %0) \n\t" + "add $16, %0 \n\t" + "cmp %4, %0 \n\t" + " jb 1b \n\t" + : "+r" (i) + : "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15) + ); + for(; igmc= gmc_mmx; c->add_bytes= add_bytes_mmx; + c->add_bytes_l2= add_bytes_l2_mmx; #ifdef CONFIG_ENCODERS c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; @@ -3471,6 +3567,7 @@ if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) ff_vc1dsp_init_mmx(c, avctx); + c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; #ifdef CONFIG_ENCODERS c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; #endif //CONFIG_ENCODERS @@ -3565,6 +3662,7 @@ H264_QPEL_FUNCS(3, 1, ssse3); H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); + c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; } #endif diff -r 7ba06222bda7 -r 0a403ade8c81 pngdec.c --- a/pngdec.c Thu Feb 21 00:06:07 2008 +0000 +++ b/pngdec.c Thu Feb 21 07:10:46 2008 +0000 @@ -21,6 +21,7 @@ #include "avcodec.h" #include "bytestream.h" #include "png.h" +#include "dsputil.h" /* TODO: * - add 2, 4 and 16 bit depth support @@ -31,6 +32,8 @@ //#define DEBUG typedef struct PNGDecContext { + DSPContext dsp; + const uint8_t *bytestream; const uint8_t *bytestream_start; const uint8_t *bytestream_end; @@ -129,12 +132,60 @@ } } -/* XXX: optimize */ +void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp) +{ + int i; + for(i = 0; i < w; i++) { + int a, b, c, p, pa, pb, pc; + + a = dst[i - bpp]; + b = top[i]; + c = top[i - bpp]; + + p = b - c; + pc = a - c; + + pa = abs(p); + pb = abs(pc); + pc = abs(p + pc); + + if (pa <= pb && pa <= pc) + p = a; + else if (pb <= pc) + p = b; + else + p = c; + dst[i] = p + src[i]; + } +} + +#define UNROLL1(bpp, op) {\ + r = dst[0];\ + if(bpp >= 2) g = dst[1];\ + if(bpp >= 3) b = dst[2];\ + if(bpp >= 4) a = dst[3];\ + for(; i < size; i+=bpp) {\ + dst[i+0] = r = op(r, src[i+0], last[i+0]);\ + if(bpp == 1) continue;\ + dst[i+1] = g = op(g, src[i+1], last[i+1]);\ + if(bpp == 2) continue;\ + dst[i+2] = b = op(b, src[i+2], last[i+2]);\ + if(bpp == 3) continue;\ + dst[i+3] = a = op(a, src[i+3], last[i+3]);\ + }\ +} + +#define UNROLL_FILTER(op)\ + if(bpp == 1) UNROLL1(1, op)\ + else if(bpp == 2) UNROLL1(2, op)\ + else if(bpp == 3) UNROLL1(3, op)\ + else if(bpp == 4) UNROLL1(4, op)\ + /* NOTE: 'dst' can be equal to 'last' */ -static void png_filter_row(uint8_t *dst, int filter_type, +static void png_filter_row(DSPContext *dsp, uint8_t *dst, int filter_type, uint8_t *src, uint8_t *last, int size, int bpp) { - int i, p; + int i, p, r, g, b, a; switch(filter_type) { case PNG_FILTER_VALUE_NONE: @@ -144,54 +195,41 @@ for(i = 0; i < bpp; i++) { dst[i] = src[i]; } - for(i = bpp; i < size; i++) { - p = dst[i - bpp]; - dst[i] = p + src[i]; + if(bpp == 4) { + p = *(int*)dst; + for(; i < size; i+=bpp) { + int s = *(int*)(src+i); + p = ((s&0x7f7f7f7f) + (p&0x7f7f7f7f)) ^ ((s^p)&0x80808080); + *(int*)(dst+i) = p; + } + } else { +#define OP_SUB(x,s,l) x+s + UNROLL_FILTER(OP_SUB); } break; case PNG_FILTER_VALUE_UP: - for(i = 0; i < size; i++) { - p = last[i]; - dst[i] = p + src[i]; - } + dsp->add_bytes_l2(dst, src, last, size); break; case PNG_FILTER_VALUE_AVG: for(i = 0; i < bpp; i++) { p = (last[i] >> 1); dst[i] = p + src[i]; } - for(i = bpp; i < size; i++) { - p = ((dst[i - bpp] + last[i]) >> 1); - dst[i] = p + src[i]; - } +#define OP_AVG(x,s,l) (((x + l) >> 1) + s) & 0xff + UNROLL_FILTER(OP_AVG); break; case PNG_FILTER_VALUE_PAETH: for(i = 0; i < bpp; i++) { p = last[i]; dst[i] = p + src[i]; } - for(i = bpp; i < size; i++) { - int a, b, c, pa, pb, pc; - - a = dst[i - bpp]; - b = last[i]; - c = last[i - bpp]; - - p = b - c; - pc = a - c; - - pa = abs(p); - pb = abs(pc); - pc = abs(p + pc); - - if (pa <= pb && pa <= pc) - p = a; - else if (pb <= pc) - p = b; - else - p = c; - dst[i] = p + src[i]; + if(bpp > 1 && size > 4) { + // would write off the end of the array if we let it process the last pixel with bpp=3 + int w = bpp==4 ? size : size-3; + dsp->add_png_paeth_prediction(dst+i, src+i, last+i, w-i, bpp); + i = w; } + ff_add_png_paeth_prediction(dst+i, src+i, last+i, size-i, bpp); break; } } @@ -222,7 +260,7 @@ ptr = s->image_buf + s->image_linesize * s->y; /* need to swap bytes correctly for RGB_ALPHA */ if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) { - png_filter_row(s->tmp_row, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, s->last_row, s->row_size, s->bpp); memcpy(s->last_row, s->tmp_row, s->row_size); convert_to_rgb32(ptr, s->tmp_row, s->width); @@ -233,7 +271,7 @@ else last_row = ptr - s->image_linesize; - png_filter_row(ptr, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1, last_row, s->row_size, s->bpp); } s->y++; @@ -249,7 +287,7 @@ wait for the next one */ if (got_line) break; - png_filter_row(s->tmp_row, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, s->last_row, s->pass_row_size, s->bpp); memcpy(s->last_row, s->tmp_row, s->pass_row_size); got_line = 1; @@ -534,6 +572,7 @@ avcodec_get_frame_defaults((AVFrame*)&s->picture); avctx->coded_frame= (AVFrame*)&s->picture; + dsputil_init(&s->dsp, avctx); return 0; }