comparison pngdec.c @ 6384:0a403ade8c81 libavcodec

simd and unroll png_filter_row cycles per 1000 pixels on core2: left: 9211->5170 top: 9283->2138 avg: 12215->7611 paeth: 64024->17360 overall rgb png decoding speed: +45% overall greyscale png decoding speed: +6%
author lorenm
date Thu, 21 Feb 2008 07:10:46 +0000
parents 817e302aae08
children 1a9af4a496f2
comparison
equal deleted inserted replaced
6383:7ba06222bda7 6384:0a403ade8c81
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */ 20 */
21 #include "avcodec.h" 21 #include "avcodec.h"
22 #include "bytestream.h" 22 #include "bytestream.h"
23 #include "png.h" 23 #include "png.h"
24 #include "dsputil.h"
24 25
25 /* TODO: 26 /* TODO:
26 * - add 2, 4 and 16 bit depth support 27 * - add 2, 4 and 16 bit depth support
27 */ 28 */
28 29
29 #include <zlib.h> 30 #include <zlib.h>
30 31
31 //#define DEBUG 32 //#define DEBUG
32 33
33 typedef struct PNGDecContext { 34 typedef struct PNGDecContext {
35 DSPContext dsp;
36
34 const uint8_t *bytestream; 37 const uint8_t *bytestream;
35 const uint8_t *bytestream_start; 38 const uint8_t *bytestream_start;
36 const uint8_t *bytestream_end; 39 const uint8_t *bytestream_end;
37 AVFrame picture; 40 AVFrame picture;
38 41
127 } 130 }
128 break; 131 break;
129 } 132 }
130 } 133 }
131 134
132 /* XXX: optimize */ 135 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)
136 {
137 int i;
138 for(i = 0; i < w; i++) {
139 int a, b, c, p, pa, pb, pc;
140
141 a = dst[i - bpp];
142 b = top[i];
143 c = top[i - bpp];
144
145 p = b - c;
146 pc = a - c;
147
148 pa = abs(p);
149 pb = abs(pc);
150 pc = abs(p + pc);
151
152 if (pa <= pb && pa <= pc)
153 p = a;
154 else if (pb <= pc)
155 p = b;
156 else
157 p = c;
158 dst[i] = p + src[i];
159 }
160 }
161
162 #define UNROLL1(bpp, op) {\
163 r = dst[0];\
164 if(bpp >= 2) g = dst[1];\
165 if(bpp >= 3) b = dst[2];\
166 if(bpp >= 4) a = dst[3];\
167 for(; i < size; i+=bpp) {\
168 dst[i+0] = r = op(r, src[i+0], last[i+0]);\
169 if(bpp == 1) continue;\
170 dst[i+1] = g = op(g, src[i+1], last[i+1]);\
171 if(bpp == 2) continue;\
172 dst[i+2] = b = op(b, src[i+2], last[i+2]);\
173 if(bpp == 3) continue;\
174 dst[i+3] = a = op(a, src[i+3], last[i+3]);\
175 }\
176 }
177
178 #define UNROLL_FILTER(op)\
179 if(bpp == 1) UNROLL1(1, op)\
180 else if(bpp == 2) UNROLL1(2, op)\
181 else if(bpp == 3) UNROLL1(3, op)\
182 else if(bpp == 4) UNROLL1(4, op)\
183
133 /* NOTE: 'dst' can be equal to 'last' */ 184 /* NOTE: 'dst' can be equal to 'last' */
134 static void png_filter_row(uint8_t *dst, int filter_type, 185 static void png_filter_row(DSPContext *dsp, uint8_t *dst, int filter_type,
135 uint8_t *src, uint8_t *last, int size, int bpp) 186 uint8_t *src, uint8_t *last, int size, int bpp)
136 { 187 {
137 int i, p; 188 int i, p, r, g, b, a;
138 189
139 switch(filter_type) { 190 switch(filter_type) {
140 case PNG_FILTER_VALUE_NONE: 191 case PNG_FILTER_VALUE_NONE:
141 memcpy(dst, src, size); 192 memcpy(dst, src, size);
142 break; 193 break;
143 case PNG_FILTER_VALUE_SUB: 194 case PNG_FILTER_VALUE_SUB:
144 for(i = 0; i < bpp; i++) { 195 for(i = 0; i < bpp; i++) {
145 dst[i] = src[i]; 196 dst[i] = src[i];
146 } 197 }
147 for(i = bpp; i < size; i++) { 198 if(bpp == 4) {
148 p = dst[i - bpp]; 199 p = *(int*)dst;
149 dst[i] = p + src[i]; 200 for(; i < size; i+=bpp) {
201 int s = *(int*)(src+i);
202 p = ((s&0x7f7f7f7f) + (p&0x7f7f7f7f)) ^ ((s^p)&0x80808080);
203 *(int*)(dst+i) = p;
204 }
205 } else {
206 #define OP_SUB(x,s,l) x+s
207 UNROLL_FILTER(OP_SUB);
150 } 208 }
151 break; 209 break;
152 case PNG_FILTER_VALUE_UP: 210 case PNG_FILTER_VALUE_UP:
153 for(i = 0; i < size; i++) { 211 dsp->add_bytes_l2(dst, src, last, size);
154 p = last[i];
155 dst[i] = p + src[i];
156 }
157 break; 212 break;
158 case PNG_FILTER_VALUE_AVG: 213 case PNG_FILTER_VALUE_AVG:
159 for(i = 0; i < bpp; i++) { 214 for(i = 0; i < bpp; i++) {
160 p = (last[i] >> 1); 215 p = (last[i] >> 1);
161 dst[i] = p + src[i]; 216 dst[i] = p + src[i];
162 } 217 }
163 for(i = bpp; i < size; i++) { 218 #define OP_AVG(x,s,l) (((x + l) >> 1) + s) & 0xff
164 p = ((dst[i - bpp] + last[i]) >> 1); 219 UNROLL_FILTER(OP_AVG);
165 dst[i] = p + src[i];
166 }
167 break; 220 break;
168 case PNG_FILTER_VALUE_PAETH: 221 case PNG_FILTER_VALUE_PAETH:
169 for(i = 0; i < bpp; i++) { 222 for(i = 0; i < bpp; i++) {
170 p = last[i]; 223 p = last[i];
171 dst[i] = p + src[i]; 224 dst[i] = p + src[i];
172 } 225 }
173 for(i = bpp; i < size; i++) { 226 if(bpp > 1 && size > 4) {
174 int a, b, c, pa, pb, pc; 227 // would write off the end of the array if we let it process the last pixel with bpp=3
175 228 int w = bpp==4 ? size : size-3;
176 a = dst[i - bpp]; 229 dsp->add_png_paeth_prediction(dst+i, src+i, last+i, w-i, bpp);
177 b = last[i]; 230 i = w;
178 c = last[i - bpp]; 231 }
179 232 ff_add_png_paeth_prediction(dst+i, src+i, last+i, size-i, bpp);
180 p = b - c;
181 pc = a - c;
182
183 pa = abs(p);
184 pb = abs(pc);
185 pc = abs(p + pc);
186
187 if (pa <= pb && pa <= pc)
188 p = a;
189 else if (pb <= pc)
190 p = b;
191 else
192 p = c;
193 dst[i] = p + src[i];
194 }
195 break; 233 break;
196 } 234 }
197 } 235 }
198 236
199 static void convert_to_rgb32(uint8_t *dst, const uint8_t *src, int width) 237 static void convert_to_rgb32(uint8_t *dst, const uint8_t *src, int width)
220 258
221 if (!s->interlace_type) { 259 if (!s->interlace_type) {
222 ptr = s->image_buf + s->image_linesize * s->y; 260 ptr = s->image_buf + s->image_linesize * s->y;
223 /* need to swap bytes correctly for RGB_ALPHA */ 261 /* need to swap bytes correctly for RGB_ALPHA */
224 if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) { 262 if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
225 png_filter_row(s->tmp_row, s->crow_buf[0], s->crow_buf + 1, 263 png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
226 s->last_row, s->row_size, s->bpp); 264 s->last_row, s->row_size, s->bpp);
227 memcpy(s->last_row, s->tmp_row, s->row_size); 265 memcpy(s->last_row, s->tmp_row, s->row_size);
228 convert_to_rgb32(ptr, s->tmp_row, s->width); 266 convert_to_rgb32(ptr, s->tmp_row, s->width);
229 } else { 267 } else {
230 /* in normal case, we avoid one copy */ 268 /* in normal case, we avoid one copy */
231 if (s->y == 0) 269 if (s->y == 0)
232 last_row = s->last_row; 270 last_row = s->last_row;
233 else 271 else
234 last_row = ptr - s->image_linesize; 272 last_row = ptr - s->image_linesize;
235 273
236 png_filter_row(ptr, s->crow_buf[0], s->crow_buf + 1, 274 png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1,
237 last_row, s->row_size, s->bpp); 275 last_row, s->row_size, s->bpp);
238 } 276 }
239 s->y++; 277 s->y++;
240 if (s->y == s->height) { 278 if (s->y == s->height) {
241 s->state |= PNG_ALLIMAGE; 279 s->state |= PNG_ALLIMAGE;
247 if ((ff_png_pass_ymask[s->pass] << (s->y & 7)) & 0x80) { 285 if ((ff_png_pass_ymask[s->pass] << (s->y & 7)) & 0x80) {
248 /* if we already read one row, it is time to stop to 286 /* if we already read one row, it is time to stop to
249 wait for the next one */ 287 wait for the next one */
250 if (got_line) 288 if (got_line)
251 break; 289 break;
252 png_filter_row(s->tmp_row, s->crow_buf[0], s->crow_buf + 1, 290 png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
253 s->last_row, s->pass_row_size, s->bpp); 291 s->last_row, s->pass_row_size, s->bpp);
254 memcpy(s->last_row, s->tmp_row, s->pass_row_size); 292 memcpy(s->last_row, s->tmp_row, s->pass_row_size);
255 got_line = 1; 293 got_line = 1;
256 } 294 }
257 if ((png_pass_dsp_ymask[s->pass] << (s->y & 7)) & 0x80) { 295 if ((png_pass_dsp_ymask[s->pass] << (s->y & 7)) & 0x80) {
532 static int png_dec_init(AVCodecContext *avctx){ 570 static int png_dec_init(AVCodecContext *avctx){
533 PNGDecContext *s = avctx->priv_data; 571 PNGDecContext *s = avctx->priv_data;
534 572
535 avcodec_get_frame_defaults((AVFrame*)&s->picture); 573 avcodec_get_frame_defaults((AVFrame*)&s->picture);
536 avctx->coded_frame= (AVFrame*)&s->picture; 574 avctx->coded_frame= (AVFrame*)&s->picture;
575 dsputil_init(&s->dsp, avctx);
537 576
538 return 0; 577 return 0;
539 } 578 }
540 579
541 AVCodec png_decoder = { 580 AVCodec png_decoder = {