Mercurial > libavcodec.hg
comparison vp8.c @ 12238:1a7903913e9b libavcodec
VP8: 30% faster idct_mb
Take shortcuts based on statically common situations.
Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT
blocks are common.
TODO: tie this more directly into the MB mode, since the DC-level transform is
only used for non-splitmv blocks?
author | darkshikari |
---|---|
date | Fri, 23 Jul 2010 02:58:27 +0000 |
parents | f0c4dc49c8f1 |
children | e6ade5e849c9 |
comparison
equal
deleted
inserted
replaced
12237:f0c4dc49c8f1 | 12238:1a7903913e9b |
---|---|
1184 8, 8, 8, 8, width, height, &bmv[3]); | 1184 8, 8, 8, 8, width, height, &bmv[3]); |
1185 break; | 1185 break; |
1186 } | 1186 } |
1187 } | 1187 } |
1188 | 1188 |
1189 static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst, | 1189 static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb) |
1190 VP8Macroblock *mb) | 1190 { |
1191 { | 1191 int x, y, ch, nnz; |
1192 int x, y, nnz; | 1192 |
1193 | 1193 if (mb->mode != MODE_I4x4) { |
1194 if (mb->mode != MODE_I4x4) | 1194 uint8_t *y_dst = dst[0]; |
1195 for (y = 0; y < 4; y++) { | 1195 for (y = 0; y < 4; y++) { |
1196 for (x = 0; x < 4; x++) { | 1196 uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]); |
1197 nnz = s->non_zero_count_cache[y][x]; | 1197 if (nnz) { |
1198 if (nnz) { | 1198 if (nnz&~0x01010101) { |
1199 if (nnz == 1) | 1199 for (x = 0; x < 4; x++) { |
1200 s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); | 1200 nnz = s->non_zero_count_cache[y][x]; |
1201 else | 1201 if (nnz) { |
1202 s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); | 1202 if (nnz == 1) |
1203 s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); | |
1204 else | |
1205 s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); | |
1206 } | |
1207 } | |
1208 } else { | |
1209 s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize); | |
1203 } | 1210 } |
1204 } | 1211 } |
1205 y_dst += 4*s->linesize; | 1212 y_dst += 4*s->linesize; |
1206 } | 1213 } |
1207 | 1214 } |
1208 for (y = 0; y < 2; y++) { | 1215 |
1209 for (x = 0; x < 2; x++) { | 1216 for (ch = 0; ch < 2; ch++) { |
1210 nnz = s->non_zero_count_cache[4][(y<<1)+x]; | 1217 if (AV_RN32A(s->non_zero_count_cache[4+ch])) { |
1211 if (nnz) { | 1218 uint8_t *ch_dst = dst[1+ch]; |
1212 if (nnz == 1) | 1219 for (y = 0; y < 2; y++) { |
1213 s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); | 1220 for (x = 0; x < 2; x++) { |
1214 else | 1221 nnz = s->non_zero_count_cache[4+ch][(y<<1)+x]; |
1215 s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); | 1222 if (nnz) { |
1223 if (nnz == 1) | |
1224 s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize); | |
1225 else | |
1226 s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize); | |
1227 } | |
1228 } | |
1229 ch_dst += 4*s->uvlinesize; | |
1216 } | 1230 } |
1217 | 1231 } |
1218 nnz = s->non_zero_count_cache[5][(y<<1)+x]; | |
1219 if (nnz) { | |
1220 if (nnz == 1) | |
1221 s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize); | |
1222 else | |
1223 s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize); | |
1224 } | |
1225 } | |
1226 u_dst += 4*s->uvlinesize; | |
1227 v_dst += 4*s->uvlinesize; | |
1228 } | 1232 } |
1229 } | 1233 } |
1230 | 1234 |
1231 static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f ) | 1235 static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f ) |
1232 { | 1236 { |
1509 inter_predict(s, dst, mb, mb_x, mb_y); | 1513 inter_predict(s, dst, mb, mb_x, mb_y); |
1510 | 1514 |
1511 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); | 1515 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); |
1512 | 1516 |
1513 if (!mb->skip) { | 1517 if (!mb->skip) { |
1514 idct_mb(s, dst[0], dst[1], dst[2], mb); | 1518 idct_mb(s, dst, mb); |
1515 } else { | 1519 } else { |
1516 AV_ZERO64(s->left_nnz); | 1520 AV_ZERO64(s->left_nnz); |
1517 AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned | 1521 AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned |
1518 | 1522 |
1519 // Reset DC block predictors if they would exist if the mb had coefficients | 1523 // Reset DC block predictors if they would exist if the mb had coefficients |