comparison vp8.c @ 12238:1a7903913e9b libavcodec

VP8: 30% faster idct_mb Take shortcuts based on statically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks?
author darkshikari
date Fri, 23 Jul 2010 02:58:27 +0000
parents f0c4dc49c8f1
children e6ade5e849c9
comparison
equal deleted inserted replaced
12237:f0c4dc49c8f1 12238:1a7903913e9b
1184 8, 8, 8, 8, width, height, &bmv[3]); 1184 8, 8, 8, 8, width, height, &bmv[3]);
1185 break; 1185 break;
1186 } 1186 }
1187 } 1187 }
1188 1188
1189 static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst, 1189 static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1190 VP8Macroblock *mb) 1190 {
1191 { 1191 int x, y, ch, nnz;
1192 int x, y, nnz; 1192
1193 1193 if (mb->mode != MODE_I4x4) {
1194 if (mb->mode != MODE_I4x4) 1194 uint8_t *y_dst = dst[0];
1195 for (y = 0; y < 4; y++) { 1195 for (y = 0; y < 4; y++) {
1196 for (x = 0; x < 4; x++) { 1196 uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]);
1197 nnz = s->non_zero_count_cache[y][x]; 1197 if (nnz) {
1198 if (nnz) { 1198 if (nnz&~0x01010101) {
1199 if (nnz == 1) 1199 for (x = 0; x < 4; x++) {
1200 s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); 1200 nnz = s->non_zero_count_cache[y][x];
1201 else 1201 if (nnz) {
1202 s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); 1202 if (nnz == 1)
1203 s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1204 else
1205 s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1206 }
1207 }
1208 } else {
1209 s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
1203 } 1210 }
1204 } 1211 }
1205 y_dst += 4*s->linesize; 1212 y_dst += 4*s->linesize;
1206 } 1213 }
1207 1214 }
1208 for (y = 0; y < 2; y++) { 1215
1209 for (x = 0; x < 2; x++) { 1216 for (ch = 0; ch < 2; ch++) {
1210 nnz = s->non_zero_count_cache[4][(y<<1)+x]; 1217 if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
1211 if (nnz) { 1218 uint8_t *ch_dst = dst[1+ch];
1212 if (nnz == 1) 1219 for (y = 0; y < 2; y++) {
1213 s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); 1220 for (x = 0; x < 2; x++) {
1214 else 1221 nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
1215 s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); 1222 if (nnz) {
1223 if (nnz == 1)
1224 s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1225 else
1226 s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1227 }
1228 }
1229 ch_dst += 4*s->uvlinesize;
1216 } 1230 }
1217 1231 }
1218 nnz = s->non_zero_count_cache[5][(y<<1)+x];
1219 if (nnz) {
1220 if (nnz == 1)
1221 s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
1222 else
1223 s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
1224 }
1225 }
1226 u_dst += 4*s->uvlinesize;
1227 v_dst += 4*s->uvlinesize;
1228 } 1232 }
1229 } 1233 }
1230 1234
1231 static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f ) 1235 static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1232 { 1236 {
1509 inter_predict(s, dst, mb, mb_x, mb_y); 1513 inter_predict(s, dst, mb, mb_x, mb_y);
1510 1514
1511 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); 1515 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1512 1516
1513 if (!mb->skip) { 1517 if (!mb->skip) {
1514 idct_mb(s, dst[0], dst[1], dst[2], mb); 1518 idct_mb(s, dst, mb);
1515 } else { 1519 } else {
1516 AV_ZERO64(s->left_nnz); 1520 AV_ZERO64(s->left_nnz);
1517 AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned 1521 AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
1518 1522
1519 // Reset DC block predictors if they would exist if the mb had coefficients 1523 // Reset DC block predictors if they would exist if the mb had coefficients