comparison h264.h @ 11292:411ab09ada91 libavcodec

Get rid of mb2b8_xy and b8_stride, change arrays organized based on b8_stride to ones based on mb_stride in h264. about 20 cpu cycles faster overall per MB
author michael
date Thu, 25 Feb 2010 23:44:42 +0000
parents d5dd13f345fc
children fb7e2f568ad9
comparison
equal deleted inserted replaced
11291:1527e25ec9d4 11292:411ab09ada91
345 */ 345 */
346 int block_offset[2*(16+8)]; 346 int block_offset[2*(16+8)];
347 347
348 uint32_t *mb2b_xy; //FIXME are these 4 a good idea? 348 uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
349 uint32_t *mb2br_xy; 349 uint32_t *mb2br_xy;
350 uint32_t *mb2b8_xy;
351 int b_stride; //FIXME use s->b4_stride 350 int b_stride; //FIXME use s->b4_stride
352 int b8_stride;
353 351
354 int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff 352 int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff
355 int mb_uvlinesize; 353 int mb_uvlinesize;
356 354
357 int emu_edge_width; 355 int emu_edge_width;
988 986
989 h->mv_cache_clean[list]= 0; 987 h->mv_cache_clean[list]= 0;
990 988
991 if(USES_LIST(top_type, list)){ 989 if(USES_LIST(top_type, list)){
992 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; 990 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
993 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
994 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); 991 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]);
995 h->ref_cache[list][scan8[0] + 0 - 1*8]= 992 h->ref_cache[list][scan8[0] + 0 - 1*8]=
996 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0]; 993 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 2];
997 h->ref_cache[list][scan8[0] + 2 - 1*8]= 994 h->ref_cache[list][scan8[0] + 2 - 1*8]=
998 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; 995 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 3];
999 }else{ 996 }else{
1000 AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]); 997 AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]);
1001 AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); 998 AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
1002 } 999 }
1003 1000
1004 for(i=0; i<2; i++){ 1001 for(i=0; i<2; i++){
1005 int cache_idx = scan8[0] - 1 + i*2*8; 1002 int cache_idx = scan8[0] - 1 + i*2*8;
1006 if(USES_LIST(left_type[i], list)){ 1003 if(USES_LIST(left_type[i], list)){
1007 const int b_xy= h->mb2b_xy[left_xy[i]] + 3; 1004 const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
1008 const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1; 1005 const int b8_xy= 4*left_xy[i] + 1;
1009 AV_COPY32(h->mv_cache[list][cache_idx ], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]); 1006 AV_COPY32(h->mv_cache[list][cache_idx ], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]);
1010 AV_COPY32(h->mv_cache[list][cache_idx+8], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]); 1007 AV_COPY32(h->mv_cache[list][cache_idx+8], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]);
1011 h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)]; 1008 h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + (left_block[0+i*2]&~1)];
1012 h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)]; 1009 h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + (left_block[1+i*2]&~1)];
1013 }else{ 1010 }else{
1014 AV_ZERO32(h->mv_cache [list][cache_idx ]); 1011 AV_ZERO32(h->mv_cache [list][cache_idx ]);
1015 AV_ZERO32(h->mv_cache [list][cache_idx+8]); 1012 AV_ZERO32(h->mv_cache [list][cache_idx+8]);
1016 h->ref_cache[list][cache_idx ]= 1013 h->ref_cache[list][cache_idx ]=
1017 h->ref_cache[list][cache_idx+8]= (left_type[i]) ? LIST_NOT_USED : PART_NOT_AVAILABLE; 1014 h->ref_cache[list][cache_idx+8]= (left_type[i]) ? LIST_NOT_USED : PART_NOT_AVAILABLE;
1018 } 1015 }
1019 } 1016 }
1020 1017
1021 if(USES_LIST(topleft_type, list)){ 1018 if(USES_LIST(topleft_type, list)){
1022 const int b_xy = h->mb2b_xy [topleft_xy] + 3 + h->b_stride + (h->topleft_partition & 2*h->b_stride); 1019 const int b_xy = h->mb2b_xy [topleft_xy] + 3 + h->b_stride + (h->topleft_partition & 2*h->b_stride);
1023 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (h->topleft_partition & h->b8_stride); 1020 const int b8_xy= 4*topleft_xy + 1 + (h->topleft_partition & 2);
1024 AV_COPY32(h->mv_cache[list][scan8[0] - 1 - 1*8], s->current_picture.motion_val[list][b_xy]); 1021 AV_COPY32(h->mv_cache[list][scan8[0] - 1 - 1*8], s->current_picture.motion_val[list][b_xy]);
1025 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy]; 1022 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
1026 }else{ 1023 }else{
1027 AV_ZERO32(h->mv_cache[list][scan8[0] - 1 - 1*8]); 1024 AV_ZERO32(h->mv_cache[list][scan8[0] - 1 - 1*8]);
1028 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 1025 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
1029 } 1026 }
1030 1027
1031 if(USES_LIST(topright_type, list)){ 1028 if(USES_LIST(topright_type, list)){
1032 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride; 1029 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
1033 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
1034 AV_COPY32(h->mv_cache[list][scan8[0] + 4 - 1*8], s->current_picture.motion_val[list][b_xy]); 1030 AV_COPY32(h->mv_cache[list][scan8[0] + 4 - 1*8], s->current_picture.motion_val[list][b_xy]);
1035 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy]; 1031 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][4*topright_xy + 2];
1036 }else{ 1032 }else{
1037 AV_ZERO32(h->mv_cache [list][scan8[0] + 4 - 1*8]); 1033 AV_ZERO32(h->mv_cache [list][scan8[0] + 4 - 1*8]);
1038 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 1034 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
1039 } 1035 }
1040 1036
1239 AV_WN32A(&h->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 1235 AV_WN32A(&h->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
1240 AV_WN32A(&h->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 1236 AV_WN32A(&h->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
1241 continue; 1237 continue;
1242 } 1238 }
1243 1239
1244 ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; 1240 ref = &s->current_picture.ref_index[list][4*mb_xy];
1245 { 1241 {
1246 int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); 1242 int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
1247 AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 1243 AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
1248 AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 1244 AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
1249 ref += h->b8_stride; 1245 ref += 2;
1250 AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 1246 AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
1251 AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 1247 AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
1252 } 1248 }
1253 1249
1254 b_stride = h->b_stride; 1250 b_stride = h->b_stride;
1317 if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ 1313 if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
1318 int list; 1314 int list;
1319 for(list=0; list<h->list_count; list++){ 1315 for(list=0; list<h->list_count; list++){
1320 if(USES_LIST(top_type, list)){ 1316 if(USES_LIST(top_type, list)){
1321 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; 1317 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
1322 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; 1318 const int b8_xy= 4*top_xy + 2;
1323 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); 1319 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
1324 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); 1320 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]);
1325 h->ref_cache[list][scan8[0] + 0 - 1*8]= 1321 h->ref_cache[list][scan8[0] + 0 - 1*8]=
1326 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; 1322 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]];
1327 h->ref_cache[list][scan8[0] + 2 - 1*8]= 1323 h->ref_cache[list][scan8[0] + 2 - 1*8]=
1332 } 1328 }
1333 1329
1334 if(!IS_INTERLACED(mb_type^left_type[0])){ 1330 if(!IS_INTERLACED(mb_type^left_type[0])){
1335 if(USES_LIST(left_type[0], list)){ 1331 if(USES_LIST(left_type[0], list)){
1336 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; 1332 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
1337 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1; 1333 const int b8_xy= 4*left_xy[0] + 1;
1338 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); 1334 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
1339 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]); 1335 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]);
1340 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]); 1336 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]);
1341 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]); 1337 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]);
1342 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]); 1338 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]);
1343 h->ref_cache[list][scan8[0] - 1 + 0 ]= 1339 h->ref_cache[list][scan8[0] - 1 + 0 ]=
1344 h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*0]]; 1340 h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*0]];
1345 h->ref_cache[list][scan8[0] - 1 +16 ]= 1341 h->ref_cache[list][scan8[0] - 1 +16 ]=
1346 h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*1]]; 1342 h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*1]];
1347 }else{ 1343 }else{
1348 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]); 1344 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]);
1349 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]); 1345 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]);
1350 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +16 ]); 1346 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +16 ]);
1351 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +24 ]); 1347 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +24 ]);
1386 AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]); 1382 AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]);
1387 } 1383 }
1388 1384
1389 static inline void write_back_motion(H264Context *h, int mb_type){ 1385 static inline void write_back_motion(H264Context *h, int mb_type){
1390 MpegEncContext * const s = &h->s; 1386 MpegEncContext * const s = &h->s;
1391 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; 1387 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; //try mb2b(8)_xy
1392 const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride; 1388 const int b8_xy= 4*h->mb_xy;
1393 int list; 1389 int list;
1394 1390
1395 if(!USES_LIST(mb_type, 0)) 1391 if(!USES_LIST(mb_type, 0))
1396 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); 1392 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
1397 1393
1398 for(list=0; list<h->list_count; list++){ 1394 for(list=0; list<h->list_count; list++){
1399 int y, b_stride; 1395 int y, b_stride;
1400 int16_t (*mv_dst)[2]; 1396 int16_t (*mv_dst)[2];
1401 int16_t (*mv_src)[2]; 1397 int16_t (*mv_src)[2];
1422 } 1418 }
1423 } 1419 }
1424 1420
1425 { 1421 {
1426 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; 1422 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1427 ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]]; 1423 ref_index[0+0*2]= h->ref_cache[list][scan8[0]];
1428 ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]]; 1424 ref_index[1+0*2]= h->ref_cache[list][scan8[4]];
1429 ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]]; 1425 ref_index[0+1*2]= h->ref_cache[list][scan8[8]];
1430 ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]]; 1426 ref_index[1+1*2]= h->ref_cache[list][scan8[12]];
1431 } 1427 }
1432 } 1428 }
1433 1429
1434 if(h->slice_type_nos == FF_B_TYPE && CABAC){ 1430 if(h->slice_type_nos == FF_B_TYPE && CABAC){
1435 if(IS_8X8(mb_type)){ 1431 if(IS_8X8(mb_type)){