Mercurial > libavcodec.hg
comparison h264.h @ 11292:411ab09ada91 libavcodec
Get rid of mb2b8_xy and b8_stride, change arrays organized based on b8_stride to
ones based on mb_stride in h264.
about 20 cpu cycles faster overall per MB
author | michael |
---|---|
date | Thu, 25 Feb 2010 23:44:42 +0000 |
parents | d5dd13f345fc |
children | fb7e2f568ad9 |
comparison
equal
deleted
inserted
replaced
11291:1527e25ec9d4 | 11292:411ab09ada91 |
---|---|
345 */ | 345 */ |
346 int block_offset[2*(16+8)]; | 346 int block_offset[2*(16+8)]; |
347 | 347 |
348 uint32_t *mb2b_xy; //FIXME are these 4 a good idea? | 348 uint32_t *mb2b_xy; //FIXME are these 4 a good idea? |
349 uint32_t *mb2br_xy; | 349 uint32_t *mb2br_xy; |
350 uint32_t *mb2b8_xy; | |
351 int b_stride; //FIXME use s->b4_stride | 350 int b_stride; //FIXME use s->b4_stride |
352 int b8_stride; | |
353 | 351 |
354 int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff | 352 int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff |
355 int mb_uvlinesize; | 353 int mb_uvlinesize; |
356 | 354 |
357 int emu_edge_width; | 355 int emu_edge_width; |
988 | 986 |
989 h->mv_cache_clean[list]= 0; | 987 h->mv_cache_clean[list]= 0; |
990 | 988 |
991 if(USES_LIST(top_type, list)){ | 989 if(USES_LIST(top_type, list)){ |
992 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; | 990 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; |
993 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; | |
994 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); | 991 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); |
995 h->ref_cache[list][scan8[0] + 0 - 1*8]= | 992 h->ref_cache[list][scan8[0] + 0 - 1*8]= |
996 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0]; | 993 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 2]; |
997 h->ref_cache[list][scan8[0] + 2 - 1*8]= | 994 h->ref_cache[list][scan8[0] + 2 - 1*8]= |
998 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; | 995 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 3]; |
999 }else{ | 996 }else{ |
1000 AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]); | 997 AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]); |
1001 AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); | 998 AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); |
1002 } | 999 } |
1003 | 1000 |
1004 for(i=0; i<2; i++){ | 1001 for(i=0; i<2; i++){ |
1005 int cache_idx = scan8[0] - 1 + i*2*8; | 1002 int cache_idx = scan8[0] - 1 + i*2*8; |
1006 if(USES_LIST(left_type[i], list)){ | 1003 if(USES_LIST(left_type[i], list)){ |
1007 const int b_xy= h->mb2b_xy[left_xy[i]] + 3; | 1004 const int b_xy= h->mb2b_xy[left_xy[i]] + 3; |
1008 const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1; | 1005 const int b8_xy= 4*left_xy[i] + 1; |
1009 AV_COPY32(h->mv_cache[list][cache_idx ], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]); | 1006 AV_COPY32(h->mv_cache[list][cache_idx ], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]); |
1010 AV_COPY32(h->mv_cache[list][cache_idx+8], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]); | 1007 AV_COPY32(h->mv_cache[list][cache_idx+8], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]); |
1011 h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)]; | 1008 h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + (left_block[0+i*2]&~1)]; |
1012 h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)]; | 1009 h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + (left_block[1+i*2]&~1)]; |
1013 }else{ | 1010 }else{ |
1014 AV_ZERO32(h->mv_cache [list][cache_idx ]); | 1011 AV_ZERO32(h->mv_cache [list][cache_idx ]); |
1015 AV_ZERO32(h->mv_cache [list][cache_idx+8]); | 1012 AV_ZERO32(h->mv_cache [list][cache_idx+8]); |
1016 h->ref_cache[list][cache_idx ]= | 1013 h->ref_cache[list][cache_idx ]= |
1017 h->ref_cache[list][cache_idx+8]= (left_type[i]) ? LIST_NOT_USED : PART_NOT_AVAILABLE; | 1014 h->ref_cache[list][cache_idx+8]= (left_type[i]) ? LIST_NOT_USED : PART_NOT_AVAILABLE; |
1018 } | 1015 } |
1019 } | 1016 } |
1020 | 1017 |
1021 if(USES_LIST(topleft_type, list)){ | 1018 if(USES_LIST(topleft_type, list)){ |
1022 const int b_xy = h->mb2b_xy [topleft_xy] + 3 + h->b_stride + (h->topleft_partition & 2*h->b_stride); | 1019 const int b_xy = h->mb2b_xy [topleft_xy] + 3 + h->b_stride + (h->topleft_partition & 2*h->b_stride); |
1023 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (h->topleft_partition & h->b8_stride); | 1020 const int b8_xy= 4*topleft_xy + 1 + (h->topleft_partition & 2); |
1024 AV_COPY32(h->mv_cache[list][scan8[0] - 1 - 1*8], s->current_picture.motion_val[list][b_xy]); | 1021 AV_COPY32(h->mv_cache[list][scan8[0] - 1 - 1*8], s->current_picture.motion_val[list][b_xy]); |
1025 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy]; | 1022 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy]; |
1026 }else{ | 1023 }else{ |
1027 AV_ZERO32(h->mv_cache[list][scan8[0] - 1 - 1*8]); | 1024 AV_ZERO32(h->mv_cache[list][scan8[0] - 1 - 1*8]); |
1028 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; | 1025 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; |
1029 } | 1026 } |
1030 | 1027 |
1031 if(USES_LIST(topright_type, list)){ | 1028 if(USES_LIST(topright_type, list)){ |
1032 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride; | 1029 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride; |
1033 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride; | |
1034 AV_COPY32(h->mv_cache[list][scan8[0] + 4 - 1*8], s->current_picture.motion_val[list][b_xy]); | 1030 AV_COPY32(h->mv_cache[list][scan8[0] + 4 - 1*8], s->current_picture.motion_val[list][b_xy]); |
1035 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy]; | 1031 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][4*topright_xy + 2]; |
1036 }else{ | 1032 }else{ |
1037 AV_ZERO32(h->mv_cache [list][scan8[0] + 4 - 1*8]); | 1033 AV_ZERO32(h->mv_cache [list][scan8[0] + 4 - 1*8]); |
1038 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; | 1034 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; |
1039 } | 1035 } |
1040 | 1036 |
1239 AV_WN32A(&h->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); | 1235 AV_WN32A(&h->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); |
1240 AV_WN32A(&h->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); | 1236 AV_WN32A(&h->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); |
1241 continue; | 1237 continue; |
1242 } | 1238 } |
1243 | 1239 |
1244 ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; | 1240 ref = &s->current_picture.ref_index[list][4*mb_xy]; |
1245 { | 1241 { |
1246 int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); | 1242 int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); |
1247 AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); | 1243 AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); |
1248 AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); | 1244 AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); |
1249 ref += h->b8_stride; | 1245 ref += 2; |
1250 AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); | 1246 AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); |
1251 AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); | 1247 AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); |
1252 } | 1248 } |
1253 | 1249 |
1254 b_stride = h->b_stride; | 1250 b_stride = h->b_stride; |
1317 if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ | 1313 if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ |
1318 int list; | 1314 int list; |
1319 for(list=0; list<h->list_count; list++){ | 1315 for(list=0; list<h->list_count; list++){ |
1320 if(USES_LIST(top_type, list)){ | 1316 if(USES_LIST(top_type, list)){ |
1321 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; | 1317 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; |
1322 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; | 1318 const int b8_xy= 4*top_xy + 2; |
1323 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); | 1319 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); |
1324 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); | 1320 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); |
1325 h->ref_cache[list][scan8[0] + 0 - 1*8]= | 1321 h->ref_cache[list][scan8[0] + 0 - 1*8]= |
1326 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; | 1322 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; |
1327 h->ref_cache[list][scan8[0] + 2 - 1*8]= | 1323 h->ref_cache[list][scan8[0] + 2 - 1*8]= |
1332 } | 1328 } |
1333 | 1329 |
1334 if(!IS_INTERLACED(mb_type^left_type[0])){ | 1330 if(!IS_INTERLACED(mb_type^left_type[0])){ |
1335 if(USES_LIST(left_type[0], list)){ | 1331 if(USES_LIST(left_type[0], list)){ |
1336 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; | 1332 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; |
1337 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1; | 1333 const int b8_xy= 4*left_xy[0] + 1; |
1338 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); | 1334 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); |
1339 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]); | 1335 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]); |
1340 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]); | 1336 AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]); |
1341 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]); | 1337 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]); |
1342 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]); | 1338 AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]); |
1343 h->ref_cache[list][scan8[0] - 1 + 0 ]= | 1339 h->ref_cache[list][scan8[0] - 1 + 0 ]= |
1344 h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*0]]; | 1340 h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*0]]; |
1345 h->ref_cache[list][scan8[0] - 1 +16 ]= | 1341 h->ref_cache[list][scan8[0] - 1 +16 ]= |
1346 h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*1]]; | 1342 h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*1]]; |
1347 }else{ | 1343 }else{ |
1348 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]); | 1344 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]); |
1349 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]); | 1345 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]); |
1350 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +16 ]); | 1346 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +16 ]); |
1351 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +24 ]); | 1347 AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +24 ]); |
1386 AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]); | 1382 AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]); |
1387 } | 1383 } |
1388 | 1384 |
1389 static inline void write_back_motion(H264Context *h, int mb_type){ | 1385 static inline void write_back_motion(H264Context *h, int mb_type){ |
1390 MpegEncContext * const s = &h->s; | 1386 MpegEncContext * const s = &h->s; |
1391 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; | 1387 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; //try mb2b(8)_xy |
1392 const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride; | 1388 const int b8_xy= 4*h->mb_xy; |
1393 int list; | 1389 int list; |
1394 | 1390 |
1395 if(!USES_LIST(mb_type, 0)) | 1391 if(!USES_LIST(mb_type, 0)) |
1396 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); | 1392 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); |
1397 | 1393 |
1398 for(list=0; list<h->list_count; list++){ | 1394 for(list=0; list<h->list_count; list++){ |
1399 int y, b_stride; | 1395 int y, b_stride; |
1400 int16_t (*mv_dst)[2]; | 1396 int16_t (*mv_dst)[2]; |
1401 int16_t (*mv_src)[2]; | 1397 int16_t (*mv_src)[2]; |
1422 } | 1418 } |
1423 } | 1419 } |
1424 | 1420 |
1425 { | 1421 { |
1426 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; | 1422 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; |
1427 ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]]; | 1423 ref_index[0+0*2]= h->ref_cache[list][scan8[0]]; |
1428 ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]]; | 1424 ref_index[1+0*2]= h->ref_cache[list][scan8[4]]; |
1429 ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]]; | 1425 ref_index[0+1*2]= h->ref_cache[list][scan8[8]]; |
1430 ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]]; | 1426 ref_index[1+1*2]= h->ref_cache[list][scan8[12]]; |
1431 } | 1427 } |
1432 } | 1428 } |
1433 | 1429 |
1434 if(h->slice_type_nos == FF_B_TYPE && CABAC){ | 1430 if(h->slice_type_nos == FF_B_TYPE && CABAC){ |
1435 if(IS_8X8(mb_type)){ | 1431 if(IS_8X8(mb_type)){ |