comparison h264.h @ 11002:1c8892d7a090 libavcodec

H.264: Use 64-/128-bit write-combining macros for copies 2-3% faster decode on x86-32 core2.
author astrange
date Mon, 25 Jan 2010 00:30:44 +0000
parents 85de0c8a19b7
children ec192d9ebac5
comparison
equal deleted inserted replaced
11001:621268959a5c 11002:1c8892d7a090
310 uint8_t (*non_zero_count)[32]; 310 uint8_t (*non_zero_count)[32];
311 311
312 /** 312 /**
313 * Motion vector cache. 313 * Motion vector cache.
314 */ 314 */
315 DECLARE_ALIGNED_8(int16_t, mv_cache)[2][5*8][2]; 315 DECLARE_ALIGNED_16(int16_t, mv_cache)[2][5*8][2];
316 DECLARE_ALIGNED_8(int8_t, ref_cache)[2][5*8]; 316 DECLARE_ALIGNED_8(int8_t, ref_cache)[2][5*8];
317 #define LIST_NOT_USED -1 //FIXME rename? 317 #define LIST_NOT_USED -1 //FIXME rename?
318 #define PART_NOT_AVAILABLE -2 318 #define PART_NOT_AVAILABLE -2
319 319
320 /** 320 /**
473 int left_cbp; 473 int left_cbp;
474 /* chroma_pred_mode for i4x4 or i16x16, else 0 */ 474 /* chroma_pred_mode for i4x4 or i16x16, else 0 */
475 uint8_t *chroma_pred_mode_table; 475 uint8_t *chroma_pred_mode_table;
476 int last_qscale_diff; 476 int last_qscale_diff;
477 int16_t (*mvd_table[2])[2]; 477 int16_t (*mvd_table[2])[2];
478 DECLARE_ALIGNED_8(int16_t, mvd_cache)[2][5*8][2]; 478 DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2];
479 uint8_t *direct_table; 479 uint8_t *direct_table;
480 uint8_t direct_cache[5*8]; 480 uint8_t direct_cache[5*8];
481 481
482 uint8_t zigzag_scan[16]; 482 uint8_t zigzag_scan[16];
483 uint8_t zigzag_scan8x8[64]; 483 uint8_t zigzag_scan8x8[64];
807 return 1; 807 return 1;
808 } 808 }
809 if(IS_INTRA(mb_type)) 809 if(IS_INTRA(mb_type))
810 return 0; 810 return 0;
811 811
812 *((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]); 812 AV_COPY64(&h->non_zero_count_cache[0+8*1], &h->non_zero_count[mb_xy][ 0]);
813 *((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]); 813 AV_COPY64(&h->non_zero_count_cache[0+8*2], &h->non_zero_count[mb_xy][ 8]);
814 *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]); 814 *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]);
815 *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]); 815 *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]);
816 *((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]); 816 AV_COPY64(&h->non_zero_count_cache[0+8*4], &h->non_zero_count[mb_xy][24]);
817 817
818 h->cbp= h->cbp_table[mb_xy]; 818 h->cbp= h->cbp_table[mb_xy];
819 819
820 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; 820 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0;
821 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; 821 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
823 823
824 { 824 {
825 int list; 825 int list;
826 for(list=0; list<h->list_count; list++){ 826 for(list=0; list<h->list_count; list++){
827 int8_t *ref; 827 int8_t *ref;
828 int y, b_xy; 828 int y, b_stride;
829 int16_t (*mv_dst)[2];
830 int16_t (*mv_src)[2];
831
829 if(!USES_LIST(mb_type, list)){ 832 if(!USES_LIST(mb_type, list)){
830 fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); 833 fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
831 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = 834 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
832 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = 835 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] =
833 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = 836 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
843 ref += h->b8_stride; 846 ref += h->b8_stride;
844 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = 847 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
845 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101; 848 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101;
846 } 849 }
847 850
848 b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; 851 b_stride = h->b_stride;
852 mv_dst = &h->mv_cache[list][scan8[0]];
853 mv_src = &s->current_picture.motion_val[list][4*s->mb_x + 4*s->mb_y*b_stride];
849 for(y=0; y<4; y++){ 854 for(y=0; y<4; y++){
850 *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]; 855 AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride);
851 *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride];
852 } 856 }
853 857
854 } 858 }
855 } 859 }
856 }else{ 860 }else{
1057 h->mv_cache_clean[list]= 0; 1061 h->mv_cache_clean[list]= 0;
1058 1062
1059 if(USES_LIST(top_type, list)){ 1063 if(USES_LIST(top_type, list)){
1060 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; 1064 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
1061 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; 1065 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
1062 *(uint64_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0]; 1066 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]);
1063 *(uint64_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2];
1064 if(for_deblock){ 1067 if(for_deblock){
1065 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); 1068 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
1066 h->ref_cache[list][scan8[0] + 0 - 1*8]= 1069 h->ref_cache[list][scan8[0] + 0 - 1*8]=
1067 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; 1070 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]];
1068 h->ref_cache[list][scan8[0] + 2 - 1*8]= 1071 h->ref_cache[list][scan8[0] + 2 - 1*8]=
1072 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0]; 1075 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
1073 h->ref_cache[list][scan8[0] + 2 - 1*8]= 1076 h->ref_cache[list][scan8[0] + 2 - 1*8]=
1074 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; 1077 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
1075 } 1078 }
1076 }else{ 1079 }else{
1077 *(uint64_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]= 1080 AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]);
1078 *(uint64_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 0;
1079 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= (((for_deblock||top_type) ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101; 1081 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= (((for_deblock||top_type) ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
1080 } 1082 }
1081 1083
1082 for(i=0; i<2; i++){ 1084 for(i=0; i<2; i++){
1083 int cache_idx = scan8[0] - 1 + i*2*8; 1085 int cache_idx = scan8[0] - 1 + i*2*8;
1141 1143
1142 if( CABAC ) { 1144 if( CABAC ) {
1143 /* XXX beurk, Load mvd */ 1145 /* XXX beurk, Load mvd */
1144 if(USES_LIST(top_type, list)){ 1146 if(USES_LIST(top_type, list)){
1145 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; 1147 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
1146 *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0]; 1148 AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
1147 *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
1148 *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
1149 *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
1150 }else{ 1149 }else{
1151 *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]= 1150 AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
1152 *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
1153 *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
1154 *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
1155 } 1151 }
1156 if(USES_LIST(left_type[0], list)){ 1152 if(USES_LIST(left_type[0], list)){
1157 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; 1153 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
1158 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]]; 1154 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
1159 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]]; 1155 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
1273 } 1269 }
1274 1270
1275 static inline void write_back_non_zero_count(H264Context *h){ 1271 static inline void write_back_non_zero_count(H264Context *h){
1276 const int mb_xy= h->mb_xy; 1272 const int mb_xy= h->mb_xy;
1277 1273
1278 *((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]); 1274 AV_COPY64(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[0+8*1]);
1279 *((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]); 1275 AV_COPY64(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[0+8*2]);
1280 *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]); 1276 *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]);
1281 *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]); 1277 *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]);
1282 *((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]); 1278 AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]);
1283 } 1279 }
1284 1280
1285 static inline void write_back_motion(H264Context *h, int mb_type){ 1281 static inline void write_back_motion(H264Context *h, int mb_type){
1286 MpegEncContext * const s = &h->s; 1282 MpegEncContext * const s = &h->s;
1287 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; 1283 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1290 1286
1291 if(!USES_LIST(mb_type, 0)) 1287 if(!USES_LIST(mb_type, 0))
1292 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); 1288 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1293 1289
1294 for(list=0; list<h->list_count; list++){ 1290 for(list=0; list<h->list_count; list++){
1295 int y; 1291 int y, b_stride;
1292 int16_t (*mv_dst)[2];
1293 int16_t (*mv_src)[2];
1294
1296 if(!USES_LIST(mb_type, list)) 1295 if(!USES_LIST(mb_type, list))
1297 continue; 1296 continue;
1298 1297
1298 b_stride = h->b_stride;
1299 mv_dst = &s->current_picture.motion_val[list][b_xy];
1300 mv_src = &h->mv_cache[list][scan8[0]];
1299 for(y=0; y<4; y++){ 1301 for(y=0; y<4; y++){
1300 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]; 1302 AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
1301 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1302 } 1303 }
1303 if( CABAC ) { 1304 if( CABAC ) {
1305 int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
1306 int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
1304 if(IS_SKIP(mb_type)) 1307 if(IS_SKIP(mb_type))
1305 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4); 1308 fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4);
1306 else 1309 else
1307 for(y=0; y<4; y++){ 1310 for(y=0; y<4; y++){
1308 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y]; 1311 AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y);
1309 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1310 } 1312 }
1311 } 1313 }
1312 1314
1313 { 1315 {
1314 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; 1316 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];