Mercurial > libavcodec.hg
comparison h264.h @ 11002:1c8892d7a090 libavcodec
H.264: Use 64-/128-bit write-combining macros for copies
2-3% faster decode on x86-32 core2.
author | astrange |
---|---|
date | Mon, 25 Jan 2010 00:30:44 +0000 |
parents | 85de0c8a19b7 |
children | ec192d9ebac5 |
comparison
equal
deleted
inserted
replaced
11001:621268959a5c | 11002:1c8892d7a090 |
---|---|
310 uint8_t (*non_zero_count)[32]; | 310 uint8_t (*non_zero_count)[32]; |
311 | 311 |
312 /** | 312 /** |
313 * Motion vector cache. | 313 * Motion vector cache. |
314 */ | 314 */ |
315 DECLARE_ALIGNED_8(int16_t, mv_cache)[2][5*8][2]; | 315 DECLARE_ALIGNED_16(int16_t, mv_cache)[2][5*8][2]; |
316 DECLARE_ALIGNED_8(int8_t, ref_cache)[2][5*8]; | 316 DECLARE_ALIGNED_8(int8_t, ref_cache)[2][5*8]; |
317 #define LIST_NOT_USED -1 //FIXME rename? | 317 #define LIST_NOT_USED -1 //FIXME rename? |
318 #define PART_NOT_AVAILABLE -2 | 318 #define PART_NOT_AVAILABLE -2 |
319 | 319 |
320 /** | 320 /** |
473 int left_cbp; | 473 int left_cbp; |
474 /* chroma_pred_mode for i4x4 or i16x16, else 0 */ | 474 /* chroma_pred_mode for i4x4 or i16x16, else 0 */ |
475 uint8_t *chroma_pred_mode_table; | 475 uint8_t *chroma_pred_mode_table; |
476 int last_qscale_diff; | 476 int last_qscale_diff; |
477 int16_t (*mvd_table[2])[2]; | 477 int16_t (*mvd_table[2])[2]; |
478 DECLARE_ALIGNED_8(int16_t, mvd_cache)[2][5*8][2]; | 478 DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2]; |
479 uint8_t *direct_table; | 479 uint8_t *direct_table; |
480 uint8_t direct_cache[5*8]; | 480 uint8_t direct_cache[5*8]; |
481 | 481 |
482 uint8_t zigzag_scan[16]; | 482 uint8_t zigzag_scan[16]; |
483 uint8_t zigzag_scan8x8[64]; | 483 uint8_t zigzag_scan8x8[64]; |
807 return 1; | 807 return 1; |
808 } | 808 } |
809 if(IS_INTRA(mb_type)) | 809 if(IS_INTRA(mb_type)) |
810 return 0; | 810 return 0; |
811 | 811 |
812 *((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]); | 812 AV_COPY64(&h->non_zero_count_cache[0+8*1], &h->non_zero_count[mb_xy][ 0]); |
813 *((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]); | 813 AV_COPY64(&h->non_zero_count_cache[0+8*2], &h->non_zero_count[mb_xy][ 8]); |
814 *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]); | 814 *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]); |
815 *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]); | 815 *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]); |
816 *((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]); | 816 AV_COPY64(&h->non_zero_count_cache[0+8*4], &h->non_zero_count[mb_xy][24]); |
817 | 817 |
818 h->cbp= h->cbp_table[mb_xy]; | 818 h->cbp= h->cbp_table[mb_xy]; |
819 | 819 |
820 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; | 820 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; |
821 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; | 821 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; |
823 | 823 |
824 { | 824 { |
825 int list; | 825 int list; |
826 for(list=0; list<h->list_count; list++){ | 826 for(list=0; list<h->list_count; list++){ |
827 int8_t *ref; | 827 int8_t *ref; |
828 int y, b_xy; | 828 int y, b_stride; |
829 int16_t (*mv_dst)[2]; | |
830 int16_t (*mv_src)[2]; | |
831 | |
829 if(!USES_LIST(mb_type, list)){ | 832 if(!USES_LIST(mb_type, list)){ |
830 fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); | 833 fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); |
831 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = | 834 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = |
832 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = | 835 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = |
833 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = | 836 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = |
843 ref += h->b8_stride; | 846 ref += h->b8_stride; |
844 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = | 847 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = |
845 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101; | 848 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101; |
846 } | 849 } |
847 | 850 |
848 b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; | 851 b_stride = h->b_stride; |
852 mv_dst = &h->mv_cache[list][scan8[0]]; | |
853 mv_src = &s->current_picture.motion_val[list][4*s->mb_x + 4*s->mb_y*b_stride]; | |
849 for(y=0; y<4; y++){ | 854 for(y=0; y<4; y++){ |
850 *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]; | 855 AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride); |
851 *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]; | |
852 } | 856 } |
853 | 857 |
854 } | 858 } |
855 } | 859 } |
856 }else{ | 860 }else{ |
1057 h->mv_cache_clean[list]= 0; | 1061 h->mv_cache_clean[list]= 0; |
1058 | 1062 |
1059 if(USES_LIST(top_type, list)){ | 1063 if(USES_LIST(top_type, list)){ |
1060 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; | 1064 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; |
1061 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; | 1065 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; |
1062 *(uint64_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0]; | 1066 AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); |
1063 *(uint64_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2]; | |
1064 if(for_deblock){ | 1067 if(for_deblock){ |
1065 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); | 1068 int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); |
1066 h->ref_cache[list][scan8[0] + 0 - 1*8]= | 1069 h->ref_cache[list][scan8[0] + 0 - 1*8]= |
1067 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; | 1070 h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; |
1068 h->ref_cache[list][scan8[0] + 2 - 1*8]= | 1071 h->ref_cache[list][scan8[0] + 2 - 1*8]= |
1072 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0]; | 1075 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0]; |
1073 h->ref_cache[list][scan8[0] + 2 - 1*8]= | 1076 h->ref_cache[list][scan8[0] + 2 - 1*8]= |
1074 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; | 1077 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; |
1075 } | 1078 } |
1076 }else{ | 1079 }else{ |
1077 *(uint64_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]= | 1080 AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]); |
1078 *(uint64_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 0; | |
1079 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= (((for_deblock||top_type) ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101; | 1081 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= (((for_deblock||top_type) ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101; |
1080 } | 1082 } |
1081 | 1083 |
1082 for(i=0; i<2; i++){ | 1084 for(i=0; i<2; i++){ |
1083 int cache_idx = scan8[0] - 1 + i*2*8; | 1085 int cache_idx = scan8[0] - 1 + i*2*8; |
1141 | 1143 |
1142 if( CABAC ) { | 1144 if( CABAC ) { |
1143 /* XXX beurk, Load mvd */ | 1145 /* XXX beurk, Load mvd */ |
1144 if(USES_LIST(top_type, list)){ | 1146 if(USES_LIST(top_type, list)){ |
1145 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; | 1147 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; |
1146 *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0]; | 1148 AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]); |
1147 *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1]; | |
1148 *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2]; | |
1149 *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3]; | |
1150 }else{ | 1149 }else{ |
1151 *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]= | 1150 AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]); |
1152 *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]= | |
1153 *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]= | |
1154 *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0; | |
1155 } | 1151 } |
1156 if(USES_LIST(left_type[0], list)){ | 1152 if(USES_LIST(left_type[0], list)){ |
1157 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; | 1153 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; |
1158 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]]; | 1154 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]]; |
1159 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]]; | 1155 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]]; |
1273 } | 1269 } |
1274 | 1270 |
1275 static inline void write_back_non_zero_count(H264Context *h){ | 1271 static inline void write_back_non_zero_count(H264Context *h){ |
1276 const int mb_xy= h->mb_xy; | 1272 const int mb_xy= h->mb_xy; |
1277 | 1273 |
1278 *((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]); | 1274 AV_COPY64(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[0+8*1]); |
1279 *((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]); | 1275 AV_COPY64(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[0+8*2]); |
1280 *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]); | 1276 *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]); |
1281 *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]); | 1277 *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]); |
1282 *((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]); | 1278 AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]); |
1283 } | 1279 } |
1284 | 1280 |
1285 static inline void write_back_motion(H264Context *h, int mb_type){ | 1281 static inline void write_back_motion(H264Context *h, int mb_type){ |
1286 MpegEncContext * const s = &h->s; | 1282 MpegEncContext * const s = &h->s; |
1287 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; | 1283 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; |
1290 | 1286 |
1291 if(!USES_LIST(mb_type, 0)) | 1287 if(!USES_LIST(mb_type, 0)) |
1292 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); | 1288 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); |
1293 | 1289 |
1294 for(list=0; list<h->list_count; list++){ | 1290 for(list=0; list<h->list_count; list++){ |
1295 int y; | 1291 int y, b_stride; |
1292 int16_t (*mv_dst)[2]; | |
1293 int16_t (*mv_src)[2]; | |
1294 | |
1296 if(!USES_LIST(mb_type, list)) | 1295 if(!USES_LIST(mb_type, list)) |
1297 continue; | 1296 continue; |
1298 | 1297 |
1298 b_stride = h->b_stride; | |
1299 mv_dst = &s->current_picture.motion_val[list][b_xy]; | |
1300 mv_src = &h->mv_cache[list][scan8[0]]; | |
1299 for(y=0; y<4; y++){ | 1301 for(y=0; y<4; y++){ |
1300 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]; | 1302 AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); |
1301 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]; | |
1302 } | 1303 } |
1303 if( CABAC ) { | 1304 if( CABAC ) { |
1305 int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy]; | |
1306 int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; | |
1304 if(IS_SKIP(mb_type)) | 1307 if(IS_SKIP(mb_type)) |
1305 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4); | 1308 fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4); |
1306 else | 1309 else |
1307 for(y=0; y<4; y++){ | 1310 for(y=0; y<4; y++){ |
1308 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y]; | 1311 AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y); |
1309 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y]; | |
1310 } | 1312 } |
1311 } | 1313 } |
1312 | 1314 |
1313 { | 1315 { |
1314 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; | 1316 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; |