Mercurial > libavcodec.hg
comparison h264.h @ 10909:f4cf3960b8c6 libavcodec
Reorganize how values are stored in h->non_zero_count.
~1% faster
author | michael |
---|---|
date | Sun, 17 Jan 2010 23:44:23 +0000 |
parents | 28840dfd4b52 |
children | 7cecaa3a6b38 |
comparison
equal
deleted
inserted
replaced
10908:28840dfd4b52 | 10909:f4cf3960b8c6 |
---|---|
298 /** | 298 /** |
299 * non zero coeff count cache. | 299 * non zero coeff count cache. |
300 * is 64 if not available. | 300 * is 64 if not available. |
301 */ | 301 */ |
302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); | 302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); |
303 | |
304 /* | |
305 .UU.YYYY | |
306 .UU.YYYY | |
307 .vv.YYYY | |
308 .VV.YYYY | |
309 */ | |
303 uint8_t (*non_zero_count)[32]; | 310 uint8_t (*non_zero_count)[32]; |
304 | 311 |
305 /** | 312 /** |
306 * Motion vector cache. | 313 * Motion vector cache. |
307 */ | 314 */ |
725 int topleft_xy, top_xy, topright_xy, left_xy[2]; | 732 int topleft_xy, top_xy, topright_xy, left_xy[2]; |
726 int topleft_type, top_type, topright_type, left_type[2]; | 733 int topleft_type, top_type, topright_type, left_type[2]; |
727 const uint8_t * left_block; | 734 const uint8_t * left_block; |
728 int topleft_partition= -1; | 735 int topleft_partition= -1; |
729 int i; | 736 int i; |
730 static const uint8_t left_block_options[4][8]={ | 737 static const uint8_t left_block_options[4][16]={ |
731 {0,1,2,3,7,10,8,11}, | 738 {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, |
732 {2,2,3,3,8,11,8,11}, | 739 {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, |
733 {0,0,1,1,7,10,7,10}, | 740 {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, |
734 {0,2,0,2,7,10,7,10} | 741 {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} |
735 }; | 742 }; |
736 | 743 |
737 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); | 744 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); |
738 | 745 |
739 //FIXME deblocking could skip the intra and nnz parts. | 746 //FIXME deblocking could skip the intra and nnz parts. |
786 | 793 |
787 h->top_mb_xy = top_xy; | 794 h->top_mb_xy = top_xy; |
788 h->left_mb_xy[0] = left_xy[0]; | 795 h->left_mb_xy[0] = left_xy[0]; |
789 h->left_mb_xy[1] = left_xy[1]; | 796 h->left_mb_xy[1] = left_xy[1]; |
790 if(for_deblock){ | 797 if(for_deblock){ |
798 *((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]); | |
799 *((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]); | |
800 *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]); | |
801 *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]); | |
802 *((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]); | |
803 | |
791 topleft_type = 0; | 804 topleft_type = 0; |
792 topright_type = 0; | 805 topright_type = 0; |
793 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; | 806 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; |
794 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; | 807 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; |
795 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; | 808 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; |
920 4 L . .L . . . . | 933 4 L . .L . . . . |
921 5 L . .. . . . . | 934 5 L . .. . . . . |
922 */ | 935 */ |
923 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) | 936 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) |
924 if(top_type){ | 937 if(top_type){ |
925 h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4]; | 938 *(uint32_t*)&h->non_zero_count_cache[4+8*0]= *(uint32_t*)&h->non_zero_count[top_xy][4+3*8]; |
926 h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5]; | 939 |
927 h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6]; | 940 h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8]; |
928 h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3]; | 941 h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8]; |
929 | 942 |
930 h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9]; | 943 h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8]; |
931 h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8]; | 944 h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8]; |
932 | |
933 h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12]; | |
934 h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11]; | |
935 | 945 |
936 }else{ | 946 }else{ |
937 h->non_zero_count_cache[4+8*0]= | 947 h->non_zero_count_cache[4+8*0]= |
938 h->non_zero_count_cache[5+8*0]= | 948 h->non_zero_count_cache[5+8*0]= |
939 h->non_zero_count_cache[6+8*0]= | 949 h->non_zero_count_cache[6+8*0]= |
947 | 957 |
948 } | 958 } |
949 | 959 |
950 for (i=0; i<2; i++) { | 960 for (i=0; i<2; i++) { |
951 if(left_type[i]){ | 961 if(left_type[i]){ |
952 h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]]; | 962 h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]]; |
953 h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]]; | 963 h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]]; |
954 h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]]; | 964 h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]]; |
955 h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]]; | 965 h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]]; |
956 }else{ | 966 }else{ |
957 h->non_zero_count_cache[3+8*1 + 2*8*i]= | 967 h->non_zero_count_cache[3+8*1 + 2*8*i]= |
958 h->non_zero_count_cache[3+8*2 + 2*8*i]= | 968 h->non_zero_count_cache[3+8*2 + 2*8*i]= |
959 h->non_zero_count_cache[0+8*1 + 8*i]= | 969 h->non_zero_count_cache[0+8*1 + 8*i]= |
960 h->non_zero_count_cache[0+8*4 + 8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64; | 970 h->non_zero_count_cache[0+8*4 + 8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64; |
1202 } | 1212 } |
1203 | 1213 |
1204 static inline void write_back_non_zero_count(H264Context *h){ | 1214 static inline void write_back_non_zero_count(H264Context *h){ |
1205 const int mb_xy= h->mb_xy; | 1215 const int mb_xy= h->mb_xy; |
1206 | 1216 |
1207 h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1]; | 1217 *((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]); |
1208 h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2]; | 1218 *((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]); |
1209 h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3]; | 1219 *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]); |
1210 h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4]; | 1220 *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]); |
1211 h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4]; | 1221 *((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]); |
1212 h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4]; | |
1213 h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4]; | |
1214 | |
1215 h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2]; | |
1216 h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2]; | |
1217 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1]; | |
1218 | |
1219 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; | |
1220 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; | |
1221 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; | |
1222 | |
1223 //FIXME sort better how things are stored in non_zero_count | |
1224 | |
1225 | |
1226 h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1]; | |
1227 h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2]; | |
1228 h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3]; | |
1229 h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1]; | |
1230 h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2]; | |
1231 h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3]; | |
1232 h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1]; | |
1233 h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2]; | |
1234 h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3]; | |
1235 | |
1236 h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1]; | |
1237 h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4]; | |
1238 | |
1239 } | 1222 } |
1240 | 1223 |
1241 static inline void write_back_motion(H264Context *h, int mb_type){ | 1224 static inline void write_back_motion(H264Context *h, int mb_type){ |
1242 MpegEncContext * const s = &h->s; | 1225 MpegEncContext * const s = &h->s; |
1243 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; | 1226 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; |