comparison h264.h @ 10909:f4cf3960b8c6 libavcodec

Reorganize how values are stored in h->non_zero_count. ~1% faster
author michael
date Sun, 17 Jan 2010 23:44:23 +0000
parents 28840dfd4b52
children 7cecaa3a6b38
comparison
equal deleted inserted replaced
10908:28840dfd4b52 10909:f4cf3960b8c6
298 /** 298 /**
299 * non zero coeff count cache. 299 * non zero coeff count cache.
300 * is 64 if not available. 300 * is 64 if not available.
301 */ 301 */
302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); 302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
303
304 /*
305 .UU.YYYY
306 .UU.YYYY
307 .vv.YYYY
308 .VV.YYYY
309 */
303 uint8_t (*non_zero_count)[32]; 310 uint8_t (*non_zero_count)[32];
304 311
305 /** 312 /**
306 * Motion vector cache. 313 * Motion vector cache.
307 */ 314 */
725 int topleft_xy, top_xy, topright_xy, left_xy[2]; 732 int topleft_xy, top_xy, topright_xy, left_xy[2];
726 int topleft_type, top_type, topright_type, left_type[2]; 733 int topleft_type, top_type, topright_type, left_type[2];
727 const uint8_t * left_block; 734 const uint8_t * left_block;
728 int topleft_partition= -1; 735 int topleft_partition= -1;
729 int i; 736 int i;
730 static const uint8_t left_block_options[4][8]={ 737 static const uint8_t left_block_options[4][16]={
731 {0,1,2,3,7,10,8,11}, 738 {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
732 {2,2,3,3,8,11,8,11}, 739 {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
733 {0,0,1,1,7,10,7,10}, 740 {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
734 {0,2,0,2,7,10,7,10} 741 {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
735 }; 742 };
736 743
737 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); 744 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
738 745
739 //FIXME deblocking could skip the intra and nnz parts. 746 //FIXME deblocking could skip the intra and nnz parts.
786 793
787 h->top_mb_xy = top_xy; 794 h->top_mb_xy = top_xy;
788 h->left_mb_xy[0] = left_xy[0]; 795 h->left_mb_xy[0] = left_xy[0];
789 h->left_mb_xy[1] = left_xy[1]; 796 h->left_mb_xy[1] = left_xy[1];
790 if(for_deblock){ 797 if(for_deblock){
798 *((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]);
799 *((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]);
800 *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]);
801 *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]);
802 *((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]);
803
791 topleft_type = 0; 804 topleft_type = 0;
792 topright_type = 0; 805 topright_type = 0;
793 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; 806 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0;
794 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; 807 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
795 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; 808 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
920 4 L . .L . . . . 933 4 L . .L . . . .
921 5 L . .. . . . . 934 5 L . .. . . . .
922 */ 935 */
923 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) 936 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
924 if(top_type){ 937 if(top_type){
925 h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4]; 938 *(uint32_t*)&h->non_zero_count_cache[4+8*0]= *(uint32_t*)&h->non_zero_count[top_xy][4+3*8];
926 h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5]; 939
927 h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6]; 940 h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8];
928 h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3]; 941 h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8];
929 942
930 h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9]; 943 h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8];
931 h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8]; 944 h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8];
932
933 h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
934 h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
935 945
936 }else{ 946 }else{
937 h->non_zero_count_cache[4+8*0]= 947 h->non_zero_count_cache[4+8*0]=
938 h->non_zero_count_cache[5+8*0]= 948 h->non_zero_count_cache[5+8*0]=
939 h->non_zero_count_cache[6+8*0]= 949 h->non_zero_count_cache[6+8*0]=
947 957
948 } 958 }
949 959
950 for (i=0; i<2; i++) { 960 for (i=0; i<2; i++) {
951 if(left_type[i]){ 961 if(left_type[i]){
952 h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]]; 962 h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
953 h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]]; 963 h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
954 h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]]; 964 h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
955 h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]]; 965 h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
956 }else{ 966 }else{
957 h->non_zero_count_cache[3+8*1 + 2*8*i]= 967 h->non_zero_count_cache[3+8*1 + 2*8*i]=
958 h->non_zero_count_cache[3+8*2 + 2*8*i]= 968 h->non_zero_count_cache[3+8*2 + 2*8*i]=
959 h->non_zero_count_cache[0+8*1 + 8*i]= 969 h->non_zero_count_cache[0+8*1 + 8*i]=
960 h->non_zero_count_cache[0+8*4 + 8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64; 970 h->non_zero_count_cache[0+8*4 + 8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64;
1202 } 1212 }
1203 1213
1204 static inline void write_back_non_zero_count(H264Context *h){ 1214 static inline void write_back_non_zero_count(H264Context *h){
1205 const int mb_xy= h->mb_xy; 1215 const int mb_xy= h->mb_xy;
1206 1216
1207 h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1]; 1217 *((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]);
1208 h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2]; 1218 *((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]);
1209 h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3]; 1219 *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]);
1210 h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4]; 1220 *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]);
1211 h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4]; 1221 *((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]);
1212 h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1213 h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1214
1215 h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1216 h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1217 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1218
1219 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1220 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1221 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1222
1223 //FIXME sort better how things are stored in non_zero_count
1224
1225
1226 h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1];
1227 h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2];
1228 h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3];
1229 h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1];
1230 h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2];
1231 h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3];
1232 h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1];
1233 h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2];
1234 h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3];
1235
1236 h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1];
1237 h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4];
1238
1239 } 1222 }
1240 1223
1241 static inline void write_back_motion(H264Context *h, int mb_type){ 1224 static inline void write_back_motion(H264Context *h, int mb_type){
1242 MpegEncContext * const s = &h->s; 1225 MpegEncContext * const s = &h->s;
1243 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; 1226 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;