# HG changeset patch # User michael # Date 1263771863 0 # Node ID f4cf3960b8c638aa771bbfb11ce5b8e6b0be11e5 # Parent 28840dfd4b52a23c6f7aa5285d9e006f467fcaec Reorganize how values are stored in h->non_zero_count. ~1% faster diff -r 28840dfd4b52 -r f4cf3960b8c6 h264.h --- a/h264.h Sun Jan 17 22:05:36 2010 +0000 +++ b/h264.h Sun Jan 17 23:44:23 2010 +0000 @@ -300,6 +300,13 @@ * is 64 if not available. */ DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); + + /* + .UU.YYYY + .UU.YYYY + .vv.YYYY + .VV.YYYY + */ uint8_t (*non_zero_count)[32]; /** @@ -727,11 +734,11 @@ const uint8_t * left_block; int topleft_partition= -1; int i; - static const uint8_t left_block_options[4][8]={ - {0,1,2,3,7,10,8,11}, - {2,2,3,3,8,11,8,11}, - {0,0,1,1,7,10,7,10}, - {0,2,0,2,7,10,7,10} + static const uint8_t left_block_options[4][16]={ + {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, + {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, + {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, + {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} }; top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); @@ -788,6 +795,12 @@ h->left_mb_xy[0] = left_xy[0]; h->left_mb_xy[1] = left_xy[1]; if(for_deblock){ + *((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]); + *((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]); + *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]); + *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]); + *((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]); + topleft_type = 0; topright_type = 0; top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; @@ -922,16 +935,13 @@ */ //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) if(top_type){ - h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4]; - h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5]; - h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6]; - h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3]; + *(uint32_t*)&h->non_zero_count_cache[4+8*0]= *(uint32_t*)&h->non_zero_count[top_xy][4+3*8]; - h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9]; - h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8]; + h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8]; + h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8]; - h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12]; - h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11]; + h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8]; + h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8]; }else{ h->non_zero_count_cache[4+8*0]= @@ -949,10 +959,10 @@ for (i=0; i<2; i++) { if(left_type[i]){ - h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]]; - h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]]; - h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]]; - h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]]; + h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]]; + h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]]; + h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]]; + h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]]; }else{ h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count_cache[3+8*2 + 2*8*i]= @@ -1204,38 +1214,11 @@ static inline void write_back_non_zero_count(H264Context *h){ const int mb_xy= h->mb_xy; - h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1]; - h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2]; - h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3]; - h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4]; - h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4]; - h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4]; - h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4]; - - h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2]; - h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2]; - h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1]; - - h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; - h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; - h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; - - //FIXME sort better how things are stored in non_zero_count - - - h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1]; - h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2]; - h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3]; - h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1]; - h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2]; - h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3]; - h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1]; - h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2]; - h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3]; - - h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1]; - h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4]; - + *((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]); + *((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]); + *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]); + *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]); + *((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]); } static inline void write_back_motion(H264Context *h, int mb_type){ diff -r 28840dfd4b52 -r f4cf3960b8c6 h264_loopfilter.c --- a/h264_loopfilter.c Sun Jan 17 22:05:36 2010 +0000 +++ b/h264_loopfilter.c Sun Jan 17 23:44:23 2010 +0000 @@ -472,7 +472,6 @@ // be done twice (one each of the field) even if we are in a // frame macroblock. // - static const int nnz_idx[4] = {4,5,6,3}; unsigned int tmp_linesize = 2 * linesize; unsigned int tmp_uvlinesize = 2 * uvlinesize; int mbn_xy = mb_xy - 2 * s->mb_stride; @@ -488,7 +487,7 @@ const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy]; for( i = 0; i < 4; i++ ) { if( h->non_zero_count_cache[scan8[0]+i] != 0 || - mbn_nnz[nnz_idx[i]] != 0 ) + mbn_nnz[i+4+3*8] != 0 ) bS[i] = 2; else bS[i] = 1; @@ -663,36 +662,6 @@ return; } } - - h->non_zero_count_cache[7+8*1]=h->non_zero_count[mb_xy][0]; - h->non_zero_count_cache[7+8*2]=h->non_zero_count[mb_xy][1]; - h->non_zero_count_cache[7+8*3]=h->non_zero_count[mb_xy][2]; - h->non_zero_count_cache[7+8*4]=h->non_zero_count[mb_xy][3]; - h->non_zero_count_cache[4+8*4]=h->non_zero_count[mb_xy][4]; - h->non_zero_count_cache[5+8*4]=h->non_zero_count[mb_xy][5]; - h->non_zero_count_cache[6+8*4]=h->non_zero_count[mb_xy][6]; - - h->non_zero_count_cache[1+8*2]=h->non_zero_count[mb_xy][9]; - h->non_zero_count_cache[2+8*2]=h->non_zero_count[mb_xy][8]; - h->non_zero_count_cache[2+8*1]=h->non_zero_count[mb_xy][7]; - - h->non_zero_count_cache[1+8*5]=h->non_zero_count[mb_xy][12]; - h->non_zero_count_cache[2+8*5]=h->non_zero_count[mb_xy][11]; - h->non_zero_count_cache[2+8*4]=h->non_zero_count[mb_xy][10]; - - h->non_zero_count_cache[6+8*1]=h->non_zero_count[mb_xy][13]; - h->non_zero_count_cache[6+8*2]=h->non_zero_count[mb_xy][14]; - h->non_zero_count_cache[6+8*3]=h->non_zero_count[mb_xy][15]; - h->non_zero_count_cache[5+8*1]=h->non_zero_count[mb_xy][16]; - h->non_zero_count_cache[5+8*2]=h->non_zero_count[mb_xy][17]; - h->non_zero_count_cache[5+8*3]=h->non_zero_count[mb_xy][18]; - h->non_zero_count_cache[4+8*1]=h->non_zero_count[mb_xy][19]; - h->non_zero_count_cache[4+8*2]=h->non_zero_count[mb_xy][20]; - h->non_zero_count_cache[4+8*3]=h->non_zero_count[mb_xy][21]; - - h->non_zero_count_cache[1+8*1]=h->non_zero_count[mb_xy][22]; - h->non_zero_count_cache[1+8*4]=h->non_zero_count[mb_xy][23]; - // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs if(!h->pps.cabac && h->pps.transform_8x8_mode){ int top_type, left_type[2]; @@ -762,7 +731,7 @@ ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ? (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2)) : - h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2])) + h->non_zero_count[mbn_xy][7+(MB_FIELD ? (i&3) : (i>>2)+(mb_y&1)*2)*8])) bS[i] = 2; else bS[i] = 1;