comparison h264.c @ 3212:5fb704618ec4 libavcodec

1.5x faster write_back_motion, 1-3% overall
author lorenm
date Wed, 22 Mar 2006 12:41:02 +0000
parents 15157293beea
children 06f98047ff26
comparison
equal deleted inserted replaced
3211:b77b5e7072d6 3212:5fb704618ec4
713 }*/ 713 }*/
714 continue; 714 continue;
715 } 715 }
716 h->mv_cache_clean[list]= 0; 716 h->mv_cache_clean[list]= 0;
717 717
718 if(IS_INTER(top_type)){ 718 if(USES_LIST(top_type, list)){
719 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; 719 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
720 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; 720 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
721 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0]; 721 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
722 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1]; 722 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
723 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2]; 723 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
733 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0; 733 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
734 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101; 734 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
735 } 735 }
736 736
737 //FIXME unify cleanup or sth 737 //FIXME unify cleanup or sth
738 if(IS_INTER(left_type[0])){ 738 if(USES_LIST(left_type[0], list)){
739 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; 739 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
740 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1; 740 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
741 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]]; 741 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
742 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]]; 742 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
743 h->ref_cache[list][scan8[0] - 1 + 0*8]= 743 h->ref_cache[list][scan8[0] - 1 + 0*8]=
747 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0; 747 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
748 h->ref_cache[list][scan8[0] - 1 + 0*8]= 748 h->ref_cache[list][scan8[0] - 1 + 0*8]=
749 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE; 749 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
750 } 750 }
751 751
752 if(IS_INTER(left_type[1])){ 752 if(USES_LIST(left_type[1], list)){
753 const int b_xy= h->mb2b_xy[left_xy[1]] + 3; 753 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
754 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1; 754 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
755 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]]; 755 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
756 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]]; 756 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
757 h->ref_cache[list][scan8[0] - 1 + 2*8]= 757 h->ref_cache[list][scan8[0] - 1 + 2*8]=
765 } 765 }
766 766
767 if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) 767 if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
768 continue; 768 continue;
769 769
770 if(IS_INTER(topleft_type)){ 770 if(USES_LIST(topleft_type, list)){
771 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride; 771 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
772 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride; 772 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
773 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy]; 773 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
774 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy]; 774 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
775 }else{ 775 }else{
776 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0; 776 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
777 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 777 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
778 } 778 }
779 779
780 if(IS_INTER(topright_type)){ 780 if(USES_LIST(topright_type, list)){
781 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride; 781 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
782 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride; 782 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
783 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy]; 783 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
784 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy]; 784 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
785 }else{ 785 }else{
799 *(uint32_t*)h->mv_cache [list][scan8[4 ]]= 799 *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
800 *(uint32_t*)h->mv_cache [list][scan8[12]]= 0; 800 *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
801 801
802 if( h->pps.cabac ) { 802 if( h->pps.cabac ) {
803 /* XXX beurk, Load mvd */ 803 /* XXX beurk, Load mvd */
804 if(IS_INTER(topleft_type)){ 804 if(USES_LIST(topleft_type, list)){
805 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride; 805 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
806 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy]; 806 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
807 }else{ 807 }else{
808 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0; 808 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
809 } 809 }
810 810
811 if(IS_INTER(top_type)){ 811 if(USES_LIST(top_type, list)){
812 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; 812 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
813 *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0]; 813 *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
814 *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1]; 814 *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
815 *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2]; 815 *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
816 *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3]; 816 *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
818 *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]= 818 *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
819 *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]= 819 *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
820 *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]= 820 *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
821 *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0; 821 *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
822 } 822 }
823 if(IS_INTER(left_type[0])){ 823 if(USES_LIST(left_type[0], list)){
824 const int b_xy= h->mb2b_xy[left_xy[0]] + 3; 824 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
825 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]]; 825 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
826 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]]; 826 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
827 }else{ 827 }else{
828 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]= 828 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
829 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0; 829 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
830 } 830 }
831 if(IS_INTER(left_type[1])){ 831 if(USES_LIST(left_type[1], list)){
832 const int b_xy= h->mb2b_xy[left_xy[1]] + 3; 832 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
833 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]]; 833 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
834 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]]; 834 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
835 }else{ 835 }else{
836 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]= 836 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
1422 const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride; 1422 const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1423 int list; 1423 int list;
1424 1424
1425 for(list=0; list<2; list++){ 1425 for(list=0; list<2; list++){
1426 int y; 1426 int y;
1427 if(!USES_LIST(mb_type, list)){ 1427 if(!USES_LIST(mb_type, list))
1428 if(1){ //FIXME skip or never read if mb_type doesn't use it
1429 for(y=0; y<4; y++){
1430 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
1431 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
1432 }
1433 if( h->pps.cabac ) {
1434 /* FIXME needed ? */
1435 for(y=0; y<4; y++){
1436 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
1437 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
1438 }
1439 }
1440 for(y=0; y<2; y++){
1441 s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]=
1442 s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= LIST_NOT_USED;
1443 }
1444 }
1445 continue; 1428 continue;
1446 }
1447 1429
1448 for(y=0; y<4; y++){ 1430 for(y=0; y<4; y++){
1449 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]; 1431 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1450 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]; 1432 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1451 } 1433 }
1453 for(y=0; y<4; y++){ 1435 for(y=0; y<4; y++){
1454 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y]; 1436 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1455 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y]; 1437 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1456 } 1438 }
1457 } 1439 }
1458 for(y=0; y<2; y++){ 1440
1459 s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y]; 1441 {
1460 s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y]; 1442 uint8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1443 ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1444 ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1445 ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1446 ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1461 } 1447 }
1462 } 1448 }
1463 1449
1464 if(h->slice_type == B_TYPE && h->pps.cabac){ 1450 if(h->slice_type == B_TYPE && h->pps.cabac){
1465 if(IS_8X8(mb_type)){ 1451 if(IS_8X8(mb_type)){
1466 h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0; 1452 uint8_t *direct_table = &h->direct_table[b8_xy];
1467 h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0; 1453 direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1468 h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0; 1454 direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1455 direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1469 } 1456 }
1470 } 1457 }
1471 } 1458 }
1472 1459
1473 /** 1460 /**