comparison h264.c @ 11002:1c8892d7a090 libavcodec

H.264: Use 64-/128-bit write-combining macros for copies 2-3% faster decode on x86-32 core2.
author astrange
date Mon, 25 Jan 2010 00:30:44 +0000
parents 79f2e73f3714
children 72c026446d67
comparison
equal deleted inserted replaced
11001:621268959a5c 11002:1c8892d7a090
943 return 0; 943 return 0;
944 } 944 }
945 945
946 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){ 946 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
947 MpegEncContext * const s = &h->s; 947 MpegEncContext * const s = &h->s;
948 uint8_t *top_border;
948 int top_idx = 1; 949 int top_idx = 1;
949 950
950 src_y -= linesize; 951 src_y -= linesize;
951 src_cb -= uvlinesize; 952 src_cb -= uvlinesize;
952 src_cr -= uvlinesize; 953 src_cr -= uvlinesize;
953 954
954 if(!simple && FRAME_MBAFF){ 955 if(!simple && FRAME_MBAFF){
955 if(s->mb_y&1){ 956 if(s->mb_y&1){
956 if(!MB_MBAFF){ 957 if(!MB_MBAFF){
957 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y + 15*linesize); 958 top_border = h->top_borders[0][s->mb_x];
958 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize); 959 AV_COPY128(top_border, src_y + 15*linesize);
959 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ 960 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
960 *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize); 961 AV_COPY64(top_border+16, src_cb+7*uvlinesize);
961 *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize); 962 AV_COPY64(top_border+24, src_cr+7*uvlinesize);
962 } 963 }
963 } 964 }
964 }else if(MB_MBAFF){ 965 }else if(MB_MBAFF){
965 top_idx = 0; 966 top_idx = 0;
966 }else 967 }else
967 return; 968 return;
968 } 969 }
969 970
971 top_border = h->top_borders[top_idx][s->mb_x];
970 // There are two lines saved, the line above the the top macroblock of a pair, 972 // There are two lines saved, the line above the the top macroblock of a pair,
971 // and the line above the bottom macroblock 973 // and the line above the bottom macroblock
972 974 AV_COPY128(top_border, src_y + 16*linesize);
973 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize);
974 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
975 975
976 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ 976 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
977 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize); 977 AV_COPY64(top_border+16, src_cb+8*uvlinesize);
978 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize); 978 AV_COPY64(top_border+24, src_cr+8*uvlinesize);
979 } 979 }
980 } 980 }
981 981
982 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){ 982 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
983 MpegEncContext * const s = &h->s; 983 MpegEncContext * const s = &h->s;
985 uint64_t temp64; 985 uint64_t temp64;
986 int deblock_left; 986 int deblock_left;
987 int deblock_top; 987 int deblock_top;
988 int mb_xy; 988 int mb_xy;
989 int top_idx = 1; 989 int top_idx = 1;
990 uint8_t *top_border_m1 = h->top_borders[top_idx][s->mb_x-1];
991 uint8_t *top_border = h->top_borders[top_idx][s->mb_x];
990 992
991 if(!simple && FRAME_MBAFF){ 993 if(!simple && FRAME_MBAFF){
992 if(s->mb_y&1){ 994 if(s->mb_y&1){
993 if(!MB_MBAFF) 995 if(!MB_MBAFF)
994 return; 996 return;
1008 1010
1009 src_y -= linesize + 1; 1011 src_y -= linesize + 1;
1010 src_cb -= uvlinesize + 1; 1012 src_cb -= uvlinesize + 1;
1011 src_cr -= uvlinesize + 1; 1013 src_cr -= uvlinesize + 1;
1012 1014
1013 #define XCHG(a,b,t,xchg)\ 1015 #define XCHG(a,b,xchg)\
1014 t= a;\ 1016 if (xchg) AV_SWAP64(b,a);\
1015 if(xchg)\ 1017 else AV_COPY64(b,a);
1016 a= b;\
1017 b= t;
1018 1018
1019 if(deblock_top){ 1019 if(deblock_top){
1020 if(deblock_left){ 1020 if(deblock_left){
1021 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+8), *(uint64_t*)(src_y -7), temp64, 1); 1021 XCHG(top_border_m1+8, src_y -7, 1);
1022 } 1022 }
1023 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg); 1023 XCHG(top_border+0, src_y +1, xchg);
1024 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1); 1024 XCHG(top_border+8, src_y +9, 1);
1025 if(s->mb_x+1 < s->mb_width){ 1025 if(s->mb_x+1 < s->mb_width){
1026 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1); 1026 XCHG(h->top_borders[top_idx][s->mb_x+1], src_y +17, 1);
1027 } 1027 }
1028 } 1028 }
1029 1029
1030 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ 1030 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1031 if(deblock_top){ 1031 if(deblock_top){
1032 if(deblock_left){ 1032 if(deblock_left){
1033 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+16), *(uint64_t*)(src_cb -7), temp64, 1); 1033 XCHG(top_border_m1+16, src_cb -7, 1);
1034 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+24), *(uint64_t*)(src_cr -7), temp64, 1); 1034 XCHG(top_border_m1+24, src_cr -7, 1);
1035 } 1035 }
1036 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1); 1036 XCHG(top_border+16, src_cb+1, 1);
1037 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1); 1037 XCHG(top_border+24, src_cr+1, 1);
1038 } 1038 }
1039 } 1039 }
1040 } 1040 }
1041 1041
1042 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ 1042 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){