libavcodec.hg: cabac.h annotate

annotate cabac.h @ 4046:8bbc695c9603 libavcodec

factorize get_cabac asm (0.5% slower but its much cleaner)

author	michael
date	Fri, 20 Oct 2006 00:35:54 +0000
parents	5ccdefd60f61
children	61a4e7218a45

rev	line source
1287 9211fbd31353 CABAC michaelni parents: diff changeset	1 /*
9211fbd31353 CABAC michaelni parents: diff changeset	2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
9211fbd31353 CABAC michaelni parents: diff changeset	3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
9211fbd31353 CABAC michaelni parents: diff changeset	4 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	7 * FFmpeg is free software; you can redistribute it and/or
1287 9211fbd31353 CABAC michaelni parents: diff changeset	8 * modify it under the terms of the GNU Lesser General Public
9211fbd31353 CABAC michaelni parents: diff changeset	9 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	10 * version 2.1 of the License, or (at your option) any later version.
1287 9211fbd31353 CABAC michaelni parents: diff changeset	11 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	12 * FFmpeg is distributed in the hope that it will be useful,
1287 9211fbd31353 CABAC michaelni parents: diff changeset	13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9211fbd31353 CABAC michaelni parents: diff changeset	14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9211fbd31353 CABAC michaelni parents: diff changeset	15 * Lesser General Public License for more details.
9211fbd31353 CABAC michaelni parents: diff changeset	16 *
9211fbd31353 CABAC michaelni parents: diff changeset	17 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	18 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2967 diff changeset	19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1287 9211fbd31353 CABAC michaelni parents: diff changeset	20 *
9211fbd31353 CABAC michaelni parents: diff changeset	21 */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	22
1287 9211fbd31353 CABAC michaelni parents: diff changeset	23 /**
9211fbd31353 CABAC michaelni parents: diff changeset	24 * @file cabac.h
9211fbd31353 CABAC michaelni parents: diff changeset	25 * Context Adaptive Binary Arithmetic Coder.
9211fbd31353 CABAC michaelni parents: diff changeset	26 */
9211fbd31353 CABAC michaelni parents: diff changeset	27
9211fbd31353 CABAC michaelni parents: diff changeset	28
3284 a224d9752912 don't force asserts in release builds. 2% faster h264. lorenm parents: 3036 diff changeset	29 //#undef NDEBUG
1287 9211fbd31353 CABAC michaelni parents: diff changeset	30 #include <assert.h>
9211fbd31353 CABAC michaelni parents: diff changeset	31
3948 3edbf131ee44 refill cabac variables in 16bit steps, 3% faster get_cabac() michael parents: 3947 diff changeset	32 #define CABAC_BITS 16
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	33 #define CABAC_MASK ((1<<CABAC_BITS)-1)
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	34 #define BRANCHLESS_CABAC_DECODER 1
3990 746a60ba3177 enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ... michael parents: 3984 diff changeset	35 #define CMOV_IS_FAST 1
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	36 //#define ARCH_X86_DISABLED 1
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	37
1287 9211fbd31353 CABAC michaelni parents: diff changeset	38 typedef struct CABACContext{
9211fbd31353 CABAC michaelni parents: diff changeset	39 int low;
9211fbd31353 CABAC michaelni parents: diff changeset	40 int range;
9211fbd31353 CABAC michaelni parents: diff changeset	41 int outstanding_count;
9211fbd31353 CABAC michaelni parents: diff changeset	42 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	43 int symCount;
9211fbd31353 CABAC michaelni parents: diff changeset	44 #endif
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	45 const uint8_t *bytestream_start;
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	46 const uint8_t *bytestream;
2116 48d9f86fb047 overread fix michael parents: 2024 diff changeset	47 const uint8_t *bytestream_end;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	48 PutBitContext pb;
9211fbd31353 CABAC michaelni parents: diff changeset	49 }CABACContext;
9211fbd31353 CABAC michaelni parents: diff changeset	50
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	51 extern uint8_t ff_h264_mlps_state[4*64];
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	52 extern uint8_t ff_h264_lps_range[4264]; ///< rangeTabLPS
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	53 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	54 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	55 extern const uint8_t ff_h264_norm_shift[512];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	56
1287 9211fbd31353 CABAC michaelni parents: diff changeset	57
9211fbd31353 CABAC michaelni parents: diff changeset	58 void ff_init_cabac_encoder(CABACContext c, uint8_t buf, int buf_size);
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	59 void ff_init_cabac_decoder(CABACContext c, const uint8_t buf, int buf_size);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	60 void ff_init_cabac_states(CABACContext *c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	61
9211fbd31353 CABAC michaelni parents: diff changeset	62
9211fbd31353 CABAC michaelni parents: diff changeset	63 static inline void put_cabac_bit(CABACContext *c, int b){
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	64 put_bits(&c->pb, 1, b);
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	65 for(;c->outstanding_count; c->outstanding_count--){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	66 put_bits(&c->pb, 1, 1-b);
9211fbd31353 CABAC michaelni parents: diff changeset	67 }
9211fbd31353 CABAC michaelni parents: diff changeset	68 }
9211fbd31353 CABAC michaelni parents: diff changeset	69
9211fbd31353 CABAC michaelni parents: diff changeset	70 static inline void renorm_cabac_encoder(CABACContext *c){
9211fbd31353 CABAC michaelni parents: diff changeset	71 while(c->range < 0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	72 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	73 if(c->low<0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	74 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	75 }else if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	76 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	77 c->low -= 0x100;
9211fbd31353 CABAC michaelni parents: diff changeset	78 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	79 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	80 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	81 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	82
1287 9211fbd31353 CABAC michaelni parents: diff changeset	83 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	84 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	85 }
9211fbd31353 CABAC michaelni parents: diff changeset	86 }
9211fbd31353 CABAC michaelni parents: diff changeset	87
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	88 static void put_cabac(CABACContext c, uint8_t const state, int bit){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	89 int RangeLPS= ff_h264_lps_range[2(c->range&0xC0) + state];
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	90
1287 9211fbd31353 CABAC michaelni parents: diff changeset	91 if(bit == ((*state)&1)){
9211fbd31353 CABAC michaelni parents: diff changeset	92 c->range -= RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	93 state= ff_h264_mps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	94 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	95 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	96 c->range = RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	97 state= ff_h264_lps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	98 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	99
1287 9211fbd31353 CABAC michaelni parents: diff changeset	100 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	101
9211fbd31353 CABAC michaelni parents: diff changeset	102 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	103 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	104 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	105 }
9211fbd31353 CABAC michaelni parents: diff changeset	106
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	107 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	108 assert(c->range > RangeLPS);
9211fbd31353 CABAC michaelni parents: diff changeset	109
9211fbd31353 CABAC michaelni parents: diff changeset	110 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	111 c->range -= RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	112 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	113 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	114 c->range = RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	115 }
9211fbd31353 CABAC michaelni parents: diff changeset	116
9211fbd31353 CABAC michaelni parents: diff changeset	117 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	118
9211fbd31353 CABAC michaelni parents: diff changeset	119 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	120 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	121 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	122 }
9211fbd31353 CABAC michaelni parents: diff changeset	123
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	124 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	125 * @param bit 0 -> write zero bit, !=0 write one bit
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	126 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	127 static void put_cabac_bypass(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	128 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	129
9211fbd31353 CABAC michaelni parents: diff changeset	130 if(bit){
9211fbd31353 CABAC michaelni parents: diff changeset	131 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	132 }
9211fbd31353 CABAC michaelni parents: diff changeset	133 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	134 if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	135 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	136 }else if(c->low<0x400){
9211fbd31353 CABAC michaelni parents: diff changeset	137 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	138 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	139 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	140 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	141 c->low -= 0x400;
9211fbd31353 CABAC michaelni parents: diff changeset	142 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	143
1287 9211fbd31353 CABAC michaelni parents: diff changeset	144 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	145 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	146 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	147 }
9211fbd31353 CABAC michaelni parents: diff changeset	148
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	149 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	150 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	151 * @return the number of bytes written
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	152 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	153 static int put_cabac_terminate(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	154 c->range -= 2;
9211fbd31353 CABAC michaelni parents: diff changeset	155
9211fbd31353 CABAC michaelni parents: diff changeset	156 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	157 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	158 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	159 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	160 c->range= 2;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	161
1287 9211fbd31353 CABAC michaelni parents: diff changeset	162 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	163
9211fbd31353 CABAC michaelni parents: diff changeset	164 assert(c->low <= 0x1FF);
9211fbd31353 CABAC michaelni parents: diff changeset	165 put_cabac_bit(c, c->low>>9);
9211fbd31353 CABAC michaelni parents: diff changeset	166 put_bits(&c->pb, 2, ((c->low>>7)&3)\|1);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	167
1287 9211fbd31353 CABAC michaelni parents: diff changeset	168 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
9211fbd31353 CABAC michaelni parents: diff changeset	169 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	170
1287 9211fbd31353 CABAC michaelni parents: diff changeset	171 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	172 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	173 #endif
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	174
1787 752b51a3c8ed get_bit_count -> put_bits_count alex parents: 1301 diff changeset	175 return (put_bits_count(&c->pb)+7)>>3;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	176 }
9211fbd31353 CABAC michaelni parents: diff changeset	177
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	178 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	179 * put (truncated) unary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	180 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	181 static void put_cabac_u(CABACContext c, uint8_t state, int v, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	182 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	183
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	184 assert(v <= max);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	185
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	186 #if 1
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	187 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	188 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	189 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	190 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	191 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	192 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	193 #else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	194 if(v <= max_index){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	195 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	196 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	197 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	198 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	199 put_cabac(c, state+i, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	200 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	201 for(i=0; i<=max_index; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	202 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	203 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	204 for(; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	205 put_cabac(c, state+max_index, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	206 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	207 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	208 put_cabac(c, state+max_index, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	209 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	210 #endif
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	211 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	212
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	213 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	214 * put unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	215 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	216 static void put_cabac_ueg(CABACContext c, uint8_t state, int v, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	217 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	218
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	219 if(v==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	220 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	221 else{
1298 5bc3184810dc cleanup michaelni parents: 1290 diff changeset	222 const int sign= v < 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	223
4001 34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3999 diff changeset	224 if(is_signed) v= FFABS(v);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	225
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	226 if(v<max){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	227 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	228 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	229 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	230 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	231
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	232 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	233 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	234 int m= 1<<k;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	235
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	236 for(i=0; i<max; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	237 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	238 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	239 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	240
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	241 v -= max;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	242 while(v >= m){ //FIXME optimize
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	243 put_cabac_bypass(c, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	244 v-= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	245 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	246 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	247 put_cabac_bypass(c, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	248 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	249 put_cabac_bypass(c, v&m);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	250 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	251 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	252
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	253 if(is_signed)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	254 put_cabac_bypass(c, sign);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	255 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	256 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	257
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	258 static void refill(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	259 #if CABAC_BITS == 16
3946 ab0797f2e397 () 10l michael parents: 3943 diff changeset	260 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	261 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	262 c->low+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	263 #endif
1c39d9786efd optimization michael parents: 2116 diff changeset	264 c->low -= CABAC_MASK;
1c39d9786efd optimization michael parents: 2116 diff changeset	265 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	266 }
1c39d9786efd optimization michael parents: 2116 diff changeset	267
1c39d9786efd optimization michael parents: 2116 diff changeset	268 static void refill2(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	269 int i, x;
1c39d9786efd optimization michael parents: 2116 diff changeset	270
1c39d9786efd optimization michael parents: 2116 diff changeset	271 x= c->low ^ (c->low-1);
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	272 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	273
1c39d9786efd optimization michael parents: 2116 diff changeset	274 x= -CABAC_MASK;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	275
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	276 #if CABAC_BITS == 16
1c39d9786efd optimization michael parents: 2116 diff changeset	277 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
1c39d9786efd optimization michael parents: 2116 diff changeset	278 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	279 x+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	280 #endif
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	281
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	282 c->low += x<<i;
1c39d9786efd optimization michael parents: 2116 diff changeset	283 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	284 }
1c39d9786efd optimization michael parents: 2116 diff changeset	285
1287 9211fbd31353 CABAC michaelni parents: diff changeset	286 static inline void renorm_cabac_decoder(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	287 while(c->range < 0x100){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	288 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	289 c->low+= c->low;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	290 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	291 refill(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	292 }
9211fbd31353 CABAC michaelni parents: diff changeset	293 }
9211fbd31353 CABAC michaelni parents: diff changeset	294
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	295 static inline void renorm_cabac_decoder_once(CABACContext *c){
3951 751bfc30df72 disable benchmarking code michael parents: 3950 diff changeset	296 #ifdef ARCH_X86_DISABLED
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	297 int temp;
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	298 #if 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	299 //P3:683 athlon:475
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	300 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	301 "lea -0x100(%0), %2 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	302 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	303 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	304 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	305 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	306 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	307 #elif 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	308 //P3:680 athlon:474
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	309 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	310 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	311 "setb %%cl \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	312 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	313 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	314 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	315 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	316 #elif 1
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	317 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	318 //P3:665 athlon:517
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	319 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	320 "lea -0x100(%0), %%eax \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	321 "cdq \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	322 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	323 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	324 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	325 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	326 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	327 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	328 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	329 #elif 0
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	330 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	331 //P3:673 athlon:509
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	332 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	333 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	334 "sbb %%edx, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	335 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	336 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	337 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	338 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	339 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	340 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	341 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	342 #else
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	343 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	344 //P3:677 athlon:511
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	345 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	346 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	347 "lea (%0, %0), %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	348 "lea (%1, %1), %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	349 "cmovb %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	350 "cmovb %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	351 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	352 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	353 #endif
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	354 #else
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	355 //P3:675 athlon:476
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	356 int shift= (uint32_t)(c->range - 0x100)>>31;
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	357 c->range<<= shift;
0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	358 c->low <<= shift;
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	359 #endif
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	360 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	361 refill(c);
1c39d9786efd optimization michael parents: 2116 diff changeset	362 }
1c39d9786efd optimization michael parents: 2116 diff changeset	363
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	364 static int always_inline get_cabac_inline(CABACContext c, uint8_t const state){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	365 //FIXME gcc generates duplicate load/stores for c->low and c->range
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	366 #define LOW "0"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	367 #define RANGE "4"
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	368 #define BYTESTART "12"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	369 #define BYTE "16"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	370 #define BYTEEND "20"
4044 5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	371 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	372 int bit;
5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	373
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	374 #ifndef BRANCHLESS_CABAC_DECODER
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	375 asm volatile(
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	376 "movzbl (%1), %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	377 "movl "RANGE "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	378 "movl "RANGE "(%2), %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	379 "andl $0xC0, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	380 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	381 "movl "LOW "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	382 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	383 "subl %%esi, %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	384 "movl %%edx, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	385 "shll $17, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	386 "cmpl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	387 " ja 1f \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	388
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	389 #if 1
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	390 //athlon:4067 P3:4110
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	391 "lea -0x100(%%edx), %%ecx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	392 "shr $31, %%ecx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	393 "shl %%cl, %%edx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	394 "shl %%cl, %%ebx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	395 #else
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	396 //athlon:4057 P3:4130
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	397 "cmp $0x100, %%edx \n\t" //FIXME avoidable
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	398 "setb %%cl \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	399 "shl %%cl, %%edx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	400 "shl %%cl, %%ebx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	401 #endif
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	402 "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	403 "movb %%cl, (%1) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	404 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	405 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	406 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	407 "movl "BYTE "(%2), %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	408 "subl $0xFFFF, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	409 "movzwl (%%esi), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	410 "bswap %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	411 "shrl $15, %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	412 "addl $2, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	413 "addl %%ecx, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	414 "movl %%esi, "BYTE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	415 "jmp 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	416 "1: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	417 //eax:state ebx:low, edx:range, esi:RangeLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	418 "subl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	419 "movl %%esi, %%edx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	420 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	421 "shll %%cl, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	422 "shll %%cl, %%edx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	423 "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	424 "movb %%cl, (%1) \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	425 "addl $1, %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	426 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	427 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	428
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	429 "movl "BYTE "(%2), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	430 "movzwl (%%ecx), %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	431 "bswap %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	432 "shrl $15, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	433 "subl $0xFFFF, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	434 "addl $2, %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	435 "movl %%ecx, "BYTE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	436
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	437 "leal -1(%%ebx), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	438 "xorl %%ebx, %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	439 "shrl $15, %%ecx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	440 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
3994 2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	441 "neg %%ecx \n\t"
2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	442 "add $7, %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	443
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	444 "shll %%cl , %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	445 "addl %%esi, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	446 "2: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	447 "movl %%edx, "RANGE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	448 "movl %%ebx, "LOW "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	449 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	450 :"r"(state), "r"(c)
4012 f8c649ac09dd add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn michael parents: 4008 diff changeset	451 : "%ecx", "%ebx", "%edx", "%esi", "memory"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	452 );
3982 af16271634c2 moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop michael parents: 3981 diff changeset	453 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	454 #else /* BRANCHLESS_CABAC_DECODER */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	455
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	456
4034 fbe263601152 Fix crash with illegal instruction, cmov is available on 686 and later only. diego parents: 4033 diff changeset	457 #if (defined CMOV_IS_FAST && __CPU__ >= 686)
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	458 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	459 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	460 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	461 "cmp "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	462 "cmova %%ecx , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	463 "sbb %%ecx , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	464 "and %%ecx , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	465 "sub "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	466 "xor %%ecx , "ret" \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	467 #else /* CMOV_IS_FAST */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	468 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	469 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	470 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	471 "sub "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	472 "sar $31 , "tmp" \n\t" /lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	473 "sub %%ecx , "range" \n\t" /RangeLPS - range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	474 "and "tmp" , "range" \n\t" /(RangeLPS - range)&lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	475 "add %%ecx , "range" \n\t" /new range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	476 "shl $17 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	477 "and "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	478 "sub %%ecx , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	479 "xor "tmp" , "ret" \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	480 #endif /* CMOV_IS_FAST */
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	481
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	482
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	483 #define BRANCHLESS_GET_CABAC(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	484 "movzbl "statep" , "ret" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	485 "mov "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	486 "and $0xC0 , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	487 "movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	488 "sub "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	489 BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	490 "movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	491 "shl %%cl , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	492 "movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	493 "mov "tmpbyte" , "statep" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	494 "shl %%cl , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	495 "test "lowword" , "lowword" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	496 " jnz 1f \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	497 "mov "BYTE"("cabac"), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	498 "movzwl (%%ecx) , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	499 "bswap "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	500 "shr $15 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	501 "sub $0xFFFF , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	502 "add $2 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	503 "mov %%ecx , "BYTE "("cabac") \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	504 "lea -1("low") , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	505 "xor "low" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	506 "shr $15 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	507 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	508 "neg %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	509 "add $7 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	510 "shl %%cl , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	511 "add "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	512 "1: \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	513
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	514 asm volatile(
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	515 "movl "RANGE "(%2), %%esi \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	516 "movl "LOW "(%2), %%ebx \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	517 BRANCHLESS_GET_CABAC("%0", "%2", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl")
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	518 "movl %%esi, "RANGE "(%2) \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	519 "movl %%ebx, "LOW "(%2) \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	520
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	521 :"=&a"(bit)
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	522 :"r"(state), "r"(c)
4012 f8c649ac09dd add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn michael parents: 4008 diff changeset	523 : "%ecx", "%ebx", "%edx", "%esi", "memory"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	524 );
3981 9854f686ba79 move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster) michael parents: 3980 diff changeset	525 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	526 #endif /* BRANCHLESS_CABAC_DECODER */
4033 f7a6b2bb3a2f Expand some #endif comments. diego parents: 4027 diff changeset	527 #else /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	528 int s = *state;
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	529 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
2522 e25782262d7d kill warnings patch by (M��ns Rullg��rd <mru inprovide com>) michael parents: 2323 diff changeset	530 int bit, lps_mask attribute_unused;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	531
1287 9211fbd31353 CABAC michaelni parents: diff changeset	532 c->range -= RangeLPS;
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	533 #ifndef BRANCHLESS_CABAC_DECODER
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	534 if(c->low < (c->range<<17)){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	535 bit= s&1;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	536 *state= ff_h264_mps_state[s];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	537 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	538 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	539 bit= ff_h264_norm_shift[RangeLPS];
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	540 c->low -= (c->range<<17);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	541 *state= ff_h264_lps_state[s];
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	542 c->range = RangeLPS<<bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	543 c->low <<= bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	544 bit= (s&1)^1;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	545
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	546 if(!(c->low & 0xFFFF)){
1c39d9786efd optimization michael parents: 2116 diff changeset	547 refill2(c);
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	548 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	549 }
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	550 #else /* BRANCHLESS_CABAC_DECODER */
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	551 lps_mask= ((c->range<<17) - c->low)>>31;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	552
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	553 c->low -= (c->range<<17) & lps_mask;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	554 c->range += (RangeLPS - c->range) & lps_mask;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	555
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	556 s^=lps_mask;
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	557 *state= (ff_h264_mlps_state+128)[s];
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	558 bit= s&1;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	559
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	560 lps_mask= ff_h264_norm_shift[c->range];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	561 c->range<<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	562 c->low <<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	563 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	564 refill2(c);
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	565 #endif /* BRANCHLESS_CABAC_DECODER */
4033 f7a6b2bb3a2f Expand some #endif comments. diego parents: 4027 diff changeset	566 #endif /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	567 return bit;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	568 }
9211fbd31353 CABAC michaelni parents: diff changeset	569
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	570 static int __attribute((noinline)) get_cabac_noinline(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	571 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	572 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	573
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	574 static int get_cabac(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	575 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	576 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	577
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	578 static int get_cabac_bypass(CABACContext *c){
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	579 #if 0 //not faster
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	580 int bit;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	581 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	582 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	583 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	584 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	585 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	586 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	587 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	588 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	589 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	590 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	591 " jnz 1f \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	592 "movl "BYTE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	593 "subl $0xFFFF, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	594 "movzwl (%%ebx), %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	595 "bswap %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	596 "shrl $15, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	597 "addl $2, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	598 "addl %%ecx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	599 "movl %%ebx, "BYTE "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	600 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	601 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	602
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	603 :"=&d"(bit)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	604 :"r"(c)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	605 : "%eax", "%ebx", "%ecx", "memory"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	606 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	607 return bit+1;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	608 #else
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	609 int range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	610 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	611
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	612 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	613 refill(c);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	614
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	615 range= c->range<<17;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	616 if(c->low < range){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	617 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	618 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	619 c->low -= range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	620 return 1;
9211fbd31353 CABAC michaelni parents: diff changeset	621 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	622 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	623 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	624
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	625
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	626 static always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	627 #ifdef ARCH_X86
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	628 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	629 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	630 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	631 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	632 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	633 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	634 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	635 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	636 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	637 "xor %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	638 "sub %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	639 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	640 " jnz 1f \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	641 "movl "BYTE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	642 "subl $0xFFFF, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	643 "movzwl (%%ebx), %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	644 "bswap %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	645 "shrl $15, %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	646 "addl $2, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	647 "addl %%edx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	648 "movl %%ebx, "BYTE "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	649 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	650 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	651
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	652 :"+c"(val)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	653 :"r"(c)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	654 : "%eax", "%ebx", "%edx", "memory"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	655 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	656 return val;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	657 #else
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	658 int range, mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	659 c->low += c->low;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	660
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	661 if(!(c->low & CABAC_MASK))
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	662 refill(c);
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	663
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	664 range= c->range<<17;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	665 c->low -= range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	666 mask= c->low >> 31;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	667 range &= mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	668 c->low += range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	669 return (val^mask)-mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	670 #endif
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	671 }
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	672
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	673 //FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	674 //FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	675 #ifdef ARCH_X86
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	676 static int decode_significance_x86(CABACContext c, int max_coeff, uint8_t significant_coeff_ctx_base, int *index){
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	677 void *end= significant_coeff_ctx_base + max_coeff - 1;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	678 int minusstart= -(int)significant_coeff_ctx_base;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	679 int minusindex= -(int)index;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	680 int coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	681 asm volatile(
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	682 "movl "RANGE "(%3), %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	683 "movl "LOW "(%3), %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	684
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	685 "2: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	686
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	687 BRANCHLESS_GET_CABAC("%0", "%3", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl")
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	688
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	689 "test $1, %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	690 " jz 3f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	691
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	692 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	693 "movl %4, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	694 "addl %1, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	695 "movl %%ecx, (%%eax) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	696 "addl $4, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	697 "movl %%eax, %2 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	698
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	699 BRANCHLESS_GET_CABAC("%0", "%3", "61(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl")
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	700
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	701 "test $1, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	702 " jnz 4f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	703
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	704 "3: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	705 "addl $1, %1 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	706 "cmpl %5, %1 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	707 " jb 2b \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	708 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	709 "movl %4, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	710 "addl %1, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	711 "movl %%ecx, (%%eax) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	712 "addl $4, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	713 "movl %%eax, %2 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	714 "4: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	715 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	716 "addl %6, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	717 "shr $2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	718
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	719 "movl %%esi, "RANGE "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	720 "movl %%ebx, "LOW "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	721 :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	722 :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	723 : "%ecx", "%ebx", "%edx", "%esi", "memory"\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	724 );
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	725 return coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	726 }
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	727 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	728
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	729 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	730 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	731 * @return the number of bytes read or 0 if no end
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	732 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	733 static int get_cabac_terminate(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	734 c->range -= 2;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	735 if(c->low < c->range<<17){
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	736 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	737 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	738 }else{
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	739 return c->bytestream - c->bytestream_start;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	740 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	741 }
9211fbd31353 CABAC michaelni parents: diff changeset	742
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	743 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	744 * get (truncated) unnary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	745 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	746 static int get_cabac_u(CABACContext c, uint8_t state, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	747 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	748
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	749 for(i=0; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	750 if(get_cabac(c, state)==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	751 return i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	752
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	753 if(i< max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	754 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	755
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	756 return truncated ? max : -1;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	757 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	758
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	759 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	760 * get unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	761 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	762 static int get_cabac_ueg(CABACContext c, uint8_t state, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	763 int i, v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	764 int m= 1<<k;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	765
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	766 if(get_cabac(c, state)==0)
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	767 return 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	768
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	769 if(0 < max_index) state++;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	770
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	771 for(i=1; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	772 if(get_cabac(c, state)==0){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	773 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	774 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	775 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	776 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	777 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	778
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	779 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	780 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	781
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	782 while(get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	783 i+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	784 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	785 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	786
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	787 v=0;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	788 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	789 v+= v + get_cabac_bypass(c);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	790 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	791 i += v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	792
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	793 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	794 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	795 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	796 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	797 }

Mercurial > libavcodec.hg

annotate cabac.h @ 4046:8bbc695c9603 libavcodec