libavcodec.hg: cabac.h annotate

annotate cabac.h @ 4040:9eaea06c5ba6 libavcodec

optimize sign decoding code in decode_residual() x86 is 4% faster on P3 C sign stuff + x86 code for everything else is also faster then before (sorry forgot to test pure C) ... and if i replace the second occurance of the sign decoding in decode_residual by the asm too then everything gets slower iam starting to think that it might be best to write the whole function in asm, playing this avoid random deoptimizations game with gcc is not fun at all

author	michael
date	Thu, 19 Oct 2006 01:19:03 +0000
parents	866a83726985
children	87694a28120c

rev	line source
1287 9211fbd31353 CABAC michaelni parents: diff changeset	1 /*
9211fbd31353 CABAC michaelni parents: diff changeset	2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
9211fbd31353 CABAC michaelni parents: diff changeset	3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
9211fbd31353 CABAC michaelni parents: diff changeset	4 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	7 * FFmpeg is free software; you can redistribute it and/or
1287 9211fbd31353 CABAC michaelni parents: diff changeset	8 * modify it under the terms of the GNU Lesser General Public
9211fbd31353 CABAC michaelni parents: diff changeset	9 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	10 * version 2.1 of the License, or (at your option) any later version.
1287 9211fbd31353 CABAC michaelni parents: diff changeset	11 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	12 * FFmpeg is distributed in the hope that it will be useful,
1287 9211fbd31353 CABAC michaelni parents: diff changeset	13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9211fbd31353 CABAC michaelni parents: diff changeset	14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9211fbd31353 CABAC michaelni parents: diff changeset	15 * Lesser General Public License for more details.
9211fbd31353 CABAC michaelni parents: diff changeset	16 *
9211fbd31353 CABAC michaelni parents: diff changeset	17 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	18 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2967 diff changeset	19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1287 9211fbd31353 CABAC michaelni parents: diff changeset	20 *
9211fbd31353 CABAC michaelni parents: diff changeset	21 */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	22
1287 9211fbd31353 CABAC michaelni parents: diff changeset	23 /**
9211fbd31353 CABAC michaelni parents: diff changeset	24 * @file cabac.h
9211fbd31353 CABAC michaelni parents: diff changeset	25 * Context Adaptive Binary Arithmetic Coder.
9211fbd31353 CABAC michaelni parents: diff changeset	26 */
9211fbd31353 CABAC michaelni parents: diff changeset	27
9211fbd31353 CABAC michaelni parents: diff changeset	28
3284 a224d9752912 don't force asserts in release builds. 2% faster h264. lorenm parents: 3036 diff changeset	29 //#undef NDEBUG
1287 9211fbd31353 CABAC michaelni parents: diff changeset	30 #include <assert.h>
9211fbd31353 CABAC michaelni parents: diff changeset	31
3948 3edbf131ee44 refill cabac variables in 16bit steps, 3% faster get_cabac() michael parents: 3947 diff changeset	32 #define CABAC_BITS 16
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	33 #define CABAC_MASK ((1<<CABAC_BITS)-1)
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	34 #define BRANCHLESS_CABAC_DECODER 1
3990 746a60ba3177 enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ... michael parents: 3984 diff changeset	35 #define CMOV_IS_FAST 1
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	36 //#define ARCH_X86_DISABLED 1
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	37
1287 9211fbd31353 CABAC michaelni parents: diff changeset	38 typedef struct CABACContext{
9211fbd31353 CABAC michaelni parents: diff changeset	39 int low;
9211fbd31353 CABAC michaelni parents: diff changeset	40 int range;
9211fbd31353 CABAC michaelni parents: diff changeset	41 int outstanding_count;
9211fbd31353 CABAC michaelni parents: diff changeset	42 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	43 int symCount;
9211fbd31353 CABAC michaelni parents: diff changeset	44 #endif
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	45 const uint8_t *bytestream_start;
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	46 const uint8_t *bytestream;
2116 48d9f86fb047 overread fix michael parents: 2024 diff changeset	47 const uint8_t *bytestream_end;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	48 PutBitContext pb;
9211fbd31353 CABAC michaelni parents: diff changeset	49 }CABACContext;
9211fbd31353 CABAC michaelni parents: diff changeset	50
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	51 extern uint8_t ff_h264_mlps_state[4*64];
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	52 extern uint8_t ff_h264_lps_range[4264]; ///< rangeTabLPS
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	53 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	54 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	55 extern const uint8_t ff_h264_norm_shift[512];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	56
1287 9211fbd31353 CABAC michaelni parents: diff changeset	57
9211fbd31353 CABAC michaelni parents: diff changeset	58 void ff_init_cabac_encoder(CABACContext c, uint8_t buf, int buf_size);
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	59 void ff_init_cabac_decoder(CABACContext c, const uint8_t buf, int buf_size);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	60 void ff_init_cabac_states(CABACContext *c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	61
9211fbd31353 CABAC michaelni parents: diff changeset	62
9211fbd31353 CABAC michaelni parents: diff changeset	63 static inline void put_cabac_bit(CABACContext *c, int b){
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	64 put_bits(&c->pb, 1, b);
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	65 for(;c->outstanding_count; c->outstanding_count--){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	66 put_bits(&c->pb, 1, 1-b);
9211fbd31353 CABAC michaelni parents: diff changeset	67 }
9211fbd31353 CABAC michaelni parents: diff changeset	68 }
9211fbd31353 CABAC michaelni parents: diff changeset	69
9211fbd31353 CABAC michaelni parents: diff changeset	70 static inline void renorm_cabac_encoder(CABACContext *c){
9211fbd31353 CABAC michaelni parents: diff changeset	71 while(c->range < 0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	72 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	73 if(c->low<0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	74 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	75 }else if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	76 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	77 c->low -= 0x100;
9211fbd31353 CABAC michaelni parents: diff changeset	78 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	79 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	80 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	81 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	82
1287 9211fbd31353 CABAC michaelni parents: diff changeset	83 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	84 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	85 }
9211fbd31353 CABAC michaelni parents: diff changeset	86 }
9211fbd31353 CABAC michaelni parents: diff changeset	87
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	88 static void put_cabac(CABACContext c, uint8_t const state, int bit){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	89 int RangeLPS= ff_h264_lps_range[2(c->range&0xC0) + state];
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	90
1287 9211fbd31353 CABAC michaelni parents: diff changeset	91 if(bit == ((*state)&1)){
9211fbd31353 CABAC michaelni parents: diff changeset	92 c->range -= RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	93 state= ff_h264_mps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	94 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	95 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	96 c->range = RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	97 state= ff_h264_lps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	98 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	99
1287 9211fbd31353 CABAC michaelni parents: diff changeset	100 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	101
9211fbd31353 CABAC michaelni parents: diff changeset	102 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	103 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	104 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	105 }
9211fbd31353 CABAC michaelni parents: diff changeset	106
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	107 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	108 assert(c->range > RangeLPS);
9211fbd31353 CABAC michaelni parents: diff changeset	109
9211fbd31353 CABAC michaelni parents: diff changeset	110 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	111 c->range -= RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	112 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	113 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	114 c->range = RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	115 }
9211fbd31353 CABAC michaelni parents: diff changeset	116
9211fbd31353 CABAC michaelni parents: diff changeset	117 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	118
9211fbd31353 CABAC michaelni parents: diff changeset	119 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	120 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	121 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	122 }
9211fbd31353 CABAC michaelni parents: diff changeset	123
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	124 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	125 * @param bit 0 -> write zero bit, !=0 write one bit
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	126 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	127 static void put_cabac_bypass(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	128 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	129
9211fbd31353 CABAC michaelni parents: diff changeset	130 if(bit){
9211fbd31353 CABAC michaelni parents: diff changeset	131 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	132 }
9211fbd31353 CABAC michaelni parents: diff changeset	133 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	134 if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	135 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	136 }else if(c->low<0x400){
9211fbd31353 CABAC michaelni parents: diff changeset	137 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	138 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	139 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	140 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	141 c->low -= 0x400;
9211fbd31353 CABAC michaelni parents: diff changeset	142 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	143
1287 9211fbd31353 CABAC michaelni parents: diff changeset	144 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	145 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	146 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	147 }
9211fbd31353 CABAC michaelni parents: diff changeset	148
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	149 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	150 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	151 * @return the number of bytes written
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	152 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	153 static int put_cabac_terminate(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	154 c->range -= 2;
9211fbd31353 CABAC michaelni parents: diff changeset	155
9211fbd31353 CABAC michaelni parents: diff changeset	156 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	157 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	158 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	159 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	160 c->range= 2;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	161
1287 9211fbd31353 CABAC michaelni parents: diff changeset	162 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	163
9211fbd31353 CABAC michaelni parents: diff changeset	164 assert(c->low <= 0x1FF);
9211fbd31353 CABAC michaelni parents: diff changeset	165 put_cabac_bit(c, c->low>>9);
9211fbd31353 CABAC michaelni parents: diff changeset	166 put_bits(&c->pb, 2, ((c->low>>7)&3)\|1);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	167
1287 9211fbd31353 CABAC michaelni parents: diff changeset	168 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
9211fbd31353 CABAC michaelni parents: diff changeset	169 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	170
1287 9211fbd31353 CABAC michaelni parents: diff changeset	171 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	172 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	173 #endif
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	174
1787 752b51a3c8ed get_bit_count -> put_bits_count alex parents: 1301 diff changeset	175 return (put_bits_count(&c->pb)+7)>>3;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	176 }
9211fbd31353 CABAC michaelni parents: diff changeset	177
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	178 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	179 * put (truncated) unary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	180 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	181 static void put_cabac_u(CABACContext c, uint8_t state, int v, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	182 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	183
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	184 assert(v <= max);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	185
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	186 #if 1
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	187 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	188 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	189 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	190 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	191 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	192 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	193 #else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	194 if(v <= max_index){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	195 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	196 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	197 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	198 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	199 put_cabac(c, state+i, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	200 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	201 for(i=0; i<=max_index; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	202 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	203 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	204 for(; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	205 put_cabac(c, state+max_index, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	206 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	207 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	208 put_cabac(c, state+max_index, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	209 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	210 #endif
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	211 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	212
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	213 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	214 * put unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	215 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	216 static void put_cabac_ueg(CABACContext c, uint8_t state, int v, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	217 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	218
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	219 if(v==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	220 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	221 else{
1298 5bc3184810dc cleanup michaelni parents: 1290 diff changeset	222 const int sign= v < 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	223
4001 34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3999 diff changeset	224 if(is_signed) v= FFABS(v);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	225
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	226 if(v<max){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	227 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	228 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	229 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	230 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	231
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	232 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	233 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	234 int m= 1<<k;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	235
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	236 for(i=0; i<max; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	237 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	238 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	239 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	240
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	241 v -= max;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	242 while(v >= m){ //FIXME optimize
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	243 put_cabac_bypass(c, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	244 v-= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	245 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	246 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	247 put_cabac_bypass(c, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	248 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	249 put_cabac_bypass(c, v&m);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	250 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	251 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	252
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	253 if(is_signed)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	254 put_cabac_bypass(c, sign);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	255 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	256 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	257
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	258 static void refill(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	259 #if CABAC_BITS == 16
3946 ab0797f2e397 () 10l michael parents: 3943 diff changeset	260 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	261 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	262 c->low+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	263 #endif
1c39d9786efd optimization michael parents: 2116 diff changeset	264 c->low -= CABAC_MASK;
1c39d9786efd optimization michael parents: 2116 diff changeset	265 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	266 }
1c39d9786efd optimization michael parents: 2116 diff changeset	267
1c39d9786efd optimization michael parents: 2116 diff changeset	268 static void refill2(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	269 int i, x;
1c39d9786efd optimization michael parents: 2116 diff changeset	270
1c39d9786efd optimization michael parents: 2116 diff changeset	271 x= c->low ^ (c->low-1);
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	272 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	273
1c39d9786efd optimization michael parents: 2116 diff changeset	274 x= -CABAC_MASK;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	275
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	276 #if CABAC_BITS == 16
1c39d9786efd optimization michael parents: 2116 diff changeset	277 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
1c39d9786efd optimization michael parents: 2116 diff changeset	278 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	279 x+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	280 #endif
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	281
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	282 c->low += x<<i;
1c39d9786efd optimization michael parents: 2116 diff changeset	283 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	284 }
1c39d9786efd optimization michael parents: 2116 diff changeset	285
1287 9211fbd31353 CABAC michaelni parents: diff changeset	286 static inline void renorm_cabac_decoder(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	287 while(c->range < 0x100){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	288 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	289 c->low+= c->low;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	290 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	291 refill(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	292 }
9211fbd31353 CABAC michaelni parents: diff changeset	293 }
9211fbd31353 CABAC michaelni parents: diff changeset	294
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	295 static inline void renorm_cabac_decoder_once(CABACContext *c){
3951 751bfc30df72 disable benchmarking code michael parents: 3950 diff changeset	296 #ifdef ARCH_X86_DISABLED
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	297 int temp;
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	298 #if 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	299 //P3:683 athlon:475
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	300 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	301 "lea -0x100(%0), %2 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	302 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	303 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	304 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	305 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	306 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	307 #elif 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	308 //P3:680 athlon:474
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	309 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	310 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	311 "setb %%cl \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	312 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	313 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	314 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	315 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	316 #elif 1
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	317 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	318 //P3:665 athlon:517
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	319 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	320 "lea -0x100(%0), %%eax \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	321 "cdq \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	322 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	323 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	324 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	325 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	326 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	327 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	328 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	329 #elif 0
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	330 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	331 //P3:673 athlon:509
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	332 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	333 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	334 "sbb %%edx, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	335 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	336 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	337 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	338 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	339 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	340 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	341 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	342 #else
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	343 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	344 //P3:677 athlon:511
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	345 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	346 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	347 "lea (%0, %0), %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	348 "lea (%1, %1), %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	349 "cmovb %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	350 "cmovb %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	351 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	352 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	353 #endif
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	354 #else
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	355 //P3:675 athlon:476
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	356 int shift= (uint32_t)(c->range - 0x100)>>31;
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	357 c->range<<= shift;
0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	358 c->low <<= shift;
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	359 #endif
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	360 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	361 refill(c);
1c39d9786efd optimization michael parents: 2116 diff changeset	362 }
1c39d9786efd optimization michael parents: 2116 diff changeset	363
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	364 static int always_inline get_cabac_inline(CABACContext c, uint8_t const state){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	365 //FIXME gcc generates duplicate load/stores for c->low and c->range
4026 2e7133456c56 10l michael parents: 4025 diff changeset	366 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	367 int bit;
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	368
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	369 #define LOW "0"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	370 #define RANGE "4"
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	371 #define BYTESTART "12"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	372 #define BYTE "16"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	373 #define BYTEEND "20"
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	374 #ifndef BRANCHLESS_CABAC_DECODER
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	375 asm volatile(
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	376 "movzbl (%1), %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	377 "movl "RANGE "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	378 "movl "RANGE "(%2), %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	379 "andl $0xC0, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	380 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	381 "movl "LOW "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	382 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	383 "subl %%esi, %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	384 "movl %%edx, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	385 "shll $17, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	386 "cmpl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	387 " ja 1f \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	388
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	389 #if 1
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	390 //athlon:4067 P3:4110
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	391 "lea -0x100(%%edx), %%ecx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	392 "shr $31, %%ecx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	393 "shl %%cl, %%edx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	394 "shl %%cl, %%ebx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	395 #else
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	396 //athlon:4057 P3:4130
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	397 "cmp $0x100, %%edx \n\t" //FIXME avoidable
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	398 "setb %%cl \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	399 "shl %%cl, %%edx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	400 "shl %%cl, %%ebx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	401 #endif
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	402 "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	403 "movb %%cl, (%1) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	404 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	405 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	406 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	407 "movl "BYTE "(%2), %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	408 "subl $0xFFFF, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	409 "movzwl (%%esi), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	410 "bswap %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	411 "shrl $15, %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	412 "addl $2, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	413 "addl %%ecx, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	414 "movl %%esi, "BYTE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	415 "jmp 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	416 "1: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	417 //eax:state ebx:low, edx:range, esi:RangeLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	418 "subl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	419 "movl %%esi, %%edx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	420 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	421 "shll %%cl, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	422 "shll %%cl, %%edx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	423 "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	424 "movb %%cl, (%1) \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	425 "addl $1, %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	426 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	427 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	428
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	429 "movl "BYTE "(%2), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	430 "movzwl (%%ecx), %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	431 "bswap %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	432 "shrl $15, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	433 "subl $0xFFFF, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	434 "addl $2, %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	435 "movl %%ecx, "BYTE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	436
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	437 "leal -1(%%ebx), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	438 "xorl %%ebx, %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	439 "shrl $15, %%ecx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	440 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
3994 2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	441 "neg %%ecx \n\t"
2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	442 "add $7, %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	443
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	444 "shll %%cl , %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	445 "addl %%esi, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	446 "2: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	447 "movl %%edx, "RANGE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	448 "movl %%ebx, "LOW "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	449 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	450 :"r"(state), "r"(c)
4012 f8c649ac09dd add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn michael parents: 4008 diff changeset	451 : "%ecx", "%ebx", "%edx", "%esi", "memory"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	452 );
3982 af16271634c2 moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop michael parents: 3981 diff changeset	453 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	454 #else /* BRANCHLESS_CABAC_DECODER */
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	455 asm volatile(
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	456 "movzbl (%1), %0 \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	457 "movl "RANGE "(%2), %%ebx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	458 "movl "RANGE "(%2), %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	459 "andl $0xC0, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	460 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	461 "movl "LOW "(%2), %%ebx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	462 //eax:state ebx:low, edx:range, esi:RangeLPS
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	463 "subl %%esi, %%edx \n\t"
4034 fbe263601152 Fix crash with illegal instruction, cmov is available on 686 and later only. diego parents: 4033 diff changeset	464 #if (defined CMOV_IS_FAST && __CPU__ >= 686)
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	465 "movl %%edx, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	466 "shl $17, %%edx \n\t"
3980 5afe4253a220 replace a few and/sub/... by cmov michael parents: 3979 diff changeset	467 "cmpl %%ebx, %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	468 "cmova %%ecx, %%esi \n\t"
3980 5afe4253a220 replace a few and/sub/... by cmov michael parents: 3979 diff changeset	469 "sbbl %%ecx, %%ecx \n\t"
5afe4253a220 replace a few and/sub/... by cmov michael parents: 3979 diff changeset	470 "andl %%ecx, %%edx \n\t"
5afe4253a220 replace a few and/sub/... by cmov michael parents: 3979 diff changeset	471 "subl %%edx, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	472 "xorl %%ecx, %0 \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	473 #else /* CMOV_IS_FAST */
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	474 "movl %%edx, %%ecx \n\t"
4027 ca4a837b896d fix !CMOV_IS_FAST case (iam not really happy with the fix but i didnt come up with a better one quickly) michael parents: 4026 diff changeset	475 "shl $17, %%edx \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	476 "subl %%ebx, %%edx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	477 "sarl $31, %%edx \n\t" //lps_mask
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	478 "subl %%ecx, %%esi \n\t" //RangeLPS - range
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	479 "andl %%edx, %%esi \n\t" //(RangeLPS - range)&lps_mask
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	480 "addl %%ecx, %%esi \n\t" //new range
4027 ca4a837b896d fix !CMOV_IS_FAST case (iam not really happy with the fix but i didnt come up with a better one quickly) michael parents: 4026 diff changeset	481 "shl $17, %%ecx \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	482 "andl %%edx, %%ecx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	483 "subl %%ecx, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	484 "xorl %%edx, %0 \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	485 #endif /* CMOV_IS_FAST */
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	486
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	487 //eax:state ebx:low edx:mask esi:range
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	488
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	489 //eax:bit ebx:low esi:range
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	490
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	491 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	492 "shll %%cl, %%esi \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	493 "movzbl "MANGLE(ff_h264_mlps_state)"+128(%0), %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	494 "movb %%dl, (%1) \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	495 "movl %%esi, "RANGE "(%2) \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	496 "shll %%cl, %%ebx \n\t"
3995 b00c06477dff write cabac low and range variables as early as possible to prevent stalls from reading them before they where written, the P4 is said to disslike that alot, on P3 its 2% faster (START/STOP_TIMER over decode_residual) michael parents: 3994 diff changeset	497 "movl %%ebx, "LOW "(%2) \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	498 "test %%bx, %%bx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	499 " jnz 1f \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	500
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	501 "movl "BYTE "(%2), %%ecx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	502 "movzwl (%%ecx), %%esi \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	503 "bswap %%esi \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	504 "shrl $15, %%esi \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	505 "subl $0xFFFF, %%esi \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	506 "addl $2, %%ecx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	507 "movl %%ecx, "BYTE "(%2) \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	508
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	509 "leal -1(%%ebx), %%ecx \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	510 "xorl %%ebx, %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	511 "shrl $15, %%ecx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	512 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
3994 2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	513 "neg %%ecx \n\t"
2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	514 "add $7, %%ecx \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	515
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	516 "shll %%cl , %%esi \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	517 "addl %%esi, %%ebx \n\t"
3995 b00c06477dff write cabac low and range variables as early as possible to prevent stalls from reading them before they where written, the P4 is said to disslike that alot, on P3 its 2% faster (START/STOP_TIMER over decode_residual) michael parents: 3994 diff changeset	518 "movl %%ebx, "LOW "(%2) \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	519 "1: \n\t"
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	520 :"=&a"(bit)
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	521 :"r"(state), "r"(c)
4012 f8c649ac09dd add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn michael parents: 4008 diff changeset	522 : "%ecx", "%ebx", "%edx", "%esi", "memory"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	523 );
3981 9854f686ba79 move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster) michael parents: 3980 diff changeset	524 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	525 #endif /* BRANCHLESS_CABAC_DECODER */
4033 f7a6b2bb3a2f Expand some #endif comments. diego parents: 4027 diff changeset	526 #else /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	527 int s = *state;
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	528 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
2522 e25782262d7d kill warnings patch by (M��ns Rullg��rd <mru inprovide com>) michael parents: 2323 diff changeset	529 int bit, lps_mask attribute_unused;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	530
1287 9211fbd31353 CABAC michaelni parents: diff changeset	531 c->range -= RangeLPS;
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	532 #ifndef BRANCHLESS_CABAC_DECODER
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	533 if(c->low < (c->range<<17)){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	534 bit= s&1;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	535 *state= ff_h264_mps_state[s];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	536 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	537 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	538 bit= ff_h264_norm_shift[RangeLPS];
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	539 c->low -= (c->range<<17);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	540 *state= ff_h264_lps_state[s];
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	541 c->range = RangeLPS<<bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	542 c->low <<= bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	543 bit= (s&1)^1;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	544
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	545 if(!(c->low & 0xFFFF)){
1c39d9786efd optimization michael parents: 2116 diff changeset	546 refill2(c);
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	547 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	548 }
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	549 #else /* BRANCHLESS_CABAC_DECODER */
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	550 lps_mask= ((c->range<<17) - c->low)>>31;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	551
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	552 c->low -= (c->range<<17) & lps_mask;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	553 c->range += (RangeLPS - c->range) & lps_mask;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	554
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	555 s^=lps_mask;
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	556 *state= (ff_h264_mlps_state+128)[s];
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	557 bit= s&1;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	558
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	559 lps_mask= ff_h264_norm_shift[c->range];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	560 c->range<<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	561 c->low <<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	562 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	563 refill2(c);
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	564 #endif /* BRANCHLESS_CABAC_DECODER */
4033 f7a6b2bb3a2f Expand some #endif comments. diego parents: 4027 diff changeset	565 #endif /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	566 return bit;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	567 }
9211fbd31353 CABAC michaelni parents: diff changeset	568
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	569 static int __attribute((noinline)) get_cabac_noinline(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	570 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	571 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	572
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	573 static int get_cabac(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	574 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	575 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	576
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	577 static int get_cabac_bypass(CABACContext *c){
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	578 #if 0 //not faster
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	579 int bit;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	580 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	581 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	582 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	583 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	584 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	585 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	586 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	587 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	588 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	589 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	590 " jnz 1f \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	591 "movl "BYTE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	592 "subl $0xFFFF, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	593 "movzwl (%%ebx), %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	594 "bswap %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	595 "shrl $15, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	596 "addl $2, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	597 "addl %%ecx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	598 "movl %%ebx, "BYTE "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	599 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	600 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	601
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	602 :"=&d"(bit)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	603 :"r"(c)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	604 : "%eax", "%ebx", "%ecx", "memory"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	605 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	606 return bit+1;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	607 #else
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	608 int range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	609 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	610
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	611 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	612 refill(c);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	613
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	614 range= c->range<<17;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	615 if(c->low < range){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	616 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	617 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	618 c->low -= range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	619 return 1;
9211fbd31353 CABAC michaelni parents: diff changeset	620 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	621 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	622 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	623
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	624
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	625 static always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	626 #ifdef ARCH_X86
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	627 int bit;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	628 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	629 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	630 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	631 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	632 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	633 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	634 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	635 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	636 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	637 "xor %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	638 "sub %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	639 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	640 " jnz 1f \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	641 "movl "BYTE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	642 "subl $0xFFFF, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	643 "movzwl (%%ebx), %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	644 "bswap %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	645 "shrl $15, %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	646 "addl $2, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	647 "addl %%edx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	648 "movl %%ebx, "BYTE "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	649 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	650 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	651
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	652 :"+c"(val)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	653 :"r"(c)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	654 : "%eax", "%ebx", "%edx", "memory"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	655 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	656 return val;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	657 #else
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	658 int range, mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	659 c->low += c->low;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	660
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	661 if(!(c->low & CABAC_MASK))
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	662 refill(c);
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	663
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	664 range= c->range<<17;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	665 c->low -= range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	666 mask= c->low >> 31;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	667 range &= mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	668 c->low += range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	669 return (val^mask)-mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	670 #endif
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	671 }
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	672
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	673 //FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	674 //FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	675 #ifdef ARCH_X86
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	676 static int decode_significance_x86(CABACContext c, int max_coeff, uint8_t significant_coeff_ctx_base, int *index){
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	677 void *end= significant_coeff_ctx_base + max_coeff - 1;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	678 int minusstart= -(int)significant_coeff_ctx_base;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	679 int minusindex= -(int)index;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	680 int coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	681 asm volatile(
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	682 "movl "RANGE "(%3), %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	683 "movl "LOW "(%3), %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	684
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	685 "2: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	686
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	687 "movzbl (%1), %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	688 "movl %%esi, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	689 "andl $0xC0, %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	690 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%esi, 2), %%esi\n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	691 /eax:state ebx:low, edx:range, esi:RangeLPS/
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	692 "subl %%esi, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	693
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	694 #if (defined CMOV_IS_FAST && __CPU__ >= 686)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	695 "movl %%edx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	696 "shl $17, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	697 "cmpl %%ebx, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	698 "cmova %%ecx, %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	699 "sbbl %%ecx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	700 "andl %%ecx, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	701 "subl %%edx, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	702 "xorl %%ecx, %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	703 #else /* CMOV_IS_FAST */
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	704 "movl %%edx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	705 "shl $17, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	706 "subl %%ebx, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	707 "sarl $31, %%edx \n\t" //lps_mask
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	708 "subl %%ecx, %%esi \n\t" //RangeLPS - range
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	709 "andl %%edx, %%esi \n\t" //(RangeLPS - range)&lps_mask
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	710 "addl %%ecx, %%esi \n\t" //new range
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	711 "shl $17, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	712 "andl %%edx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	713 "subl %%ecx, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	714 "xorl %%edx, %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	715 #endif /* CMOV_IS_FAST */
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	716
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	717 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	718 "shll %%cl, %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	719 "movzbl "MANGLE(ff_h264_mlps_state)"+128(%0), %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	720 "movb %%dl, (%1) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	721 "shll %%cl, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	722 "test %%bx, %%bx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	723 " jnz 1f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	724
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	725 "movl "BYTE "(%3), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	726 "movzwl (%%ecx), %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	727 "bswap %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	728 "shrl $15, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	729 "subl $0xFFFF, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	730 "addl $2, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	731 "movl %%ecx, "BYTE "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	732
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	733 "leal -1(%%ebx), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	734 "xorl %%ebx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	735 "shrl $15, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	736 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	737 "neg %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	738 "add $7, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	739
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	740 "shll %%cl , %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	741 "addl %%edx, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	742 "1: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	743
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	744 "test $1, %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	745 " jz 3f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	746
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	747 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	748 "movl %4, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	749 "addl %1, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	750 "movl %%ecx, (%%eax) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	751 "addl $4, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	752 "movl %%eax, %2 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	753
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	754 "movzbl 61(%1), %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	755 "movl %%esi, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	756 "andl $0xC0, %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	757 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%esi, 2), %%esi\n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	758 /eax:state ebx:low, edx:range, esi:RangeLPS/
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	759 "subl %%esi, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	760
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	761 #if (defined CMOV_IS_FAST && __CPU__ >= 686)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	762 "movl %%edx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	763 "shl $17, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	764 "cmpl %%ebx, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	765 "cmova %%ecx, %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	766 "sbbl %%ecx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	767 "andl %%ecx, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	768 "subl %%edx, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	769 "xorl %%ecx, %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	770 #else /* CMOV_IS_FAST */
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	771 "movl %%edx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	772 "shl $17, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	773 "subl %%ebx, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	774 "sarl $31, %%edx \n\t" //lps_mask
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	775 "subl %%ecx, %%esi \n\t" //RangeLPS - range
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	776 "andl %%edx, %%esi \n\t" //(RangeLPS - range)&lps_mask
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	777 "addl %%ecx, %%esi \n\t" //new range
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	778 "shl $17, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	779 "andl %%edx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	780 "subl %%ecx, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	781 "xorl %%edx, %0 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	782 #endif /* CMOV_IS_FAST */
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	783
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	784 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	785 "shll %%cl, %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	786 "movzbl "MANGLE(ff_h264_mlps_state)"+128(%0), %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	787 "movb %%dl, 61(%1) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	788 "shll %%cl, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	789 "test %%bx, %%bx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	790 " jnz 1f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	791
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	792 "movl "BYTE "(%3), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	793 "movzwl (%%ecx), %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	794 "bswap %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	795 "shrl $15, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	796 "subl $0xFFFF, %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	797 "addl $2, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	798 "movl %%ecx, "BYTE "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	799
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	800 "leal -1(%%ebx), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	801 "xorl %%ebx, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	802 "shrl $15, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	803 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	804 "neg %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	805 "add $7, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	806
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	807 "shll %%cl , %%edx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	808 "addl %%edx, %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	809 "1: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	810
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	811 "test $1, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	812 " jnz 4f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	813
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	814 "3: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	815 "addl $1, %1 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	816 "cmpl %5, %1 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	817 " jb 2b \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	818 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	819 "movl %4, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	820 "addl %1, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	821 "movl %%ecx, (%%eax) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	822 "addl $4, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	823 "movl %%eax, %2 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	824 "4: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	825 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	826 "addl %6, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	827 "shr $2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	828
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	829 "movl %%esi, "RANGE "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	830 "movl %%ebx, "LOW "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	831 :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	832 :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	833 : "%ecx", "%ebx", "%edx", "%esi", "memory"\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	834 );
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	835 return coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	836 }
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	837 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	838
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	839 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	840 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	841 * @return the number of bytes read or 0 if no end
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	842 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	843 static int get_cabac_terminate(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	844 c->range -= 2;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	845 if(c->low < c->range<<17){
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	846 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	847 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	848 }else{
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	849 return c->bytestream - c->bytestream_start;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	850 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	851 }
9211fbd31353 CABAC michaelni parents: diff changeset	852
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	853 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	854 * get (truncated) unnary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	855 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	856 static int get_cabac_u(CABACContext c, uint8_t state, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	857 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	858
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	859 for(i=0; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	860 if(get_cabac(c, state)==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	861 return i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	862
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	863 if(i< max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	864 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	865
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	866 return truncated ? max : -1;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	867 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	868
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	869 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	870 * get unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	871 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	872 static int get_cabac_ueg(CABACContext c, uint8_t state, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	873 int i, v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	874 int m= 1<<k;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	875
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	876 if(get_cabac(c, state)==0)
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	877 return 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	878
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	879 if(0 < max_index) state++;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	880
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	881 for(i=1; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	882 if(get_cabac(c, state)==0){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	883 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	884 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	885 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	886 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	887 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	888
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	889 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	890 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	891
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	892 while(get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	893 i+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	894 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	895 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	896
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	897 v=0;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	898 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	899 v+= v + get_cabac_bypass(c);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	900 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	901 i += v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	902
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	903 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	904 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	905 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	906 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	907 }

Mercurial > libavcodec.hg

annotate cabac.h @ 4040:9eaea06c5ba6 libavcodec