libavcodec.hg: cabac.h annotate

annotate cabac.h @ 4050:2c79a8281cb6 libavcodec

Protect code that uses CMOV instructions with HAVE_CMOV, Make configure set CMOV_IS_FAST on arches on which cmov has a low latency (typically non-Netburst based processor)

author	gpoirier
date	Fri, 20 Oct 2006 17:53:19 +0000
parents	8c1a5ed03a00
children	19f07b651d79

rev	line source
1287 9211fbd31353 CABAC michaelni parents: diff changeset	1 /*
9211fbd31353 CABAC michaelni parents: diff changeset	2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
9211fbd31353 CABAC michaelni parents: diff changeset	3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
9211fbd31353 CABAC michaelni parents: diff changeset	4 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	7 * FFmpeg is free software; you can redistribute it and/or
1287 9211fbd31353 CABAC michaelni parents: diff changeset	8 * modify it under the terms of the GNU Lesser General Public
9211fbd31353 CABAC michaelni parents: diff changeset	9 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	10 * version 2.1 of the License, or (at your option) any later version.
1287 9211fbd31353 CABAC michaelni parents: diff changeset	11 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	12 * FFmpeg is distributed in the hope that it will be useful,
1287 9211fbd31353 CABAC michaelni parents: diff changeset	13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9211fbd31353 CABAC michaelni parents: diff changeset	14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9211fbd31353 CABAC michaelni parents: diff changeset	15 * Lesser General Public License for more details.
9211fbd31353 CABAC michaelni parents: diff changeset	16 *
9211fbd31353 CABAC michaelni parents: diff changeset	17 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	18 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2967 diff changeset	19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1287 9211fbd31353 CABAC michaelni parents: diff changeset	20 *
9211fbd31353 CABAC michaelni parents: diff changeset	21 */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	22
1287 9211fbd31353 CABAC michaelni parents: diff changeset	23 /**
9211fbd31353 CABAC michaelni parents: diff changeset	24 * @file cabac.h
9211fbd31353 CABAC michaelni parents: diff changeset	25 * Context Adaptive Binary Arithmetic Coder.
9211fbd31353 CABAC michaelni parents: diff changeset	26 */
9211fbd31353 CABAC michaelni parents: diff changeset	27
9211fbd31353 CABAC michaelni parents: diff changeset	28
3284 a224d9752912 don't force asserts in release builds. 2% faster h264. lorenm parents: 3036 diff changeset	29 //#undef NDEBUG
1287 9211fbd31353 CABAC michaelni parents: diff changeset	30 #include <assert.h>
9211fbd31353 CABAC michaelni parents: diff changeset	31
3948 3edbf131ee44 refill cabac variables in 16bit steps, 3% faster get_cabac() michael parents: 3947 diff changeset	32 #define CABAC_BITS 16
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	33 #define CABAC_MASK ((1<<CABAC_BITS)-1)
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	34 #define BRANCHLESS_CABAC_DECODER 1
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	35 //#define ARCH_X86_DISABLED 1
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	36
1287 9211fbd31353 CABAC michaelni parents: diff changeset	37 typedef struct CABACContext{
9211fbd31353 CABAC michaelni parents: diff changeset	38 int low;
9211fbd31353 CABAC michaelni parents: diff changeset	39 int range;
9211fbd31353 CABAC michaelni parents: diff changeset	40 int outstanding_count;
9211fbd31353 CABAC michaelni parents: diff changeset	41 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	42 int symCount;
9211fbd31353 CABAC michaelni parents: diff changeset	43 #endif
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	44 const uint8_t *bytestream_start;
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	45 const uint8_t *bytestream;
2116 48d9f86fb047 overread fix michael parents: 2024 diff changeset	46 const uint8_t *bytestream_end;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	47 PutBitContext pb;
9211fbd31353 CABAC michaelni parents: diff changeset	48 }CABACContext;
9211fbd31353 CABAC michaelni parents: diff changeset	49
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	50 extern uint8_t ff_h264_mlps_state[4*64];
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	51 extern uint8_t ff_h264_lps_range[4264]; ///< rangeTabLPS
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	52 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	53 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	54 extern const uint8_t ff_h264_norm_shift[512];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	55
1287 9211fbd31353 CABAC michaelni parents: diff changeset	56
9211fbd31353 CABAC michaelni parents: diff changeset	57 void ff_init_cabac_encoder(CABACContext c, uint8_t buf, int buf_size);
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	58 void ff_init_cabac_decoder(CABACContext c, const uint8_t buf, int buf_size);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	59 void ff_init_cabac_states(CABACContext *c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	60
9211fbd31353 CABAC michaelni parents: diff changeset	61
9211fbd31353 CABAC michaelni parents: diff changeset	62 static inline void put_cabac_bit(CABACContext *c, int b){
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	63 put_bits(&c->pb, 1, b);
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	64 for(;c->outstanding_count; c->outstanding_count--){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	65 put_bits(&c->pb, 1, 1-b);
9211fbd31353 CABAC michaelni parents: diff changeset	66 }
9211fbd31353 CABAC michaelni parents: diff changeset	67 }
9211fbd31353 CABAC michaelni parents: diff changeset	68
9211fbd31353 CABAC michaelni parents: diff changeset	69 static inline void renorm_cabac_encoder(CABACContext *c){
9211fbd31353 CABAC michaelni parents: diff changeset	70 while(c->range < 0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	71 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	72 if(c->low<0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	73 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	74 }else if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	75 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	76 c->low -= 0x100;
9211fbd31353 CABAC michaelni parents: diff changeset	77 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	78 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	79 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	80 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	81
1287 9211fbd31353 CABAC michaelni parents: diff changeset	82 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	83 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	84 }
9211fbd31353 CABAC michaelni parents: diff changeset	85 }
9211fbd31353 CABAC michaelni parents: diff changeset	86
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	87 static void put_cabac(CABACContext c, uint8_t const state, int bit){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	88 int RangeLPS= ff_h264_lps_range[2(c->range&0xC0) + state];
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	89
1287 9211fbd31353 CABAC michaelni parents: diff changeset	90 if(bit == ((*state)&1)){
9211fbd31353 CABAC michaelni parents: diff changeset	91 c->range -= RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	92 state= ff_h264_mps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	93 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	94 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	95 c->range = RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	96 state= ff_h264_lps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	97 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	98
1287 9211fbd31353 CABAC michaelni parents: diff changeset	99 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	100
9211fbd31353 CABAC michaelni parents: diff changeset	101 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	102 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	103 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	104 }
9211fbd31353 CABAC michaelni parents: diff changeset	105
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	106 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	107 assert(c->range > RangeLPS);
9211fbd31353 CABAC michaelni parents: diff changeset	108
9211fbd31353 CABAC michaelni parents: diff changeset	109 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	110 c->range -= RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	111 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	112 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	113 c->range = RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	114 }
9211fbd31353 CABAC michaelni parents: diff changeset	115
9211fbd31353 CABAC michaelni parents: diff changeset	116 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	117
9211fbd31353 CABAC michaelni parents: diff changeset	118 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	119 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	120 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	121 }
9211fbd31353 CABAC michaelni parents: diff changeset	122
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	123 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	124 * @param bit 0 -> write zero bit, !=0 write one bit
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	125 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	126 static void put_cabac_bypass(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	127 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	128
9211fbd31353 CABAC michaelni parents: diff changeset	129 if(bit){
9211fbd31353 CABAC michaelni parents: diff changeset	130 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	131 }
9211fbd31353 CABAC michaelni parents: diff changeset	132 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	133 if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	134 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	135 }else if(c->low<0x400){
9211fbd31353 CABAC michaelni parents: diff changeset	136 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	137 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	138 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	139 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	140 c->low -= 0x400;
9211fbd31353 CABAC michaelni parents: diff changeset	141 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	142
1287 9211fbd31353 CABAC michaelni parents: diff changeset	143 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	144 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	145 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	146 }
9211fbd31353 CABAC michaelni parents: diff changeset	147
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	148 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	149 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	150 * @return the number of bytes written
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	151 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	152 static int put_cabac_terminate(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	153 c->range -= 2;
9211fbd31353 CABAC michaelni parents: diff changeset	154
9211fbd31353 CABAC michaelni parents: diff changeset	155 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	156 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	157 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	158 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	159 c->range= 2;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	160
1287 9211fbd31353 CABAC michaelni parents: diff changeset	161 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	162
9211fbd31353 CABAC michaelni parents: diff changeset	163 assert(c->low <= 0x1FF);
9211fbd31353 CABAC michaelni parents: diff changeset	164 put_cabac_bit(c, c->low>>9);
9211fbd31353 CABAC michaelni parents: diff changeset	165 put_bits(&c->pb, 2, ((c->low>>7)&3)\|1);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	166
1287 9211fbd31353 CABAC michaelni parents: diff changeset	167 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
9211fbd31353 CABAC michaelni parents: diff changeset	168 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	169
1287 9211fbd31353 CABAC michaelni parents: diff changeset	170 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	171 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	172 #endif
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	173
1787 752b51a3c8ed get_bit_count -> put_bits_count alex parents: 1301 diff changeset	174 return (put_bits_count(&c->pb)+7)>>3;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	175 }
9211fbd31353 CABAC michaelni parents: diff changeset	176
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	177 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	178 * put (truncated) unary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	179 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	180 static void put_cabac_u(CABACContext c, uint8_t state, int v, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	181 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	182
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	183 assert(v <= max);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	184
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	185 #if 1
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	186 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	187 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	188 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	189 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	190 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	191 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	192 #else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	193 if(v <= max_index){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	194 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	195 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	196 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	197 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	198 put_cabac(c, state+i, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	199 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	200 for(i=0; i<=max_index; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	201 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	202 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	203 for(; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	204 put_cabac(c, state+max_index, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	205 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	206 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	207 put_cabac(c, state+max_index, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	208 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	209 #endif
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	210 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	211
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	212 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	213 * put unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	214 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	215 static void put_cabac_ueg(CABACContext c, uint8_t state, int v, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	216 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	217
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	218 if(v==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	219 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	220 else{
1298 5bc3184810dc cleanup michaelni parents: 1290 diff changeset	221 const int sign= v < 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	222
4001 34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3999 diff changeset	223 if(is_signed) v= FFABS(v);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	224
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	225 if(v<max){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	226 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	227 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	228 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	229 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	230
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	231 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	232 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	233 int m= 1<<k;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	234
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	235 for(i=0; i<max; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	236 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	237 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	238 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	239
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	240 v -= max;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	241 while(v >= m){ //FIXME optimize
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	242 put_cabac_bypass(c, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	243 v-= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	244 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	245 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	246 put_cabac_bypass(c, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	247 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	248 put_cabac_bypass(c, v&m);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	249 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	250 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	251
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	252 if(is_signed)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	253 put_cabac_bypass(c, sign);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	254 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	255 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	256
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	257 static void refill(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	258 #if CABAC_BITS == 16
3946 ab0797f2e397 () 10l michael parents: 3943 diff changeset	259 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	260 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	261 c->low+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	262 #endif
1c39d9786efd optimization michael parents: 2116 diff changeset	263 c->low -= CABAC_MASK;
1c39d9786efd optimization michael parents: 2116 diff changeset	264 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	265 }
1c39d9786efd optimization michael parents: 2116 diff changeset	266
1c39d9786efd optimization michael parents: 2116 diff changeset	267 static void refill2(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	268 int i, x;
1c39d9786efd optimization michael parents: 2116 diff changeset	269
1c39d9786efd optimization michael parents: 2116 diff changeset	270 x= c->low ^ (c->low-1);
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	271 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	272
1c39d9786efd optimization michael parents: 2116 diff changeset	273 x= -CABAC_MASK;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	274
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	275 #if CABAC_BITS == 16
1c39d9786efd optimization michael parents: 2116 diff changeset	276 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
1c39d9786efd optimization michael parents: 2116 diff changeset	277 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	278 x+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	279 #endif
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	280
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	281 c->low += x<<i;
1c39d9786efd optimization michael parents: 2116 diff changeset	282 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	283 }
1c39d9786efd optimization michael parents: 2116 diff changeset	284
1287 9211fbd31353 CABAC michaelni parents: diff changeset	285 static inline void renorm_cabac_decoder(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	286 while(c->range < 0x100){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	287 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	288 c->low+= c->low;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	289 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	290 refill(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	291 }
9211fbd31353 CABAC michaelni parents: diff changeset	292 }
9211fbd31353 CABAC michaelni parents: diff changeset	293
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	294 static inline void renorm_cabac_decoder_once(CABACContext *c){
3951 751bfc30df72 disable benchmarking code michael parents: 3950 diff changeset	295 #ifdef ARCH_X86_DISABLED
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	296 int temp;
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	297 #if 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	298 //P3:683 athlon:475
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	299 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	300 "lea -0x100(%0), %2 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	301 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	302 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	303 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	304 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	305 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	306 #elif 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	307 //P3:680 athlon:474
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	308 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	309 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	310 "setb %%cl \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	311 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	312 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	313 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	314 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	315 #elif 1
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	316 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	317 //P3:665 athlon:517
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	318 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	319 "lea -0x100(%0), %%eax \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	320 "cdq \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	321 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	322 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	323 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	324 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	325 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	326 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	327 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	328 #elif 0
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	329 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	330 //P3:673 athlon:509
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	331 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	332 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	333 "sbb %%edx, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	334 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	335 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	336 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	337 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	338 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	339 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	340 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	341 #else
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	342 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	343 //P3:677 athlon:511
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	344 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	345 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	346 "lea (%0, %0), %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	347 "lea (%1, %1), %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	348 "cmovb %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	349 "cmovb %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	350 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	351 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	352 #endif
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	353 #else
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	354 //P3:675 athlon:476
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	355 int shift= (uint32_t)(c->range - 0x100)>>31;
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	356 c->range<<= shift;
0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	357 c->low <<= shift;
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	358 #endif
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	359 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	360 refill(c);
1c39d9786efd optimization michael parents: 2116 diff changeset	361 }
1c39d9786efd optimization michael parents: 2116 diff changeset	362
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	363 static int always_inline get_cabac_inline(CABACContext c, uint8_t const state){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	364 //FIXME gcc generates duplicate load/stores for c->low and c->range
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	365 #define LOW "0"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	366 #define RANGE "4"
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	367 #define BYTESTART "12"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	368 #define BYTE "16"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	369 #define BYTEEND "20"
4044 5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	370 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	371 int bit;
5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	372
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	373 #ifndef BRANCHLESS_CABAC_DECODER
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	374 asm volatile(
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	375 "movzbl (%1), %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	376 "movl "RANGE "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	377 "movl "RANGE "(%2), %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	378 "andl $0xC0, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	379 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	380 "movl "LOW "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	381 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	382 "subl %%esi, %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	383 "movl %%edx, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	384 "shll $17, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	385 "cmpl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	386 " ja 1f \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	387
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	388 #if 1
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	389 //athlon:4067 P3:4110
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	390 "lea -0x100(%%edx), %%ecx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	391 "shr $31, %%ecx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	392 "shl %%cl, %%edx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	393 "shl %%cl, %%ebx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	394 #else
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	395 //athlon:4057 P3:4130
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	396 "cmp $0x100, %%edx \n\t" //FIXME avoidable
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	397 "setb %%cl \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	398 "shl %%cl, %%edx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	399 "shl %%cl, %%ebx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	400 #endif
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	401 "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	402 "movb %%cl, (%1) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	403 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	404 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	405 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	406 "movl "BYTE "(%2), %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	407 "subl $0xFFFF, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	408 "movzwl (%%esi), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	409 "bswap %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	410 "shrl $15, %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	411 "addl $2, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	412 "addl %%ecx, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	413 "movl %%esi, "BYTE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	414 "jmp 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	415 "1: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	416 //eax:state ebx:low, edx:range, esi:RangeLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	417 "subl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	418 "movl %%esi, %%edx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	419 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	420 "shll %%cl, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	421 "shll %%cl, %%edx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	422 "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	423 "movb %%cl, (%1) \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	424 "addl $1, %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	425 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	426 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	427
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	428 "movl "BYTE "(%2), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	429 "movzwl (%%ecx), %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	430 "bswap %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	431 "shrl $15, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	432 "subl $0xFFFF, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	433 "addl $2, %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	434 "movl %%ecx, "BYTE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	435
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	436 "leal -1(%%ebx), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	437 "xorl %%ebx, %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	438 "shrl $15, %%ecx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	439 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
3994 2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	440 "neg %%ecx \n\t"
2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	441 "add $7, %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	442
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	443 "shll %%cl , %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	444 "addl %%esi, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	445 "2: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	446 "movl %%edx, "RANGE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	447 "movl %%ebx, "LOW "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	448 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	449 :"r"(state), "r"(c)
4012 f8c649ac09dd add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn michael parents: 4008 diff changeset	450 : "%ecx", "%ebx", "%edx", "%esi", "memory"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	451 );
3982 af16271634c2 moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop michael parents: 3981 diff changeset	452 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	453 #else /* BRANCHLESS_CABAC_DECODER */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	454
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	455
4050 2c79a8281cb6 Protect code that uses CMOV instructions with HAVE_CMOV, gpoirier parents: 4049 diff changeset	456 #if defined CMOV_IS_FAST
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	457 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	458 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	459 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	460 "cmp "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	461 "cmova %%ecx , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	462 "sbb %%ecx , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	463 "and %%ecx , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	464 "sub "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	465 "xor %%ecx , "ret" \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	466 #else /* CMOV_IS_FAST */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	467 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	468 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	469 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	470 "sub "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	471 "sar $31 , "tmp" \n\t" /lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	472 "sub %%ecx , "range" \n\t" /RangeLPS - range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	473 "and "tmp" , "range" \n\t" /(RangeLPS - range)&lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	474 "add %%ecx , "range" \n\t" /new range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	475 "shl $17 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	476 "and "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	477 "sub %%ecx , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	478 "xor "tmp" , "ret" \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	479 #endif /* CMOV_IS_FAST */
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	480
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	481
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	482 #define BRANCHLESS_GET_CABAC(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	483 "movzbl "statep" , "ret" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	484 "mov "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	485 "and $0xC0 , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	486 "movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	487 "sub "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	488 BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	489 "movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	490 "shl %%cl , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	491 "movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	492 "mov "tmpbyte" , "statep" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	493 "shl %%cl , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	494 "test "lowword" , "lowword" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	495 " jnz 1f \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	496 "mov "BYTE"("cabac"), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	497 "movzwl (%%ecx) , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	498 "bswap "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	499 "shr $15 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	500 "sub $0xFFFF , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	501 "add $2 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	502 "mov %%ecx , "BYTE "("cabac") \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	503 "lea -1("low") , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	504 "xor "low" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	505 "shr $15 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	506 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	507 "neg %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	508 "add $7 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	509 "shl %%cl , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	510 "add "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	511 "1: \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	512
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	513 asm volatile(
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	514 "movl "RANGE "(%2), %%esi \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	515 "movl "LOW "(%2), %%ebx \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	516 BRANCHLESS_GET_CABAC("%0", "%2", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl")
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	517 "movl %%esi, "RANGE "(%2) \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	518 "movl %%ebx, "LOW "(%2) \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	519
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	520 :"=&a"(bit)
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	521 :"r"(state), "r"(c)
4012 f8c649ac09dd add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn michael parents: 4008 diff changeset	522 : "%ecx", "%ebx", "%edx", "%esi", "memory"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	523 );
3981 9854f686ba79 move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster) michael parents: 3980 diff changeset	524 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	525 #endif /* BRANCHLESS_CABAC_DECODER */
4033 f7a6b2bb3a2f Expand some #endif comments. diego parents: 4027 diff changeset	526 #else /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	527 int s = *state;
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	528 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
2522 e25782262d7d kill warnings patch by (M��ns Rullg��rd <mru inprovide com>) michael parents: 2323 diff changeset	529 int bit, lps_mask attribute_unused;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	530
1287 9211fbd31353 CABAC michaelni parents: diff changeset	531 c->range -= RangeLPS;
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	532 #ifndef BRANCHLESS_CABAC_DECODER
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	533 if(c->low < (c->range<<17)){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	534 bit= s&1;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	535 *state= ff_h264_mps_state[s];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	536 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	537 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	538 bit= ff_h264_norm_shift[RangeLPS];
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	539 c->low -= (c->range<<17);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	540 *state= ff_h264_lps_state[s];
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	541 c->range = RangeLPS<<bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	542 c->low <<= bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	543 bit= (s&1)^1;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	544
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	545 if(!(c->low & 0xFFFF)){
1c39d9786efd optimization michael parents: 2116 diff changeset	546 refill2(c);
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	547 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	548 }
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	549 #else /* BRANCHLESS_CABAC_DECODER */
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	550 lps_mask= ((c->range<<17) - c->low)>>31;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	551
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	552 c->low -= (c->range<<17) & lps_mask;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	553 c->range += (RangeLPS - c->range) & lps_mask;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	554
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	555 s^=lps_mask;
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	556 *state= (ff_h264_mlps_state+128)[s];
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	557 bit= s&1;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	558
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	559 lps_mask= ff_h264_norm_shift[c->range];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	560 c->range<<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	561 c->low <<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	562 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	563 refill2(c);
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	564 #endif /* BRANCHLESS_CABAC_DECODER */
4033 f7a6b2bb3a2f Expand some #endif comments. diego parents: 4027 diff changeset	565 #endif /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	566 return bit;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	567 }
9211fbd31353 CABAC michaelni parents: diff changeset	568
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	569 static int __attribute((noinline)) get_cabac_noinline(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	570 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	571 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	572
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	573 static int get_cabac(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	574 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	575 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	576
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	577 static int get_cabac_bypass(CABACContext *c){
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	578 #if 0 //not faster
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	579 int bit;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	580 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	581 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	582 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	583 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	584 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	585 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	586 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	587 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	588 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	589 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	590 " jnz 1f \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	591 "movl "BYTE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	592 "subl $0xFFFF, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	593 "movzwl (%%ebx), %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	594 "bswap %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	595 "shrl $15, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	596 "addl $2, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	597 "addl %%ecx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	598 "movl %%ebx, "BYTE "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	599 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	600 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	601
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	602 :"=&d"(bit)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	603 :"r"(c)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	604 : "%eax", "%ebx", "%ecx", "memory"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	605 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	606 return bit+1;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	607 #else
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	608 int range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	609 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	610
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	611 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	612 refill(c);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	613
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	614 range= c->range<<17;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	615 if(c->low < range){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	616 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	617 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	618 c->low -= range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	619 return 1;
9211fbd31353 CABAC michaelni parents: diff changeset	620 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	621 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	622 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	623
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	624
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	625 static always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	626 #ifdef ARCH_X86
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	627 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	628 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	629 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	630 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	631 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	632 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	633 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	634 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	635 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	636 "xor %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	637 "sub %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	638 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	639 " jnz 1f \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	640 "movl "BYTE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	641 "subl $0xFFFF, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	642 "movzwl (%%ebx), %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	643 "bswap %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	644 "shrl $15, %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	645 "addl $2, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	646 "addl %%edx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	647 "movl %%ebx, "BYTE "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	648 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	649 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	650
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	651 :"+c"(val)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	652 :"r"(c)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	653 : "%eax", "%ebx", "%edx", "memory"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	654 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	655 return val;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	656 #else
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	657 int range, mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	658 c->low += c->low;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	659
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	660 if(!(c->low & CABAC_MASK))
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	661 refill(c);
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	662
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	663 range= c->range<<17;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	664 c->low -= range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	665 mask= c->low >> 31;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	666 range &= mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	667 c->low += range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	668 return (val^mask)-mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	669 #endif
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	670 }
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	671
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	672 //FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	673 //FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	674 #ifdef ARCH_X86
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	675 static int decode_significance_x86(CABACContext c, int max_coeff, uint8_t significant_coeff_ctx_base, int *index){
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	676 void *end= significant_coeff_ctx_base + max_coeff - 1;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	677 int minusstart= -(int)significant_coeff_ctx_base;
4049 8c1a5ed03a00 another instruction less in decode_significance_x86() -> 1% faster ion P3 michael parents: 4048 diff changeset	678 int minusindex= 4-(int)index;
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	679 int coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	680 asm volatile(
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	681 "movl "RANGE "(%3), %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	682 "movl "LOW "(%3), %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	683
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	684 "2: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	685
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	686 BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	687
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	688 "test $1, %%edx \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	689 " jz 3f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	690
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	691 BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	692
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	693 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	694 "movl %4, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	695 "addl %1, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	696 "movl %%ecx, (%%eax) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	697
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	698 "test $1, %%edx \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	699 " jnz 4f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	700
4049 8c1a5ed03a00 another instruction less in decode_significance_x86() -> 1% faster ion P3 michael parents: 4048 diff changeset	701 "addl $4, %%eax \n\t"
4048 bf6791303fa0 1 instruction less michael parents: 4047 diff changeset	702 "movl %%eax, %2 \n\t"
bf6791303fa0 1 instruction less michael parents: 4047 diff changeset	703
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	704 "3: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	705 "addl $1, %1 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	706 "cmpl %5, %1 \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	707 " jb 2b \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	708 "movl %2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	709 "movl %4, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	710 "addl %1, %%ecx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	711 "movl %%ecx, (%%eax) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	712 "4: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	713 "addl %6, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	714 "shr $2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	715
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	716 "movl %%esi, "RANGE "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	717 "movl %%ebx, "LOW "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	718 :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	719 :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	720 : "%ecx", "%ebx", "%edx", "%esi", "memory"\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	721 );
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	722 return coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	723 }
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	724 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	725
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	726 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	727 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	728 * @return the number of bytes read or 0 if no end
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	729 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	730 static int get_cabac_terminate(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	731 c->range -= 2;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	732 if(c->low < c->range<<17){
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	733 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	734 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	735 }else{
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	736 return c->bytestream - c->bytestream_start;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	737 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	738 }
9211fbd31353 CABAC michaelni parents: diff changeset	739
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	740 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	741 * get (truncated) unnary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	742 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	743 static int get_cabac_u(CABACContext c, uint8_t state, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	744 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	745
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	746 for(i=0; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	747 if(get_cabac(c, state)==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	748 return i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	749
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	750 if(i< max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	751 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	752
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	753 return truncated ? max : -1;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	754 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	755
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	756 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	757 * get unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	758 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	759 static int get_cabac_ueg(CABACContext c, uint8_t state, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	760 int i, v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	761 int m= 1<<k;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	762
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	763 if(get_cabac(c, state)==0)
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	764 return 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	765
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	766 if(0 < max_index) state++;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	767
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	768 for(i=1; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	769 if(get_cabac(c, state)==0){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	770 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	771 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	772 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	773 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	774 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	775
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	776 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	777 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	778
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	779 while(get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	780 i+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	781 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	782 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	783
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	784 v=0;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	785 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	786 v+= v + get_cabac_bypass(c);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	787 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	788 i += v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	789
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	790 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	791 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	792 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	793 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	794 }

Mercurial > libavcodec.hg

annotate cabac.h @ 4050:2c79a8281cb6 libavcodec