libavcodec.hg: cabac.h annotate

annotate cabac.h @ 4167:a3134db4857e libavcodec

store a identifer and the first header in extradata with this mp3 should be binary identical to what you had before header compression support mp3 with crc (by droping the crc and putting it back during header decompress, currently its just random tough, does any deocoder even check it?)

author	michael
date	Fri, 10 Nov 2006 11:31:02 +0000
parents	4ce3923d5806
children	92f773cfebf5

rev	line source
1287 9211fbd31353 CABAC michaelni parents: diff changeset	1 /*
9211fbd31353 CABAC michaelni parents: diff changeset	2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
9211fbd31353 CABAC michaelni parents: diff changeset	3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
9211fbd31353 CABAC michaelni parents: diff changeset	4 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	7 * FFmpeg is free software; you can redistribute it and/or
1287 9211fbd31353 CABAC michaelni parents: diff changeset	8 * modify it under the terms of the GNU Lesser General Public
9211fbd31353 CABAC michaelni parents: diff changeset	9 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	10 * version 2.1 of the License, or (at your option) any later version.
1287 9211fbd31353 CABAC michaelni parents: diff changeset	11 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	12 * FFmpeg is distributed in the hope that it will be useful,
1287 9211fbd31353 CABAC michaelni parents: diff changeset	13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9211fbd31353 CABAC michaelni parents: diff changeset	14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9211fbd31353 CABAC michaelni parents: diff changeset	15 * Lesser General Public License for more details.
9211fbd31353 CABAC michaelni parents: diff changeset	16 *
9211fbd31353 CABAC michaelni parents: diff changeset	17 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	18 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2967 diff changeset	19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1287 9211fbd31353 CABAC michaelni parents: diff changeset	20 *
9211fbd31353 CABAC michaelni parents: diff changeset	21 */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	22
1287 9211fbd31353 CABAC michaelni parents: diff changeset	23 /**
9211fbd31353 CABAC michaelni parents: diff changeset	24 * @file cabac.h
9211fbd31353 CABAC michaelni parents: diff changeset	25 * Context Adaptive Binary Arithmetic Coder.
9211fbd31353 CABAC michaelni parents: diff changeset	26 */
9211fbd31353 CABAC michaelni parents: diff changeset	27
9211fbd31353 CABAC michaelni parents: diff changeset	28
3284 a224d9752912 don't force asserts in release builds. 2% faster h264. lorenm parents: 3036 diff changeset	29 //#undef NDEBUG
1287 9211fbd31353 CABAC michaelni parents: diff changeset	30 #include <assert.h>
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	31 #ifdef ARCH_X86
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	32 #include "x86_cpu.h"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	33 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	34
3948 3edbf131ee44 refill cabac variables in 16bit steps, 3% faster get_cabac() michael parents: 3947 diff changeset	35 #define CABAC_BITS 16
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	36 #define CABAC_MASK ((1<<CABAC_BITS)-1)
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	37 #define BRANCHLESS_CABAC_DECODER 1
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	38 //#define ARCH_X86_DISABLED 1
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	39
1287 9211fbd31353 CABAC michaelni parents: diff changeset	40 typedef struct CABACContext{
9211fbd31353 CABAC michaelni parents: diff changeset	41 int low;
9211fbd31353 CABAC michaelni parents: diff changeset	42 int range;
9211fbd31353 CABAC michaelni parents: diff changeset	43 int outstanding_count;
9211fbd31353 CABAC michaelni parents: diff changeset	44 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	45 int symCount;
9211fbd31353 CABAC michaelni parents: diff changeset	46 #endif
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	47 const uint8_t *bytestream_start;
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	48 const uint8_t *bytestream;
2116 48d9f86fb047 overread fix michael parents: 2024 diff changeset	49 const uint8_t *bytestream_end;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	50 PutBitContext pb;
9211fbd31353 CABAC michaelni parents: diff changeset	51 }CABACContext;
9211fbd31353 CABAC michaelni parents: diff changeset	52
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	53 extern uint8_t ff_h264_mlps_state[4*64];
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	54 extern uint8_t ff_h264_lps_range[4264]; ///< rangeTabLPS
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	55 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	56 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	57 extern const uint8_t ff_h264_norm_shift[512];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	58
1287 9211fbd31353 CABAC michaelni parents: diff changeset	59
9211fbd31353 CABAC michaelni parents: diff changeset	60 void ff_init_cabac_encoder(CABACContext c, uint8_t buf, int buf_size);
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	61 void ff_init_cabac_decoder(CABACContext c, const uint8_t buf, int buf_size);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	62 void ff_init_cabac_states(CABACContext *c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	63
9211fbd31353 CABAC michaelni parents: diff changeset	64
9211fbd31353 CABAC michaelni parents: diff changeset	65 static inline void put_cabac_bit(CABACContext *c, int b){
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	66 put_bits(&c->pb, 1, b);
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	67 for(;c->outstanding_count; c->outstanding_count--){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	68 put_bits(&c->pb, 1, 1-b);
9211fbd31353 CABAC michaelni parents: diff changeset	69 }
9211fbd31353 CABAC michaelni parents: diff changeset	70 }
9211fbd31353 CABAC michaelni parents: diff changeset	71
9211fbd31353 CABAC michaelni parents: diff changeset	72 static inline void renorm_cabac_encoder(CABACContext *c){
9211fbd31353 CABAC michaelni parents: diff changeset	73 while(c->range < 0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	74 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	75 if(c->low<0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	76 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	77 }else if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	78 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	79 c->low -= 0x100;
9211fbd31353 CABAC michaelni parents: diff changeset	80 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	81 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	82 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	83 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	84
1287 9211fbd31353 CABAC michaelni parents: diff changeset	85 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	86 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	87 }
9211fbd31353 CABAC michaelni parents: diff changeset	88 }
9211fbd31353 CABAC michaelni parents: diff changeset	89
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	90 static void put_cabac(CABACContext c, uint8_t const state, int bit){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	91 int RangeLPS= ff_h264_lps_range[2(c->range&0xC0) + state];
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	92
1287 9211fbd31353 CABAC michaelni parents: diff changeset	93 if(bit == ((*state)&1)){
9211fbd31353 CABAC michaelni parents: diff changeset	94 c->range -= RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	95 state= ff_h264_mps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	96 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	97 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	98 c->range = RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	99 state= ff_h264_lps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	100 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	101
1287 9211fbd31353 CABAC michaelni parents: diff changeset	102 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	103
9211fbd31353 CABAC michaelni parents: diff changeset	104 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	105 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	106 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	107 }
9211fbd31353 CABAC michaelni parents: diff changeset	108
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	109 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	110 assert(c->range > RangeLPS);
9211fbd31353 CABAC michaelni parents: diff changeset	111
9211fbd31353 CABAC michaelni parents: diff changeset	112 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	113 c->range -= RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	114 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	115 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	116 c->range = RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	117 }
9211fbd31353 CABAC michaelni parents: diff changeset	118
9211fbd31353 CABAC michaelni parents: diff changeset	119 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	120
9211fbd31353 CABAC michaelni parents: diff changeset	121 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	122 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	123 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	124 }
9211fbd31353 CABAC michaelni parents: diff changeset	125
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	126 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	127 * @param bit 0 -> write zero bit, !=0 write one bit
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	128 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	129 static void put_cabac_bypass(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	130 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	131
9211fbd31353 CABAC michaelni parents: diff changeset	132 if(bit){
9211fbd31353 CABAC michaelni parents: diff changeset	133 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	134 }
9211fbd31353 CABAC michaelni parents: diff changeset	135 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	136 if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	137 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	138 }else if(c->low<0x400){
9211fbd31353 CABAC michaelni parents: diff changeset	139 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	140 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	141 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	142 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	143 c->low -= 0x400;
9211fbd31353 CABAC michaelni parents: diff changeset	144 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	145
1287 9211fbd31353 CABAC michaelni parents: diff changeset	146 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	147 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	148 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	149 }
9211fbd31353 CABAC michaelni parents: diff changeset	150
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	151 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	152 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	153 * @return the number of bytes written
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	154 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	155 static int put_cabac_terminate(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	156 c->range -= 2;
9211fbd31353 CABAC michaelni parents: diff changeset	157
9211fbd31353 CABAC michaelni parents: diff changeset	158 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	159 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	160 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	161 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	162 c->range= 2;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	163
1287 9211fbd31353 CABAC michaelni parents: diff changeset	164 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	165
9211fbd31353 CABAC michaelni parents: diff changeset	166 assert(c->low <= 0x1FF);
9211fbd31353 CABAC michaelni parents: diff changeset	167 put_cabac_bit(c, c->low>>9);
9211fbd31353 CABAC michaelni parents: diff changeset	168 put_bits(&c->pb, 2, ((c->low>>7)&3)\|1);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	169
1287 9211fbd31353 CABAC michaelni parents: diff changeset	170 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
9211fbd31353 CABAC michaelni parents: diff changeset	171 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	172
1287 9211fbd31353 CABAC michaelni parents: diff changeset	173 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	174 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	175 #endif
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	176
1787 752b51a3c8ed get_bit_count -> put_bits_count alex parents: 1301 diff changeset	177 return (put_bits_count(&c->pb)+7)>>3;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	178 }
9211fbd31353 CABAC michaelni parents: diff changeset	179
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	180 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	181 * put (truncated) unary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	182 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	183 static void put_cabac_u(CABACContext c, uint8_t state, int v, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	184 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	185
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	186 assert(v <= max);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	187
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	188 #if 1
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	189 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	190 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	191 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	192 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	193 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	194 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	195 #else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	196 if(v <= max_index){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	197 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	198 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	199 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	200 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	201 put_cabac(c, state+i, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	202 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	203 for(i=0; i<=max_index; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	204 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	205 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	206 for(; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	207 put_cabac(c, state+max_index, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	208 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	209 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	210 put_cabac(c, state+max_index, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	211 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	212 #endif
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	213 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	214
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	215 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	216 * put unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	217 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	218 static void put_cabac_ueg(CABACContext c, uint8_t state, int v, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	219 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	220
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	221 if(v==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	222 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	223 else{
1298 5bc3184810dc cleanup michaelni parents: 1290 diff changeset	224 const int sign= v < 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	225
4001 34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3999 diff changeset	226 if(is_signed) v= FFABS(v);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	227
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	228 if(v<max){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	229 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	230 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	231 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	232 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	233
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	234 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	235 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	236 int m= 1<<k;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	237
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	238 for(i=0; i<max; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	239 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	240 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	241 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	242
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	243 v -= max;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	244 while(v >= m){ //FIXME optimize
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	245 put_cabac_bypass(c, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	246 v-= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	247 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	248 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	249 put_cabac_bypass(c, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	250 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	251 put_cabac_bypass(c, v&m);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	252 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	253 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	254
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	255 if(is_signed)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	256 put_cabac_bypass(c, sign);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	257 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	258 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	259
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	260 static void refill(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	261 #if CABAC_BITS == 16
3946 ab0797f2e397 () 10l michael parents: 3943 diff changeset	262 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	263 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	264 c->low+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	265 #endif
1c39d9786efd optimization michael parents: 2116 diff changeset	266 c->low -= CABAC_MASK;
1c39d9786efd optimization michael parents: 2116 diff changeset	267 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	268 }
1c39d9786efd optimization michael parents: 2116 diff changeset	269
1c39d9786efd optimization michael parents: 2116 diff changeset	270 static void refill2(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	271 int i, x;
1c39d9786efd optimization michael parents: 2116 diff changeset	272
1c39d9786efd optimization michael parents: 2116 diff changeset	273 x= c->low ^ (c->low-1);
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	274 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	275
1c39d9786efd optimization michael parents: 2116 diff changeset	276 x= -CABAC_MASK;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	277
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	278 #if CABAC_BITS == 16
1c39d9786efd optimization michael parents: 2116 diff changeset	279 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
1c39d9786efd optimization michael parents: 2116 diff changeset	280 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	281 x+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	282 #endif
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	283
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	284 c->low += x<<i;
1c39d9786efd optimization michael parents: 2116 diff changeset	285 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	286 }
1c39d9786efd optimization michael parents: 2116 diff changeset	287
1287 9211fbd31353 CABAC michaelni parents: diff changeset	288 static inline void renorm_cabac_decoder(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	289 while(c->range < 0x100){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	290 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	291 c->low+= c->low;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	292 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	293 refill(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	294 }
9211fbd31353 CABAC michaelni parents: diff changeset	295 }
9211fbd31353 CABAC michaelni parents: diff changeset	296
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	297 static inline void renorm_cabac_decoder_once(CABACContext *c){
3951 751bfc30df72 disable benchmarking code michael parents: 3950 diff changeset	298 #ifdef ARCH_X86_DISABLED
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	299 int temp;
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	300 #if 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	301 //P3:683 athlon:475
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	302 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	303 "lea -0x100(%0), %2 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	304 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	305 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	306 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	307 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	308 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	309 #elif 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	310 //P3:680 athlon:474
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	311 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	312 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	313 "setb %%cl \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	314 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	315 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	316 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	317 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	318 #elif 1
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	319 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	320 //P3:665 athlon:517
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	321 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	322 "lea -0x100(%0), %%eax \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	323 "cdq \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	324 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	325 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	326 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	327 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	328 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	329 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	330 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	331 #elif 0
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	332 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	333 //P3:673 athlon:509
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	334 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	335 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	336 "sbb %%edx, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	337 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	338 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	339 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	340 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	341 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	342 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	343 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	344 #else
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	345 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	346 //P3:677 athlon:511
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	347 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	348 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	349 "lea (%0, %0), %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	350 "lea (%1, %1), %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	351 "cmovb %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	352 "cmovb %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	353 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	354 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	355 #endif
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	356 #else
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	357 //P3:675 athlon:476
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	358 int shift= (uint32_t)(c->range - 0x100)>>31;
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	359 c->range<<= shift;
0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	360 c->low <<= shift;
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	361 #endif
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	362 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	363 refill(c);
1c39d9786efd optimization michael parents: 2116 diff changeset	364 }
1c39d9786efd optimization michael parents: 2116 diff changeset	365
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	366 static int always_inline get_cabac_inline(CABACContext c, uint8_t const state){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	367 //FIXME gcc generates duplicate load/stores for c->low and c->range
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	368 #define LOW "0"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	369 #define RANGE "4"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	370 #ifdef ARCH_X86_64
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	371 #define BYTESTART "16"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	372 #define BYTE "24"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	373 #define BYTEEND "32"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	374 #else
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	375 #define BYTESTART "12"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	376 #define BYTE "16"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	377 #define BYTEEND "20"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	378 #endif
4113 4ce3923d5806 Reenable AMD64 optimizations for cabac accidentially disabled in r6852 reimar parents: 4112 diff changeset	379 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
4044 5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	380 int bit;
5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	381
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	382 #ifndef BRANCHLESS_CABAC_DECODER
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	383 asm volatile(
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	384 "movzbl (%1), %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	385 "movl "RANGE "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	386 "movl "RANGE "(%2), %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	387 "andl $0xC0, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	388 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	389 "movl "LOW "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	390 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	391 "subl %%esi, %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	392 "movl %%edx, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	393 "shll $17, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	394 "cmpl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	395 " ja 1f \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	396
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	397 #if 1
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	398 //athlon:4067 P3:4110
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	399 "lea -0x100(%%edx), %%ecx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	400 "shr $31, %%ecx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	401 "shl %%cl, %%edx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	402 "shl %%cl, %%ebx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	403 #else
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	404 //athlon:4057 P3:4130
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	405 "cmp $0x100, %%edx \n\t" //FIXME avoidable
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	406 "setb %%cl \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	407 "shl %%cl, %%edx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	408 "shl %%cl, %%ebx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	409 #endif
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	410 "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	411 "movb %%cl, (%1) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	412 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	413 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	414 " jnz 2f \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	415 "mov "BYTE "(%2), %%"REG_S" \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	416 "subl $0xFFFF, %%ebx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	417 "movzwl (%%"REG_S"), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	418 "bswap %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	419 "shrl $15, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	420 "add $2, %%"REG_S" \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	421 "addl %%ecx, %%ebx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	422 "mov %%"REG_S", "BYTE "(%2) \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	423 "jmp 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	424 "1: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	425 //eax:state ebx:low, edx:range, esi:RangeLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	426 "subl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	427 "movl %%esi, %%edx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	428 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	429 "shll %%cl, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	430 "shll %%cl, %%edx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	431 "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	432 "movb %%cl, (%1) \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	433 "add $1, %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	434 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	435 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	436
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	437 "mov "BYTE "(%2), %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	438 "movzwl (%%"REG_c"), %%esi \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	439 "bswap %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	440 "shrl $15, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	441 "subl $0xFFFF, %%esi \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	442 "add $2, %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	443 "mov %%"REG_c", "BYTE "(%2) \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	444
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	445 "leal -1(%%ebx), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	446 "xorl %%ebx, %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	447 "shrl $15, %%ecx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	448 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
3994 2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	449 "neg %%ecx \n\t"
2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	450 "add $7, %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	451
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	452 "shll %%cl , %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	453 "addl %%esi, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	454 "2: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	455 "movl %%edx, "RANGE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	456 "movl %%ebx, "LOW "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	457 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	458 :"r"(state), "r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	459 : "%"REG_c, "%ebx", "%edx", "%"REG_S, "memory"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	460 );
3982 af16271634c2 moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop michael parents: 3981 diff changeset	461 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	462 #else /* BRANCHLESS_CABAC_DECODER */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	463
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	464
4050 2c79a8281cb6 Protect code that uses CMOV instructions with HAVE_CMOV, gpoirier parents: 4049 diff changeset	465 #if defined CMOV_IS_FAST
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	466 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	467 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	468 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	469 "cmp "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	470 "cmova %%ecx , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	471 "sbb %%ecx , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	472 "and %%ecx , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	473 "sub "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	474 "xor %%ecx , "ret" \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	475 #else /* CMOV_IS_FAST */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	476 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	477 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	478 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	479 "sub "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	480 "sar $31 , "tmp" \n\t" /lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	481 "sub %%ecx , "range" \n\t" /RangeLPS - range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	482 "and "tmp" , "range" \n\t" /(RangeLPS - range)&lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	483 "add %%ecx , "range" \n\t" /new range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	484 "shl $17 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	485 "and "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	486 "sub %%ecx , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	487 "xor "tmp" , "ret" \n\t"
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	488 #endif /* CMOV_IS_FAST */
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	489
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	490
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	491 #define BRANCHLESS_GET_CABAC(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	492 "movzbl "statep" , "ret" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	493 "mov "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	494 "and $0xC0 , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	495 "movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	496 "sub "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	497 BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	498 "movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	499 "shl %%cl , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	500 "movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	501 "mov "tmpbyte" , "statep" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	502 "shl %%cl , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	503 "test "lowword" , "lowword" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	504 " jnz 1f \n\t"\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	505 "mov "BYTE"("cabac"), %%"REG_c" \n\t"\
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	506 "movzwl (%%"REG_c") , "tmp" \n\t"\
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	507 "bswap "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	508 "shr $15 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	509 "sub $0xFFFF , "tmp" \n\t"\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	510 "add $2 , %%"REG_c" \n\t"\
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	511 "mov %%"REG_c" , "BYTE "("cabac") \n\t"\
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	512 "lea -1("low") , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	513 "xor "low" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	514 "shr $15 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	515 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	516 "neg %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	517 "add $7 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	518 "shl %%cl , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	519 "add "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	520 "1: \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	521
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	522 asm volatile(
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	523 "movl "RANGE "(%2), %%esi \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	524 "movl "LOW "(%2), %%ebx \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	525 BRANCHLESS_GET_CABAC("%0", "%2", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl")
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	526 "movl %%esi, "RANGE "(%2) \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	527 "movl %%ebx, "LOW "(%2) \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	528
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	529 :"=&a"(bit)
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	530 :"r"(state), "r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	531 : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	532 );
3981 9854f686ba79 move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster) michael parents: 3980 diff changeset	533 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	534 #endif /* BRANCHLESS_CABAC_DECODER */
4113 4ce3923d5806 Reenable AMD64 optimizations for cabac accidentially disabled in r6852 reimar parents: 4112 diff changeset	535 #else /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	536 int s = *state;
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	537 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
2522 e25782262d7d kill warnings patch by (M��ns Rullg��rd <mru inprovide com>) michael parents: 2323 diff changeset	538 int bit, lps_mask attribute_unused;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	539
1287 9211fbd31353 CABAC michaelni parents: diff changeset	540 c->range -= RangeLPS;
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	541 #ifndef BRANCHLESS_CABAC_DECODER
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	542 if(c->low < (c->range<<17)){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	543 bit= s&1;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	544 *state= ff_h264_mps_state[s];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	545 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	546 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	547 bit= ff_h264_norm_shift[RangeLPS];
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	548 c->low -= (c->range<<17);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	549 *state= ff_h264_lps_state[s];
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	550 c->range = RangeLPS<<bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	551 c->low <<= bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	552 bit= (s&1)^1;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	553
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	554 if(!(c->low & 0xFFFF)){
1c39d9786efd optimization michael parents: 2116 diff changeset	555 refill2(c);
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	556 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	557 }
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	558 #else /* BRANCHLESS_CABAC_DECODER */
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	559 lps_mask= ((c->range<<17) - c->low)>>31;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	560
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	561 c->low -= (c->range<<17) & lps_mask;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	562 c->range += (RangeLPS - c->range) & lps_mask;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	563
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	564 s^=lps_mask;
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	565 *state= (ff_h264_mlps_state+128)[s];
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	566 bit= s&1;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	567
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	568 lps_mask= ff_h264_norm_shift[c->range];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	569 c->range<<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	570 c->low <<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	571 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	572 refill2(c);
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	573 #endif /* BRANCHLESS_CABAC_DECODER */
4113 4ce3923d5806 Reenable AMD64 optimizations for cabac accidentially disabled in r6852 reimar parents: 4112 diff changeset	574 #endif /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	575 return bit;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	576 }
9211fbd31353 CABAC michaelni parents: diff changeset	577
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	578 static int __attribute((noinline)) get_cabac_noinline(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	579 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	580 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	581
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	582 static int get_cabac(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	583 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	584 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	585
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	586 static int get_cabac_bypass(CABACContext *c){
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	587 #if 0 //not faster
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	588 int bit;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	589 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	590 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	591 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	592 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	593 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	594 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	595 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	596 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	597 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	598 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	599 " jnz 1f \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	600 "movl "BYTE "(%1), %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	601 "subl $0xFFFF, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	602 "movzwl (%%"REG_b"), %%ecx \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	603 "bswap %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	604 "shrl $15, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	605 "addl $2, %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	606 "addl %%ecx, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	607 "movl %%"REG_b", "BYTE "(%1) \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	608 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	609 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	610
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	611 :"=&d"(bit)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	612 :"r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	613 : "%eax", "%"REG_b, "%ecx", "memory"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	614 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	615 return bit+1;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	616 #else
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	617 int range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	618 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	619
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	620 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	621 refill(c);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	622
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	623 range= c->range<<17;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	624 if(c->low < range){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	625 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	626 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	627 c->low -= range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	628 return 1;
9211fbd31353 CABAC michaelni parents: diff changeset	629 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	630 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	631 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	632
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	633
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	634 static always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	635 #ifdef ARCH_X86
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	636 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	637 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	638 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	639 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	640 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	641 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	642 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	643 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	644 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	645 "xor %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	646 "sub %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	647 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	648 " jnz 1f \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	649 "mov "BYTE "(%1), %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	650 "subl $0xFFFF, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	651 "movzwl (%%"REG_b"), %%edx \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	652 "bswap %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	653 "shrl $15, %%edx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	654 "add $2, %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	655 "addl %%edx, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	656 "mov %%"REG_b", "BYTE "(%1) \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	657 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	658 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	659
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	660 :"+c"(val)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	661 :"r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	662 : "%eax", "%"REG_b, "%edx", "memory"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	663 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	664 return val;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	665 #else
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	666 int range, mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	667 c->low += c->low;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	668
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	669 if(!(c->low & CABAC_MASK))
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	670 refill(c);
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	671
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	672 range= c->range<<17;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	673 c->low -= range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	674 mask= c->low >> 31;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	675 range &= mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	676 c->low += range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	677 return (val^mask)-mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	678 #endif
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	679 }
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	680
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	681 //FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	682 //FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard)
4113 4ce3923d5806 Reenable AMD64 optimizations for cabac accidentially disabled in r6852 reimar parents: 4112 diff changeset	683 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	684 static int decode_significance_x86(CABACContext c, int max_coeff, uint8_t significant_coeff_ctx_base, int *index){
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	685 void *end= significant_coeff_ctx_base + max_coeff - 1;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	686 int minusstart= -(int)significant_coeff_ctx_base;
4049 8c1a5ed03a00 another instruction less in decode_significance_x86() -> 1% faster ion P3 michael parents: 4048 diff changeset	687 int minusindex= 4-(int)index;
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	688 int coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	689 asm volatile(
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	690 "movl "RANGE "(%3), %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	691 "movl "LOW "(%3), %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	692
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	693 "2: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	694
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	695 BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	696
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	697 "test $1, %%edx \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	698 " jz 3f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	699
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	700 BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	701
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	702 "mov %2, %%"REG_a" \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	703 "movl %4, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	704 "add %1, %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	705 "movl %%ecx, (%%"REG_a") \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	706
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	707 "test $1, %%edx \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	708 " jnz 4f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	709
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	710 "add $4, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	711 "mov %%"REG_a", %2 \n\t"
4048 bf6791303fa0 1 instruction less michael parents: 4047 diff changeset	712
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	713 "3: \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	714 "add $1, %1 \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	715 "cmp %5, %1 \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	716 " jb 2b \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	717 "mov %2, %%"REG_a" \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	718 "movl %4, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	719 "add %1, %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	720 "movl %%ecx, (%%"REG_a") \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	721 "4: \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	722 "add %6, %%eax \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	723 "shr $2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	724
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	725 "movl %%esi, "RANGE "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	726 "movl %%ebx, "LOW "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	727 :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	728 :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	729 : "%"REG_c, "%ebx", "%edx", "%esi", "memory"\
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	730 );
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	731 return coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	732 }
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	733
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	734 static int decode_significance_8x8_x86(CABACContext c, uint8_t significant_coeff_ctx_base, int index, uint8_t sig_off){
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	735 int minusindex= 4-(int)index;
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	736 int coeff_count;
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	737 long last=0;
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	738 asm volatile(
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	739 "movl "RANGE "(%3), %%esi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	740 "movl "LOW "(%3), %%ebx \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	741
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	742 "mov %1, %%"REG_D" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	743 "2: \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	744
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	745 "mov %6, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	746 "movzbl (%%"REG_a", %%"REG_D"), %%edi \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	747 "add %5, %%"REG_D" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	748
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	749 BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	750
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	751 "mov %1, %%edi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	752 "test $1, %%edx \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	753 " jz 3f \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	754
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	755 "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	756 "add %5, %%"REG_D" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	757
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	758 BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	759
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	760 "mov %2, %%"REG_a" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	761 "mov %1, %%edi \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	762 "movl %%edi, (%%"REG_a") \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	763
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	764 "test $1, %%edx \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	765 " jnz 4f \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	766
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	767 "add $4, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	768 "mov %%"REG_a", %2 \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	769
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	770 "3: \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	771 "addl $1, %%edi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	772 "mov %%edi, %1 \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	773 "cmpl $63, %%edi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	774 " jb 2b \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	775 "mov %2, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	776 "movl %%edi, (%%"REG_a") \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	777 "4: \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	778 "addl %4, %%eax \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	779 "shr $2, %%eax \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	780
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	781 "movl %%esi, "RANGE "(%3) \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	782 "movl %%ebx, "LOW "(%3) \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	783 :"=&a"(coeff_count),"+m"(last), "+m"(index)\
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	784 :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	785 : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"\
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	786 );
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	787 return coeff_count;
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	788 }
4113 4ce3923d5806 Reenable AMD64 optimizations for cabac accidentially disabled in r6852 reimar parents: 4112 diff changeset	789 #endif /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */
1287 9211fbd31353 CABAC michaelni parents: diff changeset	790
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	791 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	792 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	793 * @return the number of bytes read or 0 if no end
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	794 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	795 static int get_cabac_terminate(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	796 c->range -= 2;
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	797 if(c->low < c->range<<17){
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	798 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	799 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	800 }else{
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	801 return c->bytestream - c->bytestream_start;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	802 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	803 }
9211fbd31353 CABAC michaelni parents: diff changeset	804
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	805 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	806 * get (truncated) unnary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	807 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	808 static int get_cabac_u(CABACContext c, uint8_t state, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	809 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	810
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	811 for(i=0; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	812 if(get_cabac(c, state)==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	813 return i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	814
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	815 if(i< max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	816 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	817
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	818 return truncated ? max : -1;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	819 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	820
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	821 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	822 * get unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	823 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	824 static int get_cabac_ueg(CABACContext c, uint8_t state, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	825 int i, v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	826 int m= 1<<k;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	827
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	828 if(get_cabac(c, state)==0)
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	829 return 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	830
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	831 if(0 < max_index) state++;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	832
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	833 for(i=1; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	834 if(get_cabac(c, state)==0){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	835 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	836 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	837 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	838 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	839 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	840
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	841 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	842 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	843
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	844 while(get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	845 i+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	846 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	847 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	848
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	849 v=0;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	850 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	851 v+= v + get_cabac_bypass(c);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	852 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	853 i += v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	854
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	855 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	856 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	857 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	858 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	859 }

Mercurial > libavcodec.hg

annotate cabac.h @ 4167:a3134db4857e libavcodec