Mercurial > libavcodec.hg
annotate cabac.h @ 3992:a09b5b667229 libavcodec
10l
author | michael |
---|---|
date | Wed, 11 Oct 2006 13:25:29 +0000 |
parents | 72bae00a317f |
children | 8b7c59b7af01 |
rev | line source |
---|---|
1287 | 1 /* |
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
1287 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
1287 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
1287 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1287 | 20 * |
21 */ | |
2967 | 22 |
1287 | 23 /** |
24 * @file cabac.h | |
25 * Context Adaptive Binary Arithmetic Coder. | |
26 */ | |
27 | |
28 | |
3284
a224d9752912
don't force asserts in release builds. 2% faster h264.
lorenm
parents:
3036
diff
changeset
|
29 //#undef NDEBUG |
1287 | 30 #include <assert.h> |
31 | |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
32 #define CABAC_BITS 16 |
2323 | 33 #define CABAC_MASK ((1<<CABAC_BITS)-1) |
3984 | 34 #define BRANCHLESS_CABAC_DECODER 1 |
3990
746a60ba3177
enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ...
michael
parents:
3984
diff
changeset
|
35 #define CMOV_IS_FAST 1 |
2323 | 36 |
1287 | 37 typedef struct CABACContext{ |
38 int low; | |
39 int range; | |
40 int outstanding_count; | |
41 #ifdef STRICT_LIMITS | |
42 int symCount; | |
43 #endif | |
3976
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
44 uint8_t lps_state[2*64]; ///< transIdxLPS |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
45 uint8_t mps_state[2*64]; ///< transIdxMPS |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
46 const uint8_t *bytestream_start; |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
47 const uint8_t *bytestream; |
2116 | 48 const uint8_t *bytestream_end; |
1287 | 49 PutBitContext pb; |
50 }CABACContext; | |
51 | |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
52 extern uint8_t ff_h264_lps_range[2*65][4]; ///< rangeTabLPS |
1301 | 53 extern const uint8_t ff_h264_mps_state[64]; |
54 extern const uint8_t ff_h264_lps_state[64]; | |
3964 | 55 extern const uint8_t ff_h264_norm_shift[128]; |
2323 | 56 |
1287 | 57 |
58 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size); | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
59 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
60 void ff_init_cabac_states(CABACContext *c, |
1287 | 61 uint8_t const *mps_state, uint8_t const *lps_state, int state_count); |
62 | |
63 | |
64 static inline void put_cabac_bit(CABACContext *c, int b){ | |
2967 | 65 put_bits(&c->pb, 1, b); |
66 for(;c->outstanding_count; c->outstanding_count--){ | |
1287 | 67 put_bits(&c->pb, 1, 1-b); |
68 } | |
69 } | |
70 | |
71 static inline void renorm_cabac_encoder(CABACContext *c){ | |
72 while(c->range < 0x100){ | |
73 //FIXME optimize | |
74 if(c->low<0x100){ | |
75 put_cabac_bit(c, 0); | |
76 }else if(c->low<0x200){ | |
77 c->outstanding_count++; | |
78 c->low -= 0x100; | |
79 }else{ | |
80 put_cabac_bit(c, 1); | |
81 c->low -= 0x200; | |
82 } | |
2967 | 83 |
1287 | 84 c->range+= c->range; |
85 c->low += c->low; | |
86 } | |
87 } | |
88 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
89 static void put_cabac(CABACContext *c, uint8_t * const state, int bit){ |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
90 int RangeLPS= ff_h264_lps_range[*state][c->range>>6]; |
2967 | 91 |
1287 | 92 if(bit == ((*state)&1)){ |
93 c->range -= RangeLPS; | |
94 *state= c->mps_state[*state]; | |
95 }else{ | |
96 c->low += c->range - RangeLPS; | |
97 c->range = RangeLPS; | |
98 *state= c->lps_state[*state]; | |
99 } | |
2967 | 100 |
1287 | 101 renorm_cabac_encoder(c); |
102 | |
103 #ifdef STRICT_LIMITS | |
104 c->symCount++; | |
105 #endif | |
106 } | |
107 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
108 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){ |
1287 | 109 assert(c->range > RangeLPS); |
110 | |
111 if(!bit){ | |
112 c->range -= RangeLPS; | |
113 }else{ | |
114 c->low += c->range - RangeLPS; | |
115 c->range = RangeLPS; | |
116 } | |
117 | |
118 renorm_cabac_encoder(c); | |
119 | |
120 #ifdef STRICT_LIMITS | |
121 c->symCount++; | |
122 #endif | |
123 } | |
124 | |
1290 | 125 /** |
126 * @param bit 0 -> write zero bit, !=0 write one bit | |
127 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
128 static void put_cabac_bypass(CABACContext *c, int bit){ |
1287 | 129 c->low += c->low; |
130 | |
131 if(bit){ | |
132 c->low += c->range; | |
133 } | |
134 //FIXME optimize | |
135 if(c->low<0x200){ | |
136 put_cabac_bit(c, 0); | |
137 }else if(c->low<0x400){ | |
138 c->outstanding_count++; | |
139 c->low -= 0x200; | |
140 }else{ | |
141 put_cabac_bit(c, 1); | |
142 c->low -= 0x400; | |
143 } | |
2967 | 144 |
1287 | 145 #ifdef STRICT_LIMITS |
146 c->symCount++; | |
147 #endif | |
148 } | |
149 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
150 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
151 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
152 * @return the number of bytes written |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
153 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
154 static int put_cabac_terminate(CABACContext *c, int bit){ |
1287 | 155 c->range -= 2; |
156 | |
157 if(!bit){ | |
158 renorm_cabac_encoder(c); | |
159 }else{ | |
160 c->low += c->range; | |
161 c->range= 2; | |
2967 | 162 |
1287 | 163 renorm_cabac_encoder(c); |
164 | |
165 assert(c->low <= 0x1FF); | |
166 put_cabac_bit(c, c->low>>9); | |
167 put_bits(&c->pb, 2, ((c->low>>7)&3)|1); | |
2967 | 168 |
1287 | 169 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong |
170 } | |
2967 | 171 |
1287 | 172 #ifdef STRICT_LIMITS |
173 c->symCount++; | |
174 #endif | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
175 |
1787 | 176 return (put_bits_count(&c->pb)+7)>>3; |
1287 | 177 } |
178 | |
1290 | 179 /** |
180 * put (truncated) unary binarization. | |
181 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
182 static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ |
1290 | 183 int i; |
2967 | 184 |
1290 | 185 assert(v <= max); |
2967 | 186 |
1290 | 187 #if 1 |
188 for(i=0; i<v; i++){ | |
189 put_cabac(c, state, 1); | |
190 if(i < max_index) state++; | |
191 } | |
192 if(truncated==0 || v<max) | |
193 put_cabac(c, state, 0); | |
194 #else | |
195 if(v <= max_index){ | |
196 for(i=0; i<v; i++){ | |
197 put_cabac(c, state+i, 1); | |
198 } | |
199 if(truncated==0 || v<max) | |
200 put_cabac(c, state+i, 0); | |
201 }else{ | |
202 for(i=0; i<=max_index; i++){ | |
203 put_cabac(c, state+i, 1); | |
204 } | |
205 for(; i<v; i++){ | |
206 put_cabac(c, state+max_index, 1); | |
207 } | |
208 if(truncated==0 || v<max) | |
209 put_cabac(c, state+max_index, 0); | |
210 } | |
211 #endif | |
212 } | |
213 | |
214 /** | |
215 * put unary exp golomb k-th order binarization. | |
216 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
217 static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ |
1290 | 218 int i; |
2967 | 219 |
1290 | 220 if(v==0) |
221 put_cabac(c, state, 0); | |
222 else{ | |
1298 | 223 const int sign= v < 0; |
2967 | 224 |
1298 | 225 if(is_signed) v= ABS(v); |
2967 | 226 |
1290 | 227 if(v<max){ |
228 for(i=0; i<v; i++){ | |
229 put_cabac(c, state, 1); | |
230 if(i < max_index) state++; | |
231 } | |
232 | |
233 put_cabac(c, state, 0); | |
234 }else{ | |
235 int m= 1<<k; | |
236 | |
237 for(i=0; i<max; i++){ | |
238 put_cabac(c, state, 1); | |
239 if(i < max_index) state++; | |
240 } | |
241 | |
242 v -= max; | |
243 while(v >= m){ //FIXME optimize | |
244 put_cabac_bypass(c, 1); | |
245 v-= m; | |
246 m+= m; | |
247 } | |
248 put_cabac_bypass(c, 0); | |
249 while(m>>=1){ | |
250 put_cabac_bypass(c, v&m); | |
251 } | |
252 } | |
253 | |
254 if(is_signed) | |
255 put_cabac_bypass(c, sign); | |
256 } | |
257 } | |
258 | |
2323 | 259 static void refill(CABACContext *c){ |
260 #if CABAC_BITS == 16 | |
3946 | 261 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); |
2323 | 262 #else |
263 c->low+= c->bytestream[0]<<1; | |
264 #endif | |
265 c->low -= CABAC_MASK; | |
266 c->bytestream+= CABAC_BITS/8; | |
267 } | |
268 | |
269 static void refill2(CABACContext *c){ | |
270 int i, x; | |
271 | |
272 x= c->low ^ (c->low-1); | |
3964 | 273 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS+1)]; |
2323 | 274 |
275 x= -CABAC_MASK; | |
2967 | 276 |
2323 | 277 #if CABAC_BITS == 16 |
278 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); | |
279 #else | |
280 x+= c->bytestream[0]<<1; | |
281 #endif | |
2967 | 282 |
2323 | 283 c->low += x<<i; |
284 c->bytestream+= CABAC_BITS/8; | |
285 } | |
286 | |
1287 | 287 static inline void renorm_cabac_decoder(CABACContext *c){ |
2323 | 288 while(c->range < (0x200 << CABAC_BITS)){ |
1287 | 289 c->range+= c->range; |
290 c->low+= c->low; | |
2323 | 291 if(!(c->low & CABAC_MASK)) |
292 refill(c); | |
1287 | 293 } |
294 } | |
295 | |
2323 | 296 static inline void renorm_cabac_decoder_once(CABACContext *c){ |
3951 | 297 #ifdef ARCH_X86_DISABLED |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
298 int temp; |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
299 #if 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
300 //P3:683 athlon:475 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
301 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
302 "lea -0x2000000(%0), %2 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
303 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
304 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
305 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
306 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
307 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
308 #elif 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
309 //P3:680 athlon:474 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
310 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
311 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
312 "setb %%cl \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
313 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
314 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
315 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
316 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
317 #elif 1 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
318 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
319 //P3:665 athlon:517 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
320 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
321 "lea -0x2000000(%0), %%eax \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
322 "cdq \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
323 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
324 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
325 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
326 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
327 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
328 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
329 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
330 #elif 0 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
331 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
332 //P3:673 athlon:509 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
333 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
334 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
335 "sbb %%edx, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
336 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
337 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
338 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
339 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
340 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
341 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
342 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
343 #else |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
344 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
345 //P3:677 athlon:511 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
346 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
347 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
348 "lea (%0, %0), %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
349 "lea (%1, %1), %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
350 "cmovb %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
351 "cmovb %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
352 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
353 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
354 #endif |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
355 #else |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
356 //P3:675 athlon:476 |
3642 | 357 int shift= (uint32_t)(c->range - (0x200 << CABAC_BITS))>>31; |
358 c->range<<= shift; | |
359 c->low <<= shift; | |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
360 #endif |
2323 | 361 if(!(c->low & CABAC_MASK)) |
362 refill(c); | |
363 } | |
364 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
365 static int get_cabac(CABACContext *c, uint8_t * const state){ |
3642 | 366 //FIXME gcc generates duplicate load/stores for c->low and c->range |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
367 #ifdef ARCH_X86 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
368 int bit; |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
369 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
370 #define LOW "0" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
371 #define RANGE "4" |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
372 #define LPS_STATE "12" |
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
373 #define MPS_STATE "12+2*64" |
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
374 #define BYTESTART "12+4*64" |
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
375 #define BYTE "16+4*64" |
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
376 #define BYTEEND "20+4*64" |
3984 | 377 #ifndef BRANCHLESS_CABAC_DECODER |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
378 asm volatile( |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
379 "movzbl (%1), %%eax \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
380 "movl "RANGE "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
381 "movl "RANGE "(%2), %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
382 "shrl $23, %%ebx \n\t" |
3992 | 383 "movzbl "MANGLE(ff_h264_lps_range)"(%%ebx, %%eax, 4), %%esi\n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
384 "shll $17, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
385 "movl "LOW "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
386 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
387 "subl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
388 "cmpl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
389 " ja 1f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
390 "cmp $0x2000000, %%edx \n\t" //FIXME avoidable |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
391 "setb %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
392 "shl %%cl, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
393 "shl %%cl, %%ebx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
394 "movzbl "MPS_STATE"(%2, %%eax), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
395 "movb %%cl, (%1) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
396 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
397 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
398 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
399 "movl "BYTE "(%2), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
400 "subl $0xFFFF, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
401 "movzwl (%%esi), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
402 "bswap %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
403 "shrl $15, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
404 "addl $2, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
405 "addl %%ecx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
406 "movl %%esi, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
407 "jmp 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
408 "1: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
409 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
410 "subl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
411 "movl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
412 "shr $19, %%esi \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
413 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
414 "shll %%cl, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
415 "shll %%cl, %%edx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
416 "movzbl "LPS_STATE"(%2, %%eax), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
417 "movb %%cl, (%1) \n\t" |
3978 | 418 "addl $1, %%eax \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
419 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
420 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
421 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
422 "movl "BYTE "(%2), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
423 "movzwl (%%ecx), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
424 "bswap %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
425 "shrl $15, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
426 "subl $0xFFFF, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
427 "addl $2, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
428 "movl %%ecx, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
429 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
430 "leal -1(%%ebx), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
431 "xorl %%ebx, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
432 "shrl $17, %%ecx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
433 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
434 "neg %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
435 "add $7, %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
436 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
437 "shll %%cl , %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
438 "addl %%esi, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
439 "2: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
440 "movl %%edx, "RANGE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
441 "movl %%ebx, "LOW "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
442 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
443 :"r"(state), "r"(c) |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
444 : "%ecx", "%ebx", "%edx", "%esi" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
445 ); |
3982
af16271634c2
moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop
michael
parents:
3981
diff
changeset
|
446 bit&=1; |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
447 #else |
3975 | 448 asm volatile( |
449 "movzbl (%1), %%eax \n\t" | |
450 "movl "RANGE "(%2), %%ebx \n\t" | |
451 "movl "RANGE "(%2), %%edx \n\t" | |
452 "shrl $23, %%ebx \n\t" | |
3992 | 453 "movzbl "MANGLE(ff_h264_lps_range)"(%%ebx, %%eax, 4), %%esi\n\t" |
3975 | 454 "shll $17, %%esi \n\t" |
455 "movl "LOW "(%2), %%ebx \n\t" | |
456 //eax:state ebx:low, edx:range, esi:RangeLPS | |
457 "subl %%esi, %%edx \n\t" | |
3980 | 458 #ifdef CMOV_IS_FAST //FIXME actually define this somewhere |
459 "cmpl %%ebx, %%edx \n\t" | |
460 "cmova %%edx, %%esi \n\t" | |
461 "sbbl %%ecx, %%ecx \n\t" | |
462 "andl %%ecx, %%edx \n\t" | |
463 "subl %%edx, %%ebx \n\t" | |
464 "xorl %%ecx, %%eax \n\t" | |
465 #else | |
3975 | 466 "movl %%edx, %%ecx \n\t" |
467 "subl %%ebx, %%edx \n\t" | |
468 "sarl $31, %%edx \n\t" //lps_mask | |
469 "subl %%ecx, %%esi \n\t" //RangeLPS - range | |
470 "andl %%edx, %%esi \n\t" //(RangeLPS - range)&lps_mask | |
471 "addl %%ecx, %%esi \n\t" //new range | |
472 "andl %%edx, %%ecx \n\t" | |
473 "subl %%ecx, %%ebx \n\t" | |
3980 | 474 "xorl %%edx, %%eax \n\t" |
475 #endif | |
3975 | 476 |
477 //eax:state ebx:low edx:mask esi:range | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
478 "movzbl "MPS_STATE"(%2, %%eax), %%ecx \n\t" |
3975 | 479 "movb %%cl, (%1) \n\t" |
480 | |
481 "movl %%esi, %%edx \n\t" | |
482 //eax:bit ebx:low edx:range esi:range | |
483 | |
484 "shr $19, %%esi \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
485 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3975 | 486 "shll %%cl, %%ebx \n\t" |
487 "shll %%cl, %%edx \n\t" | |
488 "test %%bx, %%bx \n\t" | |
489 " jnz 1f \n\t" | |
490 | |
491 "movl "BYTE "(%2), %%ecx \n\t" | |
492 "movzwl (%%ecx), %%esi \n\t" | |
493 "bswap %%esi \n\t" | |
494 "shrl $15, %%esi \n\t" | |
495 "subl $0xFFFF, %%esi \n\t" | |
496 "addl $2, %%ecx \n\t" | |
497 "movl %%ecx, "BYTE "(%2) \n\t" | |
498 | |
499 "leal -1(%%ebx), %%ecx \n\t" | |
500 "xorl %%ebx, %%ecx \n\t" | |
501 "shrl $17, %%ecx \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
502 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3975 | 503 "neg %%cl \n\t" |
504 "add $7, %%cl \n\t" | |
505 | |
506 "shll %%cl , %%esi \n\t" | |
507 "addl %%esi, %%ebx \n\t" | |
508 "1: \n\t" | |
509 "movl %%edx, "RANGE "(%2) \n\t" | |
510 "movl %%ebx, "LOW "(%2) \n\t" | |
511 :"=&a"(bit) | |
512 :"r"(state), "r"(c) | |
513 : "%ecx", "%ebx", "%edx", "%esi" | |
514 ); | |
3981
9854f686ba79
move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster)
michael
parents:
3980
diff
changeset
|
515 bit&=1; |
3975 | 516 #endif |
517 #else | |
3642 | 518 int s = *state; |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
519 int RangeLPS= ff_h264_lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1); |
2522
e25782262d7d
kill warnings patch by (M«©ns Rullg«©rd <mru inprovide com>)
michael
parents:
2323
diff
changeset
|
520 int bit, lps_mask attribute_unused; |
2967 | 521 |
1287 | 522 c->range -= RangeLPS; |
3984 | 523 #ifndef BRANCHLESS_CABAC_DECODER |
1287 | 524 if(c->low < c->range){ |
3642 | 525 bit= s&1; |
3971
e8a6d5c1ab0b
drop failed attempt to optimize *state= c->mps_state[s];
michael
parents:
3970
diff
changeset
|
526 *state= c->mps_state[s]; |
2323 | 527 renorm_cabac_decoder_once(c); |
1287 | 528 }else{ |
3964 | 529 bit= ff_h264_norm_shift[RangeLPS>>19]; |
1287 | 530 c->low -= c->range; |
3642 | 531 *state= c->lps_state[s]; |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
532 c->range = RangeLPS<<bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
533 c->low <<= bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
534 bit= (s&1)^1; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
535 |
2323 | 536 if(!(c->low & 0xFFFF)){ |
537 refill2(c); | |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
538 } |
1287 | 539 } |
2323 | 540 #else |
541 lps_mask= (c->range - c->low)>>31; | |
2967 | 542 |
2323 | 543 c->low -= c->range & lps_mask; |
544 c->range += (RangeLPS - c->range) & lps_mask; | |
2967 | 545 |
3974 | 546 s^=lps_mask; |
547 *state= c->mps_state[s]; | |
548 bit= s&1; | |
2967 | 549 |
3970 | 550 lps_mask= ff_h264_norm_shift[c->range>>(CABAC_BITS+3)]; |
2323 | 551 c->range<<= lps_mask; |
552 c->low <<= lps_mask; | |
553 if(!(c->low & CABAC_MASK)) | |
554 refill2(c); | |
555 #endif | |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
556 #endif |
2967 | 557 return bit; |
1287 | 558 } |
559 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
560 static int get_cabac_bypass(CABACContext *c){ |
1287 | 561 c->low += c->low; |
562 | |
2323 | 563 if(!(c->low & CABAC_MASK)) |
564 refill(c); | |
2967 | 565 |
1287 | 566 if(c->low < c->range){ |
567 return 0; | |
568 }else{ | |
569 c->low -= c->range; | |
570 return 1; | |
571 } | |
572 } | |
573 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
574 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
575 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
576 * @return the number of bytes read or 0 if no end |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
577 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
578 static int get_cabac_terminate(CABACContext *c){ |
2323 | 579 c->range -= 4<<CABAC_BITS; |
1287 | 580 if(c->low < c->range){ |
2323 | 581 renorm_cabac_decoder_once(c); |
1287 | 582 return 0; |
583 }else{ | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
584 return c->bytestream - c->bytestream_start; |
2967 | 585 } |
1287 | 586 } |
587 | |
1290 | 588 /** |
589 * get (truncated) unnary binarization. | |
590 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
591 static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){ |
1290 | 592 int i; |
2967 | 593 |
594 for(i=0; i<max; i++){ | |
1290 | 595 if(get_cabac(c, state)==0) |
596 return i; | |
2967 | 597 |
1290 | 598 if(i< max_index) state++; |
599 } | |
600 | |
601 return truncated ? max : -1; | |
602 } | |
603 | |
604 /** | |
605 * get unary exp golomb k-th order binarization. | |
606 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
607 static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){ |
1290 | 608 int i, v; |
609 int m= 1<<k; | |
2967 | 610 |
611 if(get_cabac(c, state)==0) | |
1290 | 612 return 0; |
2967 | 613 |
1290 | 614 if(0 < max_index) state++; |
2967 | 615 |
616 for(i=1; i<max; i++){ | |
1290 | 617 if(get_cabac(c, state)==0){ |
618 if(is_signed && get_cabac_bypass(c)){ | |
619 return -i; | |
620 }else | |
621 return i; | |
622 } | |
623 | |
624 if(i < max_index) state++; | |
625 } | |
2967 | 626 |
1290 | 627 while(get_cabac_bypass(c)){ |
628 i+= m; | |
629 m+= m; | |
630 } | |
2967 | 631 |
1290 | 632 v=0; |
633 while(m>>=1){ | |
634 v+= v + get_cabac_bypass(c); | |
635 } | |
636 i += v; | |
637 | |
638 if(is_signed && get_cabac_bypass(c)){ | |
639 return -i; | |
640 }else | |
641 return i; | |
642 } |