Mercurial > libavcodec.hg
annotate cabac.h @ 3990:746a60ba3177 libavcodec
enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ...
author | michael |
---|---|
date | Wed, 11 Oct 2006 12:23:40 +0000 |
parents | bb186452e7da |
children | 72bae00a317f |
rev | line source |
---|---|
1287 | 1 /* |
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
1287 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
1287 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
1287 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1287 | 20 * |
21 */ | |
2967 | 22 |
1287 | 23 /** |
24 * @file cabac.h | |
25 * Context Adaptive Binary Arithmetic Coder. | |
26 */ | |
27 | |
28 | |
3284
a224d9752912
don't force asserts in release builds. 2% faster h264.
lorenm
parents:
3036
diff
changeset
|
29 //#undef NDEBUG |
1287 | 30 #include <assert.h> |
31 | |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
32 #define CABAC_BITS 16 |
2323 | 33 #define CABAC_MASK ((1<<CABAC_BITS)-1) |
3984 | 34 #define BRANCHLESS_CABAC_DECODER 1 |
3990
746a60ba3177
enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ...
michael
parents:
3984
diff
changeset
|
35 #define CMOV_IS_FAST 1 |
2323 | 36 |
1287 | 37 typedef struct CABACContext{ |
38 int low; | |
39 int range; | |
40 int outstanding_count; | |
41 #ifdef STRICT_LIMITS | |
42 int symCount; | |
43 #endif | |
3976
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
44 uint8_t lps_range[2*65][4]; ///< rangeTabLPS |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
45 uint8_t lps_state[2*64]; ///< transIdxLPS |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
46 uint8_t mps_state[2*64]; ///< transIdxMPS |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
47 const uint8_t *bytestream_start; |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
48 const uint8_t *bytestream; |
2116 | 49 const uint8_t *bytestream_end; |
1287 | 50 PutBitContext pb; |
51 }CABACContext; | |
52 | |
1301 | 53 extern const uint8_t ff_h264_lps_range[64][4]; |
54 extern const uint8_t ff_h264_mps_state[64]; | |
55 extern const uint8_t ff_h264_lps_state[64]; | |
3964 | 56 extern const uint8_t ff_h264_norm_shift[128]; |
2323 | 57 |
1287 | 58 |
59 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size); | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
60 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); |
2967 | 61 void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4], |
1287 | 62 uint8_t const *mps_state, uint8_t const *lps_state, int state_count); |
63 | |
64 | |
65 static inline void put_cabac_bit(CABACContext *c, int b){ | |
2967 | 66 put_bits(&c->pb, 1, b); |
67 for(;c->outstanding_count; c->outstanding_count--){ | |
1287 | 68 put_bits(&c->pb, 1, 1-b); |
69 } | |
70 } | |
71 | |
72 static inline void renorm_cabac_encoder(CABACContext *c){ | |
73 while(c->range < 0x100){ | |
74 //FIXME optimize | |
75 if(c->low<0x100){ | |
76 put_cabac_bit(c, 0); | |
77 }else if(c->low<0x200){ | |
78 c->outstanding_count++; | |
79 c->low -= 0x100; | |
80 }else{ | |
81 put_cabac_bit(c, 1); | |
82 c->low -= 0x200; | |
83 } | |
2967 | 84 |
1287 | 85 c->range+= c->range; |
86 c->low += c->low; | |
87 } | |
88 } | |
89 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
90 static void put_cabac(CABACContext *c, uint8_t * const state, int bit){ |
2323 | 91 int RangeLPS= c->lps_range[*state][c->range>>6]; |
2967 | 92 |
1287 | 93 if(bit == ((*state)&1)){ |
94 c->range -= RangeLPS; | |
95 *state= c->mps_state[*state]; | |
96 }else{ | |
97 c->low += c->range - RangeLPS; | |
98 c->range = RangeLPS; | |
99 *state= c->lps_state[*state]; | |
100 } | |
2967 | 101 |
1287 | 102 renorm_cabac_encoder(c); |
103 | |
104 #ifdef STRICT_LIMITS | |
105 c->symCount++; | |
106 #endif | |
107 } | |
108 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
109 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){ |
1287 | 110 assert(c->range > RangeLPS); |
111 | |
112 if(!bit){ | |
113 c->range -= RangeLPS; | |
114 }else{ | |
115 c->low += c->range - RangeLPS; | |
116 c->range = RangeLPS; | |
117 } | |
118 | |
119 renorm_cabac_encoder(c); | |
120 | |
121 #ifdef STRICT_LIMITS | |
122 c->symCount++; | |
123 #endif | |
124 } | |
125 | |
1290 | 126 /** |
127 * @param bit 0 -> write zero bit, !=0 write one bit | |
128 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
129 static void put_cabac_bypass(CABACContext *c, int bit){ |
1287 | 130 c->low += c->low; |
131 | |
132 if(bit){ | |
133 c->low += c->range; | |
134 } | |
135 //FIXME optimize | |
136 if(c->low<0x200){ | |
137 put_cabac_bit(c, 0); | |
138 }else if(c->low<0x400){ | |
139 c->outstanding_count++; | |
140 c->low -= 0x200; | |
141 }else{ | |
142 put_cabac_bit(c, 1); | |
143 c->low -= 0x400; | |
144 } | |
2967 | 145 |
1287 | 146 #ifdef STRICT_LIMITS |
147 c->symCount++; | |
148 #endif | |
149 } | |
150 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
151 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
152 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
153 * @return the number of bytes written |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
154 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
155 static int put_cabac_terminate(CABACContext *c, int bit){ |
1287 | 156 c->range -= 2; |
157 | |
158 if(!bit){ | |
159 renorm_cabac_encoder(c); | |
160 }else{ | |
161 c->low += c->range; | |
162 c->range= 2; | |
2967 | 163 |
1287 | 164 renorm_cabac_encoder(c); |
165 | |
166 assert(c->low <= 0x1FF); | |
167 put_cabac_bit(c, c->low>>9); | |
168 put_bits(&c->pb, 2, ((c->low>>7)&3)|1); | |
2967 | 169 |
1287 | 170 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong |
171 } | |
2967 | 172 |
1287 | 173 #ifdef STRICT_LIMITS |
174 c->symCount++; | |
175 #endif | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
176 |
1787 | 177 return (put_bits_count(&c->pb)+7)>>3; |
1287 | 178 } |
179 | |
1290 | 180 /** |
181 * put (truncated) unary binarization. | |
182 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
183 static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ |
1290 | 184 int i; |
2967 | 185 |
1290 | 186 assert(v <= max); |
2967 | 187 |
1290 | 188 #if 1 |
189 for(i=0; i<v; i++){ | |
190 put_cabac(c, state, 1); | |
191 if(i < max_index) state++; | |
192 } | |
193 if(truncated==0 || v<max) | |
194 put_cabac(c, state, 0); | |
195 #else | |
196 if(v <= max_index){ | |
197 for(i=0; i<v; i++){ | |
198 put_cabac(c, state+i, 1); | |
199 } | |
200 if(truncated==0 || v<max) | |
201 put_cabac(c, state+i, 0); | |
202 }else{ | |
203 for(i=0; i<=max_index; i++){ | |
204 put_cabac(c, state+i, 1); | |
205 } | |
206 for(; i<v; i++){ | |
207 put_cabac(c, state+max_index, 1); | |
208 } | |
209 if(truncated==0 || v<max) | |
210 put_cabac(c, state+max_index, 0); | |
211 } | |
212 #endif | |
213 } | |
214 | |
215 /** | |
216 * put unary exp golomb k-th order binarization. | |
217 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
218 static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ |
1290 | 219 int i; |
2967 | 220 |
1290 | 221 if(v==0) |
222 put_cabac(c, state, 0); | |
223 else{ | |
1298 | 224 const int sign= v < 0; |
2967 | 225 |
1298 | 226 if(is_signed) v= ABS(v); |
2967 | 227 |
1290 | 228 if(v<max){ |
229 for(i=0; i<v; i++){ | |
230 put_cabac(c, state, 1); | |
231 if(i < max_index) state++; | |
232 } | |
233 | |
234 put_cabac(c, state, 0); | |
235 }else{ | |
236 int m= 1<<k; | |
237 | |
238 for(i=0; i<max; i++){ | |
239 put_cabac(c, state, 1); | |
240 if(i < max_index) state++; | |
241 } | |
242 | |
243 v -= max; | |
244 while(v >= m){ //FIXME optimize | |
245 put_cabac_bypass(c, 1); | |
246 v-= m; | |
247 m+= m; | |
248 } | |
249 put_cabac_bypass(c, 0); | |
250 while(m>>=1){ | |
251 put_cabac_bypass(c, v&m); | |
252 } | |
253 } | |
254 | |
255 if(is_signed) | |
256 put_cabac_bypass(c, sign); | |
257 } | |
258 } | |
259 | |
2323 | 260 static void refill(CABACContext *c){ |
261 #if CABAC_BITS == 16 | |
3946 | 262 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); |
2323 | 263 #else |
264 c->low+= c->bytestream[0]<<1; | |
265 #endif | |
266 c->low -= CABAC_MASK; | |
267 c->bytestream+= CABAC_BITS/8; | |
268 } | |
269 | |
270 static void refill2(CABACContext *c){ | |
271 int i, x; | |
272 | |
273 x= c->low ^ (c->low-1); | |
3964 | 274 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS+1)]; |
2323 | 275 |
276 x= -CABAC_MASK; | |
2967 | 277 |
2323 | 278 #if CABAC_BITS == 16 |
279 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); | |
280 #else | |
281 x+= c->bytestream[0]<<1; | |
282 #endif | |
2967 | 283 |
2323 | 284 c->low += x<<i; |
285 c->bytestream+= CABAC_BITS/8; | |
286 } | |
287 | |
1287 | 288 static inline void renorm_cabac_decoder(CABACContext *c){ |
2323 | 289 while(c->range < (0x200 << CABAC_BITS)){ |
1287 | 290 c->range+= c->range; |
291 c->low+= c->low; | |
2323 | 292 if(!(c->low & CABAC_MASK)) |
293 refill(c); | |
1287 | 294 } |
295 } | |
296 | |
2323 | 297 static inline void renorm_cabac_decoder_once(CABACContext *c){ |
3951 | 298 #ifdef ARCH_X86_DISABLED |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
299 int temp; |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
300 #if 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
301 //P3:683 athlon:475 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
302 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
303 "lea -0x2000000(%0), %2 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
304 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
305 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
306 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
307 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
308 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
309 #elif 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
310 //P3:680 athlon:474 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
311 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
312 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
313 "setb %%cl \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
314 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
315 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
316 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
317 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
318 #elif 1 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
319 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
320 //P3:665 athlon:517 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
321 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
322 "lea -0x2000000(%0), %%eax \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
323 "cdq \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
324 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
325 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
326 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
327 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
328 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
329 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
330 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
331 #elif 0 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
332 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
333 //P3:673 athlon:509 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
334 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
335 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
336 "sbb %%edx, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
337 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
338 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
339 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
340 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
341 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
342 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
343 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
344 #else |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
345 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
346 //P3:677 athlon:511 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
347 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
348 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
349 "lea (%0, %0), %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
350 "lea (%1, %1), %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
351 "cmovb %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
352 "cmovb %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
353 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
354 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
355 #endif |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
356 #else |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
357 //P3:675 athlon:476 |
3642 | 358 int shift= (uint32_t)(c->range - (0x200 << CABAC_BITS))>>31; |
359 c->range<<= shift; | |
360 c->low <<= shift; | |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
361 #endif |
2323 | 362 if(!(c->low & CABAC_MASK)) |
363 refill(c); | |
364 } | |
365 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
366 static int get_cabac(CABACContext *c, uint8_t * const state){ |
3642 | 367 //FIXME gcc generates duplicate load/stores for c->low and c->range |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
368 #ifdef ARCH_X86 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
369 int bit; |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
370 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
371 #define LOW "0" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
372 #define RANGE "4" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
373 #define LPS_RANGE "12" |
3976
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
374 #define LPS_STATE "12+2*65*4" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
375 #define MPS_STATE "12+2*65*4+2*64" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
376 #define BYTESTART "12+2*65*4+4*64" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
377 #define BYTE "16+2*65*4+4*64" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
378 #define BYTEEND "20+2*65*4+4*64" |
3984 | 379 #ifndef BRANCHLESS_CABAC_DECODER |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
380 asm volatile( |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
381 "movzbl (%1), %%eax \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
382 "movl "RANGE "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
383 "movl "RANGE "(%2), %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
384 "shrl $23, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
385 "leal "LPS_RANGE"(%2, %%eax, 4), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
386 "movzbl (%%ebx, %%esi), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
387 "shll $17, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
388 "movl "LOW "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
389 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
390 "subl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
391 "cmpl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
392 " ja 1f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
393 "cmp $0x2000000, %%edx \n\t" //FIXME avoidable |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
394 "setb %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
395 "shl %%cl, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
396 "shl %%cl, %%ebx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
397 "movzbl "MPS_STATE"(%2, %%eax), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
398 "movb %%cl, (%1) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
399 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
400 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
401 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
402 "movl "BYTE "(%2), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
403 "subl $0xFFFF, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
404 "movzwl (%%esi), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
405 "bswap %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
406 "shrl $15, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
407 "addl $2, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
408 "addl %%ecx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
409 "movl %%esi, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
410 "jmp 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
411 "1: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
412 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
413 "subl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
414 "movl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
415 "shr $19, %%esi \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
416 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
417 "shll %%cl, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
418 "shll %%cl, %%edx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
419 "movzbl "LPS_STATE"(%2, %%eax), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
420 "movb %%cl, (%1) \n\t" |
3978 | 421 "addl $1, %%eax \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
422 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
423 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
424 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
425 "movl "BYTE "(%2), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
426 "movzwl (%%ecx), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
427 "bswap %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
428 "shrl $15, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
429 "subl $0xFFFF, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
430 "addl $2, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
431 "movl %%ecx, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
432 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
433 "leal -1(%%ebx), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
434 "xorl %%ebx, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
435 "shrl $17, %%ecx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
436 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
437 "neg %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
438 "add $7, %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
439 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
440 "shll %%cl , %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
441 "addl %%esi, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
442 "2: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
443 "movl %%edx, "RANGE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
444 "movl %%ebx, "LOW "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
445 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
446 :"r"(state), "r"(c) |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
447 : "%ecx", "%ebx", "%edx", "%esi" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
448 ); |
3982
af16271634c2
moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop
michael
parents:
3981
diff
changeset
|
449 bit&=1; |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
450 #else |
3975 | 451 asm volatile( |
452 "movzbl (%1), %%eax \n\t" | |
453 "movl "RANGE "(%2), %%ebx \n\t" | |
454 "movl "RANGE "(%2), %%edx \n\t" | |
455 "shrl $23, %%ebx \n\t" | |
456 "leal "LPS_RANGE"(%2, %%eax, 4), %%esi \n\t" | |
457 "movzbl (%%ebx, %%esi), %%esi \n\t" | |
458 "shll $17, %%esi \n\t" | |
459 "movl "LOW "(%2), %%ebx \n\t" | |
460 //eax:state ebx:low, edx:range, esi:RangeLPS | |
461 "subl %%esi, %%edx \n\t" | |
3980 | 462 #ifdef CMOV_IS_FAST //FIXME actually define this somewhere |
463 "cmpl %%ebx, %%edx \n\t" | |
464 "cmova %%edx, %%esi \n\t" | |
465 "sbbl %%ecx, %%ecx \n\t" | |
466 "andl %%ecx, %%edx \n\t" | |
467 "subl %%edx, %%ebx \n\t" | |
468 "xorl %%ecx, %%eax \n\t" | |
469 #else | |
3975 | 470 "movl %%edx, %%ecx \n\t" |
471 "subl %%ebx, %%edx \n\t" | |
472 "sarl $31, %%edx \n\t" //lps_mask | |
473 "subl %%ecx, %%esi \n\t" //RangeLPS - range | |
474 "andl %%edx, %%esi \n\t" //(RangeLPS - range)&lps_mask | |
475 "addl %%ecx, %%esi \n\t" //new range | |
476 "andl %%edx, %%ecx \n\t" | |
477 "subl %%ecx, %%ebx \n\t" | |
3980 | 478 "xorl %%edx, %%eax \n\t" |
479 #endif | |
3975 | 480 |
481 //eax:state ebx:low edx:mask esi:range | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
482 "movzbl "MPS_STATE"(%2, %%eax), %%ecx \n\t" |
3975 | 483 "movb %%cl, (%1) \n\t" |
484 | |
485 "movl %%esi, %%edx \n\t" | |
486 //eax:bit ebx:low edx:range esi:range | |
487 | |
488 "shr $19, %%esi \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
489 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3975 | 490 "shll %%cl, %%ebx \n\t" |
491 "shll %%cl, %%edx \n\t" | |
492 "test %%bx, %%bx \n\t" | |
493 " jnz 1f \n\t" | |
494 | |
495 "movl "BYTE "(%2), %%ecx \n\t" | |
496 "movzwl (%%ecx), %%esi \n\t" | |
497 "bswap %%esi \n\t" | |
498 "shrl $15, %%esi \n\t" | |
499 "subl $0xFFFF, %%esi \n\t" | |
500 "addl $2, %%ecx \n\t" | |
501 "movl %%ecx, "BYTE "(%2) \n\t" | |
502 | |
503 "leal -1(%%ebx), %%ecx \n\t" | |
504 "xorl %%ebx, %%ecx \n\t" | |
505 "shrl $17, %%ecx \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
506 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3975 | 507 "neg %%cl \n\t" |
508 "add $7, %%cl \n\t" | |
509 | |
510 "shll %%cl , %%esi \n\t" | |
511 "addl %%esi, %%ebx \n\t" | |
512 "1: \n\t" | |
513 "movl %%edx, "RANGE "(%2) \n\t" | |
514 "movl %%ebx, "LOW "(%2) \n\t" | |
515 :"=&a"(bit) | |
516 :"r"(state), "r"(c) | |
517 : "%ecx", "%ebx", "%edx", "%esi" | |
518 ); | |
3981
9854f686ba79
move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster)
michael
parents:
3980
diff
changeset
|
519 bit&=1; |
3975 | 520 #endif |
521 #else | |
3642 | 522 int s = *state; |
523 int RangeLPS= c->lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1); | |
2522
e25782262d7d
kill warnings patch by (M«©ns Rullg«©rd <mru inprovide com>)
michael
parents:
2323
diff
changeset
|
524 int bit, lps_mask attribute_unused; |
2967 | 525 |
1287 | 526 c->range -= RangeLPS; |
3984 | 527 #ifndef BRANCHLESS_CABAC_DECODER |
1287 | 528 if(c->low < c->range){ |
3642 | 529 bit= s&1; |
3971
e8a6d5c1ab0b
drop failed attempt to optimize *state= c->mps_state[s];
michael
parents:
3970
diff
changeset
|
530 *state= c->mps_state[s]; |
2323 | 531 renorm_cabac_decoder_once(c); |
1287 | 532 }else{ |
3964 | 533 bit= ff_h264_norm_shift[RangeLPS>>19]; |
1287 | 534 c->low -= c->range; |
3642 | 535 *state= c->lps_state[s]; |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
536 c->range = RangeLPS<<bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
537 c->low <<= bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
538 bit= (s&1)^1; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
539 |
2323 | 540 if(!(c->low & 0xFFFF)){ |
541 refill2(c); | |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
542 } |
1287 | 543 } |
2323 | 544 #else |
545 lps_mask= (c->range - c->low)>>31; | |
2967 | 546 |
2323 | 547 c->low -= c->range & lps_mask; |
548 c->range += (RangeLPS - c->range) & lps_mask; | |
2967 | 549 |
3974 | 550 s^=lps_mask; |
551 *state= c->mps_state[s]; | |
552 bit= s&1; | |
2967 | 553 |
3970 | 554 lps_mask= ff_h264_norm_shift[c->range>>(CABAC_BITS+3)]; |
2323 | 555 c->range<<= lps_mask; |
556 c->low <<= lps_mask; | |
557 if(!(c->low & CABAC_MASK)) | |
558 refill2(c); | |
559 #endif | |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
560 #endif |
2967 | 561 return bit; |
1287 | 562 } |
563 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
564 static int get_cabac_bypass(CABACContext *c){ |
1287 | 565 c->low += c->low; |
566 | |
2323 | 567 if(!(c->low & CABAC_MASK)) |
568 refill(c); | |
2967 | 569 |
1287 | 570 if(c->low < c->range){ |
571 return 0; | |
572 }else{ | |
573 c->low -= c->range; | |
574 return 1; | |
575 } | |
576 } | |
577 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
578 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
579 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
580 * @return the number of bytes read or 0 if no end |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
581 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
582 static int get_cabac_terminate(CABACContext *c){ |
2323 | 583 c->range -= 4<<CABAC_BITS; |
1287 | 584 if(c->low < c->range){ |
2323 | 585 renorm_cabac_decoder_once(c); |
1287 | 586 return 0; |
587 }else{ | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
588 return c->bytestream - c->bytestream_start; |
2967 | 589 } |
1287 | 590 } |
591 | |
1290 | 592 /** |
593 * get (truncated) unnary binarization. | |
594 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
595 static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){ |
1290 | 596 int i; |
2967 | 597 |
598 for(i=0; i<max; i++){ | |
1290 | 599 if(get_cabac(c, state)==0) |
600 return i; | |
2967 | 601 |
1290 | 602 if(i< max_index) state++; |
603 } | |
604 | |
605 return truncated ? max : -1; | |
606 } | |
607 | |
608 /** | |
609 * get unary exp golomb k-th order binarization. | |
610 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
611 static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){ |
1290 | 612 int i, v; |
613 int m= 1<<k; | |
2967 | 614 |
615 if(get_cabac(c, state)==0) | |
1290 | 616 return 0; |
2967 | 617 |
1290 | 618 if(0 < max_index) state++; |
2967 | 619 |
620 for(i=1; i<max; i++){ | |
1290 | 621 if(get_cabac(c, state)==0){ |
622 if(is_signed && get_cabac_bypass(c)){ | |
623 return -i; | |
624 }else | |
625 return i; | |
626 } | |
627 | |
628 if(i < max_index) state++; | |
629 } | |
2967 | 630 |
1290 | 631 while(get_cabac_bypass(c)){ |
632 i+= m; | |
633 m+= m; | |
634 } | |
2967 | 635 |
1290 | 636 v=0; |
637 while(m>>=1){ | |
638 v+= v + get_cabac_bypass(c); | |
639 } | |
640 i += v; | |
641 | |
642 if(is_signed && get_cabac_bypass(c)){ | |
643 return -i; | |
644 }else | |
645 return i; | |
646 } |