Mercurial > libavcodec.hg
annotate cabac.h @ 4167:a3134db4857e libavcodec
store a identifer and the first header in extradata
with this mp3 should be binary identical to what you had before header compression
support mp3 with crc (by droping the crc and putting it back during header decompress, currently its just random tough, does any deocoder even check it?)
author | michael |
---|---|
date | Fri, 10 Nov 2006 11:31:02 +0000 |
parents | 4ce3923d5806 |
children | 92f773cfebf5 |
rev | line source |
---|---|
1287 | 1 /* |
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
1287 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
1287 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
1287 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1287 | 20 * |
21 */ | |
2967 | 22 |
1287 | 23 /** |
24 * @file cabac.h | |
25 * Context Adaptive Binary Arithmetic Coder. | |
26 */ | |
27 | |
28 | |
3284
a224d9752912
don't force asserts in release builds. 2% faster h264.
lorenm
parents:
3036
diff
changeset
|
29 //#undef NDEBUG |
1287 | 30 #include <assert.h> |
4064 | 31 #ifdef ARCH_X86 |
32 #include "x86_cpu.h" | |
33 #endif | |
1287 | 34 |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
35 #define CABAC_BITS 16 |
2323 | 36 #define CABAC_MASK ((1<<CABAC_BITS)-1) |
3984 | 37 #define BRANCHLESS_CABAC_DECODER 1 |
4039 | 38 //#define ARCH_X86_DISABLED 1 |
2323 | 39 |
1287 | 40 typedef struct CABACContext{ |
41 int low; | |
42 int range; | |
43 int outstanding_count; | |
44 #ifdef STRICT_LIMITS | |
45 int symCount; | |
46 #endif | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
47 const uint8_t *bytestream_start; |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
48 const uint8_t *bytestream; |
2116 | 49 const uint8_t *bytestream_end; |
1287 | 50 PutBitContext pb; |
51 }CABACContext; | |
52 | |
4014
b2582438effe
dehack *ps_state indexing in the branchless decoder
michael
parents:
4012
diff
changeset
|
53 extern uint8_t ff_h264_mlps_state[4*64]; |
4039 | 54 extern uint8_t ff_h264_lps_range[4*2*64]; ///< rangeTabLPS |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
55 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
56 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
57 extern const uint8_t ff_h264_norm_shift[512]; |
2323 | 58 |
1287 | 59 |
60 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size); | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
61 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
62 void ff_init_cabac_states(CABACContext *c); |
1287 | 63 |
64 | |
65 static inline void put_cabac_bit(CABACContext *c, int b){ | |
2967 | 66 put_bits(&c->pb, 1, b); |
67 for(;c->outstanding_count; c->outstanding_count--){ | |
1287 | 68 put_bits(&c->pb, 1, 1-b); |
69 } | |
70 } | |
71 | |
72 static inline void renorm_cabac_encoder(CABACContext *c){ | |
73 while(c->range < 0x100){ | |
74 //FIXME optimize | |
75 if(c->low<0x100){ | |
76 put_cabac_bit(c, 0); | |
77 }else if(c->low<0x200){ | |
78 c->outstanding_count++; | |
79 c->low -= 0x100; | |
80 }else{ | |
81 put_cabac_bit(c, 1); | |
82 c->low -= 0x200; | |
83 } | |
2967 | 84 |
1287 | 85 c->range+= c->range; |
86 c->low += c->low; | |
87 } | |
88 } | |
89 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
90 static void put_cabac(CABACContext *c, uint8_t * const state, int bit){ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
91 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + *state]; |
2967 | 92 |
1287 | 93 if(bit == ((*state)&1)){ |
94 c->range -= RangeLPS; | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
95 *state= ff_h264_mps_state[*state]; |
1287 | 96 }else{ |
97 c->low += c->range - RangeLPS; | |
98 c->range = RangeLPS; | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
99 *state= ff_h264_lps_state[*state]; |
1287 | 100 } |
2967 | 101 |
1287 | 102 renorm_cabac_encoder(c); |
103 | |
104 #ifdef STRICT_LIMITS | |
105 c->symCount++; | |
106 #endif | |
107 } | |
108 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
109 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){ |
1287 | 110 assert(c->range > RangeLPS); |
111 | |
112 if(!bit){ | |
113 c->range -= RangeLPS; | |
114 }else{ | |
115 c->low += c->range - RangeLPS; | |
116 c->range = RangeLPS; | |
117 } | |
118 | |
119 renorm_cabac_encoder(c); | |
120 | |
121 #ifdef STRICT_LIMITS | |
122 c->symCount++; | |
123 #endif | |
124 } | |
125 | |
1290 | 126 /** |
127 * @param bit 0 -> write zero bit, !=0 write one bit | |
128 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
129 static void put_cabac_bypass(CABACContext *c, int bit){ |
1287 | 130 c->low += c->low; |
131 | |
132 if(bit){ | |
133 c->low += c->range; | |
134 } | |
135 //FIXME optimize | |
136 if(c->low<0x200){ | |
137 put_cabac_bit(c, 0); | |
138 }else if(c->low<0x400){ | |
139 c->outstanding_count++; | |
140 c->low -= 0x200; | |
141 }else{ | |
142 put_cabac_bit(c, 1); | |
143 c->low -= 0x400; | |
144 } | |
2967 | 145 |
1287 | 146 #ifdef STRICT_LIMITS |
147 c->symCount++; | |
148 #endif | |
149 } | |
150 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
151 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
152 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
153 * @return the number of bytes written |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
154 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
155 static int put_cabac_terminate(CABACContext *c, int bit){ |
1287 | 156 c->range -= 2; |
157 | |
158 if(!bit){ | |
159 renorm_cabac_encoder(c); | |
160 }else{ | |
161 c->low += c->range; | |
162 c->range= 2; | |
2967 | 163 |
1287 | 164 renorm_cabac_encoder(c); |
165 | |
166 assert(c->low <= 0x1FF); | |
167 put_cabac_bit(c, c->low>>9); | |
168 put_bits(&c->pb, 2, ((c->low>>7)&3)|1); | |
2967 | 169 |
1287 | 170 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong |
171 } | |
2967 | 172 |
1287 | 173 #ifdef STRICT_LIMITS |
174 c->symCount++; | |
175 #endif | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
176 |
1787 | 177 return (put_bits_count(&c->pb)+7)>>3; |
1287 | 178 } |
179 | |
1290 | 180 /** |
181 * put (truncated) unary binarization. | |
182 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
183 static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ |
1290 | 184 int i; |
2967 | 185 |
1290 | 186 assert(v <= max); |
2967 | 187 |
1290 | 188 #if 1 |
189 for(i=0; i<v; i++){ | |
190 put_cabac(c, state, 1); | |
191 if(i < max_index) state++; | |
192 } | |
193 if(truncated==0 || v<max) | |
194 put_cabac(c, state, 0); | |
195 #else | |
196 if(v <= max_index){ | |
197 for(i=0; i<v; i++){ | |
198 put_cabac(c, state+i, 1); | |
199 } | |
200 if(truncated==0 || v<max) | |
201 put_cabac(c, state+i, 0); | |
202 }else{ | |
203 for(i=0; i<=max_index; i++){ | |
204 put_cabac(c, state+i, 1); | |
205 } | |
206 for(; i<v; i++){ | |
207 put_cabac(c, state+max_index, 1); | |
208 } | |
209 if(truncated==0 || v<max) | |
210 put_cabac(c, state+max_index, 0); | |
211 } | |
212 #endif | |
213 } | |
214 | |
215 /** | |
216 * put unary exp golomb k-th order binarization. | |
217 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
218 static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ |
1290 | 219 int i; |
2967 | 220 |
1290 | 221 if(v==0) |
222 put_cabac(c, state, 0); | |
223 else{ | |
1298 | 224 const int sign= v < 0; |
2967 | 225 |
4001 | 226 if(is_signed) v= FFABS(v); |
2967 | 227 |
1290 | 228 if(v<max){ |
229 for(i=0; i<v; i++){ | |
230 put_cabac(c, state, 1); | |
231 if(i < max_index) state++; | |
232 } | |
233 | |
234 put_cabac(c, state, 0); | |
235 }else{ | |
236 int m= 1<<k; | |
237 | |
238 for(i=0; i<max; i++){ | |
239 put_cabac(c, state, 1); | |
240 if(i < max_index) state++; | |
241 } | |
242 | |
243 v -= max; | |
244 while(v >= m){ //FIXME optimize | |
245 put_cabac_bypass(c, 1); | |
246 v-= m; | |
247 m+= m; | |
248 } | |
249 put_cabac_bypass(c, 0); | |
250 while(m>>=1){ | |
251 put_cabac_bypass(c, v&m); | |
252 } | |
253 } | |
254 | |
255 if(is_signed) | |
256 put_cabac_bypass(c, sign); | |
257 } | |
258 } | |
259 | |
2323 | 260 static void refill(CABACContext *c){ |
261 #if CABAC_BITS == 16 | |
3946 | 262 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); |
2323 | 263 #else |
264 c->low+= c->bytestream[0]<<1; | |
265 #endif | |
266 c->low -= CABAC_MASK; | |
267 c->bytestream+= CABAC_BITS/8; | |
268 } | |
269 | |
270 static void refill2(CABACContext *c){ | |
271 int i, x; | |
272 | |
273 x= c->low ^ (c->low-1); | |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
274 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; |
2323 | 275 |
276 x= -CABAC_MASK; | |
2967 | 277 |
2323 | 278 #if CABAC_BITS == 16 |
279 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); | |
280 #else | |
281 x+= c->bytestream[0]<<1; | |
282 #endif | |
2967 | 283 |
2323 | 284 c->low += x<<i; |
285 c->bytestream+= CABAC_BITS/8; | |
286 } | |
287 | |
1287 | 288 static inline void renorm_cabac_decoder(CABACContext *c){ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
289 while(c->range < 0x100){ |
1287 | 290 c->range+= c->range; |
291 c->low+= c->low; | |
2323 | 292 if(!(c->low & CABAC_MASK)) |
293 refill(c); | |
1287 | 294 } |
295 } | |
296 | |
2323 | 297 static inline void renorm_cabac_decoder_once(CABACContext *c){ |
3951 | 298 #ifdef ARCH_X86_DISABLED |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
299 int temp; |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
300 #if 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
301 //P3:683 athlon:475 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
302 asm( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
303 "lea -0x100(%0), %2 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
304 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
305 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
306 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
307 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
308 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
309 #elif 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
310 //P3:680 athlon:474 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
311 asm( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
312 "cmp $0x100, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
313 "setb %%cl \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
314 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
315 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
316 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
317 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
318 #elif 1 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
319 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
320 //P3:665 athlon:517 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
321 asm( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
322 "lea -0x100(%0), %%eax \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
323 "cdq \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
324 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
325 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
326 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
327 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
328 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
329 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
330 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
331 #elif 0 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
332 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
333 //P3:673 athlon:509 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
334 asm( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
335 "cmp $0x100, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
336 "sbb %%edx, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
337 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
338 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
339 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
340 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
341 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
342 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
343 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
344 #else |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
345 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
346 //P3:677 athlon:511 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
347 asm( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
348 "cmp $0x100, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
349 "lea (%0, %0), %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
350 "lea (%1, %1), %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
351 "cmovb %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
352 "cmovb %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
353 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
354 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
355 #endif |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
356 #else |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
357 //P3:675 athlon:476 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
358 int shift= (uint32_t)(c->range - 0x100)>>31; |
3642 | 359 c->range<<= shift; |
360 c->low <<= shift; | |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
361 #endif |
2323 | 362 if(!(c->low & CABAC_MASK)) |
363 refill(c); | |
364 } | |
365 | |
4008
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
366 static int always_inline get_cabac_inline(CABACContext *c, uint8_t * const state){ |
3642 | 367 //FIXME gcc generates duplicate load/stores for c->low and c->range |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
368 #define LOW "0" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
369 #define RANGE "4" |
4064 | 370 #ifdef ARCH_X86_64 |
371 #define BYTESTART "16" | |
372 #define BYTE "24" | |
373 #define BYTEEND "32" | |
374 #else | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
375 #define BYTESTART "12" |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
376 #define BYTE "16" |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
377 #define BYTEEND "20" |
4064 | 378 #endif |
4113
4ce3923d5806
Reenable AMD64 optimizations for cabac accidentially disabled in r6852
reimar
parents:
4112
diff
changeset
|
379 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) |
4044
5ccdefd60f61
Fix PIC compilation, some defines were under #ifdef !PIC but used
diego
parents:
4043
diff
changeset
|
380 int bit; |
5ccdefd60f61
Fix PIC compilation, some defines were under #ifdef !PIC but used
diego
parents:
4043
diff
changeset
|
381 |
3984 | 382 #ifndef BRANCHLESS_CABAC_DECODER |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
383 asm volatile( |
4035 | 384 "movzbl (%1), %0 \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
385 "movl "RANGE "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
386 "movl "RANGE "(%2), %%edx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
387 "andl $0xC0, %%ebx \n\t" |
4035 | 388 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
389 "movl "LOW "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
390 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
391 "subl %%esi, %%edx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
392 "movl %%edx, %%ecx \n\t" |
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
393 "shll $17, %%ecx \n\t" |
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
394 "cmpl %%ecx, %%ebx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
395 " ja 1f \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
396 |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
397 #if 1 |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
398 //athlon:4067 P3:4110 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
399 "lea -0x100(%%edx), %%ecx \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
400 "shr $31, %%ecx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
401 "shl %%cl, %%edx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
402 "shl %%cl, %%ebx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
403 #else |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
404 //athlon:4057 P3:4130 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
405 "cmp $0x100, %%edx \n\t" //FIXME avoidable |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
406 "setb %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
407 "shl %%cl, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
408 "shl %%cl, %%ebx \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
409 #endif |
4035 | 410 "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
411 "movb %%cl, (%1) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
412 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
413 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
414 " jnz 2f \n\t" |
4064 | 415 "mov "BYTE "(%2), %%"REG_S" \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
416 "subl $0xFFFF, %%ebx \n\t" |
4064 | 417 "movzwl (%%"REG_S"), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
418 "bswap %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
419 "shrl $15, %%ecx \n\t" |
4064 | 420 "add $2, %%"REG_S" \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
421 "addl %%ecx, %%ebx \n\t" |
4064 | 422 "mov %%"REG_S", "BYTE "(%2) \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
423 "jmp 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
424 "1: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
425 //eax:state ebx:low, edx:range, esi:RangeLPS |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
426 "subl %%ecx, %%ebx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
427 "movl %%esi, %%edx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
428 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
429 "shll %%cl, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
430 "shll %%cl, %%edx \n\t" |
4035 | 431 "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
432 "movb %%cl, (%1) \n\t" |
4064 | 433 "add $1, %0 \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
434 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
435 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
436 |
4064 | 437 "mov "BYTE "(%2), %%"REG_c" \n\t" |
438 "movzwl (%%"REG_c"), %%esi \n\t" | |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
439 "bswap %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
440 "shrl $15, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
441 "subl $0xFFFF, %%esi \n\t" |
4064 | 442 "add $2, %%"REG_c" \n\t" |
443 "mov %%"REG_c", "BYTE "(%2) \n\t" | |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
444 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
445 "leal -1(%%ebx), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
446 "xorl %%ebx, %%ecx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
447 "shrl $15, %%ecx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
448 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3994
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
449 "neg %%ecx \n\t" |
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
450 "add $7, %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
451 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
452 "shll %%cl , %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
453 "addl %%esi, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
454 "2: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
455 "movl %%edx, "RANGE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
456 "movl %%ebx, "LOW "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
457 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
458 :"r"(state), "r"(c) |
4064 | 459 : "%"REG_c, "%ebx", "%edx", "%"REG_S, "memory" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
460 ); |
3982
af16271634c2
moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop
michael
parents:
3981
diff
changeset
|
461 bit&=1; |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
462 #else /* BRANCHLESS_CABAC_DECODER */ |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
463 |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
464 |
4050
2c79a8281cb6
Protect code that uses CMOV instructions with HAVE_CMOV,
gpoirier
parents:
4049
diff
changeset
|
465 #if defined CMOV_IS_FAST |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
466 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
467 "mov "tmp" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
468 "shl $17 , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
469 "cmp "low" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
470 "cmova %%ecx , "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
471 "sbb %%ecx , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
472 "and %%ecx , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
473 "sub "tmp" , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
474 "xor %%ecx , "ret" \n\t" |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
475 #else /* CMOV_IS_FAST */ |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
476 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
477 "mov "tmp" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
478 "shl $17 , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
479 "sub "low" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
480 "sar $31 , "tmp" \n\t" /*lps_mask*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
481 "sub %%ecx , "range" \n\t" /*RangeLPS - range*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
482 "and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
483 "add %%ecx , "range" \n\t" /*new range*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
484 "shl $17 , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
485 "and "tmp" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
486 "sub %%ecx , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
487 "xor "tmp" , "ret" \n\t" |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
488 #endif /* CMOV_IS_FAST */ |
3975 | 489 |
490 | |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
491 #define BRANCHLESS_GET_CABAC(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
492 "movzbl "statep" , "ret" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
493 "mov "range" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
494 "and $0xC0 , "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
495 "movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
496 "sub "range" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
497 BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
498 "movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
499 "shl %%cl , "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
500 "movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
501 "mov "tmpbyte" , "statep" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
502 "shl %%cl , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
503 "test "lowword" , "lowword" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
504 " jnz 1f \n\t"\ |
4064 | 505 "mov "BYTE"("cabac"), %%"REG_c" \n\t"\ |
506 "movzwl (%%"REG_c") , "tmp" \n\t"\ | |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
507 "bswap "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
508 "shr $15 , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
509 "sub $0xFFFF , "tmp" \n\t"\ |
4064 | 510 "add $2 , %%"REG_c" \n\t"\ |
511 "mov %%"REG_c" , "BYTE "("cabac") \n\t"\ | |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
512 "lea -1("low") , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
513 "xor "low" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
514 "shr $15 , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
515 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
516 "neg %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
517 "add $7 , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
518 "shl %%cl , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
519 "add "tmp" , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
520 "1: \n\t" |
3975 | 521 |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
522 asm volatile( |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
523 "movl "RANGE "(%2), %%esi \n\t" |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
524 "movl "LOW "(%2), %%ebx \n\t" |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
525 BRANCHLESS_GET_CABAC("%0", "%2", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl") |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
526 "movl %%esi, "RANGE "(%2) \n\t" |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
527 "movl %%ebx, "LOW "(%2) \n\t" |
3975 | 528 |
529 :"=&a"(bit) | |
530 :"r"(state), "r"(c) | |
4064 | 531 : "%"REG_c, "%ebx", "%edx", "%esi", "memory" |
3975 | 532 ); |
3981
9854f686ba79
move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster)
michael
parents:
3980
diff
changeset
|
533 bit&=1; |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
534 #endif /* BRANCHLESS_CABAC_DECODER */ |
4113
4ce3923d5806
Reenable AMD64 optimizations for cabac accidentially disabled in r6852
reimar
parents:
4112
diff
changeset
|
535 #else /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */ |
3642 | 536 int s = *state; |
4039 | 537 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; |
2522
e25782262d7d
kill warnings patch by (M«©ns Rullg«©rd <mru inprovide com>)
michael
parents:
2323
diff
changeset
|
538 int bit, lps_mask attribute_unused; |
2967 | 539 |
1287 | 540 c->range -= RangeLPS; |
3984 | 541 #ifndef BRANCHLESS_CABAC_DECODER |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
542 if(c->low < (c->range<<17)){ |
3642 | 543 bit= s&1; |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
544 *state= ff_h264_mps_state[s]; |
2323 | 545 renorm_cabac_decoder_once(c); |
1287 | 546 }else{ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
547 bit= ff_h264_norm_shift[RangeLPS]; |
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
548 c->low -= (c->range<<17); |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
549 *state= ff_h264_lps_state[s]; |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
550 c->range = RangeLPS<<bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
551 c->low <<= bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
552 bit= (s&1)^1; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
553 |
2323 | 554 if(!(c->low & 0xFFFF)){ |
555 refill2(c); | |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
556 } |
1287 | 557 } |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
558 #else /* BRANCHLESS_CABAC_DECODER */ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
559 lps_mask= ((c->range<<17) - c->low)>>31; |
2967 | 560 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
561 c->low -= (c->range<<17) & lps_mask; |
2323 | 562 c->range += (RangeLPS - c->range) & lps_mask; |
2967 | 563 |
3974 | 564 s^=lps_mask; |
4014
b2582438effe
dehack *ps_state indexing in the branchless decoder
michael
parents:
4012
diff
changeset
|
565 *state= (ff_h264_mlps_state+128)[s]; |
3974 | 566 bit= s&1; |
2967 | 567 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
568 lps_mask= ff_h264_norm_shift[c->range]; |
2323 | 569 c->range<<= lps_mask; |
570 c->low <<= lps_mask; | |
571 if(!(c->low & CABAC_MASK)) | |
572 refill2(c); | |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
573 #endif /* BRANCHLESS_CABAC_DECODER */ |
4113
4ce3923d5806
Reenable AMD64 optimizations for cabac accidentially disabled in r6852
reimar
parents:
4112
diff
changeset
|
574 #endif /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */ |
2967 | 575 return bit; |
1287 | 576 } |
577 | |
4008
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
578 static int __attribute((noinline)) get_cabac_noinline(CABACContext *c, uint8_t * const state){ |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
579 return get_cabac_inline(c,state); |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
580 } |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
581 |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
582 static int get_cabac(CABACContext *c, uint8_t * const state){ |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
583 return get_cabac_inline(c,state); |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
584 } |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
585 |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
586 static int get_cabac_bypass(CABACContext *c){ |
4040 | 587 #if 0 //not faster |
588 int bit; | |
589 asm volatile( | |
590 "movl "RANGE "(%1), %%ebx \n\t" | |
591 "movl "LOW "(%1), %%eax \n\t" | |
592 "shl $17, %%ebx \n\t" | |
593 "add %%eax, %%eax \n\t" | |
594 "sub %%ebx, %%eax \n\t" | |
595 "cdq \n\t" | |
596 "and %%edx, %%ebx \n\t" | |
597 "add %%ebx, %%eax \n\t" | |
598 "test %%ax, %%ax \n\t" | |
599 " jnz 1f \n\t" | |
4064 | 600 "movl "BYTE "(%1), %%"REG_b" \n\t" |
4040 | 601 "subl $0xFFFF, %%eax \n\t" |
4064 | 602 "movzwl (%%"REG_b"), %%ecx \n\t" |
4040 | 603 "bswap %%ecx \n\t" |
604 "shrl $15, %%ecx \n\t" | |
4064 | 605 "addl $2, %%"REG_b" \n\t" |
4040 | 606 "addl %%ecx, %%eax \n\t" |
4064 | 607 "movl %%"REG_b", "BYTE "(%1) \n\t" |
4040 | 608 "1: \n\t" |
609 "movl %%eax, "LOW "(%1) \n\t" | |
610 | |
611 :"=&d"(bit) | |
612 :"r"(c) | |
4064 | 613 : "%eax", "%"REG_b, "%ecx", "memory" |
4040 | 614 ); |
615 return bit+1; | |
616 #else | |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
617 int range; |
1287 | 618 c->low += c->low; |
619 | |
2323 | 620 if(!(c->low & CABAC_MASK)) |
621 refill(c); | |
2967 | 622 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
623 range= c->range<<17; |
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
624 if(c->low < range){ |
1287 | 625 return 0; |
626 }else{ | |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
627 c->low -= range; |
1287 | 628 return 1; |
629 } | |
4040 | 630 #endif |
1287 | 631 } |
4040 | 632 |
633 | |
634 static always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ | |
635 #ifdef ARCH_X86 | |
636 asm volatile( | |
637 "movl "RANGE "(%1), %%ebx \n\t" | |
638 "movl "LOW "(%1), %%eax \n\t" | |
639 "shl $17, %%ebx \n\t" | |
640 "add %%eax, %%eax \n\t" | |
641 "sub %%ebx, %%eax \n\t" | |
642 "cdq \n\t" | |
643 "and %%edx, %%ebx \n\t" | |
644 "add %%ebx, %%eax \n\t" | |
645 "xor %%edx, %%ecx \n\t" | |
646 "sub %%edx, %%ecx \n\t" | |
647 "test %%ax, %%ax \n\t" | |
648 " jnz 1f \n\t" | |
4064 | 649 "mov "BYTE "(%1), %%"REG_b" \n\t" |
4040 | 650 "subl $0xFFFF, %%eax \n\t" |
4064 | 651 "movzwl (%%"REG_b"), %%edx \n\t" |
4040 | 652 "bswap %%edx \n\t" |
653 "shrl $15, %%edx \n\t" | |
4064 | 654 "add $2, %%"REG_b" \n\t" |
4040 | 655 "addl %%edx, %%eax \n\t" |
4064 | 656 "mov %%"REG_b", "BYTE "(%1) \n\t" |
4040 | 657 "1: \n\t" |
658 "movl %%eax, "LOW "(%1) \n\t" | |
659 | |
660 :"+c"(val) | |
661 :"r"(c) | |
4064 | 662 : "%eax", "%"REG_b, "%edx", "memory" |
4040 | 663 ); |
664 return val; | |
665 #else | |
666 int range, mask; | |
667 c->low += c->low; | |
668 | |
669 if(!(c->low & CABAC_MASK)) | |
670 refill(c); | |
671 | |
672 range= c->range<<17; | |
673 c->low -= range; | |
674 mask= c->low >> 31; | |
675 range &= mask; | |
676 c->low += range; | |
677 return (val^mask)-mask; | |
678 #endif | |
679 } | |
680 | |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
681 //FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!) |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
682 //FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard) |
4113
4ce3923d5806
Reenable AMD64 optimizations for cabac accidentially disabled in r6852
reimar
parents:
4112
diff
changeset
|
683 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
684 static int decode_significance_x86(CABACContext *c, int max_coeff, uint8_t *significant_coeff_ctx_base, int *index){ |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
685 void *end= significant_coeff_ctx_base + max_coeff - 1; |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
686 int minusstart= -(int)significant_coeff_ctx_base; |
4049
8c1a5ed03a00
another instruction less in decode_significance_x86() -> 1% faster ion P3
michael
parents:
4048
diff
changeset
|
687 int minusindex= 4-(int)index; |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
688 int coeff_count; |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
689 asm volatile( |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
690 "movl "RANGE "(%3), %%esi \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
691 "movl "LOW "(%3), %%ebx \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
692 |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
693 "2: \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
694 |
4047
61a4e7218a45
reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3
michael
parents:
4046
diff
changeset
|
695 BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
696 |
4047
61a4e7218a45
reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3
michael
parents:
4046
diff
changeset
|
697 "test $1, %%edx \n\t" |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
698 " jz 3f \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
699 |
4047
61a4e7218a45
reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3
michael
parents:
4046
diff
changeset
|
700 BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
61a4e7218a45
reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3
michael
parents:
4046
diff
changeset
|
701 |
4064 | 702 "mov %2, %%"REG_a" \n\t" |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
703 "movl %4, %%ecx \n\t" |
4064 | 704 "add %1, %%"REG_c" \n\t" |
705 "movl %%ecx, (%%"REG_a") \n\t" | |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
706 |
4047
61a4e7218a45
reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3
michael
parents:
4046
diff
changeset
|
707 "test $1, %%edx \n\t" |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
708 " jnz 4f \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
709 |
4064 | 710 "add $4, %%"REG_a" \n\t" |
711 "mov %%"REG_a", %2 \n\t" | |
4048 | 712 |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
713 "3: \n\t" |
4064 | 714 "add $1, %1 \n\t" |
715 "cmp %5, %1 \n\t" | |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
716 " jb 2b \n\t" |
4064 | 717 "mov %2, %%"REG_a" \n\t" |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
718 "movl %4, %%ecx \n\t" |
4064 | 719 "add %1, %%"REG_c" \n\t" |
720 "movl %%ecx, (%%"REG_a") \n\t" | |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
721 "4: \n\t" |
4064 | 722 "add %6, %%eax \n\t" |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
723 "shr $2, %%eax \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
724 |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
725 "movl %%esi, "RANGE "(%3) \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
726 "movl %%ebx, "LOW "(%3) \n\t" |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
727 :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\ |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
728 :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\ |
4064 | 729 : "%"REG_c, "%ebx", "%edx", "%esi", "memory"\ |
4037
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
730 ); |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
731 return coeff_count; |
53be304c7f54
x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3
michael
parents:
4035
diff
changeset
|
732 } |
4051 | 733 |
734 static int decode_significance_8x8_x86(CABACContext *c, uint8_t *significant_coeff_ctx_base, int *index, uint8_t *sig_off){ | |
735 int minusindex= 4-(int)index; | |
736 int coeff_count; | |
4064 | 737 long last=0; |
4051 | 738 asm volatile( |
739 "movl "RANGE "(%3), %%esi \n\t" | |
740 "movl "LOW "(%3), %%ebx \n\t" | |
741 | |
4064 | 742 "mov %1, %%"REG_D" \n\t" |
4051 | 743 "2: \n\t" |
744 | |
4064 | 745 "mov %6, %%"REG_a" \n\t" |
746 "movzbl (%%"REG_a", %%"REG_D"), %%edi \n\t" | |
747 "add %5, %%"REG_D" \n\t" | |
4051 | 748 |
4064 | 749 BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
4051 | 750 |
751 "mov %1, %%edi \n\t" | |
752 "test $1, %%edx \n\t" | |
753 " jz 3f \n\t" | |
754 | |
755 "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t" | |
4064 | 756 "add %5, %%"REG_D" \n\t" |
4051 | 757 |
4064 | 758 BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
4051 | 759 |
4064 | 760 "mov %2, %%"REG_a" \n\t" |
4051 | 761 "mov %1, %%edi \n\t" |
4064 | 762 "movl %%edi, (%%"REG_a") \n\t" |
4051 | 763 |
764 "test $1, %%edx \n\t" | |
765 " jnz 4f \n\t" | |
766 | |
4064 | 767 "add $4, %%"REG_a" \n\t" |
768 "mov %%"REG_a", %2 \n\t" | |
4051 | 769 |
770 "3: \n\t" | |
771 "addl $1, %%edi \n\t" | |
772 "mov %%edi, %1 \n\t" | |
773 "cmpl $63, %%edi \n\t" | |
774 " jb 2b \n\t" | |
4064 | 775 "mov %2, %%"REG_a" \n\t" |
776 "movl %%edi, (%%"REG_a") \n\t" | |
4051 | 777 "4: \n\t" |
778 "addl %4, %%eax \n\t" | |
779 "shr $2, %%eax \n\t" | |
780 | |
781 "movl %%esi, "RANGE "(%3) \n\t" | |
782 "movl %%ebx, "LOW "(%3) \n\t" | |
783 :"=&a"(coeff_count),"+m"(last), "+m"(index)\ | |
784 :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)\ | |
4064 | 785 : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"\ |
4051 | 786 ); |
787 return coeff_count; | |
788 } | |
4113
4ce3923d5806
Reenable AMD64 optimizations for cabac accidentially disabled in r6852
reimar
parents:
4112
diff
changeset
|
789 #endif /* defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__)) */ |
1287 | 790 |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
791 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
792 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
793 * @return the number of bytes read or 0 if no end |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
794 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
795 static int get_cabac_terminate(CABACContext *c){ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
796 c->range -= 2; |
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
797 if(c->low < c->range<<17){ |
2323 | 798 renorm_cabac_decoder_once(c); |
1287 | 799 return 0; |
800 }else{ | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
801 return c->bytestream - c->bytestream_start; |
2967 | 802 } |
1287 | 803 } |
804 | |
1290 | 805 /** |
806 * get (truncated) unnary binarization. | |
807 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
808 static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){ |
1290 | 809 int i; |
2967 | 810 |
811 for(i=0; i<max; i++){ | |
1290 | 812 if(get_cabac(c, state)==0) |
813 return i; | |
2967 | 814 |
1290 | 815 if(i< max_index) state++; |
816 } | |
817 | |
818 return truncated ? max : -1; | |
819 } | |
820 | |
821 /** | |
822 * get unary exp golomb k-th order binarization. | |
823 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
824 static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){ |
1290 | 825 int i, v; |
826 int m= 1<<k; | |
2967 | 827 |
828 if(get_cabac(c, state)==0) | |
1290 | 829 return 0; |
2967 | 830 |
1290 | 831 if(0 < max_index) state++; |
2967 | 832 |
833 for(i=1; i<max; i++){ | |
1290 | 834 if(get_cabac(c, state)==0){ |
835 if(is_signed && get_cabac_bypass(c)){ | |
836 return -i; | |
837 }else | |
838 return i; | |
839 } | |
840 | |
841 if(i < max_index) state++; | |
842 } | |
2967 | 843 |
1290 | 844 while(get_cabac_bypass(c)){ |
845 i+= m; | |
846 m+= m; | |
847 } | |
2967 | 848 |
1290 | 849 v=0; |
850 while(m>>=1){ | |
851 v+= v + get_cabac_bypass(c); | |
852 } | |
853 i += v; | |
854 | |
855 if(is_signed && get_cabac_bypass(c)){ | |
856 return -i; | |
857 }else | |
858 return i; | |
859 } |