Mercurial > libavcodec.hg
annotate dnxhdenc.c @ 10960:10759fd39860 libavcodec
Gcc idiocy fixes related to filter_mb_edge*.
Change order of operands as gcc uses a hardcoded register per operand it seems
even for static functions
thus reducing unneeded moved (now functions try to pass the same argument in
the same spot).
Change signed int to unsigned int for array indexes as signed requires signed
extension while unsigned is free.
move the +52 up and merge it where it will end as a lea instruction, gcc always
splits the 52 out there turning the free +52 into an expensive one otherwise.
The changed code becomes a little faster.
author | michael |
---|---|
date | Fri, 22 Jan 2010 01:59:17 +0000 |
parents | 5a298a61c6cc |
children | 34a65026fa06 |
rev | line source |
---|---|
5790 | 1 /* |
2 * VC3/DNxHD encoder | |
3 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com> | |
4 * | |
5 * VC-3 encoder funded by the British Broadcasting Corporation | |
6 * | |
7 * This file is part of FFmpeg. | |
8 * | |
9 * FFmpeg is free software; you can redistribute it and/or | |
10 * modify it under the terms of the GNU Lesser General Public | |
11 * License as published by the Free Software Foundation; either | |
12 * version 2.1 of the License, or (at your option) any later version. | |
13 * | |
14 * FFmpeg is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Lesser General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Lesser General Public | |
20 * License along with FFmpeg; if not, write to the Free Software | |
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 */ | |
23 | |
24 //#define DEBUG | |
25 #define RC_VARIANCE 1 // use variance or ssd for fast rc | |
26 | |
27 #include "avcodec.h" | |
28 #include "dsputil.h" | |
29 #include "mpegvideo.h" | |
8294 | 30 #include "dnxhdenc.h" |
5790 | 31 |
32 int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow); | |
33 | |
34 #define LAMBDA_FRAC_BITS 10 | |
35 | |
8302
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
36 static av_always_inline void dnxhd_get_pixels_8x4(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
37 { |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
38 int i; |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
39 for (i = 0; i < 4; i++) { |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
40 block[0] = pixels[0]; block[1] = pixels[1]; |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
41 block[2] = pixels[2]; block[3] = pixels[3]; |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
42 block[4] = pixels[4]; block[5] = pixels[5]; |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
43 block[6] = pixels[6]; block[7] = pixels[7]; |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
44 pixels += line_size; |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
45 block += 8; |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
46 } |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
47 memcpy(block , block- 8, sizeof(*block)*8); |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
48 memcpy(block+ 8, block-16, sizeof(*block)*8); |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
49 memcpy(block+16, block-24, sizeof(*block)*8); |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
50 memcpy(block+24, block-32, sizeof(*block)*8); |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
51 } |
f54976d6a8a5
Move get_pixels_8x4 before init func to avoid useless forward declaration.
bcoudurier
parents:
8301
diff
changeset
|
52 |
5790 | 53 static int dnxhd_init_vlc(DNXHDEncContext *ctx) |
54 { | |
6978 | 55 int i, j, level, run; |
56 int max_level = 1<<(ctx->cid_table->bit_depth+2); | |
5790 | 57 |
10137
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
58 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_codes, max_level*4*sizeof(*ctx->vlc_codes), fail); |
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
59 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_bits , max_level*4*sizeof(*ctx->vlc_bits ), fail); |
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
60 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_codes, 63*2 , fail); |
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
61 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_bits , 63 , fail); |
6978 | 62 |
6981 | 63 ctx->vlc_codes += max_level*2; |
64 ctx->vlc_bits += max_level*2; | |
6978 | 65 for (level = -max_level; level < max_level; level++) { |
66 for (run = 0; run < 2; run++) { | |
67 int index = (level<<1)|run; | |
68 int sign, offset = 0, alevel = level; | |
5790 | 69 |
6978 | 70 MASK_ABS(sign, alevel); |
71 if (alevel > 64) { | |
72 offset = (alevel-1)>>6; | |
73 alevel -= offset<<6; | |
74 } | |
75 for (j = 0; j < 257; j++) { | |
76 if (ctx->cid_table->ac_level[j] == alevel && | |
77 (!offset || (ctx->cid_table->ac_index_flag[j] && offset)) && | |
78 (!run || (ctx->cid_table->ac_run_flag [j] && run))) { | |
6981 | 79 assert(!ctx->vlc_codes[index]); |
6978 | 80 if (alevel) { |
6981 | 81 ctx->vlc_codes[index] = (ctx->cid_table->ac_codes[j]<<1)|(sign&1); |
82 ctx->vlc_bits [index] = ctx->cid_table->ac_bits[j]+1; | |
6978 | 83 } else { |
6981 | 84 ctx->vlc_codes[index] = ctx->cid_table->ac_codes[j]; |
85 ctx->vlc_bits [index] = ctx->cid_table->ac_bits [j]; | |
6978 | 86 } |
87 break; | |
88 } | |
89 } | |
90 assert(!alevel || j < 257); | |
91 if (offset) { | |
6981 | 92 ctx->vlc_codes[index] = (ctx->vlc_codes[index]<<ctx->cid_table->index_bits)|offset; |
93 ctx->vlc_bits [index]+= ctx->cid_table->index_bits; | |
6978 | 94 } |
95 } | |
5790 | 96 } |
97 for (i = 0; i < 62; i++) { | |
98 int run = ctx->cid_table->run[i]; | |
99 assert(run < 63); | |
6981 | 100 ctx->run_codes[run] = ctx->cid_table->run_codes[i]; |
101 ctx->run_bits [run] = ctx->cid_table->run_bits[i]; | |
5790 | 102 } |
103 return 0; | |
104 fail: | |
105 return -1; | |
106 } | |
107 | |
108 static int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias) | |
109 { | |
110 // init first elem to 1 to avoid div by 0 in convert_matrix | |
111 uint16_t weight_matrix[64] = {1,}; // convert_matrix needs uint16_t* | |
112 int qscale, i; | |
113 | |
10137
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
114 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l, (ctx->m.avctx->qmax+1) * 64 * sizeof(int) , fail); |
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
115 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c, (ctx->m.avctx->qmax+1) * 64 * sizeof(int) , fail); |
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
116 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail); |
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
117 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail); |
5790 | 118 |
119 for (i = 1; i < 64; i++) { | |
120 int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]]; | |
5795 | 121 weight_matrix[j] = ctx->cid_table->luma_weight[i]; |
5790 | 122 } |
123 ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix, | |
124 ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1); | |
125 for (i = 1; i < 64; i++) { | |
126 int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]]; | |
5795 | 127 weight_matrix[j] = ctx->cid_table->chroma_weight[i]; |
5790 | 128 } |
129 ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix, | |
130 ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1); | |
131 for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) { | |
132 for (i = 0; i < 64; i++) { | |
133 ctx->qmatrix_l [qscale] [i] <<= 2; ctx->qmatrix_c [qscale] [i] <<= 2; | |
134 ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2; | |
135 ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2; | |
136 } | |
137 } | |
138 return 0; | |
139 fail: | |
140 return -1; | |
141 } | |
142 | |
143 static int dnxhd_init_rc(DNXHDEncContext *ctx) | |
144 { | |
10137
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
145 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_rc, 8160*ctx->m.avctx->qmax*sizeof(RCEntry), fail); |
5790 | 146 if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD) |
10137
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
147 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_cmp, ctx->m.mb_num*sizeof(RCCMPEntry), fail); |
5790 | 148 |
149 ctx->frame_bits = (ctx->cid_table->coding_unit_size - 640 - 4) * 8; | |
150 ctx->qscale = 1; | |
151 ctx->lambda = 2<<LAMBDA_FRAC_BITS; // qscale 2 | |
152 return 0; | |
153 fail: | |
154 return -1; | |
155 } | |
156 | |
157 static int dnxhd_encode_init(AVCodecContext *avctx) | |
158 { | |
159 DNXHDEncContext *ctx = avctx->priv_data; | |
160 int i, index; | |
161 | |
6041
bb4b486c6775
add bitrate helper to choose all dnxhd variants
bcoudurier
parents:
5972
diff
changeset
|
162 ctx->cid = ff_dnxhd_find_cid(avctx); |
5971 | 163 if (!ctx->cid || avctx->pix_fmt != PIX_FMT_YUV422P) { |
5790 | 164 av_log(avctx, AV_LOG_ERROR, "video parameters incompatible with DNxHD\n"); |
165 return -1; | |
166 } | |
6041
bb4b486c6775
add bitrate helper to choose all dnxhd variants
bcoudurier
parents:
5972
diff
changeset
|
167 av_log(avctx, AV_LOG_DEBUG, "cid %d\n", ctx->cid); |
5790 | 168 |
169 index = ff_dnxhd_get_cid_table(ctx->cid); | |
170 ctx->cid_table = &ff_dnxhd_cid_table[index]; | |
171 | |
172 ctx->m.avctx = avctx; | |
173 ctx->m.mb_intra = 1; | |
174 ctx->m.h263_aic = 1; | |
175 | |
8303 | 176 ctx->get_pixels_8x4_sym = dnxhd_get_pixels_8x4; |
177 | |
5790 | 178 dsputil_init(&ctx->m.dsp, avctx); |
179 ff_dct_common_init(&ctx->m); | |
8590 | 180 #if HAVE_MMX |
8303 | 181 ff_dnxhd_init_mmx(ctx); |
182 #endif | |
5790 | 183 if (!ctx->m.dct_quantize) |
184 ctx->m.dct_quantize = dct_quantize_c; | |
185 | |
186 ctx->m.mb_height = (avctx->height + 15) / 16; | |
187 ctx->m.mb_width = (avctx->width + 15) / 16; | |
188 | |
189 if (avctx->flags & CODEC_FLAG_INTERLACED_DCT) { | |
190 ctx->interlaced = 1; | |
191 ctx->m.mb_height /= 2; | |
192 } | |
193 | |
194 ctx->m.mb_num = ctx->m.mb_height * ctx->m.mb_width; | |
195 | |
196 if (avctx->intra_quant_bias != FF_DEFAULT_QUANT_BIAS) | |
197 ctx->m.intra_quant_bias = avctx->intra_quant_bias; | |
198 if (dnxhd_init_qmat(ctx, ctx->m.intra_quant_bias, 0) < 0) // XXX tune lbias/cbias | |
199 return -1; | |
200 | |
201 if (dnxhd_init_vlc(ctx) < 0) | |
202 return -1; | |
203 if (dnxhd_init_rc(ctx) < 0) | |
204 return -1; | |
205 | |
10137
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
206 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail); |
10387 | 207 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail); |
10137
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
208 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits, ctx->m.mb_num *sizeof(uint16_t), fail); |
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
209 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale, ctx->m.mb_num *sizeof(uint8_t) , fail); |
5790 | 210 |
211 ctx->frame.key_frame = 1; | |
212 ctx->frame.pict_type = FF_I_TYPE; | |
213 ctx->m.avctx->coded_frame = &ctx->frame; | |
214 | |
10387 | 215 if (avctx->thread_count > MAX_THREADS) { |
5790 | 216 av_log(avctx, AV_LOG_ERROR, "too many threads\n"); |
217 return -1; | |
218 } | |
219 | |
220 ctx->thread[0] = ctx; | |
221 for (i = 1; i < avctx->thread_count; i++) { | |
222 ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext)); | |
223 memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext)); | |
224 } | |
225 | |
226 return 0; | |
10137
9a670cfd1941
Rename CHECKED_ALLOC(Z) to FF_ALLOC(Z)_OR_GOTO and add context and label
ramiro
parents:
9618
diff
changeset
|
227 fail: //for FF_ALLOCZ_OR_GOTO |
5790 | 228 return -1; |
229 } | |
230 | |
231 static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf) | |
232 { | |
233 DNXHDEncContext *ctx = avctx->priv_data; | |
234 const uint8_t header_prefix[5] = { 0x00,0x00,0x02,0x80,0x01 }; | |
235 | |
10186 | 236 memset(buf, 0, 640); |
237 | |
5790 | 238 memcpy(buf, header_prefix, 5); |
239 buf[5] = ctx->interlaced ? ctx->cur_field+2 : 0x01; | |
240 buf[6] = 0x80; // crc flag off | |
241 buf[7] = 0xa0; // reserved | |
242 AV_WB16(buf + 0x18, avctx->height); // ALPF | |
243 AV_WB16(buf + 0x1a, avctx->width); // SPL | |
244 AV_WB16(buf + 0x1d, avctx->height); // NAL | |
245 | |
246 buf[0x21] = 0x38; // FIXME 8 bit per comp | |
247 buf[0x22] = 0x88 + (ctx->frame.interlaced_frame<<2); | |
248 AV_WB32(buf + 0x28, ctx->cid); // CID | |
249 buf[0x2c] = ctx->interlaced ? 0 : 0x80; | |
250 | |
251 buf[0x5f] = 0x01; // UDL | |
252 | |
253 buf[0x167] = 0x02; // reserved | |
254 AV_WB16(buf + 0x16a, ctx->m.mb_height * 4 + 4); // MSIPS | |
255 buf[0x16d] = ctx->m.mb_height; // Ns | |
256 buf[0x16f] = 0x10; // reserved | |
257 | |
258 ctx->msip = buf + 0x170; | |
259 return 0; | |
260 } | |
261 | |
262 static av_always_inline void dnxhd_encode_dc(DNXHDEncContext *ctx, int diff) | |
263 { | |
264 int nbits; | |
265 if (diff < 0) { | |
266 nbits = av_log2_16bit(-2*diff); | |
267 diff--; | |
268 } else { | |
269 nbits = av_log2_16bit(2*diff); | |
270 } | |
271 put_bits(&ctx->m.pb, ctx->cid_table->dc_bits[nbits] + nbits, | |
272 (ctx->cid_table->dc_codes[nbits]<<nbits) + (diff & ((1 << nbits) - 1))); | |
273 } | |
274 | |
275 static av_always_inline void dnxhd_encode_block(DNXHDEncContext *ctx, DCTELEM *block, int last_index, int n) | |
276 { | |
277 int last_non_zero = 0; | |
278 int slevel, i, j; | |
279 | |
280 dnxhd_encode_dc(ctx, block[0] - ctx->m.last_dc[n]); | |
281 ctx->m.last_dc[n] = block[0]; | |
282 | |
283 for (i = 1; i <= last_index; i++) { | |
284 j = ctx->m.intra_scantable.permutated[i]; | |
285 slevel = block[j]; | |
286 if (slevel) { | |
287 int run_level = i - last_non_zero - 1; | |
6978 | 288 int rlevel = (slevel<<1)|!!run_level; |
6981 | 289 put_bits(&ctx->m.pb, ctx->vlc_bits[rlevel], ctx->vlc_codes[rlevel]); |
5790 | 290 if (run_level) |
6981 | 291 put_bits(&ctx->m.pb, ctx->run_bits[run_level], ctx->run_codes[run_level]); |
5790 | 292 last_non_zero = i; |
293 } | |
294 } | |
6981 | 295 put_bits(&ctx->m.pb, ctx->vlc_bits[0], ctx->vlc_codes[0]); // EOB |
5790 | 296 } |
297 | |
298 static av_always_inline void dnxhd_unquantize_c(DNXHDEncContext *ctx, DCTELEM *block, int n, int qscale, int last_index) | |
299 { | |
5795 | 300 const uint8_t *weight_matrix; |
5790 | 301 int level; |
302 int i; | |
303 | |
5795 | 304 weight_matrix = (n&2) ? ctx->cid_table->chroma_weight : ctx->cid_table->luma_weight; |
5790 | 305 |
306 for (i = 1; i <= last_index; i++) { | |
307 int j = ctx->m.intra_scantable.permutated[i]; | |
308 level = block[j]; | |
309 if (level) { | |
310 if (level < 0) { | |
5795 | 311 level = (1-2*level) * qscale * weight_matrix[i]; |
312 if (weight_matrix[i] != 32) | |
5790 | 313 level += 32; |
314 level >>= 6; | |
315 level = -level; | |
316 } else { | |
5795 | 317 level = (2*level+1) * qscale * weight_matrix[i]; |
318 if (weight_matrix[i] != 32) | |
5790 | 319 level += 32; |
320 level >>= 6; | |
321 } | |
322 block[j] = level; | |
323 } | |
324 } | |
325 } | |
326 | |
327 static av_always_inline int dnxhd_ssd_block(DCTELEM *qblock, DCTELEM *block) | |
328 { | |
329 int score = 0; | |
330 int i; | |
331 for (i = 0; i < 64; i++) | |
332 score += (block[i]-qblock[i])*(block[i]-qblock[i]); | |
333 return score; | |
334 } | |
335 | |
336 static av_always_inline int dnxhd_calc_ac_bits(DNXHDEncContext *ctx, DCTELEM *block, int last_index) | |
337 { | |
338 int last_non_zero = 0; | |
339 int bits = 0; | |
340 int i, j, level; | |
341 for (i = 1; i <= last_index; i++) { | |
342 j = ctx->m.intra_scantable.permutated[i]; | |
343 level = block[j]; | |
344 if (level) { | |
345 int run_level = i - last_non_zero - 1; | |
6981 | 346 bits += ctx->vlc_bits[(level<<1)|!!run_level]+ctx->run_bits[run_level]; |
5790 | 347 last_non_zero = i; |
348 } | |
349 } | |
350 return bits; | |
351 } | |
352 | |
353 static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y) | |
354 { | |
355 const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize) + (mb_x << 4); | |
356 const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3); | |
357 const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3); | |
358 DSPContext *dsp = &ctx->m.dsp; | |
359 | |
360 dsp->get_pixels(ctx->blocks[0], ptr_y , ctx->m.linesize); | |
361 dsp->get_pixels(ctx->blocks[1], ptr_y + 8, ctx->m.linesize); | |
362 dsp->get_pixels(ctx->blocks[2], ptr_u , ctx->m.uvlinesize); | |
363 dsp->get_pixels(ctx->blocks[3], ptr_v , ctx->m.uvlinesize); | |
364 | |
5971 | 365 if (mb_y+1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) { |
5790 | 366 if (ctx->interlaced) { |
8303 | 367 ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset , ctx->m.linesize); |
368 ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize); | |
369 ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset , ctx->m.uvlinesize); | |
370 ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset , ctx->m.uvlinesize); | |
8292 | 371 } else { |
372 dsp->clear_block(ctx->blocks[4]); dsp->clear_block(ctx->blocks[5]); | |
373 dsp->clear_block(ctx->blocks[6]); dsp->clear_block(ctx->blocks[7]); | |
374 } | |
5790 | 375 } else { |
376 dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset , ctx->m.linesize); | |
377 dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize); | |
378 dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset , ctx->m.uvlinesize); | |
379 dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset , ctx->m.uvlinesize); | |
380 } | |
381 } | |
382 | |
383 static av_always_inline int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i) | |
384 { | |
385 if (i&2) { | |
386 ctx->m.q_intra_matrix16 = ctx->qmatrix_c16; | |
387 ctx->m.q_intra_matrix = ctx->qmatrix_c; | |
388 return 1 + (i&1); | |
389 } else { | |
390 ctx->m.q_intra_matrix16 = ctx->qmatrix_l16; | |
391 ctx->m.q_intra_matrix = ctx->qmatrix_l; | |
392 return 0; | |
393 } | |
394 } | |
395 | |
10387 | 396 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) |
5790 | 397 { |
10387 | 398 DNXHDEncContext *ctx = avctx->priv_data; |
399 int mb_y = jobnr, mb_x; | |
400 int qscale = ctx->qscale; | |
401 ctx = ctx->thread[threadnr]; | |
5790 | 402 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
403 ctx->m.last_dc[0] = |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
404 ctx->m.last_dc[1] = |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
405 ctx->m.last_dc[2] = 1024; |
5790 | 406 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
407 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
408 unsigned mb = mb_y * ctx->m.mb_width + mb_x; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
409 int ssd = 0; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
410 int ac_bits = 0; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
411 int dc_bits = 0; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
412 int i; |
5790 | 413 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
414 dnxhd_get_blocks(ctx, mb_x, mb_y); |
5790 | 415 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
416 for (i = 0; i < 8; i++) { |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
417 DECLARE_ALIGNED_16(DCTELEM, block[64]); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
418 DCTELEM *src_block = ctx->blocks[i]; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
419 int overflow, nbits, diff, last_index; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
420 int n = dnxhd_switch_matrix(ctx, i); |
5790 | 421 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
422 memcpy(block, src_block, sizeof(block)); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
423 last_index = ctx->m.dct_quantize((MpegEncContext*)ctx, block, i, qscale, &overflow); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
424 ac_bits += dnxhd_calc_ac_bits(ctx, block, last_index); |
5790 | 425 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
426 diff = block[0] - ctx->m.last_dc[n]; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
427 if (diff < 0) nbits = av_log2_16bit(-2*diff); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
428 else nbits = av_log2_16bit( 2*diff); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
429 dc_bits += ctx->cid_table->dc_bits[nbits] + nbits; |
5790 | 430 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
431 ctx->m.last_dc[n] = block[0]; |
5790 | 432 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
433 if (avctx->mb_decision == FF_MB_DECISION_RD || !RC_VARIANCE) { |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
434 dnxhd_unquantize_c(ctx, block, i, qscale, last_index); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
435 ctx->m.dsp.idct(block); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
436 ssd += dnxhd_ssd_block(block, src_block); |
5790 | 437 } |
438 } | |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
439 ctx->mb_rc[qscale][mb].ssd = ssd; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
440 ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0]; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
441 } |
5790 | 442 return 0; |
443 } | |
444 | |
10387 | 445 static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) |
5790 | 446 { |
10387 | 447 DNXHDEncContext *ctx = avctx->priv_data; |
448 int mb_y = jobnr, mb_x; | |
449 ctx = ctx->thread[threadnr]; | |
450 init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]); | |
5790 | 451 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
452 ctx->m.last_dc[0] = |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
453 ctx->m.last_dc[1] = |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
454 ctx->m.last_dc[2] = 1024; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
455 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
456 unsigned mb = mb_y * ctx->m.mb_width + mb_x; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
457 int qscale = ctx->mb_qscale[mb]; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
458 int i; |
5790 | 459 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
460 put_bits(&ctx->m.pb, 12, qscale<<1); |
5790 | 461 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
462 dnxhd_get_blocks(ctx, mb_x, mb_y); |
5790 | 463 |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
464 for (i = 0; i < 8; i++) { |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
465 DCTELEM *block = ctx->blocks[i]; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
466 int last_index, overflow; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
467 int n = dnxhd_switch_matrix(ctx, i); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
468 last_index = ctx->m.dct_quantize((MpegEncContext*)ctx, block, i, qscale, &overflow); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
469 //START_TIMER; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
470 dnxhd_encode_block(ctx, block, last_index, n); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
471 //STOP_TIMER("encode_block"); |
5790 | 472 } |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
473 } |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
474 if (put_bits_count(&ctx->m.pb)&31) |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
475 put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0); |
5790 | 476 flush_put_bits(&ctx->m.pb); |
477 return 0; | |
478 } | |
479 | |
10387 | 480 static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx) |
5790 | 481 { |
482 int mb_y, mb_x; | |
10387 | 483 int offset = 0; |
484 for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) { | |
485 int thread_size; | |
486 ctx->slice_offs[mb_y] = offset; | |
5790 | 487 ctx->slice_size[mb_y] = 0; |
488 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { | |
489 unsigned mb = mb_y * ctx->m.mb_width + mb_x; | |
490 ctx->slice_size[mb_y] += ctx->mb_bits[mb]; | |
491 } | |
492 ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31; | |
493 ctx->slice_size[mb_y] >>= 3; | |
10387 | 494 thread_size = ctx->slice_size[mb_y]; |
5790 | 495 offset += thread_size; |
496 } | |
497 } | |
498 | |
10387 | 499 static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) |
5790 | 500 { |
10387 | 501 DNXHDEncContext *ctx = avctx->priv_data; |
502 int mb_y = jobnr, mb_x; | |
503 ctx = ctx->thread[threadnr]; | |
10388
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
504 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
505 unsigned mb = mb_y * ctx->m.mb_width + mb_x; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
506 uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
507 int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize); |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
508 int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
509 ctx->mb_cmp[mb].value = varc; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
510 ctx->mb_cmp[mb].mb = mb; |
08e50bcdcbf1
Reindent after removing the outer for loops in the execute2 patch
reimar
parents:
10387
diff
changeset
|
511 } |
5790 | 512 return 0; |
513 } | |
514 | |
515 static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx) | |
516 { | |
5802 | 517 int lambda, up_step, down_step; |
518 int last_lower = INT_MAX, last_higher = 0; | |
5790 | 519 int x, y, q; |
520 | |
521 for (q = 1; q < avctx->qmax; q++) { | |
522 ctx->qscale = q; | |
10387 | 523 avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height); |
5790 | 524 } |
5802 | 525 up_step = down_step = 2<<LAMBDA_FRAC_BITS; |
5790 | 526 lambda = ctx->lambda; |
527 | |
528 for (;;) { | |
529 int bits = 0; | |
530 int end = 0; | |
5802 | 531 if (lambda == last_higher) { |
532 lambda++; | |
5790 | 533 end = 1; // need to set final qscales/bits |
534 } | |
535 for (y = 0; y < ctx->m.mb_height; y++) { | |
536 for (x = 0; x < ctx->m.mb_width; x++) { | |
537 unsigned min = UINT_MAX; | |
538 int qscale = 1; | |
539 int mb = y*ctx->m.mb_width+x; | |
540 for (q = 1; q < avctx->qmax; q++) { | |
541 unsigned score = ctx->mb_rc[q][mb].bits*lambda+(ctx->mb_rc[q][mb].ssd<<LAMBDA_FRAC_BITS); | |
542 if (score < min) { | |
543 min = score; | |
544 qscale = q; | |
545 } | |
546 } | |
547 bits += ctx->mb_rc[qscale][mb].bits; | |
548 ctx->mb_qscale[mb] = qscale; | |
549 ctx->mb_bits[mb] = ctx->mb_rc[qscale][mb].bits; | |
550 } | |
551 bits = (bits+31)&~31; // padding | |
552 if (bits > ctx->frame_bits) | |
553 break; | |
554 } | |
5802 | 555 //dprintf(ctx->m.avctx, "lambda %d, up %u, down %u, bits %d, frame %d\n", |
556 // lambda, last_higher, last_lower, bits, ctx->frame_bits); | |
5790 | 557 if (end) { |
558 if (bits > ctx->frame_bits) | |
559 return -1; | |
560 break; | |
561 } | |
562 if (bits < ctx->frame_bits) { | |
5802 | 563 last_lower = FFMIN(lambda, last_lower); |
564 if (last_higher != 0) | |
565 lambda = (lambda+last_higher)>>1; | |
566 else | |
567 lambda -= down_step; | |
568 down_step *= 5; // XXX tune ? | |
569 up_step = 1<<LAMBDA_FRAC_BITS; | |
570 lambda = FFMAX(1, lambda); | |
571 if (lambda == last_lower) | |
572 break; | |
5790 | 573 } else { |
5802 | 574 last_higher = FFMAX(lambda, last_higher); |
575 if (last_lower != INT_MAX) | |
576 lambda = (lambda+last_lower)>>1; | |
10542
5a298a61c6cc
avoid integer overflow in dnxhd encoder, fixes #1557
bcoudurier
parents:
10541
diff
changeset
|
577 else if ((int64_t)lambda + up_step > INT_MAX) |
5a298a61c6cc
avoid integer overflow in dnxhd encoder, fixes #1557
bcoudurier
parents:
10541
diff
changeset
|
578 return -1; |
5802 | 579 else |
580 lambda += up_step; | |
10542
5a298a61c6cc
avoid integer overflow in dnxhd encoder, fixes #1557
bcoudurier
parents:
10541
diff
changeset
|
581 up_step = FFMIN((int64_t)up_step*5, INT_MAX); |
5802 | 582 down_step = 1<<LAMBDA_FRAC_BITS; |
5790 | 583 } |
584 } | |
5802 | 585 //dprintf(ctx->m.avctx, "out lambda %d\n", lambda); |
5790 | 586 ctx->lambda = lambda; |
587 return 0; | |
588 } | |
589 | |
590 static int dnxhd_find_qscale(DNXHDEncContext *ctx) | |
591 { | |
592 int bits = 0; | |
593 int up_step = 1; | |
594 int down_step = 1; | |
595 int last_higher = 0; | |
596 int last_lower = INT_MAX; | |
597 int qscale; | |
598 int x, y; | |
599 | |
600 qscale = ctx->qscale; | |
601 for (;;) { | |
602 bits = 0; | |
603 ctx->qscale = qscale; | |
604 // XXX avoid recalculating bits | |
10387 | 605 ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height); |
5790 | 606 for (y = 0; y < ctx->m.mb_height; y++) { |
607 for (x = 0; x < ctx->m.mb_width; x++) | |
608 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits; | |
609 bits = (bits+31)&~31; // padding | |
610 if (bits > ctx->frame_bits) | |
611 break; | |
612 } | |
613 //dprintf(ctx->m.avctx, "%d, qscale %d, bits %d, frame %d, higher %d, lower %d\n", | |
614 // ctx->m.avctx->frame_number, qscale, bits, ctx->frame_bits, last_higher, last_lower); | |
615 if (bits < ctx->frame_bits) { | |
616 if (qscale == 1) | |
5969
c5d11f6f6a3d
fix corner case when qscale 1 bits < frame bits but max bits with worst padding > frame bits
bcoudurier
parents:
5802
diff
changeset
|
617 return 1; |
5790 | 618 if (last_higher == qscale - 1) { |
619 qscale = last_higher; | |
620 break; | |
621 } | |
622 last_lower = FFMIN(qscale, last_lower); | |
623 if (last_higher != 0) | |
624 qscale = (qscale+last_higher)>>1; | |
625 else | |
626 qscale -= down_step++; | |
627 if (qscale < 1) | |
628 qscale = 1; | |
629 up_step = 1; | |
630 } else { | |
631 if (last_lower == qscale + 1) | |
632 break; | |
633 last_higher = FFMAX(qscale, last_higher); | |
634 if (last_lower != INT_MAX) | |
635 qscale = (qscale+last_lower)>>1; | |
636 else | |
637 qscale += up_step++; | |
638 down_step = 1; | |
639 if (qscale >= ctx->m.avctx->qmax) | |
640 return -1; | |
641 } | |
642 } | |
643 //dprintf(ctx->m.avctx, "out qscale %d\n", qscale); | |
644 ctx->qscale = qscale; | |
645 return 0; | |
646 } | |
647 | |
10214
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
648 #define BUCKET_BITS 8 |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
649 #define RADIX_PASSES 4 |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
650 #define NBUCKETS (1 << BUCKET_BITS) |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
651 |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
652 static inline int get_bucket(int value, int shift) |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
653 { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
654 value >>= shift; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
655 value &= NBUCKETS - 1; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
656 return NBUCKETS - 1 - value; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
657 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
658 |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
659 static void radix_count(const RCCMPEntry *data, int size, int buckets[RADIX_PASSES][NBUCKETS]) |
5790 | 660 { |
10214
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
661 int i, j; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
662 memset(buckets, 0, sizeof(buckets[0][0]) * RADIX_PASSES * NBUCKETS); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
663 for (i = 0; i < size; i++) { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
664 int v = data[i].value; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
665 for (j = 0; j < RADIX_PASSES; j++) { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
666 buckets[j][get_bucket(v, 0)]++; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
667 v >>= BUCKET_BITS; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
668 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
669 assert(!v); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
670 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
671 for (j = 0; j < RADIX_PASSES; j++) { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
672 int offset = size; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
673 for (i = NBUCKETS - 1; i >= 0; i--) |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
674 buckets[j][i] = offset -= buckets[j][i]; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
675 assert(!buckets[j][0]); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
676 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
677 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
678 |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
679 static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data, int size, int buckets[NBUCKETS], int pass) |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
680 { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
681 int shift = pass * BUCKET_BITS; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
682 int i; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
683 for (i = 0; i < size; i++) { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
684 int v = get_bucket(data[i].value, shift); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
685 int pos = buckets[v]++; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
686 dst[pos] = data[i]; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
687 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
688 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
689 |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
690 static void radix_sort(RCCMPEntry *data, int size) |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
691 { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
692 int buckets[RADIX_PASSES][NBUCKETS]; |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
693 RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
694 radix_count(data, size, buckets); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
695 radix_sort_pass(tmp, data, size, buckets[0], 0); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
696 radix_sort_pass(data, tmp, size, buckets[1], 1); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
697 if (buckets[2][NBUCKETS - 1] || buckets[3][NBUCKETS - 1]) { |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
698 radix_sort_pass(tmp, data, size, buckets[2], 2); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
699 radix_sort_pass(data, tmp, size, buckets[3], 3); |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
700 } |
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
701 av_free(tmp); |
5790 | 702 } |
703 | |
5970 | 704 static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx) |
5790 | 705 { |
706 int max_bits = 0; | |
5969
c5d11f6f6a3d
fix corner case when qscale 1 bits < frame bits but max bits with worst padding > frame bits
bcoudurier
parents:
5802
diff
changeset
|
707 int ret, x, y; |
c5d11f6f6a3d
fix corner case when qscale 1 bits < frame bits but max bits with worst padding > frame bits
bcoudurier
parents:
5802
diff
changeset
|
708 if ((ret = dnxhd_find_qscale(ctx)) < 0) |
5790 | 709 return -1; |
710 for (y = 0; y < ctx->m.mb_height; y++) { | |
711 for (x = 0; x < ctx->m.mb_width; x++) { | |
712 int mb = y*ctx->m.mb_width+x; | |
713 int delta_bits; | |
714 ctx->mb_qscale[mb] = ctx->qscale; | |
715 ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale][mb].bits; | |
716 max_bits += ctx->mb_rc[ctx->qscale][mb].bits; | |
717 if (!RC_VARIANCE) { | |
718 delta_bits = ctx->mb_rc[ctx->qscale][mb].bits-ctx->mb_rc[ctx->qscale+1][mb].bits; | |
719 ctx->mb_cmp[mb].mb = mb; | |
720 ctx->mb_cmp[mb].value = delta_bits ? | |
721 ((ctx->mb_rc[ctx->qscale][mb].ssd-ctx->mb_rc[ctx->qscale+1][mb].ssd)*100)/delta_bits | |
722 : INT_MIN; //avoid increasing qscale | |
723 } | |
724 } | |
725 max_bits += 31; //worst padding | |
726 } | |
5969
c5d11f6f6a3d
fix corner case when qscale 1 bits < frame bits but max bits with worst padding > frame bits
bcoudurier
parents:
5802
diff
changeset
|
727 if (!ret) { |
5790 | 728 if (RC_VARIANCE) |
10387 | 729 avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height); |
10214
97f38ca4ed14
Use a custom radix sort implementation instead of qsort in dnxhd encoder.
reimar
parents:
10186
diff
changeset
|
730 radix_sort(ctx->mb_cmp, ctx->m.mb_num); |
5790 | 731 for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) { |
732 int mb = ctx->mb_cmp[x].mb; | |
733 max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits; | |
734 ctx->mb_qscale[mb] = ctx->qscale+1; | |
735 ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale+1][mb].bits; | |
736 } | |
737 } | |
738 return 0; | |
739 } | |
740 | |
6218 | 741 static void dnxhd_load_picture(DNXHDEncContext *ctx, const AVFrame *frame) |
5790 | 742 { |
743 int i; | |
744 | |
745 for (i = 0; i < 3; i++) { | |
746 ctx->frame.data[i] = frame->data[i]; | |
747 ctx->frame.linesize[i] = frame->linesize[i]; | |
748 } | |
749 | |
750 for (i = 0; i < ctx->m.avctx->thread_count; i++) { | |
751 ctx->thread[i]->m.linesize = ctx->frame.linesize[0]<<ctx->interlaced; | |
752 ctx->thread[i]->m.uvlinesize = ctx->frame.linesize[1]<<ctx->interlaced; | |
753 ctx->thread[i]->dct_y_offset = ctx->m.linesize *8; | |
754 ctx->thread[i]->dct_uv_offset = ctx->m.uvlinesize*8; | |
755 } | |
756 | |
757 ctx->frame.interlaced_frame = frame->interlaced_frame; | |
758 ctx->cur_field = frame->interlaced_frame && !frame->top_field_first; | |
759 } | |
760 | |
9618
85ad5d68ec98
data parameter of dnxhd_encode_picture() should not be const.
diego
parents:
8590
diff
changeset
|
761 static int dnxhd_encode_picture(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data) |
5790 | 762 { |
763 DNXHDEncContext *ctx = avctx->priv_data; | |
764 int first_field = 1; | |
765 int offset, i, ret; | |
766 | |
767 if (buf_size < ctx->cid_table->frame_size) { | |
768 av_log(avctx, AV_LOG_ERROR, "output buffer is too small to compress picture\n"); | |
769 return -1; | |
770 } | |
771 | |
772 dnxhd_load_picture(ctx, data); | |
773 | |
774 encode_coding_unit: | |
775 for (i = 0; i < 3; i++) { | |
776 ctx->src[i] = ctx->frame.data[i]; | |
777 if (ctx->interlaced && ctx->cur_field) | |
778 ctx->src[i] += ctx->frame.linesize[i]; | |
779 } | |
780 | |
781 dnxhd_write_header(avctx, buf); | |
782 | |
783 if (avctx->mb_decision == FF_MB_DECISION_RD) | |
784 ret = dnxhd_encode_rdo(avctx, ctx); | |
785 else | |
5970 | 786 ret = dnxhd_encode_fast(avctx, ctx); |
5790 | 787 if (ret < 0) { |
10541 | 788 av_log(avctx, AV_LOG_ERROR, |
789 "picture could not fit ratecontrol constraints, increase qmax\n"); | |
5790 | 790 return -1; |
791 } | |
792 | |
10387 | 793 dnxhd_setup_threads_slices(ctx); |
5790 | 794 |
795 offset = 0; | |
796 for (i = 0; i < ctx->m.mb_height; i++) { | |
797 AV_WB32(ctx->msip + i * 4, offset); | |
798 offset += ctx->slice_size[i]; | |
799 assert(!(ctx->slice_size[i] & 3)); | |
800 } | |
801 | |
10387 | 802 avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height); |
5790 | 803 |
10186 | 804 assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size); |
805 memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640); | |
806 | |
5790 | 807 AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF |
808 | |
809 if (ctx->interlaced && first_field) { | |
810 first_field = 0; | |
811 ctx->cur_field ^= 1; | |
812 buf += ctx->cid_table->coding_unit_size; | |
813 buf_size -= ctx->cid_table->coding_unit_size; | |
814 goto encode_coding_unit; | |
815 } | |
816 | |
6752 | 817 ctx->frame.quality = ctx->qscale*FF_QP2LAMBDA; |
818 | |
5790 | 819 return ctx->cid_table->frame_size; |
820 } | |
821 | |
822 static int dnxhd_encode_end(AVCodecContext *avctx) | |
823 { | |
824 DNXHDEncContext *ctx = avctx->priv_data; | |
6978 | 825 int max_level = 1<<(ctx->cid_table->bit_depth+2); |
5790 | 826 int i; |
827 | |
6981 | 828 av_free(ctx->vlc_codes-max_level*2); |
829 av_free(ctx->vlc_bits -max_level*2); | |
830 av_freep(&ctx->run_codes); | |
831 av_freep(&ctx->run_bits); | |
5790 | 832 |
833 av_freep(&ctx->mb_bits); | |
834 av_freep(&ctx->mb_qscale); | |
835 av_freep(&ctx->mb_rc); | |
836 av_freep(&ctx->mb_cmp); | |
837 av_freep(&ctx->slice_size); | |
10387 | 838 av_freep(&ctx->slice_offs); |
5790 | 839 |
840 av_freep(&ctx->qmatrix_c); | |
841 av_freep(&ctx->qmatrix_l); | |
842 av_freep(&ctx->qmatrix_c16); | |
843 av_freep(&ctx->qmatrix_l16); | |
844 | |
845 for (i = 1; i < avctx->thread_count; i++) | |
846 av_freep(&ctx->thread[i]); | |
847 | |
848 return 0; | |
849 } | |
850 | |
851 AVCodec dnxhd_encoder = { | |
852 "dnxhd", | |
853 CODEC_TYPE_VIDEO, | |
854 CODEC_ID_DNXHD, | |
855 sizeof(DNXHDEncContext), | |
856 dnxhd_encode_init, | |
857 dnxhd_encode_picture, | |
858 dnxhd_encode_end, | |
10146
38cfe222e1a4
Mark all pix_fmts and supported_framerates compound literals as const.
reimar
parents:
10137
diff
changeset
|
859 .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV422P, PIX_FMT_NONE}, |
7040
e943e1409077
Make AVCodec long_names definition conditional depending on CONFIG_SMALL.
stefano
parents:
6981
diff
changeset
|
860 .long_name = NULL_IF_CONFIG_SMALL("VC3/DNxHD"), |
5790 | 861 }; |