comparison h264_loopfilter.c @ 10960:10759fd39860 libavcodec

Gcc idiocy fixes related to filter_mb_edge*. Change order of operands as gcc uses a hardcoded register per operand it seems even for static functions thus reducing unneeded moved (now functions try to pass the same argument in the same spot). Change signed int to unsigned int for array indexes as signed requires signed extension while unsigned is free. move the +52 up and merge it where it will end as a lea instruction, gcc always splits the 52 out there turning the free +52 into an expensive one otherwise. The changed code becomes a little faster.
author michael
date Fri, 22 Jan 2010 01:59:17 +0000
parents 304db572a69a
children 34a65026fa06
comparison
equal deleted inserted replaced
10959:d5320d1acaee 10960:10759fd39860
97 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 97 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
98 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 98 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
99 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 99 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
100 }; 100 };
101 101
102 static void av_noinline filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 102 static void av_noinline filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h) {
103 const int index_a = qp + h->slice_alpha_c0_offset; 103 const unsigned int index_a = 52 + qp + h->slice_alpha_c0_offset;
104 const int alpha = (alpha_table+52)[index_a]; 104 const int alpha = alpha_table[index_a];
105 const int beta = (beta_table+52)[qp + h->slice_beta_offset]; 105 const int beta = (beta_table+52)[qp + h->slice_beta_offset];
106 if (alpha ==0 || beta == 0) return; 106 if (alpha ==0 || beta == 0) return;
107 107
108 if( bS[0] < 4 ) { 108 if( bS[0] < 4 ) {
109 int8_t tc[4]; 109 int8_t tc[4];
110 tc[0] = (tc0_table+52)[index_a][bS[0]]; 110 tc[0] = tc0_table[index_a][bS[0]];
111 tc[1] = (tc0_table+52)[index_a][bS[1]]; 111 tc[1] = tc0_table[index_a][bS[1]];
112 tc[2] = (tc0_table+52)[index_a][bS[2]]; 112 tc[2] = tc0_table[index_a][bS[2]];
113 tc[3] = (tc0_table+52)[index_a][bS[3]]; 113 tc[3] = tc0_table[index_a][bS[3]];
114 h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc); 114 h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
115 } else { 115 } else {
116 h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta); 116 h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
117 } 117 }
118 } 118 }
119 static void av_noinline filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 119 static void av_noinline filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
120 const int index_a = qp + h->slice_alpha_c0_offset; 120 const unsigned int index_a = 52 + qp + h->slice_alpha_c0_offset;
121 const int alpha = (alpha_table+52)[index_a]; 121 const int alpha = alpha_table[index_a];
122 const int beta = (beta_table+52)[qp + h->slice_beta_offset]; 122 const int beta = (beta_table+52)[qp + h->slice_beta_offset];
123 if (alpha ==0 || beta == 0) return; 123 if (alpha ==0 || beta == 0) return;
124 124
125 if( bS[0] < 4 ) { 125 if( bS[0] < 4 ) {
126 int8_t tc[4]; 126 int8_t tc[4];
127 tc[0] = (tc0_table+52)[index_a][bS[0]]+1; 127 tc[0] = tc0_table[index_a][bS[0]]+1;
128 tc[1] = (tc0_table+52)[index_a][bS[1]]+1; 128 tc[1] = tc0_table[index_a][bS[1]]+1;
129 tc[2] = (tc0_table+52)[index_a][bS[2]]+1; 129 tc[2] = tc0_table[index_a][bS[2]]+1;
130 tc[3] = (tc0_table+52)[index_a][bS[3]]+1; 130 tc[3] = tc0_table[index_a][bS[3]]+1;
131 h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc); 131 h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
132 } else { 132 } else {
133 h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta); 133 h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
134 } 134 }
135 } 135 }
273 } 273 }
274 } 274 }
275 } 275 }
276 } 276 }
277 277
278 static void av_noinline filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 278 static void av_noinline filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
279 const int index_a = qp + h->slice_alpha_c0_offset; 279 const unsigned int index_a = 52 + qp + h->slice_alpha_c0_offset;
280 const int alpha = (alpha_table+52)[index_a]; 280 const int alpha = alpha_table[index_a];
281 const int beta = (beta_table+52)[qp + h->slice_beta_offset]; 281 const int beta = (beta_table+52)[qp + h->slice_beta_offset];
282 if (alpha ==0 || beta == 0) return; 282 if (alpha ==0 || beta == 0) return;
283 283
284 if( bS[0] < 4 ) { 284 if( bS[0] < 4 ) {
285 int8_t tc[4]; 285 int8_t tc[4];
286 tc[0] = (tc0_table+52)[index_a][bS[0]]; 286 tc[0] = tc0_table[index_a][bS[0]];
287 tc[1] = (tc0_table+52)[index_a][bS[1]]; 287 tc[1] = tc0_table[index_a][bS[1]];
288 tc[2] = (tc0_table+52)[index_a][bS[2]]; 288 tc[2] = tc0_table[index_a][bS[2]];
289 tc[3] = (tc0_table+52)[index_a][bS[3]]; 289 tc[3] = tc0_table[index_a][bS[3]];
290 h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc); 290 h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
291 } else { 291 } else {
292 h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta); 292 h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
293 } 293 }
294 } 294 }
295 295
296 static void av_noinline filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 296 static void av_noinline filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
297 const int index_a = qp + h->slice_alpha_c0_offset; 297 const unsigned int index_a = 52 + qp + h->slice_alpha_c0_offset;
298 const int alpha = (alpha_table+52)[index_a]; 298 const int alpha = alpha_table[index_a];
299 const int beta = (beta_table+52)[qp + h->slice_beta_offset]; 299 const int beta = (beta_table+52)[qp + h->slice_beta_offset];
300 if (alpha ==0 || beta == 0) return; 300 if (alpha ==0 || beta == 0) return;
301 301
302 if( bS[0] < 4 ) { 302 if( bS[0] < 4 ) {
303 int8_t tc[4]; 303 int8_t tc[4];
304 tc[0] = (tc0_table+52)[index_a][bS[0]]+1; 304 tc[0] = tc0_table[index_a][bS[0]]+1;
305 tc[1] = (tc0_table+52)[index_a][bS[1]]+1; 305 tc[1] = tc0_table[index_a][bS[1]]+1;
306 tc[2] = (tc0_table+52)[index_a][bS[2]]+1; 306 tc[2] = tc0_table[index_a][bS[2]]+1;
307 tc[3] = (tc0_table+52)[index_a][bS[3]]+1; 307 tc[3] = tc0_table[index_a][bS[3]]+1;
308 h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc); 308 h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
309 } else { 309 } else {
310 h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta); 310 h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
311 } 311 }
312 } 312 }
346 if( IS_INTRA(mb_type) ) { 346 if( IS_INTRA(mb_type) ) {
347 int16_t bS4[4] = {4,4,4,4}; 347 int16_t bS4[4] = {4,4,4,4};
348 int16_t bS3[4] = {3,3,3,3}; 348 int16_t bS3[4] = {3,3,3,3};
349 int16_t *bSH = FIELD_PICTURE ? bS3 : bS4; 349 int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
350 if( IS_8x8DCT(mb_type) ) { 350 if( IS_8x8DCT(mb_type) ) {
351 filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 ); 351 filter_mb_edgev( &img_y[4*0], linesize, bS4, qp0, h);
352 filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp ); 352 filter_mb_edgev( &img_y[4*2], linesize, bS3, qp, h);
353 filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 ); 353 filter_mb_edgeh( &img_y[4*0*linesize], linesize, bSH, qp1, h);
354 filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp ); 354 filter_mb_edgeh( &img_y[4*2*linesize], linesize, bS3, qp, h);
355 } else { 355 } else {
356 filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 ); 356 filter_mb_edgev( &img_y[4*0], linesize, bS4, qp0, h);
357 filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp ); 357 filter_mb_edgev( &img_y[4*1], linesize, bS3, qp, h);
358 filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp ); 358 filter_mb_edgev( &img_y[4*2], linesize, bS3, qp, h);
359 filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp ); 359 filter_mb_edgev( &img_y[4*3], linesize, bS3, qp, h);
360 filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 ); 360 filter_mb_edgeh( &img_y[4*0*linesize], linesize, bSH, qp1, h);
361 filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp ); 361 filter_mb_edgeh( &img_y[4*1*linesize], linesize, bS3, qp, h);
362 filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp ); 362 filter_mb_edgeh( &img_y[4*2*linesize], linesize, bS3, qp, h);
363 filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp ); 363 filter_mb_edgeh( &img_y[4*3*linesize], linesize, bS3, qp, h);
364 } 364 }
365 filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 ); 365 filter_mb_edgecv( &img_cb[2*0], uvlinesize, bS4, qpc0, h);
366 filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc ); 366 filter_mb_edgecv( &img_cb[2*2], uvlinesize, bS3, qpc, h);
367 filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 ); 367 filter_mb_edgecv( &img_cr[2*0], uvlinesize, bS4, qpc0, h);
368 filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc ); 368 filter_mb_edgecv( &img_cr[2*2], uvlinesize, bS3, qpc, h);
369 filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 ); 369 filter_mb_edgech( &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
370 filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc ); 370 filter_mb_edgech( &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
371 filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 ); 371 filter_mb_edgech( &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
372 filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc ); 372 filter_mb_edgech( &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
373 return; 373 return;
374 } else { 374 } else {
375 DECLARE_ALIGNED_8(int16_t, bS[2][4][4]); 375 DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
376 uint64_t (*bSv)[4] = (uint64_t(*)[4])bS; 376 uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
377 int edges; 377 int edges;
394 if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) ) 394 if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
395 bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL; 395 bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
396 396
397 #define FILTER(hv,dir,edge)\ 397 #define FILTER(hv,dir,edge)\
398 if(bSv[dir][edge]) {\ 398 if(bSv[dir][edge]) {\
399 filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\ 399 filter_mb_edge##hv( &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir, h );\
400 if(!(edge&1)) {\ 400 if(!(edge&1)) {\
401 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\ 401 filter_mb_edgec##hv( &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
402 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\ 402 filter_mb_edgec##hv( &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
403 }\ 403 }\
404 } 404 }
405 if( edges == 1 ) { 405 if( edges == 1 ) {
406 FILTER(v,0,0); 406 FILTER(v,0,0);
407 FILTER(h,1,0); 407 FILTER(h,1,0);
471 // Do not use s->qscale as luma quantizer because it has not the same 471 // Do not use s->qscale as luma quantizer because it has not the same
472 // value in IPCM macroblocks. 472 // value in IPCM macroblocks.
473 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; 473 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
474 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize); 474 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
475 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } 475 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
476 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp ); 476 filter_mb_edgeh( &img_y[j*linesize], tmp_linesize, bS, qp, h );
477 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, 477 filter_mb_edgech( &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
478 ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1); 478 ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
479 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, 479 filter_mb_edgech( &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
480 ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1); 480 ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
481 } 481 }
482 482
483 start = 1; 483 start = 1;
484 } 484 }
485 485
588 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; 588 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
589 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]); 589 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
590 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); 590 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
591 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } 591 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
592 if( dir == 0 ) { 592 if( dir == 0 ) {
593 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp ); 593 filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, h );
594 if( (edge&1) == 0 ) { 594 if( (edge&1) == 0 ) {
595 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; 595 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
596 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, qp); 596 filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, qp, h);
597 if(h->pps.chroma_qp_diff) 597 if(h->pps.chroma_qp_diff)
598 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; 598 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
599 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, qp); 599 filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, qp, h);
600 } 600 }
601 } else { 601 } else {
602 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp ); 602 filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
603 if( (edge&1) == 0 ) { 603 if( (edge&1) == 0 ) {
604 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; 604 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
605 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, qp); 605 filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, qp, h);
606 if(h->pps.chroma_qp_diff) 606 if(h->pps.chroma_qp_diff)
607 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; 607 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
608 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, qp); 608 filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, qp, h);
609 } 609 }
610 } 610 }
611 } 611 }
612 } 612 }
613 613