comparison vp3.c @ 11133:cd2956d08cc1 libavcodec

Don't pre-calculate first_pixel 3.6% faster on Elephants_Dream_HD-q7-aq7.ogg on my penryn
author conrad
date Fri, 12 Feb 2010 22:01:38 +0000
parents 449c12b6c3a0
children 68e16ac22032
comparison
equal deleted inserted replaced
11132:449c12b6c3a0 11133:cd2956d08cc1
51 } Coeff; 51 } Coeff;
52 52
53 //FIXME split things out into their own arrays 53 //FIXME split things out into their own arrays
54 typedef struct Vp3Fragment { 54 typedef struct Vp3Fragment {
55 Coeff *next_coeff; 55 Coeff *next_coeff;
56 /* address of first pixel taking into account which plane the fragment
57 * lives on as well as the plane stride */
58 int first_pixel;
59 /* this is the macroblock that the fragment belongs to */ 56 /* this is the macroblock that the fragment belongs to */
60 uint16_t macroblock; 57 uint16_t macroblock;
61 uint8_t coding_method; 58 uint8_t coding_method;
62 int8_t motion_x; 59 int8_t motion_x;
63 int8_t motion_y; 60 int8_t motion_y;
161 Vp3Fragment *all_fragments; 158 Vp3Fragment *all_fragments;
162 uint8_t *coeff_counts; 159 uint8_t *coeff_counts;
163 Coeff *coeffs; 160 Coeff *coeffs;
164 Coeff *next_coeff; 161 Coeff *next_coeff;
165 int fragment_start[3]; 162 int fragment_start[3];
163 int data_offset[3];
166 164
167 ScanTable scantable; 165 ScanTable scantable;
168 166
169 /* tables */ 167 /* tables */
170 uint16_t coded_dc_scale_factor[64]; 168 uint16_t coded_dc_scale_factor[64];
176 174
177 /* this is a list of indexes into the all_fragments array indicating 175 /* this is a list of indexes into the all_fragments array indicating
178 * which of the fragments are coded */ 176 * which of the fragments are coded */
179 int *coded_fragment_list; 177 int *coded_fragment_list;
180 int coded_fragment_list_index; 178 int coded_fragment_list_index;
181 int pixel_addresses_initialized;
182 179
183 /* track which fragments have already been decoded; called 'fast' 180 /* track which fragments have already been decoded; called 'fast'
184 * because this data structure avoids having to iterate through every 181 * because this data structure avoids having to iterate through every
185 * fragment in coded_fragment_list; once a fragment has been fully 182 * fragment in coded_fragment_list; once a fragment has been fully
186 * decoded, it is removed from this list */ 183 * decoded, it is removed from this list */
1399 int height = s->fragment_height >> !!plane; 1396 int height = s->fragment_height >> !!plane;
1400 int fragment = s->fragment_start [plane] + ystart * width; 1397 int fragment = s->fragment_start [plane] + ystart * width;
1401 int stride = s->current_frame.linesize[plane]; 1398 int stride = s->current_frame.linesize[plane];
1402 uint8_t *plane_data = s->current_frame.data [plane]; 1399 uint8_t *plane_data = s->current_frame.data [plane];
1403 if (!s->flipped_image) stride = -stride; 1400 if (!s->flipped_image) stride = -stride;
1401 plane_data += s->data_offset[plane] + 8*ystart*stride;
1404 1402
1405 for (y = ystart; y < yend; y++) { 1403 for (y = ystart; y < yend; y++) {
1406 1404
1407 for (x = 0; x < width; x++) { 1405 for (x = 0; x < width; x++) {
1408 /* This code basically just deblocks on the edges of coded blocks. 1406 /* This code basically just deblocks on the edges of coded blocks.
1412 if( s->all_fragments[fragment].coding_method != MODE_COPY ) 1410 if( s->all_fragments[fragment].coding_method != MODE_COPY )
1413 { 1411 {
1414 /* do not perform left edge filter for left columns frags */ 1412 /* do not perform left edge filter for left columns frags */
1415 if (x > 0) { 1413 if (x > 0) {
1416 s->dsp.vp3_h_loop_filter( 1414 s->dsp.vp3_h_loop_filter(
1417 plane_data + s->all_fragments[fragment].first_pixel, 1415 plane_data + 8*x,
1418 stride, bounding_values); 1416 stride, bounding_values);
1419 } 1417 }
1420 1418
1421 /* do not perform top edge filter for top row fragments */ 1419 /* do not perform top edge filter for top row fragments */
1422 if (y > 0) { 1420 if (y > 0) {
1423 s->dsp.vp3_v_loop_filter( 1421 s->dsp.vp3_v_loop_filter(
1424 plane_data + s->all_fragments[fragment].first_pixel, 1422 plane_data + 8*x,
1425 stride, bounding_values); 1423 stride, bounding_values);
1426 } 1424 }
1427 1425
1428 /* do not perform right edge filter for right column 1426 /* do not perform right edge filter for right column
1429 * fragments or if right fragment neighbor is also coded 1427 * fragments or if right fragment neighbor is also coded
1430 * in this frame (it will be filtered in next iteration) */ 1428 * in this frame (it will be filtered in next iteration) */
1431 if ((x < width - 1) && 1429 if ((x < width - 1) &&
1432 (s->all_fragments[fragment + 1].coding_method == MODE_COPY)) { 1430 (s->all_fragments[fragment + 1].coding_method == MODE_COPY)) {
1433 s->dsp.vp3_h_loop_filter( 1431 s->dsp.vp3_h_loop_filter(
1434 plane_data + s->all_fragments[fragment + 1].first_pixel, 1432 plane_data + 8*x + 8,
1435 stride, bounding_values); 1433 stride, bounding_values);
1436 } 1434 }
1437 1435
1438 /* do not perform bottom edge filter for bottom row 1436 /* do not perform bottom edge filter for bottom row
1439 * fragments or if bottom fragment neighbor is also coded 1437 * fragments or if bottom fragment neighbor is also coded
1440 * in this frame (it will be filtered in the next row) */ 1438 * in this frame (it will be filtered in the next row) */
1441 if ((y < height - 1) && 1439 if ((y < height - 1) &&
1442 (s->all_fragments[fragment + width].coding_method == MODE_COPY)) { 1440 (s->all_fragments[fragment + width].coding_method == MODE_COPY)) {
1443 s->dsp.vp3_v_loop_filter( 1441 s->dsp.vp3_v_loop_filter(
1444 plane_data + s->all_fragments[fragment + width].first_pixel, 1442 plane_data + 8*x + 8*stride,
1445 stride, bounding_values); 1443 stride, bounding_values);
1446 } 1444 }
1447 } 1445 }
1448 1446
1449 fragment++; 1447 fragment++;
1450 } 1448 }
1449 plane_data += 8*stride;
1451 } 1450 }
1452 } 1451 }
1453 1452
1454 /** 1453 /**
1455 * called when all pixels up to row y are complete 1454 * called when all pixels up to row y are complete
1499 1498
1500 if (slice >= s->macroblock_height) 1499 if (slice >= s->macroblock_height)
1501 return; 1500 return;
1502 1501
1503 for (plane = 0; plane < 3; plane++) { 1502 for (plane = 0; plane < 3; plane++) {
1504 uint8_t *output_plane = s->current_frame.data [plane]; 1503 uint8_t *output_plane = s->current_frame.data [plane] + s->data_offset[plane];
1505 uint8_t * last_plane = s-> last_frame.data [plane]; 1504 uint8_t * last_plane = s-> last_frame.data [plane] + s->data_offset[plane];
1506 uint8_t *golden_plane = s-> golden_frame.data [plane]; 1505 uint8_t *golden_plane = s-> golden_frame.data [plane] + s->data_offset[plane];
1507 int stride = s->current_frame.linesize[plane]; 1506 int stride = s->current_frame.linesize[plane];
1508 int plane_width = s->width >> !!plane; 1507 int plane_width = s->width >> !!plane;
1509 int plane_height = s->height >> !!plane; 1508 int plane_height = s->height >> !!plane;
1510 int y = slice * FRAGMENT_PIXELS << !plane ; 1509 int y = slice * FRAGMENT_PIXELS << !plane ;
1511 int slice_height = y + (FRAGMENT_PIXELS << !plane); 1510 int slice_height = y + (FRAGMENT_PIXELS << !plane);
1520 /* for each fragment row in the slice (both of them)... */ 1519 /* for each fragment row in the slice (both of them)... */
1521 for (; y < slice_height; y += 8) { 1520 for (; y < slice_height; y += 8) {
1522 1521
1523 /* for each fragment in a row... */ 1522 /* for each fragment in a row... */
1524 for (x = 0; x < plane_width; x += 8, i++) { 1523 for (x = 0; x < plane_width; x += 8, i++) {
1524 int first_pixel = y*stride + x;
1525 1525
1526 if ((i < 0) || (i >= s->fragment_count)) { 1526 if ((i < 0) || (i >= s->fragment_count)) {
1527 av_log(s->avctx, AV_LOG_ERROR, " vp3:render_slice(): bad fragment number (%d)\n", i); 1527 av_log(s->avctx, AV_LOG_ERROR, " vp3:render_slice(): bad fragment number (%d)\n", i);
1528 return; 1528 return;
1529 } 1529 }
1536 (s->all_fragments[i].coding_method == MODE_GOLDEN_MV)) 1536 (s->all_fragments[i].coding_method == MODE_GOLDEN_MV))
1537 motion_source= golden_plane; 1537 motion_source= golden_plane;
1538 else 1538 else
1539 motion_source= last_plane; 1539 motion_source= last_plane;
1540 1540
1541 motion_source += s->all_fragments[i].first_pixel; 1541 motion_source += first_pixel;
1542 motion_halfpel_index = 0; 1542 motion_halfpel_index = 0;
1543 1543
1544 /* sort out the motion vector if this fragment is coded 1544 /* sort out the motion vector if this fragment is coded
1545 * using a motion vector method */ 1545 * using a motion vector method */
1546 if ((s->all_fragments[i].coding_method > MODE_INTRA) && 1546 if ((s->all_fragments[i].coding_method > MODE_INTRA) &&
1582 put_no_rnd_pixels_l2 which would look more like the 1582 put_no_rnd_pixels_l2 which would look more like the
1583 VP3 source but this would be slower as 1583 VP3 source but this would be slower as
1584 put_no_rnd_pixels_tab is better optimzed */ 1584 put_no_rnd_pixels_tab is better optimzed */
1585 if(motion_halfpel_index != 3){ 1585 if(motion_halfpel_index != 3){
1586 s->dsp.put_no_rnd_pixels_tab[1][motion_halfpel_index]( 1586 s->dsp.put_no_rnd_pixels_tab[1][motion_halfpel_index](
1587 output_plane + s->all_fragments[i].first_pixel, 1587 output_plane + first_pixel,
1588 motion_source, stride, 8); 1588 motion_source, stride, 8);
1589 }else{ 1589 }else{
1590 int d= (motion_x ^ motion_y)>>31; // d is 0 if motion_x and _y have the same sign, else -1 1590 int d= (motion_x ^ motion_y)>>31; // d is 0 if motion_x and _y have the same sign, else -1
1591 s->dsp.put_no_rnd_pixels_l2[1]( 1591 s->dsp.put_no_rnd_pixels_l2[1](
1592 output_plane + s->all_fragments[i].first_pixel, 1592 output_plane + first_pixel,
1593 motion_source - d, 1593 motion_source - d,
1594 motion_source + stride + 1 + d, 1594 motion_source + stride + 1 + d,
1595 stride, 8); 1595 stride, 8);
1596 } 1596 }
1597 dequantizer = s->qmat[s->all_fragments[i].qpi][1][plane]; 1597 dequantizer = s->qmat[s->all_fragments[i].qpi][1][plane];
1620 1620
1621 if (s->all_fragments[i].coding_method == MODE_INTRA) { 1621 if (s->all_fragments[i].coding_method == MODE_INTRA) {
1622 if(s->avctx->idct_algo!=FF_IDCT_VP3) 1622 if(s->avctx->idct_algo!=FF_IDCT_VP3)
1623 block[0] += 128<<3; 1623 block[0] += 128<<3;
1624 s->dsp.idct_put( 1624 s->dsp.idct_put(
1625 output_plane + s->all_fragments[i].first_pixel, 1625 output_plane + first_pixel,
1626 stride, 1626 stride,
1627 block); 1627 block);
1628 } else { 1628 } else {
1629 s->dsp.idct_add( 1629 s->dsp.idct_add(
1630 output_plane + s->all_fragments[i].first_pixel, 1630 output_plane + first_pixel,
1631 stride, 1631 stride,
1632 block); 1632 block);
1633 } 1633 }
1634 } else { 1634 } else {
1635 1635
1636 /* copy directly from the previous frame */ 1636 /* copy directly from the previous frame */
1637 s->dsp.put_pixels_tab[1][0]( 1637 s->dsp.put_pixels_tab[1][0](
1638 output_plane + s->all_fragments[i].first_pixel, 1638 output_plane + first_pixel,
1639 last_plane + s->all_fragments[i].first_pixel, 1639 last_plane + first_pixel,
1640 stride, 8); 1640 stride, 8);
1641 1641
1642 } 1642 }
1643 } 1643 }
1644 // Filter the previous block row. We can't filter the current row yet 1644 // Filter the previous block row. We can't filter the current row yet
1657 */ 1657 */
1658 1658
1659 // now that we've filtered the last rows, they're safe to display 1659 // now that we've filtered the last rows, they're safe to display
1660 if (slice) 1660 if (slice)
1661 vp3_draw_horiz_band(s, 16*slice); 1661 vp3_draw_horiz_band(s, 16*slice);
1662 }
1663
1664 /*
1665 * This function computes the first pixel addresses for each fragment.
1666 * This function needs to be invoked after the first frame is allocated
1667 * so that it has access to the plane strides.
1668 */
1669 static void vp3_calculate_pixel_addresses(Vp3DecodeContext *s)
1670 {
1671 #define Y_INITIAL(chroma_shift) s->flipped_image ? 1 : s->fragment_height >> chroma_shift
1672 #define Y_FINISHED(chroma_shift) s->flipped_image ? y <= s->fragment_height >> chroma_shift : y > 0
1673
1674 int i, x, y;
1675 const int y_inc = s->flipped_image ? 1 : -1;
1676
1677 /* figure out the first pixel addresses for each of the fragments */
1678 /* Y plane */
1679 i = 0;
1680 for (y = Y_INITIAL(0); Y_FINISHED(0); y += y_inc) {
1681 for (x = 0; x < s->fragment_width; x++) {
1682 s->all_fragments[i++].first_pixel =
1683 s->golden_frame.linesize[0] * y * FRAGMENT_PIXELS -
1684 s->golden_frame.linesize[0] +
1685 x * FRAGMENT_PIXELS;
1686 }
1687 }
1688
1689 /* U plane */
1690 i = s->fragment_start[1];
1691 for (y = Y_INITIAL(1); Y_FINISHED(1); y += y_inc) {
1692 for (x = 0; x < s->fragment_width / 2; x++) {
1693 s->all_fragments[i++].first_pixel =
1694 s->golden_frame.linesize[1] * y * FRAGMENT_PIXELS -
1695 s->golden_frame.linesize[1] +
1696 x * FRAGMENT_PIXELS;
1697 }
1698 }
1699
1700 /* V plane */
1701 i = s->fragment_start[2];
1702 for (y = Y_INITIAL(1); Y_FINISHED(1); y += y_inc) {
1703 for (x = 0; x < s->fragment_width / 2; x++) {
1704 s->all_fragments[i++].first_pixel =
1705 s->golden_frame.linesize[2] * y * FRAGMENT_PIXELS -
1706 s->golden_frame.linesize[2] +
1707 x * FRAGMENT_PIXELS;
1708 }
1709 }
1710 } 1662 }
1711 1663
1712 /* 1664 /*
1713 * This is the ffmpeg/libavcodec API init function. 1665 * This is the ffmpeg/libavcodec API init function.
1714 */ 1666 */
1773 s->all_fragments = av_malloc(s->fragment_count * sizeof(Vp3Fragment)); 1725 s->all_fragments = av_malloc(s->fragment_count * sizeof(Vp3Fragment));
1774 s->coeff_counts = av_malloc(s->fragment_count * sizeof(*s->coeff_counts)); 1726 s->coeff_counts = av_malloc(s->fragment_count * sizeof(*s->coeff_counts));
1775 s->coeffs = av_malloc(s->fragment_count * sizeof(Coeff) * 65); 1727 s->coeffs = av_malloc(s->fragment_count * sizeof(Coeff) * 65);
1776 s->coded_fragment_list = av_malloc(s->fragment_count * sizeof(int)); 1728 s->coded_fragment_list = av_malloc(s->fragment_count * sizeof(int));
1777 s->fast_fragment_list = av_malloc(s->fragment_count * sizeof(int)); 1729 s->fast_fragment_list = av_malloc(s->fragment_count * sizeof(int));
1778 s->pixel_addresses_initialized = 0;
1779 if (!s->superblock_coding || !s->all_fragments || !s->coeff_counts || 1730 if (!s->superblock_coding || !s->all_fragments || !s->coeff_counts ||
1780 !s->coeffs || !s->coded_fragment_list || !s->fast_fragment_list) { 1731 !s->coeffs || !s->coded_fragment_list || !s->fast_fragment_list) {
1781 vp3_decode_end(avctx); 1732 vp3_decode_end(avctx);
1782 return -1; 1733 return -1;
1783 } 1734 }
1994 return -1; 1945 return -1;
1995 } 1946 }
1996 1947
1997 /* golden frame is also the current frame */ 1948 /* golden frame is also the current frame */
1998 s->current_frame= s->golden_frame; 1949 s->current_frame= s->golden_frame;
1999
2000 /* time to figure out pixel addresses? */
2001 if (!s->pixel_addresses_initialized)
2002 {
2003 vp3_calculate_pixel_addresses(s);
2004 s->pixel_addresses_initialized = 1;
2005 }
2006 } else { 1950 } else {
2007 /* allocate a new current frame */ 1951 /* allocate a new current frame */
2008 s->current_frame.reference = 3; 1952 s->current_frame.reference = 3;
2009 if (!s->pixel_addresses_initialized) { 1953 if (!s->golden_frame.data[0]) {
2010 av_log(s->avctx, AV_LOG_ERROR, "vp3: first frame not a keyframe\n"); 1954 av_log(s->avctx, AV_LOG_ERROR, "vp3: first frame not a keyframe\n");
2011 return -1; 1955 return -1;
2012 } 1956 }
2013 if(avctx->get_buffer(avctx, &s->current_frame) < 0) { 1957 if(avctx->get_buffer(avctx, &s->current_frame) < 0) {
2014 av_log(s->avctx, AV_LOG_ERROR, "vp3: get_buffer() failed\n"); 1958 av_log(s->avctx, AV_LOG_ERROR, "vp3: get_buffer() failed\n");
2038 return -1; 1982 return -1;
2039 } 1983 }
2040 if (unpack_dct_coeffs(s, &gb)){ 1984 if (unpack_dct_coeffs(s, &gb)){
2041 av_log(s->avctx, AV_LOG_ERROR, "error in unpack_dct_coeffs\n"); 1985 av_log(s->avctx, AV_LOG_ERROR, "error in unpack_dct_coeffs\n");
2042 return -1; 1986 return -1;
1987 }
1988
1989 for (i = 0; i < 3; i++) {
1990 if (s->flipped_image)
1991 s->data_offset[i] = 0;
1992 else
1993 s->data_offset[i] = ((s->height>>!!i)-1) * s->current_frame.linesize[i];
2043 } 1994 }
2044 1995
2045 s->last_slice_end = 0; 1996 s->last_slice_end = 0;
2046 for (i = 0; i < s->macroblock_height; i++) 1997 for (i = 0; i < s->macroblock_height; i++)
2047 render_slice(s, i); 1998 render_slice(s, i);