Mercurial > libavcodec.hg
comparison snow.c @ 3198:6b9f0c4fbdbe libavcodec
First part of a series of speed-enchancing patches.
This one sets up a snow.h and makes snow use the dsputil function pointer
framework to access the three functions that will be implemented in asm
in the other parts of the patchset.
Patch by Robert Edele < yartrebo AH earthlink POIS net>
Original thread:
Subject: [Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations
Date: Sun, 05 Feb 2006 12:47:14 -0500
author | gpoirier |
---|---|
date | Thu, 16 Mar 2006 19:18:18 +0000 |
parents | 8f53630cd24e |
children | c1add9fe5c65 |
comparison
equal
deleted
inserted
replaced
3197:8f53630cd24e | 3198:6b9f0c4fbdbe |
---|---|
17 */ | 17 */ |
18 | 18 |
19 #include "avcodec.h" | 19 #include "avcodec.h" |
20 #include "common.h" | 20 #include "common.h" |
21 #include "dsputil.h" | 21 #include "dsputil.h" |
22 #include "snow.h" | |
22 | 23 |
23 #include "rangecoder.h" | 24 #include "rangecoder.h" |
24 #define MID_STATE 128 | |
25 | 25 |
26 #include "mpegvideo.h" | 26 #include "mpegvideo.h" |
27 | 27 |
28 #undef NDEBUG | 28 #undef NDEBUG |
29 #include <assert.h> | 29 #include <assert.h> |
30 | |
31 #define MAX_DECOMPOSITIONS 8 | |
32 #define MAX_PLANES 4 | |
33 #define DWTELEM int | |
34 #define QSHIFT 5 | |
35 #define QROOT (1<<QSHIFT) | |
36 #define LOSSLESS_QLOG -128 | |
37 #define FRAC_BITS 8 | |
38 | 30 |
39 static const int8_t quant3[256]={ | 31 static const int8_t quant3[256]={ |
40 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 32 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
179 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, | 171 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, |
180 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, | 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, |
181 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1, | 173 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1, |
182 }; | 174 }; |
183 | 175 |
184 #define LOG2_OBMC_MAX 6 | |
185 #define OBMC_MAX (1<<(LOG2_OBMC_MAX)) | |
186 #if 0 //64*cubic | 176 #if 0 //64*cubic |
187 static const uint8_t obmc32[1024]={ | 177 static const uint8_t obmc32[1024]={ |
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
189 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, | 179 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, |
190 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, | 180 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, |
423 int width; | 413 int width; |
424 int height; | 414 int height; |
425 SubBand band[MAX_DECOMPOSITIONS][4]; | 415 SubBand band[MAX_DECOMPOSITIONS][4]; |
426 }Plane; | 416 }Plane; |
427 | 417 |
428 /** Used to minimize the amount of memory used in order to optimize cache performance. **/ | |
429 typedef struct { | |
430 DWTELEM * * line; ///< For use by idwt and predict_slices. | |
431 DWTELEM * * data_stack; ///< Used for internal purposes. | |
432 int data_stack_top; | |
433 int line_count; | |
434 int line_width; | |
435 int data_count; | |
436 DWTELEM * base_buffer; ///< Buffer that this structure is caching. | |
437 } slice_buffer; | |
438 | |
439 typedef struct SnowContext{ | 418 typedef struct SnowContext{ |
440 // MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX) | 419 // MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX) |
441 | 420 |
442 AVCodecContext *avctx; | 421 AVCodecContext *avctx; |
443 RangeCoder c; | 422 RangeCoder c; |
739 if(mirror_right){ | 718 if(mirror_right){ |
740 dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse); | 719 dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse); |
741 } | 720 } |
742 } | 721 } |
743 | 722 |
723 #ifndef lift5 | |
744 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){ | 724 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){ |
745 const int mirror_left= !highpass; | 725 const int mirror_left= !highpass; |
746 const int mirror_right= (width&1) ^ highpass; | 726 const int mirror_right= (width&1) ^ highpass; |
747 const int w= (width>>1) - 1 + (highpass & width); | 727 const int w= (width>>1) - 1 + (highpass & width); |
748 int i; | 728 int i; |
768 r += r>>4; | 748 r += r>>4; |
769 r += r>>8; | 749 r += r>>8; |
770 dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse); | 750 dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse); |
771 } | 751 } |
772 } | 752 } |
773 | 753 #endif |
754 | |
755 #ifndef liftS | |
774 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){ | 756 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){ |
775 const int mirror_left= !highpass; | 757 const int mirror_left= !highpass; |
776 const int mirror_right= (width&1) ^ highpass; | 758 const int mirror_right= (width&1) ^ highpass; |
777 const int w= (width>>1) - 1 + (highpass & width); | 759 const int w= (width>>1) - 1 + (highpass & width); |
778 int i; | 760 int i; |
791 | 773 |
792 if(mirror_right){ | 774 if(mirror_right){ |
793 dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse); | 775 dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse); |
794 } | 776 } |
795 } | 777 } |
778 #endif | |
796 | 779 |
797 | 780 |
798 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){ | 781 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){ |
799 int x, i; | 782 int x, i; |
800 | 783 |
1109 b0=b2; | 1092 b0=b2; |
1110 b1=b3; | 1093 b1=b3; |
1111 } | 1094 } |
1112 } | 1095 } |
1113 | 1096 |
1114 #define liftS lift | |
1115 #define lift5 lift | |
1116 #if 1 | |
1117 #define W_AM 3 | |
1118 #define W_AO 0 | |
1119 #define W_AS 1 | |
1120 | |
1121 #undef liftS | |
1122 #define W_BM 1 | |
1123 #define W_BO 8 | |
1124 #define W_BS 4 | |
1125 | |
1126 #define W_CM 1 | |
1127 #define W_CO 0 | |
1128 #define W_CS 0 | |
1129 | |
1130 #define W_DM 3 | |
1131 #define W_DO 4 | |
1132 #define W_DS 3 | |
1133 #elif 0 | |
1134 #define W_AM 55 | |
1135 #define W_AO 16 | |
1136 #define W_AS 5 | |
1137 | |
1138 #define W_BM 3 | |
1139 #define W_BO 32 | |
1140 #define W_BS 6 | |
1141 | |
1142 #define W_CM 127 | |
1143 #define W_CO 64 | |
1144 #define W_CS 7 | |
1145 | |
1146 #define W_DM 7 | |
1147 #define W_DO 8 | |
1148 #define W_DS 4 | |
1149 #elif 0 | |
1150 #define W_AM 97 | |
1151 #define W_AO 32 | |
1152 #define W_AS 6 | |
1153 | |
1154 #define W_BM 63 | |
1155 #define W_BO 512 | |
1156 #define W_BS 10 | |
1157 | |
1158 #define W_CM 13 | |
1159 #define W_CO 8 | |
1160 #define W_CS 4 | |
1161 | |
1162 #define W_DM 15 | |
1163 #define W_DO 16 | |
1164 #define W_DS 5 | |
1165 | |
1166 #else | |
1167 | |
1168 #define W_AM 203 | |
1169 #define W_AO 64 | |
1170 #define W_AS 7 | |
1171 | |
1172 #define W_BM 217 | |
1173 #define W_BO 2048 | |
1174 #define W_BS 12 | |
1175 | |
1176 #define W_CM 113 | |
1177 #define W_CO 64 | |
1178 #define W_CS 7 | |
1179 | |
1180 #define W_DM 227 | |
1181 #define W_DO 128 | |
1182 #define W_DS 9 | |
1183 #endif | |
1184 static void horizontal_decompose97i(DWTELEM *b, int width){ | 1097 static void horizontal_decompose97i(DWTELEM *b, int width){ |
1185 DWTELEM temp[width]; | 1098 DWTELEM temp[width]; |
1186 const int w2= (width+1)>>1; | 1099 const int w2= (width+1)>>1; |
1187 | 1100 |
1188 lift (temp+w2, b +1, b , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0); | 1101 lift (temp+w2, b +1, b , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0); |
1408 while(cs.y <= height) | 1321 while(cs.y <= height) |
1409 spatial_compose53i_dy(&cs, buffer, width, height, stride); | 1322 spatial_compose53i_dy(&cs, buffer, width, height, stride); |
1410 } | 1323 } |
1411 | 1324 |
1412 | 1325 |
1413 static void horizontal_compose97i(DWTELEM *b, int width){ | 1326 void ff_snow_horizontal_compose97i(DWTELEM *b, int width){ |
1414 DWTELEM temp[width]; | 1327 DWTELEM temp[width]; |
1415 const int w2= (width+1)>>1; | 1328 const int w2= (width+1)>>1; |
1416 | 1329 |
1417 lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1); | 1330 lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1); |
1418 lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1); | 1331 lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1); |
1461 for(i=0; i<width; i++){ | 1374 for(i=0; i<width; i++){ |
1462 b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS; | 1375 b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS; |
1463 } | 1376 } |
1464 } | 1377 } |
1465 | 1378 |
1466 static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){ | 1379 void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){ |
1467 int i; | 1380 int i; |
1468 | 1381 |
1469 for(i=0; i<width; i++){ | 1382 for(i=0; i<width; i++){ |
1470 #ifndef lift5 | 1383 #ifndef lift5 |
1471 int r; | 1384 int r; |
1502 cs->b2 = buffer + mirror(-3+1, height-1)*stride; | 1415 cs->b2 = buffer + mirror(-3+1, height-1)*stride; |
1503 cs->b3 = buffer + mirror(-3+2, height-1)*stride; | 1416 cs->b3 = buffer + mirror(-3+2, height-1)*stride; |
1504 cs->y = -3; | 1417 cs->y = -3; |
1505 } | 1418 } |
1506 | 1419 |
1507 static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){ | 1420 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){ |
1508 int y = cs->y; | 1421 int y = cs->y; |
1509 | 1422 |
1510 DWTELEM *b0= cs->b0; | 1423 DWTELEM *b0= cs->b0; |
1511 DWTELEM *b1= cs->b1; | 1424 DWTELEM *b1= cs->b1; |
1512 DWTELEM *b2= cs->b2; | 1425 DWTELEM *b2= cs->b2; |
1514 DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line); | 1427 DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line); |
1515 DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line); | 1428 DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line); |
1516 | 1429 |
1517 {START_TIMER | 1430 {START_TIMER |
1518 if(y>0 && y+4<height){ | 1431 if(y>0 && y+4<height){ |
1519 vertical_compose97i(b0, b1, b2, b3, b4, b5, width); | 1432 dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width); |
1520 }else{ | 1433 }else{ |
1521 if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width); | 1434 if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width); |
1522 if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width); | 1435 if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width); |
1523 if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width); | 1436 if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width); |
1524 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width); | 1437 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width); |
1525 } | 1438 } |
1526 if(width>400){ | 1439 if(width>400){ |
1527 STOP_TIMER("vertical_compose97i")}} | 1440 STOP_TIMER("vertical_compose97i")}} |
1528 | 1441 |
1529 {START_TIMER | 1442 {START_TIMER |
1530 if(y-1<(unsigned)height) horizontal_compose97i(b0, width); | 1443 if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width); |
1531 if(y+0<(unsigned)height) horizontal_compose97i(b1, width); | 1444 if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width); |
1532 if(width>400 && y+0<(unsigned)height){ | 1445 if(width>400 && y+0<(unsigned)height){ |
1533 STOP_TIMER("horizontal_compose97i")}} | 1446 STOP_TIMER("horizontal_compose97i")}} |
1534 | 1447 |
1535 cs->b0=b2; | 1448 cs->b0=b2; |
1536 cs->b1=b3; | 1449 cs->b1=b3; |
1555 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width); | 1468 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width); |
1556 if(width>400){ | 1469 if(width>400){ |
1557 STOP_TIMER("vertical_compose97i")}} | 1470 STOP_TIMER("vertical_compose97i")}} |
1558 | 1471 |
1559 {START_TIMER | 1472 {START_TIMER |
1560 if(y-1<(unsigned)height) horizontal_compose97i(b0, width); | 1473 if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width); |
1561 if(y+0<(unsigned)height) horizontal_compose97i(b1, width); | 1474 if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width); |
1562 if(width>400 && b0 <= b2){ | 1475 if(width>400 && b0 <= b2){ |
1563 STOP_TIMER("horizontal_compose97i")}} | 1476 STOP_TIMER("horizontal_compose97i")}} |
1564 | 1477 |
1565 cs->b0=b2; | 1478 cs->b0=b2; |
1566 cs->b1=b3; | 1479 cs->b1=b3; |
1617 } | 1530 } |
1618 } | 1531 } |
1619 } | 1532 } |
1620 } | 1533 } |
1621 | 1534 |
1622 static void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){ | 1535 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){ |
1623 const int support = type==1 ? 3 : 5; | 1536 const int support = type==1 ? 3 : 5; |
1624 int level; | 1537 int level; |
1625 if(type==2) return; | 1538 if(type==2) return; |
1626 | 1539 |
1627 for(level=decomposition_count-1; level>=0; level--){ | 1540 for(level=decomposition_count-1; level>=0; level--){ |
1628 while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){ | 1541 while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){ |
1629 switch(type){ | 1542 switch(type){ |
1630 case 0: spatial_compose97i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level); | 1543 case 0: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level); |
1631 break; | 1544 break; |
1632 case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level); | 1545 case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level); |
1633 break; | 1546 break; |
1634 case 2: break; | 1547 case 2: break; |
1635 } | 1548 } |
2543 s->dsp.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride); | 2456 s->dsp.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride); |
2544 } | 2457 } |
2545 } | 2458 } |
2546 } | 2459 } |
2547 | 2460 |
2461 void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, | |
2462 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | |
2463 int y, x; | |
2464 DWTELEM * dst; | |
2465 for(y=0; y<b_h; y++){ | |
2466 //FIXME ugly missue of obmc_stride | |
2467 uint8_t *obmc1= obmc + y*obmc_stride; | |
2468 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
2469 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
2470 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
2471 dst = slice_buffer_get_line(sb, src_y + y); | |
2472 for(x=0; x<b_w; x++){ | |
2473 int v= obmc1[x] * block[3][x + y*src_stride] | |
2474 +obmc2[x] * block[2][x + y*src_stride] | |
2475 +obmc3[x] * block[1][x + y*src_stride] | |
2476 +obmc4[x] * block[0][x + y*src_stride]; | |
2477 | |
2478 v <<= 8 - LOG2_OBMC_MAX; | |
2479 if(FRAC_BITS != 8){ | |
2480 v += 1<<(7 - FRAC_BITS); | |
2481 v >>= 8 - FRAC_BITS; | |
2482 } | |
2483 if(add){ | |
2484 v += dst[x + src_x]; | |
2485 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS; | |
2486 if(v&(~255)) v= ~(v>>31); | |
2487 dst8[x + y*src_stride] = v; | |
2488 }else{ | |
2489 dst[x + src_x] -= v; | |
2490 } | |
2491 } | |
2492 } | |
2493 } | |
2494 | |
2548 //FIXME name clenup (b_w, block_w, b_width stuff) | 2495 //FIXME name clenup (b_w, block_w, b_width stuff) |
2549 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){ | 2496 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){ |
2550 DWTELEM * dst = NULL; | 2497 DWTELEM * dst = NULL; |
2551 const int b_width = s->b_width << s->block_max_depth; | 2498 const int b_width = s->b_width << s->block_max_depth; |
2552 const int b_height= s->b_height << s->block_max_depth; | 2499 const int b_height= s->b_height << s->block_max_depth; |
2667 #else | 2614 #else |
2668 { | 2615 { |
2669 | 2616 |
2670 START_TIMER | 2617 START_TIMER |
2671 | 2618 |
2672 for(y=0; y<b_h; y++){ | 2619 s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
2673 //FIXME ugly missue of obmc_stride | |
2674 uint8_t *obmc1= obmc + y*obmc_stride; | |
2675 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
2676 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
2677 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
2678 dst = slice_buffer_get_line(sb, src_y + y); | |
2679 for(x=0; x<b_w; x++){ | |
2680 int v= obmc1[x] * block[3][x + y*src_stride] | |
2681 +obmc2[x] * block[2][x + y*src_stride] | |
2682 +obmc3[x] * block[1][x + y*src_stride] | |
2683 +obmc4[x] * block[0][x + y*src_stride]; | |
2684 | |
2685 v <<= 8 - LOG2_OBMC_MAX; | |
2686 if(FRAC_BITS != 8){ | |
2687 v += 1<<(7 - FRAC_BITS); | |
2688 v >>= 8 - FRAC_BITS; | |
2689 } | |
2690 if(add){ | |
2691 // v += old_dst[x + y*dst_stride]; | |
2692 v += dst[x + src_x]; | |
2693 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS; | |
2694 if(v&(~255)) v= ~(v>>31); | |
2695 dst8[x + y*src_stride] = v; | |
2696 }else{ | |
2697 // old_dst[x + y*dst_stride] -= v; | |
2698 dst[x + src_x] -= v; | |
2699 } | |
2700 } | |
2701 } | |
2702 STOP_TIMER("Inner add y block") | 2620 STOP_TIMER("Inner add y block") |
2703 } | 2621 } |
2704 #endif | 2622 #endif |
2705 } | 2623 } |
2706 | 2624 |
4397 STOP_TIMER("decode_subband_slice"); | 4315 STOP_TIMER("decode_subband_slice"); |
4398 } | 4316 } |
4399 | 4317 |
4400 { START_TIMER | 4318 { START_TIMER |
4401 for(; yd<slice_h; yd+=4){ | 4319 for(; yd<slice_h; yd+=4){ |
4402 ff_spatial_idwt_buffered_slice(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd); | 4320 ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd); |
4403 } | 4321 } |
4404 STOP_TIMER("idwt slice");} | 4322 STOP_TIMER("idwt slice");} |
4405 | 4323 |
4406 | 4324 |
4407 if(s->qlog == LOSSLESS_QLOG){ | 4325 if(s->qlog == LOSSLESS_QLOG){ |