comparison snow.c @ 3198:6b9f0c4fbdbe libavcodec

First part of a series of speed-enchancing patches. This one sets up a snow.h and makes snow use the dsputil function pointer framework to access the three functions that will be implemented in asm in the other parts of the patchset. Patch by Robert Edele < yartrebo AH earthlink POIS net> Original thread: Subject: [Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations Date: Sun, 05 Feb 2006 12:47:14 -0500
author gpoirier
date Thu, 16 Mar 2006 19:18:18 +0000
parents 8f53630cd24e
children c1add9fe5c65
comparison
equal deleted inserted replaced
3197:8f53630cd24e 3198:6b9f0c4fbdbe
17 */ 17 */
18 18
19 #include "avcodec.h" 19 #include "avcodec.h"
20 #include "common.h" 20 #include "common.h"
21 #include "dsputil.h" 21 #include "dsputil.h"
22 #include "snow.h"
22 23
23 #include "rangecoder.h" 24 #include "rangecoder.h"
24 #define MID_STATE 128
25 25
26 #include "mpegvideo.h" 26 #include "mpegvideo.h"
27 27
28 #undef NDEBUG 28 #undef NDEBUG
29 #include <assert.h> 29 #include <assert.h>
30
31 #define MAX_DECOMPOSITIONS 8
32 #define MAX_PLANES 4
33 #define DWTELEM int
34 #define QSHIFT 5
35 #define QROOT (1<<QSHIFT)
36 #define LOSSLESS_QLOG -128
37 #define FRAC_BITS 8
38 30
39 static const int8_t quant3[256]={ 31 static const int8_t quant3[256]={
40 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 32 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 171 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
180 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
181 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1, 173 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
182 }; 174 };
183 175
184 #define LOG2_OBMC_MAX 6
185 #define OBMC_MAX (1<<(LOG2_OBMC_MAX))
186 #if 0 //64*cubic 176 #if 0 //64*cubic
187 static const uint8_t obmc32[1024]={ 177 static const uint8_t obmc32[1024]={
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
190 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 180 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
423 int width; 413 int width;
424 int height; 414 int height;
425 SubBand band[MAX_DECOMPOSITIONS][4]; 415 SubBand band[MAX_DECOMPOSITIONS][4];
426 }Plane; 416 }Plane;
427 417
428 /** Used to minimize the amount of memory used in order to optimize cache performance. **/
429 typedef struct {
430 DWTELEM * * line; ///< For use by idwt and predict_slices.
431 DWTELEM * * data_stack; ///< Used for internal purposes.
432 int data_stack_top;
433 int line_count;
434 int line_width;
435 int data_count;
436 DWTELEM * base_buffer; ///< Buffer that this structure is caching.
437 } slice_buffer;
438
439 typedef struct SnowContext{ 418 typedef struct SnowContext{
440 // MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX) 419 // MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
441 420
442 AVCodecContext *avctx; 421 AVCodecContext *avctx;
443 RangeCoder c; 422 RangeCoder c;
739 if(mirror_right){ 718 if(mirror_right){
740 dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse); 719 dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
741 } 720 }
742 } 721 }
743 722
723 #ifndef lift5
744 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){ 724 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
745 const int mirror_left= !highpass; 725 const int mirror_left= !highpass;
746 const int mirror_right= (width&1) ^ highpass; 726 const int mirror_right= (width&1) ^ highpass;
747 const int w= (width>>1) - 1 + (highpass & width); 727 const int w= (width>>1) - 1 + (highpass & width);
748 int i; 728 int i;
768 r += r>>4; 748 r += r>>4;
769 r += r>>8; 749 r += r>>8;
770 dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse); 750 dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
771 } 751 }
772 } 752 }
773 753 #endif
754
755 #ifndef liftS
774 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){ 756 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
775 const int mirror_left= !highpass; 757 const int mirror_left= !highpass;
776 const int mirror_right= (width&1) ^ highpass; 758 const int mirror_right= (width&1) ^ highpass;
777 const int w= (width>>1) - 1 + (highpass & width); 759 const int w= (width>>1) - 1 + (highpass & width);
778 int i; 760 int i;
791 773
792 if(mirror_right){ 774 if(mirror_right){
793 dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse); 775 dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
794 } 776 }
795 } 777 }
778 #endif
796 779
797 780
798 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){ 781 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
799 int x, i; 782 int x, i;
800 783
1109 b0=b2; 1092 b0=b2;
1110 b1=b3; 1093 b1=b3;
1111 } 1094 }
1112 } 1095 }
1113 1096
1114 #define liftS lift
1115 #define lift5 lift
1116 #if 1
1117 #define W_AM 3
1118 #define W_AO 0
1119 #define W_AS 1
1120
1121 #undef liftS
1122 #define W_BM 1
1123 #define W_BO 8
1124 #define W_BS 4
1125
1126 #define W_CM 1
1127 #define W_CO 0
1128 #define W_CS 0
1129
1130 #define W_DM 3
1131 #define W_DO 4
1132 #define W_DS 3
1133 #elif 0
1134 #define W_AM 55
1135 #define W_AO 16
1136 #define W_AS 5
1137
1138 #define W_BM 3
1139 #define W_BO 32
1140 #define W_BS 6
1141
1142 #define W_CM 127
1143 #define W_CO 64
1144 #define W_CS 7
1145
1146 #define W_DM 7
1147 #define W_DO 8
1148 #define W_DS 4
1149 #elif 0
1150 #define W_AM 97
1151 #define W_AO 32
1152 #define W_AS 6
1153
1154 #define W_BM 63
1155 #define W_BO 512
1156 #define W_BS 10
1157
1158 #define W_CM 13
1159 #define W_CO 8
1160 #define W_CS 4
1161
1162 #define W_DM 15
1163 #define W_DO 16
1164 #define W_DS 5
1165
1166 #else
1167
1168 #define W_AM 203
1169 #define W_AO 64
1170 #define W_AS 7
1171
1172 #define W_BM 217
1173 #define W_BO 2048
1174 #define W_BS 12
1175
1176 #define W_CM 113
1177 #define W_CO 64
1178 #define W_CS 7
1179
1180 #define W_DM 227
1181 #define W_DO 128
1182 #define W_DS 9
1183 #endif
1184 static void horizontal_decompose97i(DWTELEM *b, int width){ 1097 static void horizontal_decompose97i(DWTELEM *b, int width){
1185 DWTELEM temp[width]; 1098 DWTELEM temp[width];
1186 const int w2= (width+1)>>1; 1099 const int w2= (width+1)>>1;
1187 1100
1188 lift (temp+w2, b +1, b , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0); 1101 lift (temp+w2, b +1, b , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1408 while(cs.y <= height) 1321 while(cs.y <= height)
1409 spatial_compose53i_dy(&cs, buffer, width, height, stride); 1322 spatial_compose53i_dy(&cs, buffer, width, height, stride);
1410 } 1323 }
1411 1324
1412 1325
1413 static void horizontal_compose97i(DWTELEM *b, int width){ 1326 void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1414 DWTELEM temp[width]; 1327 DWTELEM temp[width];
1415 const int w2= (width+1)>>1; 1328 const int w2= (width+1)>>1;
1416 1329
1417 lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1); 1330 lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1);
1418 lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1); 1331 lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1);
1461 for(i=0; i<width; i++){ 1374 for(i=0; i<width; i++){
1462 b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS; 1375 b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1463 } 1376 }
1464 } 1377 }
1465 1378
1466 static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){ 1379 void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1467 int i; 1380 int i;
1468 1381
1469 for(i=0; i<width; i++){ 1382 for(i=0; i<width; i++){
1470 #ifndef lift5 1383 #ifndef lift5
1471 int r; 1384 int r;
1502 cs->b2 = buffer + mirror(-3+1, height-1)*stride; 1415 cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1503 cs->b3 = buffer + mirror(-3+2, height-1)*stride; 1416 cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1504 cs->y = -3; 1417 cs->y = -3;
1505 } 1418 }
1506 1419
1507 static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){ 1420 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1508 int y = cs->y; 1421 int y = cs->y;
1509 1422
1510 DWTELEM *b0= cs->b0; 1423 DWTELEM *b0= cs->b0;
1511 DWTELEM *b1= cs->b1; 1424 DWTELEM *b1= cs->b1;
1512 DWTELEM *b2= cs->b2; 1425 DWTELEM *b2= cs->b2;
1514 DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line); 1427 DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1515 DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line); 1428 DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1516 1429
1517 {START_TIMER 1430 {START_TIMER
1518 if(y>0 && y+4<height){ 1431 if(y>0 && y+4<height){
1519 vertical_compose97i(b0, b1, b2, b3, b4, b5, width); 1432 dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1520 }else{ 1433 }else{
1521 if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width); 1434 if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1522 if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width); 1435 if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1523 if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width); 1436 if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1524 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width); 1437 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1525 } 1438 }
1526 if(width>400){ 1439 if(width>400){
1527 STOP_TIMER("vertical_compose97i")}} 1440 STOP_TIMER("vertical_compose97i")}}
1528 1441
1529 {START_TIMER 1442 {START_TIMER
1530 if(y-1<(unsigned)height) horizontal_compose97i(b0, width); 1443 if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1531 if(y+0<(unsigned)height) horizontal_compose97i(b1, width); 1444 if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1532 if(width>400 && y+0<(unsigned)height){ 1445 if(width>400 && y+0<(unsigned)height){
1533 STOP_TIMER("horizontal_compose97i")}} 1446 STOP_TIMER("horizontal_compose97i")}}
1534 1447
1535 cs->b0=b2; 1448 cs->b0=b2;
1536 cs->b1=b3; 1449 cs->b1=b3;
1555 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width); 1468 if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1556 if(width>400){ 1469 if(width>400){
1557 STOP_TIMER("vertical_compose97i")}} 1470 STOP_TIMER("vertical_compose97i")}}
1558 1471
1559 {START_TIMER 1472 {START_TIMER
1560 if(y-1<(unsigned)height) horizontal_compose97i(b0, width); 1473 if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1561 if(y+0<(unsigned)height) horizontal_compose97i(b1, width); 1474 if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1562 if(width>400 && b0 <= b2){ 1475 if(width>400 && b0 <= b2){
1563 STOP_TIMER("horizontal_compose97i")}} 1476 STOP_TIMER("horizontal_compose97i")}}
1564 1477
1565 cs->b0=b2; 1478 cs->b0=b2;
1566 cs->b1=b3; 1479 cs->b1=b3;
1617 } 1530 }
1618 } 1531 }
1619 } 1532 }
1620 } 1533 }
1621 1534
1622 static void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){ 1535 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1623 const int support = type==1 ? 3 : 5; 1536 const int support = type==1 ? 3 : 5;
1624 int level; 1537 int level;
1625 if(type==2) return; 1538 if(type==2) return;
1626 1539
1627 for(level=decomposition_count-1; level>=0; level--){ 1540 for(level=decomposition_count-1; level>=0; level--){
1628 while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){ 1541 while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1629 switch(type){ 1542 switch(type){
1630 case 0: spatial_compose97i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level); 1543 case 0: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1631 break; 1544 break;
1632 case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level); 1545 case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1633 break; 1546 break;
1634 case 2: break; 1547 case 2: break;
1635 } 1548 }
2543 s->dsp.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride); 2456 s->dsp.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2544 } 2457 }
2545 } 2458 }
2546 } 2459 }
2547 2460
2461 void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2462 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2463 int y, x;
2464 DWTELEM * dst;
2465 for(y=0; y<b_h; y++){
2466 //FIXME ugly missue of obmc_stride
2467 uint8_t *obmc1= obmc + y*obmc_stride;
2468 uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2469 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2470 uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2471 dst = slice_buffer_get_line(sb, src_y + y);
2472 for(x=0; x<b_w; x++){
2473 int v= obmc1[x] * block[3][x + y*src_stride]
2474 +obmc2[x] * block[2][x + y*src_stride]
2475 +obmc3[x] * block[1][x + y*src_stride]
2476 +obmc4[x] * block[0][x + y*src_stride];
2477
2478 v <<= 8 - LOG2_OBMC_MAX;
2479 if(FRAC_BITS != 8){
2480 v += 1<<(7 - FRAC_BITS);
2481 v >>= 8 - FRAC_BITS;
2482 }
2483 if(add){
2484 v += dst[x + src_x];
2485 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2486 if(v&(~255)) v= ~(v>>31);
2487 dst8[x + y*src_stride] = v;
2488 }else{
2489 dst[x + src_x] -= v;
2490 }
2491 }
2492 }
2493 }
2494
2548 //FIXME name clenup (b_w, block_w, b_width stuff) 2495 //FIXME name clenup (b_w, block_w, b_width stuff)
2549 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){ 2496 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
2550 DWTELEM * dst = NULL; 2497 DWTELEM * dst = NULL;
2551 const int b_width = s->b_width << s->block_max_depth; 2498 const int b_width = s->b_width << s->block_max_depth;
2552 const int b_height= s->b_height << s->block_max_depth; 2499 const int b_height= s->b_height << s->block_max_depth;
2667 #else 2614 #else
2668 { 2615 {
2669 2616
2670 START_TIMER 2617 START_TIMER
2671 2618
2672 for(y=0; y<b_h; y++){ 2619 s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2673 //FIXME ugly missue of obmc_stride
2674 uint8_t *obmc1= obmc + y*obmc_stride;
2675 uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2676 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2677 uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2678 dst = slice_buffer_get_line(sb, src_y + y);
2679 for(x=0; x<b_w; x++){
2680 int v= obmc1[x] * block[3][x + y*src_stride]
2681 +obmc2[x] * block[2][x + y*src_stride]
2682 +obmc3[x] * block[1][x + y*src_stride]
2683 +obmc4[x] * block[0][x + y*src_stride];
2684
2685 v <<= 8 - LOG2_OBMC_MAX;
2686 if(FRAC_BITS != 8){
2687 v += 1<<(7 - FRAC_BITS);
2688 v >>= 8 - FRAC_BITS;
2689 }
2690 if(add){
2691 // v += old_dst[x + y*dst_stride];
2692 v += dst[x + src_x];
2693 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2694 if(v&(~255)) v= ~(v>>31);
2695 dst8[x + y*src_stride] = v;
2696 }else{
2697 // old_dst[x + y*dst_stride] -= v;
2698 dst[x + src_x] -= v;
2699 }
2700 }
2701 }
2702 STOP_TIMER("Inner add y block") 2620 STOP_TIMER("Inner add y block")
2703 } 2621 }
2704 #endif 2622 #endif
2705 } 2623 }
2706 2624
4397 STOP_TIMER("decode_subband_slice"); 4315 STOP_TIMER("decode_subband_slice");
4398 } 4316 }
4399 4317
4400 { START_TIMER 4318 { START_TIMER
4401 for(; yd<slice_h; yd+=4){ 4319 for(; yd<slice_h; yd+=4){
4402 ff_spatial_idwt_buffered_slice(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd); 4320 ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4403 } 4321 }
4404 STOP_TIMER("idwt slice");} 4322 STOP_TIMER("idwt slice");}
4405 4323
4406 4324
4407 if(s->qlog == LOSSLESS_QLOG){ 4325 if(s->qlog == LOSSLESS_QLOG){