comparison postproc/swscale.c @ 4276:9199d15cb4e0

removed global vars so that multiple swscalers can be used experimental upscaling mode (-sws 3) general convolution filters support (unfinished) bugfix for bicubic upscaling assertion checking if defined MP_DEBUG checking of the input/output size instead of segfault if its very large
author michael
date Sun, 20 Jan 2002 05:30:23 +0000
parents 3cdb86beebce
children a7c8f1aec34a
comparison
equal deleted inserted replaced
4275:818be6ba8758 4276:9199d15cb4e0
2 // Software scaling and colorspace conversion routines for MPlayer 2 // Software scaling and colorspace conversion routines for MPlayer
3 3
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu> 4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at) 5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
6 // the parts written by michael are under GNU GPL 6 // the parts written by michael are under GNU GPL
7
8 /*
9 supported Input formats: YV12 (grayscale soon too)
10 supported output formats: YV12, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
11 */
7 12
8 #include <inttypes.h> 13 #include <inttypes.h>
9 #include <string.h> 14 #include <string.h>
10 #include <math.h> 15 #include <math.h>
11 #include <stdio.h> 16 #include <stdio.h>
14 #ifdef HAVE_MALLOC_H 19 #ifdef HAVE_MALLOC_H
15 #include <malloc.h> 20 #include <malloc.h>
16 #endif 21 #endif
17 #include "swscale.h" 22 #include "swscale.h"
18 #include "../cpudetect.h" 23 #include "../cpudetect.h"
24 #include "../libvo/img_format.h"
19 #undef MOVNTQ 25 #undef MOVNTQ
20 #undef PAVGB 26 #undef PAVGB
21 27
22 //#undef HAVE_MMX2 28 //#undef HAVE_MMX2
23 //#undef HAVE_MMX 29 //#undef HAVE_MMX
24 //#undef ARCH_X86 30 //#undef ARCH_X86
25 #define DITHER1XBPP 31 #define DITHER1XBPP
26 int fullUVIpol=0;
27 //disables the unscaled height version
28 int allwaysIpol=0;
29 32
30 #define RET 0xC3 //near return opcode 33 #define RET 0xC3 //near return opcode
31 34
32 //#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; } 35 #ifdef MP_DEBUG
36 #define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
37 #else
33 #define ASSERT(x) ; 38 #define ASSERT(x) ;
39 #endif
40
41 #ifdef M_PI
42 #define PI M_PI
43 #else
44 #define PI 3.14159265358979323846
45 #endif
34 46
35 extern int verbose; // defined in mplayer.c 47 extern int verbose; // defined in mplayer.c
36 /* 48 /*
37 NOTES 49 NOTES
38 50
48 dither in C 60 dither in C
49 change the distance of the u & v buffer 61 change the distance of the u & v buffer
50 Move static / global vars into a struct so multiple scalers can be used 62 Move static / global vars into a struct so multiple scalers can be used
51 write special vertical cubic upscale version 63 write special vertical cubic upscale version
52 Optimize C code (yv12 / minmax) 64 Optimize C code (yv12 / minmax)
53 dstStride[3]
54 */ 65 */
55 66
56 #define ABS(a) ((a) > 0 ? (a) : (-(a))) 67 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
57 #define MIN(a,b) ((a) > (b) ? (b) : (a)) 68 #define MIN(a,b) ((a) > (b) ? (b) : (a))
58 #define MAX(a,b) ((a) < (b) ? (b) : (a)) 69 #define MAX(a,b) ((a) < (b) ? (b) : (a))
99 110
100 static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL; 111 static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
101 static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL; 112 static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
102 static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL; 113 static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
103 114
104 static uint64_t __attribute__((aligned(8))) temp0; 115 // FIXME remove
105 static uint64_t __attribute__((aligned(8))) asm_yalpha1; 116 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
106 static uint64_t __attribute__((aligned(8))) asm_uvalpha1; 117 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
107
108 static int16_t __attribute__((aligned(8))) *lumPixBuf[2000];
109 static int16_t __attribute__((aligned(8))) *chrPixBuf[2000];
110 static int16_t __attribute__((aligned(8))) hLumFilter[8000];
111 static int16_t __attribute__((aligned(8))) hLumFilterPos[2000];
112 static int16_t __attribute__((aligned(8))) hChrFilter[8000];
113 static int16_t __attribute__((aligned(8))) hChrFilterPos[2000];
114 static int16_t __attribute__((aligned(8))) vLumFilter[8000];
115 static int16_t __attribute__((aligned(8))) vLumFilterPos[2000];
116 static int16_t __attribute__((aligned(8))) vChrFilter[8000];
117 static int16_t __attribute__((aligned(8))) vChrFilterPos[2000];
118
119 // Contain simply the values from v(Lum|Chr)Filter just nicely packed for mmx
120 //FIXME these are very likely too small / 8000 caused problems with 480x480
121 static int16_t __attribute__((aligned(8))) lumMmxFilter[16000];
122 static int16_t __attribute__((aligned(8))) chrMmxFilter[16000];
123 #else
124 static int16_t *lumPixBuf[2000];
125 static int16_t *chrPixBuf[2000];
126 static int16_t hLumFilter[8000];
127 static int16_t hLumFilterPos[2000];
128 static int16_t hChrFilter[8000];
129 static int16_t hChrFilterPos[2000];
130 static int16_t vLumFilter[8000];
131 static int16_t vLumFilterPos[2000];
132 static int16_t vChrFilter[8000];
133 static int16_t vChrFilterPos[2000];
134 //FIXME just dummy vars
135 static int16_t lumMmxFilter[1];
136 static int16_t chrMmxFilter[1];
137 #endif 118 #endif
138 119
139 // clipping helper table for C implementations: 120 // clipping helper table for C implementations:
140 static unsigned char clip_table[768]; 121 static unsigned char clip_table[768];
141 122
157 static int clip_yuvtab_3343[768]; 138 static int clip_yuvtab_3343[768];
158 static int clip_yuvtab_0c92[768]; 139 static int clip_yuvtab_0c92[768];
159 static int clip_yuvtab_1a1e[768]; 140 static int clip_yuvtab_1a1e[768];
160 static int clip_yuvtab_40cf[768]; 141 static int clip_yuvtab_40cf[768];
161 142
162 static int hLumFilterSize=0; 143 //global sws_flags from the command line
163 static int hChrFilterSize=0;
164 static int vLumFilterSize=0;
165 static int vChrFilterSize=0;
166 static int vLumBufSize=0;
167 static int vChrBufSize=0;
168
169 int sws_flags=0; 144 int sws_flags=0;
170 145
171 #ifdef CAN_COMPILE_X86_ASM 146 /* cpuCaps combined from cpudetect and whats actually compiled in
172 static uint8_t funnyYCode[10000]; 147 (if there is no support for something compiled in it wont appear here) */
173 static uint8_t funnyUVCode[10000]; 148 static CpuCaps cpuCaps;
174 #endif 149
175 150 void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
176 static int canMMX2BeUsed=0; 151 int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
177 152
178 #ifdef CAN_COMPILE_X86_ASM 153 #ifdef CAN_COMPILE_X86_ASM
179 void in_asm_used_var_warning_killer() 154 void in_asm_used_var_warning_killer()
180 { 155 {
181 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+ 156 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
182 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+ 157 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
183 M24A+M24B+M24C+w02 + funnyYCode[0]+ funnyUVCode[0]+b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]; 158 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0];
184 if(i) i=0; 159 if(i) i=0;
185 } 160 }
186 #endif 161 #endif
187 162
188 static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 163 static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
218 } 193 }
219 } 194 }
220 195
221 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 196 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
222 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, 197 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
223 uint8_t *dest, int dstW, int dstbpp) 198 uint8_t *dest, int dstW, int dstFormat)
224 { 199 {
225 if(dstbpp==32) 200 if(dstFormat==IMGFMT_BGR32)
226 { 201 {
227 int i; 202 int i;
228 for(i=0; i<(dstW>>1); i++){ 203 for(i=0; i<(dstW>>1); i++){
229 int j; 204 int j;
230 int Y1=0; 205 int Y1=0;
258 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)]; 233 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
259 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)]; 234 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
260 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)]; 235 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
261 } 236 }
262 } 237 }
263 else if(dstbpp==24) 238 else if(dstFormat==IMGFMT_BGR24)
264 { 239 {
265 int i; 240 int i;
266 for(i=0; i<(dstW>>1); i++){ 241 for(i=0; i<(dstW>>1); i++){
267 int j; 242 int j;
268 int Y1=0; 243 int Y1=0;
297 dest[4]=clip_table[((Y2 + Cg) >>13)]; 272 dest[4]=clip_table[((Y2 + Cg) >>13)];
298 dest[5]=clip_table[((Y2 + Cr) >>13)]; 273 dest[5]=clip_table[((Y2 + Cr) >>13)];
299 dest+=6; 274 dest+=6;
300 } 275 }
301 } 276 }
302 else if(dstbpp==16) 277 else if(dstFormat==IMGFMT_BGR16)
303 { 278 {
304 int i; 279 int i;
305 for(i=0; i<(dstW>>1); i++){ 280 for(i=0; i<(dstW>>1); i++){
306 int j; 281 int j;
307 int Y1=0; 282 int Y1=0;
337 clip_table16b[(Y2 + Cb) >>13] | 312 clip_table16b[(Y2 + Cb) >>13] |
338 clip_table16g[(Y2 + Cg) >>13] | 313 clip_table16g[(Y2 + Cg) >>13] |
339 clip_table16r[(Y2 + Cr) >>13]; 314 clip_table16r[(Y2 + Cr) >>13];
340 } 315 }
341 } 316 }
342 else if(dstbpp==15) 317 else if(dstFormat==IMGFMT_BGR15)
343 { 318 {
344 int i; 319 int i;
345 for(i=0; i<(dstW>>1); i++){ 320 for(i=0; i<(dstW>>1); i++){
346 int j; 321 int j;
347 int Y1=0; 322 int Y1=0;
465 #endif //CAN_COMPILE_X86_ASM 440 #endif //CAN_COMPILE_X86_ASM
466 441
467 // minor note: the HAVE_xyz is messed up after that line so dont use it 442 // minor note: the HAVE_xyz is messed up after that line so dont use it
468 443
469 444
470 // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices: 445 // old global scaler, dont use for new code
471 // *** Note: it's called multiple times while decoding a frame, first time y==0 446 // will use sws_flags from the command line
472 // switching the cpu type during a sliced drawing can have bad effects, like sig11 447 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
473 void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int srcSliceY , 448 int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
474 int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
475 int srcW, int srcH, int dstW, int dstH){ 449 int srcW, int srcH, int dstW, int dstH){
476 450
477 #ifdef RUNTIME_CPUDETECT 451 static SwsContext *context=NULL;
478 #ifdef CAN_COMPILE_X86_ASM 452 int dstFormat;
479 // ordered per speed fasterst first 453 int flags=0;
480 if(gCpuCaps.hasMMX2) 454 static int firstTime=1;
481 SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 455 int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
482 else if(gCpuCaps.has3DNow) 456
483 SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 457 if(firstTime)
484 else if(gCpuCaps.hasMMX) 458 {
485 SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 459 flags= SWS_PRINT_INFO;
486 else 460 firstTime=0;
487 SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 461 }
488 #else 462
489 SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 463 switch(dstbpp)
490 #endif 464 {
491 #else //RUNTIME_CPUDETECT 465 case 8 : dstFormat= IMGFMT_Y8; break;
492 #ifdef HAVE_MMX2 466 case 12: dstFormat= IMGFMT_YV12; break;
493 SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 467 case 15: dstFormat= IMGFMT_BGR15; break;
494 #elif defined (HAVE_3DNOW) 468 case 16: dstFormat= IMGFMT_BGR16; break;
495 SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 469 case 24: dstFormat= IMGFMT_BGR24; break;
496 #elif defined (HAVE_MMX) 470 case 32: dstFormat= IMGFMT_BGR32; break;
497 SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 471 default: return;
498 #else 472 }
499 SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH); 473
500 #endif 474 switch(sws_flags)
501 #endif //!RUNTIME_CPUDETECT 475 {
502 476 case 0: flags|= SWS_FAST_BILINEAR; break;
503 } 477 case 1: flags|= SWS_BILINEAR; break;
504 478 case 2: flags|= SWS_BICUBIC; break;
479 case 3: flags|= SWS_X; break;
480 default:flags|= SWS_BILINEAR; break;
481 }
482
483 if(!context) context=getSwsContext(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat, flags, NULL, NULL);
484
485
486 swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
487 }
488
489 static inline void initFilter(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
490 int srcW, int dstW, int filterAlign, int one, int flags)
491 {
492 int i;
493 double filter[10000];
494 #ifdef ARCH_X86
495 if(gCpuCaps.hasMMX)
496 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
497 #endif
498
499 if(ABS(xInc - 0x10000) <10) // unscaled
500 {
501 int i;
502 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
503 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
504
505 for(i=0; i<dstW; i++)
506 {
507 filter[i*(*filterSize)]=1;
508 filterPos[i]=i;
509 }
510
511 }
512 else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
513 {
514 int i;
515 int xDstInSrc;
516 if (flags&SWS_BICUBIC) *filterSize= 4;
517 else if(flags&SWS_X ) *filterSize= 4;
518 else *filterSize= 2;
519 // printf("%d %d %d\n", filterSize, srcW, dstW);
520 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
521
522 xDstInSrc= xInc/2 - 0x8000;
523 for(i=0; i<dstW; i++)
524 {
525 int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
526 int j;
527
528 filterPos[i]= xx;
529 if((flags & SWS_BICUBIC) || (flags & SWS_X))
530 {
531 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
532 double y1,y2,y3,y4;
533 double A= -0.6;
534 if(flags & SWS_BICUBIC){
535 // Equation is from VirtualDub
536 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
537 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
538 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
539 y4 = ( + A*d*d - A*d*d*d);
540 }else{
541 // cubic interpolation (derived it myself)
542 y1 = ( -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
543 y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
544 y3 = ( +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
545 y4 = ( -1.0*d + 1.0*d*d*d)/6.0;
546 }
547
548 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
549 filter[i*(*filterSize) + 0]= y1;
550 filter[i*(*filterSize) + 1]= y2;
551 filter[i*(*filterSize) + 2]= y3;
552 filter[i*(*filterSize) + 3]= y4;
553 // printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
554 }
555 else
556 {
557 for(j=0; j<*filterSize; j++)
558 {
559 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
560 double coeff= 1.0 - d;
561 if(coeff<0) coeff=0;
562 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
563 filter[i*(*filterSize) + j]= coeff;
564 xx++;
565 }
566 }
567 xDstInSrc+= xInc;
568 }
569 }
570 else // downscale
571 {
572 int xDstInSrc;
573 if(flags&SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
574 else if(flags&SWS_X) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
575 else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
576 // printf("%d %d %d\n", *filterSize, srcW, dstW);
577 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
578
579 xDstInSrc= xInc/2 - 0x8000;
580 for(i=0; i<dstW; i++)
581 {
582 int xx= (int)((double)xDstInSrc/(double)(1<<16) - ((*filterSize)-1)*0.5 + 0.5);
583 int j;
584 filterPos[i]= xx;
585 for(j=0; j<*filterSize; j++)
586 {
587 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
588 double coeff;
589 if((flags & SWS_BICUBIC) || (flags & SWS_X))
590 {
591 double A= -0.75;
592 // d*=2;
593 // Equation is from VirtualDub
594 if(d<1.0)
595 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
596 else if(d<2.0)
597 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
598 else
599 coeff=0.0;
600 }
601 /* else if(flags & SWS_X)
602 {
603 }*/
604 else
605 {
606 coeff= 1.0 - d;
607 if(coeff<0) coeff=0;
608 }
609 // printf("%1.3f %d %d \n", coeff, (int)d, xDstInSrc);
610 filter[i*(*filterSize) + j]= coeff;
611 xx++;
612 }
613 xDstInSrc+= xInc;
614 }
615 }
616
617 //fix borders
618 for(i=0; i<dstW; i++)
619 {
620 int j;
621 if(filterPos[i] < 0)
622 {
623 // Move filter coeffs left to compensate for filterPos
624 for(j=1; j<*filterSize; j++)
625 {
626 int left= MAX(j + filterPos[i], 0);
627 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
628 filter[i*(*filterSize) + j]=0;
629 }
630 filterPos[i]= 0;
631 }
632
633 if(filterPos[i] + (*filterSize) > srcW)
634 {
635 int shift= filterPos[i] + (*filterSize) - srcW;
636 // Move filter coeffs right to compensate for filterPos
637 for(j=(*filterSize)-2; j>=0; j--)
638 {
639 int right= MIN(j + shift, (*filterSize)-1);
640 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
641 filter[i*(*filterSize) +j]=0;
642 }
643 filterPos[i]= srcW - (*filterSize);
644 }
645 }
646
647 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
648 // and skip these than later
649
650 //Normalize
651 for(i=0; i<dstW; i++)
652 {
653 int j;
654 double sum=0;
655 double scale= one;
656 for(j=0; j<*filterSize; j++)
657 {
658 sum+= filter[i*(*filterSize) + j];
659 }
660 scale/= sum;
661 for(j=0; j<*filterSize; j++)
662 {
663 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
664 }
665 }
666 }
667
668 #ifdef ARCH_X86
669 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
670 {
671 uint8_t *fragment;
672 int imm8OfPShufW1;
673 int imm8OfPShufW2;
674 int fragmentLength;
675
676 int xpos, i;
677
678 // create an optimized horizontal scaling routine
679
680 //code fragment
681
682 asm volatile(
683 "jmp 9f \n\t"
684 // Begin
685 "0: \n\t"
686 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
687 "movq %%mm0, %%mm1 \n\t"
688 "psrlq $8, %%mm0 \n\t"
689 "punpcklbw %%mm7, %%mm1 \n\t"
690 "movq %%mm2, %%mm3 \n\t"
691 "punpcklbw %%mm7, %%mm0 \n\t"
692 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
693 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
694 "1: \n\t"
695 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
696 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
697 "2: \n\t"
698 "psrlw $9, %%mm3 \n\t"
699 "psubw %%mm1, %%mm0 \n\t"
700 "pmullw %%mm3, %%mm0 \n\t"
701 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
702 "psllw $7, %%mm1 \n\t"
703 "paddw %%mm1, %%mm0 \n\t"
704
705 "movq %%mm0, (%%edi, %%eax) \n\t"
706
707 "addl $8, %%eax \n\t"
708 // End
709 "9: \n\t"
710 // "int $3\n\t"
711 "leal 0b, %0 \n\t"
712 "leal 1b, %1 \n\t"
713 "leal 2b, %2 \n\t"
714 "decl %1 \n\t"
715 "decl %2 \n\t"
716 "subl %0, %1 \n\t"
717 "subl %0, %2 \n\t"
718 "leal 9b, %3 \n\t"
719 "subl %0, %3 \n\t"
720 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
721 "=r" (fragmentLength)
722 );
723
724 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
725
726 for(i=0; i<dstW/8; i++)
727 {
728 int xx=xpos>>16;
729
730 if((i&3) == 0)
731 {
732 int a=0;
733 int b=((xpos+xInc)>>16) - xx;
734 int c=((xpos+xInc*2)>>16) - xx;
735 int d=((xpos+xInc*3)>>16) - xx;
736
737 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
738
739 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
740 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
741 a | (b<<2) | (c<<4) | (d<<6);
742
743 // if we dont need to read 8 bytes than dont :), reduces the chance of
744 // crossing a cache line
745 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
746
747 funnyCode[fragmentLength*(i+4)/4]= RET;
748 }
749 xpos+=xInc;
750 }
751 }
752 #endif // ARCH_X86
753
754 //FIXME remove
505 void SwScale_Init(){ 755 void SwScale_Init(){
756 }
757
758 static void globalInit(){
506 // generating tables: 759 // generating tables:
507 int i; 760 int i;
508 for(i=0; i<768; i++){ 761 for(i=0; i<768; i++){
509 int c= MIN(MAX(i-256, 0), 255); 762 int c= MIN(MAX(i-256, 0), 255);
510 clip_table[i]=c; 763 clip_table[i]=c;
515 yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128); 768 yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
516 } 769 }
517 770
518 for(i=0; i<768; i++) 771 for(i=0; i<768; i++)
519 { 772 {
520 int v= clip_table[i]; 773 int v= clip_table[i];
521 clip_table16b[i]= v>>3; 774 clip_table16b[i]= v>>3;
522 clip_table16g[i]= (v<<3)&0x07E0; 775 clip_table16g[i]= (v<<3)&0x07E0;
523 clip_table16r[i]= (v<<8)&0xF800; 776 clip_table16r[i]= (v<<8)&0xF800;
524 clip_table15b[i]= v>>3; 777 clip_table15b[i]= v>>3;
525 clip_table15g[i]= (v<<2)&0x03E0; 778 clip_table15g[i]= (v<<2)&0x03E0;
526 clip_table15r[i]= (v<<7)&0x7C00; 779 clip_table15r[i]= (v<<7)&0x7C00;
527 } 780 }
528 781
529 } 782 cpuCaps= gCpuCaps;
530 783
784 #ifdef RUNTIME_CPUDETECT
785 #ifdef CAN_COMPILE_X86_ASM
786 // ordered per speed fasterst first
787 if(gCpuCaps.hasMMX2)
788 swScale= swScale_MMX2;
789 else if(gCpuCaps.has3DNow)
790 swScale= swScale_3DNOW;
791 else if(gCpuCaps.hasMMX)
792 swScale= swScale_MMX;
793 else
794 swScale= swScale_C;
795
796 #else
797 swScale= swScale_C;
798 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
799 #endif
800 #else //RUNTIME_CPUDETECT
801 #ifdef HAVE_MMX2
802 swScale= swScale_MMX2;
803 cpuCaps.has3DNow = 0;
804 #elif defined (HAVE_3DNOW)
805 swScale= swScale_3DNOW;
806 cpuCaps.hasMMX2 = 0;
807 #elif defined (HAVE_MMX)
808 swScale= swScale_MMX;
809 cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
810 #else
811 swScale= swScale_C;
812 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
813 #endif
814 #endif //!RUNTIME_CPUDETECT
815 }
816
817
818 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
819 SwsFilter *srcFilter, SwsFilter *dstFilter){
820
821 const int widthAlign= dstFormat==IMGFMT_YV12 ? 16 : 8;
822 SwsContext *c;
823 int i;
824 //const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
825 //const int over= dstFormat==IMGFMT_YV12 ? (((dstW+15)&(~15))) - dststride
826 // : (((dstW+7)&(~7)))*bytespp - dststride;
827 if(swScale==NULL) globalInit();
828
829 /* sanity check */
830 if(srcW<1 || srcH<1 || dstW<1 || dstH<1) return NULL;
831 if(srcW>=SWS_MAX_SIZE || dstW>=SWS_MAX_SIZE || srcH>=SWS_MAX_SIZE || dstH>=SWS_MAX_SIZE)
832 {
833 fprintf(stderr, "size is too large, increase SWS_MAX_SIZE\n");
834 return NULL;
835 }
836
837 /* FIXME
838 if(dstStride[0]%widthAlign !=0 )
839 {
840 if(flags & SWS_PRINT_INFO)
841 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
842 "SwScaler: ->cannot do aligned memory acesses anymore\n",
843 widthAlign);
844 }
845 */
846 c= memalign(64, sizeof(SwsContext));
847
848 c->srcW= srcW;
849 c->srcH= srcH;
850 c->dstW= dstW;
851 c->dstH= dstH;
852 c->lumXInc= ((srcW<<16) + (1<<15))/dstW;
853 c->lumYInc= ((srcH<<16) + (1<<15))/dstH;
854 c->flags= flags;
855 c->dstFormat= dstFormat;
856 c->srcFormat= srcFormat;
857
858 if(cpuCaps.hasMMX2)
859 {
860 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
861 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
862 {
863 if(flags&SWS_PRINT_INFO)
864 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
865 }
866 }
867 else
868 c->canMMX2BeUsed=0;
869
870 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
871 // but only for the FAST_BILINEAR mode otherwise do correct scaling
872 // n-2 is the last chrominance sample available
873 // this is not perfect, but noone shuld notice the difference, the more correct variant
874 // would be like the vertical one, but that would require some special code for the
875 // first and last pixel
876 if(flags&SWS_FAST_BILINEAR)
877 {
878 if(c->canMMX2BeUsed) c->lumXInc+= 20;
879 //we dont use the x86asm scaler if mmx is available
880 else if(cpuCaps.hasMMX) c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
881 }
882
883 /* set chrXInc & chrDstW */
884 if((flags&SWS_FULL_UV_IPOL) && dstFormat!=IMGFMT_YV12)
885 c->chrXInc= c->lumXInc>>1, c->chrDstW= dstW;
886 else
887 c->chrXInc= c->lumXInc, c->chrDstW= (dstW+1)>>1;
888
889 /* set chrYInc & chrDstH */
890 if(dstFormat==IMGFMT_YV12) c->chrYInc= c->lumYInc, c->chrDstH= (dstH+1)>>1;
891 else c->chrYInc= c->lumYInc>>1, c->chrDstH= dstH;
892
893 /* precalculate horizontal scaler filter coefficients */
894 {
895 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
896
897 initFilter(c->hLumFilter, c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
898 srcW , dstW, filterAlign, 1<<14, flags);
899 initFilter(c->hChrFilter, c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
900 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags);
901
902 #ifdef ARCH_X86
903 // cant downscale !!!
904 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
905 {
906 initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode);
907 initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
908 }
909 #endif
910 } // Init Horizontal stuff
911
912
913
914 /* precalculate vertical scaler filter coefficients */
915 initFilter(c->vLumFilter, c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
916 srcH , dstH, 1, (1<<12)-4, flags);
917 initFilter(c->vChrFilter, c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
918 (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags);
919
920 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
921 c->vLumBufSize= c->vLumFilterSize;
922 c->vChrBufSize= c->vChrFilterSize;
923 for(i=0; i<dstH; i++)
924 {
925 int chrI= i*c->chrDstH / dstH;
926 int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
927 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
928 nextSlice&= ~1; // Slices start at even boundaries
929 if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
930 c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
931 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
932 c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
933 }
934
935 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
936 // allocate several megabytes to handle all possible cases)
937 for(i=0; i<c->vLumBufSize; i++)
938 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
939 for(i=0; i<c->vChrBufSize; i++)
940 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
941
942 //try to avoid drawing green stuff between the right end and the stride end
943 for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
944 for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
945
946 ASSERT(c->chrDstH <= dstH)
947 ASSERT(c->vLumFilterSize* dstH*4 <= SWS_MAX_SIZE*20)
948 ASSERT(c->vChrFilterSize*c->chrDstH*4 <= SWS_MAX_SIZE*20)
949
950 // pack filter data for mmx code
951 if(cpuCaps.hasMMX)
952 {
953 for(i=0; i<c->vLumFilterSize*dstH; i++)
954 c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
955 c->vLumFilter[i];
956 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
957 c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
958 c->vChrFilter[i];
959 }
960
961 if(flags&SWS_PRINT_INFO)
962 {
963 #ifdef DITHER1XBPP
964 char *dither= cpuCaps.hasMMX ? " dithered" : "";
965 #endif
966 if(flags&SWS_FAST_BILINEAR)
967 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
968 else if(flags&SWS_BILINEAR)
969 fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
970 else if(flags&SWS_BICUBIC)
971 fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
972 else
973 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
974
975 if(dstFormat==IMGFMT_BGR15)
976 fprintf(stderr, "with%s BGR15 output ", dither);
977 else if(dstFormat==IMGFMT_BGR16)
978 fprintf(stderr, "with%s BGR16 output ", dither);
979 else if(dstFormat==IMGFMT_BGR24)
980 fprintf(stderr, "with BGR24 output ");
981 else if(dstFormat==IMGFMT_BGR32)
982 fprintf(stderr, "with BGR32 output ");
983 else if(dstFormat==IMGFMT_YV12)
984 fprintf(stderr, "with YV12 output ");
985 else
986 fprintf(stderr, "without output ");
987
988 if(cpuCaps.hasMMX2)
989 fprintf(stderr, "using MMX2\n");
990 else if(cpuCaps.has3DNow)
991 fprintf(stderr, "using 3DNOW\n");
992 else if(cpuCaps.hasMMX)
993 fprintf(stderr, "using MMX\n");
994 else
995 fprintf(stderr, "using C\n");
996 }
997
998 if((flags & SWS_PRINT_INFO) && verbose)
999 {
1000 if(cpuCaps.hasMMX)
1001 {
1002 if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1003 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1004 else
1005 {
1006 if(c->hLumFilterSize==4)
1007 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1008 else if(c->hLumFilterSize==8)
1009 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1010 else
1011 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1012
1013 if(c->hChrFilterSize==4)
1014 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1015 else if(c->hChrFilterSize==8)
1016 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1017 else
1018 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1019 }
1020 }
1021 else
1022 {
1023 #ifdef ARCH_X86
1024 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1025 #else
1026 if(flags & SWS_FAST_BILINEAR)
1027 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1028 else
1029 printf("SwScaler: using C scaler for horizontal scaling\n");
1030 #endif
1031 }
1032 if(dstFormat==IMGFMT_YV12)
1033 {
1034 if(c->vLumFilterSize==1)
1035 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", cpuCaps.hasMMX ? "MMX" : "C");
1036 else
1037 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", cpuCaps.hasMMX ? "MMX" : "C");
1038 }
1039 else
1040 {
1041 if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1042 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1043 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1044 else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1045 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1046 else
1047 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1048 }
1049
1050 if(dstFormat==IMGFMT_BGR24)
1051 printf("SwScaler: using %s YV12->BGR24 Converter\n",
1052 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1053 else
1054 printf("SwScaler: using %s YV12->BGR Converter\n", cpuCaps.hasMMX ? "MMX" : "C");//FIXME print format
1055
1056 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1057 }
1058
1059 return c;
1060 }
1061
1062 /**
1063 * returns a normalized gaussian curve used to filter stuff
1064 * quality=3 is high quality, lowwer is lowwer quality
1065 */
1066 double *getGaussian(double variance, double quality){
1067 const int length= (int)(variance*quality + 0.5) | 1;
1068 int i;
1069 double *coeff= memalign(sizeof(double), length*sizeof(double));
1070 double middle= (length-1)*0.5;
1071
1072 for(i=0; i<length; i++)
1073 {
1074 double dist= i-middle;
1075 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1076 }
1077
1078 normalize(coeff, length, 1.0);
1079 return coeff;
1080 }
1081
1082 void normalize(double *coeff, int length, double height){
1083 int i;
1084 double sum=0;
1085 double inv;
1086
1087 for(i=0; i<length; i++)
1088 sum+= coeff[i];
1089
1090 inv= height/sum;
1091
1092 for(i=0; i<length; i++)
1093 coeff[i]*= height;
1094 }
1095
1096 double *conv(double *a, int aLength, double *b, int bLength){
1097 int length= aLength + bLength - 1;
1098 double *coeff= memalign(sizeof(double), length*sizeof(double));
1099 int i, j;
1100
1101 for(i=0; i<length; i++) coeff[i]= 0.0;
1102
1103 for(i=0; i<aLength; i++)
1104 {
1105 for(j=0; j<bLength; j++)
1106 {
1107 coeff[i+j]+= a[i]*b[j];
1108 }
1109 }
1110
1111 return coeff;
1112 }
1113
1114 /*
1115 double *sum(double *a, int aLength, double *b, int bLength){
1116 int length= MAX(aLength, bLength);
1117 double *coeff= memalign(sizeof(double), length*sizeof(double));
1118 int i;
1119
1120 for(i=0; i<length; i++) coeff[i]= 0.0;
1121
1122 for(i=0; i<aLength; i++) coeff[i]+= a[i];
1123 }
1124 */