annotate postproc/swscale.c @ 3152:54710806be56

runtime cpu detection optional (compiles faster)
author michael
date Tue, 27 Nov 2001 01:19:56 +0000
parents b196b915fdc4
children 0b172eb639f1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
1
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
2 // Software scaling and colorspace conversion routines for MPlayer
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
3
2269
95c48204bcd9 (C) fixed
arpi
parents: 2267
diff changeset
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
6 // the parts written by michael are under GNU GPL
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
7
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
8 #include <inttypes.h>
2476
a6c5a537f30a a few warning fixes (missing #include's)
pl
parents: 2469
diff changeset
9 #include <string.h>
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
10 #include "../config.h"
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
11 #include "swscale.h"
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
12 #include "../cpudetect.h"
2540
f2e70944d02a fixed a warning
michael
parents: 2534
diff changeset
13 #undef MOVNTQ
2680
e8a534509557 green line fix for dstw%8!=0
michael
parents: 2671
diff changeset
14 #undef PAVGB
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
15
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
16 //#undef HAVE_MMX2
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
17 //#undef HAVE_MMX
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
18 //#undef ARCH_X86
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
19 #define DITHER1XBPP
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
20 int fullUVIpol=0;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
21 //disables the unscaled height version
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
22 int allwaysIpol=0;
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
23
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
24 #define RET 0xC3 //near return opcode
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
25 /*
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
26 NOTES
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
27
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
28 known BUGS with known cause (no bugreports please!, but patches are welcome :) )
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
29 horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
30
2326
7d3542955132 BGR24 bugfix
michael
parents: 2316
diff changeset
31 Supported output formats BGR15 BGR16 BGR24 BGR32
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
32 BGR15 & BGR16 MMX verions support dithering
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
33 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
34
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
35 TODO
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
36 more intelligent missalignment avoidance for the horizontal scaler
2566
a350d8bed636 bugfixes
michael
parents: 2540
diff changeset
37 bicubic scaler
2585
bd52b78f12dc c speedup
michael
parents: 2584
diff changeset
38 dither in C
bd52b78f12dc c speedup
michael
parents: 2584
diff changeset
39 change the distance of the u & v buffer
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
40 how to differenciate between x86 an C at runtime ?! (using C for now)
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
41 */
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
42
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
43 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
2469
03abc2743ed6 downscale
michael
parents: 2326
diff changeset
44 #define MIN(a,b) ((a) > (b) ? (b) : (a))
03abc2743ed6 downscale
michael
parents: 2326
diff changeset
45 #define MAX(a,b) ((a) < (b) ? (b) : (a))
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
46
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
47 #ifdef ARCH_X86
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
48 #define CAN_COMPILE_X86_ASM
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
49 #endif
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
50
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
51 #ifdef CAN_COMPILE_X86_ASM
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
52 static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
2503
d21d8d5f2e23 yuv2rgb bugfix
michael
parents: 2476
diff changeset
53 static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
d21d8d5f2e23 yuv2rgb bugfix
michael
parents: 2476
diff changeset
54 static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
d21d8d5f2e23 yuv2rgb bugfix
michael
parents: 2476
diff changeset
55 static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
d21d8d5f2e23 yuv2rgb bugfix
michael
parents: 2476
diff changeset
56 static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
2669
476b9b3b91be faster bgr15/16
michael
parents: 2638
diff changeset
57 static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
476b9b3b91be faster bgr15/16
michael
parents: 2638
diff changeset
58 static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
59 static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
60 static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
61 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
62 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
63 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
64 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
65
2750
9ef09e232505 gcc does optimize writes to non volatile variables away if it didnt know that they were read in between
michael
parents: 2748
diff changeset
66 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
9ef09e232505 gcc does optimize writes to non volatile variables away if it didnt know that they were read in between
michael
parents: 2748
diff changeset
67 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
9ef09e232505 gcc does optimize writes to non volatile variables away if it didnt know that they were read in between
michael
parents: 2748
diff changeset
68 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
9ef09e232505 gcc does optimize writes to non volatile variables away if it didnt know that they were read in between
michael
parents: 2748
diff changeset
69 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
2748
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
70
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
71 static uint64_t __attribute__((aligned(8))) dither4[2]={
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
72 0x0103010301030103LL,
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
73 0x0200020002000200LL,};
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
74
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
75 static uint64_t __attribute__((aligned(8))) dither8[2]={
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
76 0x0602060206020602LL,
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
77 0x0004000400040004LL,};
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
78
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
79 static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
80 static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
81 static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
82 static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
83 static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
84 static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
85
2730
c483fc9bf0c4 faster bgr24 output
michael
parents: 2728
diff changeset
86 static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
c483fc9bf0c4 faster bgr24 output
michael
parents: 2728
diff changeset
87 static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
c483fc9bf0c4 faster bgr24 output
michael
parents: 2728
diff changeset
88 static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
c483fc9bf0c4 faster bgr24 output
michael
parents: 2728
diff changeset
89
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
90 static uint64_t __attribute__((aligned(8))) temp0;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
91 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
92 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
93
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
94 // temporary storage for 4 yuv lines:
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
95 // 16bit for now (mmx likes it more compact)
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
96 static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
97 static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
98 #else
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
99 static uint16_t pix_buf_y[4][2048];
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
100 static uint16_t pix_buf_uv[2][2048*2];
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
101 #endif
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
102
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
103 // clipping helper table for C implementations:
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
104 static unsigned char clip_table[768];
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
105
2584
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
106 static unsigned short clip_table16b[768];
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
107 static unsigned short clip_table16g[768];
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
108 static unsigned short clip_table16r[768];
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
109 static unsigned short clip_table15b[768];
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
110 static unsigned short clip_table15g[768];
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
111 static unsigned short clip_table15r[768];
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
112
2264
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
113 // yuv->rgb conversion tables:
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
114 static int yuvtab_2568[256];
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
115 static int yuvtab_3343[256];
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
116 static int yuvtab_0c92[256];
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
117 static int yuvtab_1a1e[256];
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
118 static int yuvtab_40cf[256];
7851375ea156 increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents: 2237
diff changeset
119
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
120 #ifdef CAN_COMPILE_X86_ASM
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
121 static uint8_t funnyYCode[10000];
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
122 static uint8_t funnyUVCode[10000];
2671
555cb027c7a7 fixed warnings
michael
parents: 2669
diff changeset
123 #endif
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
124
2469
03abc2743ed6 downscale
michael
parents: 2326
diff changeset
125 static int canMMX2BeUsed=0;
03abc2743ed6 downscale
michael
parents: 2326
diff changeset
126
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
127 #ifdef CAN_COMPILE_X86_ASM
2671
555cb027c7a7 fixed warnings
michael
parents: 2669
diff changeset
128 void in_asm_used_var_warning_killer()
555cb027c7a7 fixed warnings
michael
parents: 2669
diff changeset
129 {
555cb027c7a7 fixed warnings
michael
parents: 2669
diff changeset
130 int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
2748
01dbf100b4f8 better dithering
michael
parents: 2730
diff changeset
131 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
2730
c483fc9bf0c4 faster bgr24 output
michael
parents: 2728
diff changeset
132 M24A+M24B+M24C;
2671
555cb027c7a7 fixed warnings
michael
parents: 2669
diff changeset
133 if(i) i=0;
555cb027c7a7 fixed warnings
michael
parents: 2669
diff changeset
134 }
555cb027c7a7 fixed warnings
michael
parents: 2669
diff changeset
135 #endif
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
136
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
137 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
138 //Plain C versions
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
139 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
140 #define COMPILE_C
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
141 #endif
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
142
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
143 #ifdef CAN_COMPILE_X86_ASM
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
144
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
145 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
146 #define COMPILE_MMX
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
147 #endif
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
148
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
149 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
150 #define COMPILE_MMX2
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
151 #endif
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
153 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
154 #define COMPILE_3DNOW
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
155 #endif
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
156 #endif //CAN_COMPILE_X86_ASM
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
157
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
158 #undef HAVE_MMX
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
159 #undef HAVE_MMX2
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
160 #undef HAVE_3DNOW
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
161 #undef ARCH_X86
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
162
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
163 #ifdef COMPILE_C
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
164 #undef HAVE_MMX
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
165 #undef HAVE_MMX2
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
166 #undef HAVE_3DNOW
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
167 #undef ARCH_X86
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
168 #define RENAME(a) a ## _C
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
169 #include "swscale_template.c"
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
170 #endif
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
171
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
172 #ifdef CAN_COMPILE_X86_ASM
2576
437ed06579d8 c optimizations
michael
parents: 2575
diff changeset
173
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
174 //X86 versions
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
175 /*
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
176 #undef RENAME
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
177 #undef HAVE_MMX
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
178 #undef HAVE_MMX2
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
179 #undef HAVE_3DNOW
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
180 #define ARCH_X86
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
181 #define RENAME(a) a ## _X86
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
182 #include "swscale_template.c"
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
183 */
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
184 //MMX versions
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
185 #ifdef COMPILE_MMX
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
186 #undef RENAME
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
187 #define HAVE_MMX
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
188 #undef HAVE_MMX2
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
189 #undef HAVE_3DNOW
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
190 #define ARCH_X86
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
191 #define RENAME(a) a ## _MMX
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
192 #include "swscale_template.c"
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
193 #endif
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
194
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
195 //MMX2 versions
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
196 #ifdef COMPILE_MMX2
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
197 #undef RENAME
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
198 #define HAVE_MMX
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
199 #define HAVE_MMX2
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
200 #undef HAVE_3DNOW
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
201 #define ARCH_X86
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
202 #define RENAME(a) a ## _MMX2
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
203 #include "swscale_template.c"
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
204 #endif
2469
03abc2743ed6 downscale
michael
parents: 2326
diff changeset
205
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
206 //3DNOW versions
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
207 #ifdef COMPILE_3DNOW
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
208 #undef RENAME
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
209 #define HAVE_MMX
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
210 #undef HAVE_MMX2
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
211 #define HAVE_3DNOW
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
212 #define ARCH_X86
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
213 #define RENAME(a) a ## _3DNow
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
214 #include "swscale_template.c"
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
215 #endif
2469
03abc2743ed6 downscale
michael
parents: 2326
diff changeset
216
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
217 #endif //CAN_COMPILE_X86_ASM
2469
03abc2743ed6 downscale
michael
parents: 2326
diff changeset
218
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
219 // minor note: the HAVE_xyz is messed up after that line so dont use it
2316
bcb229557e9b fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents: 2297
diff changeset
220
2232
65996b3467d7 MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents: 2230
diff changeset
221
2519
6f3fa9bc3b27 yv12 to yv12 scaler
michael
parents: 2503
diff changeset
222 // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
223 // *** Note: it's called multiple times while decoding a frame, first time y==0
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
224 // *** Designed to upscale, but may work for downscale too.
2274
9ee34c6950e0 xinc scaled by 16 instead of 8
arpi
parents: 2271
diff changeset
225 // s_xinc = (src_width << 16) / dst_width
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
226 // s_yinc = (src_height << 16) / dst_height
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
227 // switching the cpu type during a sliced drawing can have bad effects, like sig11
2519
6f3fa9bc3b27 yv12 to yv12 scaler
michael
parents: 2503
diff changeset
228 void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
6f3fa9bc3b27 yv12 to yv12 scaler
michael
parents: 2503
diff changeset
229 uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
230 unsigned int s_xinc,unsigned int s_yinc){
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
231
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
232 // scaling factors:
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
233 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
234 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
235 #ifdef RUNTIME_CPUDETECT
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
236 #ifdef CAN_COMPILE_X86_ASM
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
237 // ordered per speed fasterst first
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
238 if(gCpuCaps.hasMMX2)
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
239 SwScale_YV12slice_MMX2(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
240 else if(gCpuCaps.has3DNow)
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
241 SwScale_YV12slice_3DNow(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
242 else if(gCpuCaps.hasMMX)
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
243 SwScale_YV12slice_MMX(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
244 else
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
245 SwScale_YV12slice_C(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
246 #else
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
247 SwScale_YV12slice_C(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
2270
56ca174d8169 vertical lines bugfix
michael
parents: 2269
diff changeset
248 #endif
3152
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
249 #else //RUNTIME_CPUDETECT
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
250 #ifdef HAVE_MMX2
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
251 SwScale_YV12slice_MMX2(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
252 #elif defined (HAVE_3DNOW)
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
253 SwScale_YV12slice_3DNow(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
254 #elif defined (HAVE_MMX)
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
255 SwScale_YV12slice_MMX(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
256 #else
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
257 SwScale_YV12slice_C(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
258 #endif
54710806be56 runtime cpu detection optional (compiles faster)
michael
parents: 3136
diff changeset
259 #endif //!RUNTIME_CPUDETECT
2270
56ca174d8169 vertical lines bugfix
michael
parents: 2269
diff changeset
260
2216
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
261 }
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
262
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
263 void SwScale_Init(){
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
264 // generating tables:
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
265 int i;
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
266 for(i=0;i<256;i++){
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
267 clip_table[i]=0;
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
268 clip_table[i+256]=i;
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
269 clip_table[i+512]=255;
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
270 yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
271 yuvtab_3343[i]=0x3343*(i-128);
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
272 yuvtab_0c92[i]=-0x0c92*(i-128);
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
273 yuvtab_1a1e[i]=-0x1a1e*(i-128);
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
274 yuvtab_40cf[i]=0x40cf*(i-128);
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
275 }
9da2a0515184 software yv12->rgb scaler - separated from fsdga
arpi
parents:
diff changeset
276
2584
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
277 for(i=0; i<768; i++)
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
278 {
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
279 int v= clip_table[i];
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
280 clip_table16b[i]= v>>3;
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
281 clip_table16g[i]= (v<<3)&0x07E0;
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
282 clip_table16r[i]= (v<<8)&0xF800;
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
283 clip_table15b[i]= v>>3;
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
284 clip_table15g[i]= (v<<2)&0x03E0;
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
285 clip_table15r[i]= (v<<7)&0x7C00;
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
286 }
3126
e71ae0213431 runtime cpu detection
michael
parents: 2800
diff changeset
287 }
2584
6d20d5d5829f 15/16bit in C speedup
michael
parents: 2576
diff changeset
288