Mercurial > libavcodec.hg
annotate i386/mpegvideo_mmx.c @ 3980:5afe4253a220 libavcodec
replace a few and/sub/... by cmov
this is faster on P3, should be faster on AMD, and should be slower on P4
its disabled by default (benchmarks welcome so we know when to enable it)
author | michael |
---|---|
date | Tue, 10 Oct 2006 01:08:39 +0000 |
parents | c8c591fe26f8 |
children | 580d2c397251 |
rev | line source |
---|---|
8 | 1 /* |
2 * The simplest mpeg encoder (well, it was the simplest!) | |
429 | 3 * Copyright (c) 2000,2001 Fabrice Bellard. |
8 | 4 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
429 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
8 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
8 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 * Lesser General Public License for more details. | |
8 | 16 * |
429 | 17 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
8 | 20 * |
21 * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> | |
325 | 22 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> |
8 | 23 */ |
24 | |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
25 #include "../dsputil.h" |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
26 #include "../mpegvideo.h" |
220 | 27 #include "../avcodec.h" |
3398
e0927bc44a10
Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h
lucabe
parents:
3281
diff
changeset
|
28 #include "x86_cpu.h" |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
29 |
1064 | 30 extern uint8_t zigzag_direct_noperm[64]; |
31 extern uint16_t inv_zigzag_direct16[64]; | |
200 | 32 |
8 | 33 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; |
34 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
35 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
36 |
1689 | 37 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
38 DCTELEM *block, int n, int qscale) |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
39 { |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
40 long level, qmul, qadd, nCoeffs; |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
41 |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
42 qmul = qscale << 1; |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
43 |
1661 | 44 assert(s->block_last_index[n]>=0 || s->h263_aic); |
2967 | 45 |
1689 | 46 if (!s->h263_aic) { |
47 if (n < 4) | |
48 level = block[0] * s->y_dc_scale; | |
49 else | |
50 level = block[0] * s->c_dc_scale; | |
51 qadd = (qscale - 1) | 1; | |
52 }else{ | |
53 qadd = 0; | |
54 level= block[0]; | |
55 } | |
56 if(s->ac_pred) | |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
57 nCoeffs=63; |
1689 | 58 else |
59 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
200 | 60 //printf("%d %d ", qmul, qadd); |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
61 asm volatile( |
2979 | 62 "movd %1, %%mm6 \n\t" //qmul |
63 "packssdw %%mm6, %%mm6 \n\t" | |
64 "packssdw %%mm6, %%mm6 \n\t" | |
65 "movd %2, %%mm5 \n\t" //qadd | |
66 "pxor %%mm7, %%mm7 \n\t" | |
67 "packssdw %%mm5, %%mm5 \n\t" | |
68 "packssdw %%mm5, %%mm5 \n\t" | |
69 "psubw %%mm5, %%mm7 \n\t" | |
70 "pxor %%mm4, %%mm4 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
71 ASMALIGN(4) |
2979 | 72 "1: \n\t" |
73 "movq (%0, %3), %%mm0 \n\t" | |
74 "movq 8(%0, %3), %%mm1 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
75 |
2979 | 76 "pmullw %%mm6, %%mm0 \n\t" |
77 "pmullw %%mm6, %%mm1 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
78 |
2979 | 79 "movq (%0, %3), %%mm2 \n\t" |
80 "movq 8(%0, %3), %%mm3 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
81 |
2979 | 82 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
83 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
84 |
2979 | 85 "pxor %%mm2, %%mm0 \n\t" |
86 "pxor %%mm3, %%mm1 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
87 |
2979 | 88 "paddw %%mm7, %%mm0 \n\t" |
89 "paddw %%mm7, %%mm1 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
90 |
2979 | 91 "pxor %%mm0, %%mm2 \n\t" |
92 "pxor %%mm1, %%mm3 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
93 |
2979 | 94 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
95 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
96 |
2979 | 97 "pandn %%mm2, %%mm0 \n\t" |
98 "pandn %%mm3, %%mm1 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
99 |
2979 | 100 "movq %%mm0, (%0, %3) \n\t" |
101 "movq %%mm1, 8(%0, %3) \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
102 |
2979 | 103 "add $16, %3 \n\t" |
104 "jng 1b \n\t" | |
105 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) | |
106 : "memory" | |
107 ); | |
1689 | 108 block[0]= level; |
109 } | |
110 | |
111 | |
112 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, | |
113 DCTELEM *block, int n, int qscale) | |
114 { | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
115 long qmul, qadd, nCoeffs; |
1689 | 116 |
117 qmul = qscale << 1; | |
118 qadd = (qscale - 1) | 1; | |
119 | |
120 assert(s->block_last_index[n]>=0 || s->h263_aic); | |
2967 | 121 |
1689 | 122 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
123 //printf("%d %d ", qmul, qadd); | |
124 asm volatile( | |
2979 | 125 "movd %1, %%mm6 \n\t" //qmul |
126 "packssdw %%mm6, %%mm6 \n\t" | |
127 "packssdw %%mm6, %%mm6 \n\t" | |
128 "movd %2, %%mm5 \n\t" //qadd | |
129 "pxor %%mm7, %%mm7 \n\t" | |
130 "packssdw %%mm5, %%mm5 \n\t" | |
131 "packssdw %%mm5, %%mm5 \n\t" | |
132 "psubw %%mm5, %%mm7 \n\t" | |
133 "pxor %%mm4, %%mm4 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
134 ASMALIGN(4) |
2979 | 135 "1: \n\t" |
136 "movq (%0, %3), %%mm0 \n\t" | |
137 "movq 8(%0, %3), %%mm1 \n\t" | |
1689 | 138 |
2979 | 139 "pmullw %%mm6, %%mm0 \n\t" |
140 "pmullw %%mm6, %%mm1 \n\t" | |
1689 | 141 |
2979 | 142 "movq (%0, %3), %%mm2 \n\t" |
143 "movq 8(%0, %3), %%mm3 \n\t" | |
1689 | 144 |
2979 | 145 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
146 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
1689 | 147 |
2979 | 148 "pxor %%mm2, %%mm0 \n\t" |
149 "pxor %%mm3, %%mm1 \n\t" | |
1689 | 150 |
2979 | 151 "paddw %%mm7, %%mm0 \n\t" |
152 "paddw %%mm7, %%mm1 \n\t" | |
1689 | 153 |
2979 | 154 "pxor %%mm0, %%mm2 \n\t" |
155 "pxor %%mm1, %%mm3 \n\t" | |
1689 | 156 |
2979 | 157 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
158 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
1689 | 159 |
2979 | 160 "pandn %%mm2, %%mm0 \n\t" |
161 "pandn %%mm3, %%mm1 \n\t" | |
1689 | 162 |
2979 | 163 "movq %%mm0, (%0, %3) \n\t" |
164 "movq %%mm1, 8(%0, %3) \n\t" | |
1689 | 165 |
2979 | 166 "add $16, %3 \n\t" |
167 "jng 1b \n\t" | |
168 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) | |
169 : "memory" | |
170 ); | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
171 } |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
172 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
173 |
8 | 174 /* |
175 NK: | |
176 Note: looking at PARANOID: | |
177 "enable all paranoid tests for rounding, overflows, etc..." | |
178 | |
179 #ifdef PARANOID | |
180 if (level < -2048 || level > 2047) | |
181 fprintf(stderr, "unquant error %d %d\n", i, level); | |
182 #endif | |
183 We can suppose that result of two multiplications can't be greate of 0xFFFF | |
184 i.e. is 16-bit, so we use here only PMULLW instruction and can avoid | |
185 a complex multiplication. | |
186 ===================================================== | |
187 Full formula for multiplication of 2 integer numbers | |
188 which are represent as high:low words: | |
189 input: value1 = high1:low1 | |
190 value2 = high2:low2 | |
191 output: value3 = value1*value2 | |
192 value3=high3:low3 (on overflow: modulus 2^32 wrap-around) | |
193 this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 | |
194 but this algorithm will compute only 0x66cb0ce4 | |
195 this limited by 16-bit size of operands | |
196 --------------------------------- | |
197 tlow1 = high1*low2 | |
198 tlow2 = high2*low1 | |
199 tlow1 = tlow1 + tlow2 | |
200 high3:low3 = low1*low2 | |
201 high3 += tlow1 | |
202 */ | |
1689 | 203 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
204 DCTELEM *block, int n, int qscale) |
8 | 205 { |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
206 long nCoeffs; |
1064 | 207 const uint16_t *quant_matrix; |
1689 | 208 int block0; |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
209 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
210 assert(s->block_last_index[n]>=0); |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
211 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
212 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; |
200 | 213 |
2967 | 214 if (n < 4) |
1689 | 215 block0 = block[0] * s->y_dc_scale; |
216 else | |
217 block0 = block[0] * s->c_dc_scale; | |
218 /* XXX: only mpeg1 */ | |
219 quant_matrix = s->intra_matrix; | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
220 asm volatile( |
2979 | 221 "pcmpeqw %%mm7, %%mm7 \n\t" |
222 "psrlw $15, %%mm7 \n\t" | |
223 "movd %2, %%mm6 \n\t" | |
224 "packssdw %%mm6, %%mm6 \n\t" | |
225 "packssdw %%mm6, %%mm6 \n\t" | |
226 "mov %3, %%"REG_a" \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
227 ASMALIGN(4) |
2979 | 228 "1: \n\t" |
229 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
230 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
231 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
232 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
233 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
234 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
235 "pxor %%mm2, %%mm2 \n\t" | |
236 "pxor %%mm3, %%mm3 \n\t" | |
237 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
238 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
239 "pxor %%mm2, %%mm0 \n\t" | |
240 "pxor %%mm3, %%mm1 \n\t" | |
241 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
242 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
243 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
244 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
245 "pxor %%mm4, %%mm4 \n\t" | |
246 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
247 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
248 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
249 "psraw $3, %%mm0 \n\t" | |
250 "psraw $3, %%mm1 \n\t" | |
251 "psubw %%mm7, %%mm0 \n\t" | |
252 "psubw %%mm7, %%mm1 \n\t" | |
253 "por %%mm7, %%mm0 \n\t" | |
254 "por %%mm7, %%mm1 \n\t" | |
255 "pxor %%mm2, %%mm0 \n\t" | |
256 "pxor %%mm3, %%mm1 \n\t" | |
257 "psubw %%mm2, %%mm0 \n\t" | |
258 "psubw %%mm3, %%mm1 \n\t" | |
259 "pandn %%mm0, %%mm4 \n\t" | |
260 "pandn %%mm1, %%mm5 \n\t" | |
261 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
262 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
325 | 263 |
2979 | 264 "add $16, %%"REG_a" \n\t" |
265 "js 1b \n\t" | |
266 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
267 : "%"REG_a, "memory" | |
268 ); | |
1689 | 269 block[0]= block0; |
270 } | |
325 | 271 |
1689 | 272 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, |
273 DCTELEM *block, int n, int qscale) | |
274 { | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
275 long nCoeffs; |
1689 | 276 const uint16_t *quant_matrix; |
277 | |
278 assert(s->block_last_index[n]>=0); | |
279 | |
280 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
281 | |
344 | 282 quant_matrix = s->inter_matrix; |
325 | 283 asm volatile( |
2979 | 284 "pcmpeqw %%mm7, %%mm7 \n\t" |
285 "psrlw $15, %%mm7 \n\t" | |
286 "movd %2, %%mm6 \n\t" | |
287 "packssdw %%mm6, %%mm6 \n\t" | |
288 "packssdw %%mm6, %%mm6 \n\t" | |
289 "mov %3, %%"REG_a" \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
290 ASMALIGN(4) |
2979 | 291 "1: \n\t" |
292 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
293 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
294 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
295 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
296 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
297 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
298 "pxor %%mm2, %%mm2 \n\t" | |
299 "pxor %%mm3, %%mm3 \n\t" | |
300 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
301 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
302 "pxor %%mm2, %%mm0 \n\t" | |
303 "pxor %%mm3, %%mm1 \n\t" | |
304 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
305 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
306 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
307 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
308 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 | |
309 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 | |
310 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
311 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
312 "pxor %%mm4, %%mm4 \n\t" | |
313 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
314 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
315 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
316 "psraw $4, %%mm0 \n\t" | |
317 "psraw $4, %%mm1 \n\t" | |
318 "psubw %%mm7, %%mm0 \n\t" | |
319 "psubw %%mm7, %%mm1 \n\t" | |
320 "por %%mm7, %%mm0 \n\t" | |
321 "por %%mm7, %%mm1 \n\t" | |
322 "pxor %%mm2, %%mm0 \n\t" | |
323 "pxor %%mm3, %%mm1 \n\t" | |
324 "psubw %%mm2, %%mm0 \n\t" | |
325 "psubw %%mm3, %%mm1 \n\t" | |
326 "pandn %%mm0, %%mm4 \n\t" | |
327 "pandn %%mm1, %%mm5 \n\t" | |
328 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
329 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
325 | 330 |
2979 | 331 "add $16, %%"REG_a" \n\t" |
332 "js 1b \n\t" | |
333 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
334 : "%"REG_a, "memory" | |
335 ); | |
325 | 336 } |
337 | |
1689 | 338 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, |
325 | 339 DCTELEM *block, int n, int qscale) |
340 { | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
341 long nCoeffs; |
1064 | 342 const uint16_t *quant_matrix; |
1689 | 343 int block0; |
2967 | 344 |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
345 assert(s->block_last_index[n]>=0); |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
346 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
347 if(s->alternate_scan) nCoeffs= 63; //FIXME |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
348 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; |
325 | 349 |
2967 | 350 if (n < 4) |
1689 | 351 block0 = block[0] * s->y_dc_scale; |
352 else | |
353 block0 = block[0] * s->c_dc_scale; | |
354 quant_matrix = s->intra_matrix; | |
325 | 355 asm volatile( |
2979 | 356 "pcmpeqw %%mm7, %%mm7 \n\t" |
357 "psrlw $15, %%mm7 \n\t" | |
358 "movd %2, %%mm6 \n\t" | |
359 "packssdw %%mm6, %%mm6 \n\t" | |
360 "packssdw %%mm6, %%mm6 \n\t" | |
361 "mov %3, %%"REG_a" \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
362 ASMALIGN(4) |
2979 | 363 "1: \n\t" |
364 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
365 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
366 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
367 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
368 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
369 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
370 "pxor %%mm2, %%mm2 \n\t" | |
371 "pxor %%mm3, %%mm3 \n\t" | |
372 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
373 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
374 "pxor %%mm2, %%mm0 \n\t" | |
375 "pxor %%mm3, %%mm1 \n\t" | |
376 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
377 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
378 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
379 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
380 "pxor %%mm4, %%mm4 \n\t" | |
381 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
382 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
383 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
384 "psraw $3, %%mm0 \n\t" | |
385 "psraw $3, %%mm1 \n\t" | |
386 "pxor %%mm2, %%mm0 \n\t" | |
387 "pxor %%mm3, %%mm1 \n\t" | |
388 "psubw %%mm2, %%mm0 \n\t" | |
389 "psubw %%mm3, %%mm1 \n\t" | |
390 "pandn %%mm0, %%mm4 \n\t" | |
391 "pandn %%mm1, %%mm5 \n\t" | |
392 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
393 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
394 |
2979 | 395 "add $16, %%"REG_a" \n\t" |
396 "jng 1b \n\t" | |
397 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
398 : "%"REG_a, "memory" | |
399 ); | |
1689 | 400 block[0]= block0; |
325 | 401 //Note, we dont do mismatch control for intra as errors cannot accumulate |
1689 | 402 } |
325 | 403 |
1689 | 404 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, |
405 DCTELEM *block, int n, int qscale) | |
406 { | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
407 long nCoeffs; |
1689 | 408 const uint16_t *quant_matrix; |
2967 | 409 |
1689 | 410 assert(s->block_last_index[n]>=0); |
411 | |
412 if(s->alternate_scan) nCoeffs= 63; //FIXME | |
413 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
414 | |
344 | 415 quant_matrix = s->inter_matrix; |
325 | 416 asm volatile( |
2979 | 417 "pcmpeqw %%mm7, %%mm7 \n\t" |
418 "psrlq $48, %%mm7 \n\t" | |
419 "movd %2, %%mm6 \n\t" | |
420 "packssdw %%mm6, %%mm6 \n\t" | |
421 "packssdw %%mm6, %%mm6 \n\t" | |
422 "mov %3, %%"REG_a" \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
423 ASMALIGN(4) |
2979 | 424 "1: \n\t" |
425 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
426 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
427 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
428 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
429 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
430 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
431 "pxor %%mm2, %%mm2 \n\t" | |
432 "pxor %%mm3, %%mm3 \n\t" | |
433 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
434 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
435 "pxor %%mm2, %%mm0 \n\t" | |
436 "pxor %%mm3, %%mm1 \n\t" | |
437 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
438 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
439 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
440 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
441 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | |
442 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | |
443 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
444 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
445 "pxor %%mm4, %%mm4 \n\t" | |
446 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
447 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
448 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
449 "psrlw $4, %%mm0 \n\t" | |
450 "psrlw $4, %%mm1 \n\t" | |
451 "pxor %%mm2, %%mm0 \n\t" | |
452 "pxor %%mm3, %%mm1 \n\t" | |
453 "psubw %%mm2, %%mm0 \n\t" | |
454 "psubw %%mm3, %%mm1 \n\t" | |
455 "pandn %%mm0, %%mm4 \n\t" | |
456 "pandn %%mm1, %%mm5 \n\t" | |
457 "pxor %%mm4, %%mm7 \n\t" | |
458 "pxor %%mm5, %%mm7 \n\t" | |
459 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
460 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
325 | 461 |
2979 | 462 "add $16, %%"REG_a" \n\t" |
463 "jng 1b \n\t" | |
464 "movd 124(%0, %3), %%mm0 \n\t" | |
465 "movq %%mm7, %%mm6 \n\t" | |
466 "psrlq $32, %%mm7 \n\t" | |
467 "pxor %%mm6, %%mm7 \n\t" | |
468 "movq %%mm7, %%mm6 \n\t" | |
469 "psrlq $16, %%mm7 \n\t" | |
470 "pxor %%mm6, %%mm7 \n\t" | |
471 "pslld $31, %%mm7 \n\t" | |
472 "psrlq $15, %%mm7 \n\t" | |
473 "pxor %%mm7, %%mm0 \n\t" | |
474 "movd %%mm0, 124(%0, %3) \n\t" | |
2967 | 475 |
2979 | 476 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) |
477 : "%"REG_a, "memory" | |
478 ); | |
8 | 479 } |
480 | |
2967 | 481 /* draw the edges of width 'w' of an image of size width, height |
206 | 482 this mmx version can only handle w==8 || w==16 */ |
1064 | 483 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) |
206 | 484 { |
1064 | 485 uint8_t *ptr, *last_line; |
206 | 486 int i; |
487 | |
488 last_line = buf + (height - 1) * wrap; | |
489 /* left and right */ | |
490 ptr = buf; | |
491 if(w==8) | |
492 { | |
2979 | 493 asm volatile( |
494 "1: \n\t" | |
495 "movd (%0), %%mm0 \n\t" | |
496 "punpcklbw %%mm0, %%mm0 \n\t" | |
497 "punpcklwd %%mm0, %%mm0 \n\t" | |
498 "punpckldq %%mm0, %%mm0 \n\t" | |
499 "movq %%mm0, -8(%0) \n\t" | |
500 "movq -8(%0, %2), %%mm1 \n\t" | |
501 "punpckhbw %%mm1, %%mm1 \n\t" | |
502 "punpckhwd %%mm1, %%mm1 \n\t" | |
503 "punpckhdq %%mm1, %%mm1 \n\t" | |
504 "movq %%mm1, (%0, %2) \n\t" | |
505 "add %1, %0 \n\t" | |
506 "cmp %3, %0 \n\t" | |
507 " jb 1b \n\t" | |
508 : "+r" (ptr) | |
509 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) | |
510 ); | |
206 | 511 } |
512 else | |
513 { | |
2979 | 514 asm volatile( |
515 "1: \n\t" | |
516 "movd (%0), %%mm0 \n\t" | |
517 "punpcklbw %%mm0, %%mm0 \n\t" | |
518 "punpcklwd %%mm0, %%mm0 \n\t" | |
519 "punpckldq %%mm0, %%mm0 \n\t" | |
520 "movq %%mm0, -8(%0) \n\t" | |
521 "movq %%mm0, -16(%0) \n\t" | |
522 "movq -8(%0, %2), %%mm1 \n\t" | |
523 "punpckhbw %%mm1, %%mm1 \n\t" | |
524 "punpckhwd %%mm1, %%mm1 \n\t" | |
525 "punpckhdq %%mm1, %%mm1 \n\t" | |
526 "movq %%mm1, (%0, %2) \n\t" | |
527 "movq %%mm1, 8(%0, %2) \n\t" | |
528 "add %1, %0 \n\t" | |
529 "cmp %3, %0 \n\t" | |
530 " jb 1b \n\t" | |
531 : "+r" (ptr) | |
532 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) | |
533 ); | |
206 | 534 } |
2967 | 535 |
206 | 536 for(i=0;i<w;i+=4) { |
537 /* top and bottom (and hopefully also the corners) */ | |
2979 | 538 ptr= buf - (i + 1) * wrap - w; |
539 asm volatile( | |
540 "1: \n\t" | |
541 "movq (%1, %0), %%mm0 \n\t" | |
542 "movq %%mm0, (%0) \n\t" | |
543 "movq %%mm0, (%0, %2) \n\t" | |
544 "movq %%mm0, (%0, %2, 2) \n\t" | |
545 "movq %%mm0, (%0, %3) \n\t" | |
546 "add $8, %0 \n\t" | |
547 "cmp %4, %0 \n\t" | |
548 " jb 1b \n\t" | |
549 : "+r" (ptr) | |
550 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w) | |
551 ); | |
552 ptr= last_line + (i + 1) * wrap - w; | |
553 asm volatile( | |
554 "1: \n\t" | |
555 "movq (%1, %0), %%mm0 \n\t" | |
556 "movq %%mm0, (%0) \n\t" | |
557 "movq %%mm0, (%0, %2) \n\t" | |
558 "movq %%mm0, (%0, %2, 2) \n\t" | |
559 "movq %%mm0, (%0, %3) \n\t" | |
560 "add $8, %0 \n\t" | |
561 "cmp %4, %0 \n\t" | |
562 " jb 1b \n\t" | |
563 : "+r" (ptr) | |
564 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) | |
565 ); | |
206 | 566 } |
567 } | |
568 | |
1719 | 569 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ |
570 const int intra= s->mb_intra; | |
571 int *sum= s->dct_error_sum[intra]; | |
572 uint16_t *offset= s->dct_offset[intra]; | |
573 | |
574 s->dct_count[intra]++; | |
575 | |
576 asm volatile( | |
2979 | 577 "pxor %%mm7, %%mm7 \n\t" |
578 "1: \n\t" | |
579 "pxor %%mm0, %%mm0 \n\t" | |
580 "pxor %%mm1, %%mm1 \n\t" | |
581 "movq (%0), %%mm2 \n\t" | |
582 "movq 8(%0), %%mm3 \n\t" | |
583 "pcmpgtw %%mm2, %%mm0 \n\t" | |
584 "pcmpgtw %%mm3, %%mm1 \n\t" | |
585 "pxor %%mm0, %%mm2 \n\t" | |
586 "pxor %%mm1, %%mm3 \n\t" | |
587 "psubw %%mm0, %%mm2 \n\t" | |
588 "psubw %%mm1, %%mm3 \n\t" | |
589 "movq %%mm2, %%mm4 \n\t" | |
590 "movq %%mm3, %%mm5 \n\t" | |
591 "psubusw (%2), %%mm2 \n\t" | |
592 "psubusw 8(%2), %%mm3 \n\t" | |
593 "pxor %%mm0, %%mm2 \n\t" | |
594 "pxor %%mm1, %%mm3 \n\t" | |
595 "psubw %%mm0, %%mm2 \n\t" | |
596 "psubw %%mm1, %%mm3 \n\t" | |
597 "movq %%mm2, (%0) \n\t" | |
598 "movq %%mm3, 8(%0) \n\t" | |
599 "movq %%mm4, %%mm2 \n\t" | |
600 "movq %%mm5, %%mm3 \n\t" | |
601 "punpcklwd %%mm7, %%mm4 \n\t" | |
602 "punpckhwd %%mm7, %%mm2 \n\t" | |
603 "punpcklwd %%mm7, %%mm5 \n\t" | |
604 "punpckhwd %%mm7, %%mm3 \n\t" | |
605 "paddd (%1), %%mm4 \n\t" | |
606 "paddd 8(%1), %%mm2 \n\t" | |
607 "paddd 16(%1), %%mm5 \n\t" | |
608 "paddd 24(%1), %%mm3 \n\t" | |
609 "movq %%mm4, (%1) \n\t" | |
610 "movq %%mm2, 8(%1) \n\t" | |
611 "movq %%mm5, 16(%1) \n\t" | |
612 "movq %%mm3, 24(%1) \n\t" | |
613 "add $16, %0 \n\t" | |
614 "add $32, %1 \n\t" | |
615 "add $16, %2 \n\t" | |
616 "cmp %3, %0 \n\t" | |
617 " jb 1b \n\t" | |
1719 | 618 : "+r" (block), "+r" (sum), "+r" (offset) |
619 : "r"(block+64) | |
620 ); | |
621 } | |
622 | |
1720
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
623 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
624 const int intra= s->mb_intra; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
625 int *sum= s->dct_error_sum[intra]; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
626 uint16_t *offset= s->dct_offset[intra]; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
627 |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
628 s->dct_count[intra]++; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
629 |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
630 asm volatile( |
2979 | 631 "pxor %%xmm7, %%xmm7 \n\t" |
632 "1: \n\t" | |
633 "pxor %%xmm0, %%xmm0 \n\t" | |
634 "pxor %%xmm1, %%xmm1 \n\t" | |
635 "movdqa (%0), %%xmm2 \n\t" | |
636 "movdqa 16(%0), %%xmm3 \n\t" | |
637 "pcmpgtw %%xmm2, %%xmm0 \n\t" | |
638 "pcmpgtw %%xmm3, %%xmm1 \n\t" | |
639 "pxor %%xmm0, %%xmm2 \n\t" | |
640 "pxor %%xmm1, %%xmm3 \n\t" | |
641 "psubw %%xmm0, %%xmm2 \n\t" | |
642 "psubw %%xmm1, %%xmm3 \n\t" | |
643 "movdqa %%xmm2, %%xmm4 \n\t" | |
644 "movdqa %%xmm3, %%xmm5 \n\t" | |
645 "psubusw (%2), %%xmm2 \n\t" | |
646 "psubusw 16(%2), %%xmm3 \n\t" | |
647 "pxor %%xmm0, %%xmm2 \n\t" | |
648 "pxor %%xmm1, %%xmm3 \n\t" | |
649 "psubw %%xmm0, %%xmm2 \n\t" | |
650 "psubw %%xmm1, %%xmm3 \n\t" | |
651 "movdqa %%xmm2, (%0) \n\t" | |
652 "movdqa %%xmm3, 16(%0) \n\t" | |
653 "movdqa %%xmm4, %%xmm6 \n\t" | |
654 "movdqa %%xmm5, %%xmm0 \n\t" | |
655 "punpcklwd %%xmm7, %%xmm4 \n\t" | |
656 "punpckhwd %%xmm7, %%xmm6 \n\t" | |
657 "punpcklwd %%xmm7, %%xmm5 \n\t" | |
658 "punpckhwd %%xmm7, %%xmm0 \n\t" | |
659 "paddd (%1), %%xmm4 \n\t" | |
660 "paddd 16(%1), %%xmm6 \n\t" | |
661 "paddd 32(%1), %%xmm5 \n\t" | |
662 "paddd 48(%1), %%xmm0 \n\t" | |
663 "movdqa %%xmm4, (%1) \n\t" | |
664 "movdqa %%xmm6, 16(%1) \n\t" | |
665 "movdqa %%xmm5, 32(%1) \n\t" | |
666 "movdqa %%xmm0, 48(%1) \n\t" | |
667 "add $32, %0 \n\t" | |
668 "add $64, %1 \n\t" | |
669 "add $32, %2 \n\t" | |
670 "cmp %3, %0 \n\t" | |
671 " jb 1b \n\t" | |
1720
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
672 : "+r" (block), "+r" (sum), "+r" (offset) |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
673 : "r"(block+64) |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
674 ); |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
675 } |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
676 |
220 | 677 #undef HAVE_MMX2 |
678 #define RENAME(a) a ## _MMX | |
1565 | 679 #define RENAMEl(a) a ## _mmx |
220 | 680 #include "mpegvideo_mmx_template.c" |
681 | |
682 #define HAVE_MMX2 | |
683 #undef RENAME | |
1597 | 684 #undef RENAMEl |
220 | 685 #define RENAME(a) a ## _MMX2 |
1565 | 686 #define RENAMEl(a) a ## _mmx2 |
220 | 687 #include "mpegvideo_mmx_template.c" |
206 | 688 |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
689 #undef RENAME |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
690 #undef RENAMEl |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
691 #define RENAME(a) a ## _SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
692 #define RENAMEl(a) a ## _sse2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
693 #include "mpegvideo_mmx_template.c" |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
694 |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
695 void MPV_common_init_mmx(MpegEncContext *s) |
8 | 696 { |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
697 if (mm_flags & MM_MMX) { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
698 const int dct_algo = s->avctx->dct_algo; |
2967 | 699 |
1689 | 700 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; |
701 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; | |
702 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; | |
703 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; | |
3281
7fac25904a8b
missmatch control for mpeg2 intra dequantization if bitexact=1
michael
parents:
3036
diff
changeset
|
704 if(!(s->flags & CODEC_FLAG_BITEXACT)) |
7fac25904a8b
missmatch control for mpeg2 intra dequantization if bitexact=1
michael
parents:
3036
diff
changeset
|
705 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; |
1689 | 706 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; |
312 | 707 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
708 draw_edges = draw_edges_mmx; |
2967 | 709 |
1720
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
710 if (mm_flags & MM_SSE2) { |
2979 | 711 s->denoise_dct= denoise_dct_sse2; |
712 } else { | |
713 s->denoise_dct= denoise_dct_mmx; | |
714 } | |
220 | 715 |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
716 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
717 if(mm_flags & MM_SSE2){ |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
718 s->dct_quantize= dct_quantize_SSE2; |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
719 } else if(mm_flags & MM_MMXEXT){ |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
720 s->dct_quantize= dct_quantize_MMX2; |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
721 } else { |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
722 s->dct_quantize= dct_quantize_MMX; |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
723 } |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
724 } |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
725 } |
8 | 726 } |