Mercurial > libavcodec.hg
annotate x86/mpegvideo_mmx.c @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 9fef0a8ddd63 |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * The simplest mpeg encoder (well, it was the simplest!) | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8590
diff
changeset
|
3 * Copyright (c) 2000,2001 Fabrice Bellard |
8430 | 4 * |
5 * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> | |
6 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> | |
7 * | |
8 * This file is part of FFmpeg. | |
9 * | |
10 * FFmpeg is free software; you can redistribute it and/or | |
11 * modify it under the terms of the GNU Lesser General Public | |
12 * License as published by the Free Software Foundation; either | |
13 * version 2.1 of the License, or (at your option) any later version. | |
14 * | |
15 * FFmpeg is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 * Lesser General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU Lesser General Public | |
21 * License along with FFmpeg; if not, write to the Free Software | |
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 */ | |
24 | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12456
diff
changeset
|
25 #include "libavutil/cpu.h" |
8430 | 26 #include "libavutil/x86_cpu.h" |
27 #include "libavcodec/avcodec.h" | |
28 #include "libavcodec/dsputil.h" | |
29 #include "libavcodec/mpegvideo.h" | |
30 #include "dsputil_mmx.h" | |
31 | |
32 extern uint16_t inv_zigzag_direct16[64]; | |
33 | |
34 | |
35 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, | |
36 DCTELEM *block, int n, int qscale) | |
37 { | |
38 x86_reg level, qmul, qadd, nCoeffs; | |
39 | |
40 qmul = qscale << 1; | |
41 | |
42 assert(s->block_last_index[n]>=0 || s->h263_aic); | |
43 | |
44 if (!s->h263_aic) { | |
45 if (n < 4) | |
46 level = block[0] * s->y_dc_scale; | |
47 else | |
48 level = block[0] * s->c_dc_scale; | |
49 qadd = (qscale - 1) | 1; | |
50 }else{ | |
51 qadd = 0; | |
52 level= block[0]; | |
53 } | |
54 if(s->ac_pred) | |
55 nCoeffs=63; | |
56 else | |
57 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
58 //printf("%d %d ", qmul, qadd); | |
59 __asm__ volatile( | |
60 "movd %1, %%mm6 \n\t" //qmul | |
61 "packssdw %%mm6, %%mm6 \n\t" | |
62 "packssdw %%mm6, %%mm6 \n\t" | |
63 "movd %2, %%mm5 \n\t" //qadd | |
64 "pxor %%mm7, %%mm7 \n\t" | |
65 "packssdw %%mm5, %%mm5 \n\t" | |
66 "packssdw %%mm5, %%mm5 \n\t" | |
67 "psubw %%mm5, %%mm7 \n\t" | |
68 "pxor %%mm4, %%mm4 \n\t" | |
69 ASMALIGN(4) | |
70 "1: \n\t" | |
71 "movq (%0, %3), %%mm0 \n\t" | |
72 "movq 8(%0, %3), %%mm1 \n\t" | |
73 | |
74 "pmullw %%mm6, %%mm0 \n\t" | |
75 "pmullw %%mm6, %%mm1 \n\t" | |
76 | |
77 "movq (%0, %3), %%mm2 \n\t" | |
78 "movq 8(%0, %3), %%mm3 \n\t" | |
79 | |
80 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
81 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
82 | |
83 "pxor %%mm2, %%mm0 \n\t" | |
84 "pxor %%mm3, %%mm1 \n\t" | |
85 | |
86 "paddw %%mm7, %%mm0 \n\t" | |
87 "paddw %%mm7, %%mm1 \n\t" | |
88 | |
89 "pxor %%mm0, %%mm2 \n\t" | |
90 "pxor %%mm1, %%mm3 \n\t" | |
91 | |
92 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | |
93 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
94 | |
95 "pandn %%mm2, %%mm0 \n\t" | |
96 "pandn %%mm3, %%mm1 \n\t" | |
97 | |
98 "movq %%mm0, (%0, %3) \n\t" | |
99 "movq %%mm1, 8(%0, %3) \n\t" | |
100 | |
101 "add $16, %3 \n\t" | |
102 "jng 1b \n\t" | |
103 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
104 : "memory" | |
105 ); | |
106 block[0]= level; | |
107 } | |
108 | |
109 | |
110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, | |
111 DCTELEM *block, int n, int qscale) | |
112 { | |
113 x86_reg qmul, qadd, nCoeffs; | |
114 | |
115 qmul = qscale << 1; | |
116 qadd = (qscale - 1) | 1; | |
117 | |
118 assert(s->block_last_index[n]>=0 || s->h263_aic); | |
119 | |
120 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
121 //printf("%d %d ", qmul, qadd); | |
122 __asm__ volatile( | |
123 "movd %1, %%mm6 \n\t" //qmul | |
124 "packssdw %%mm6, %%mm6 \n\t" | |
125 "packssdw %%mm6, %%mm6 \n\t" | |
126 "movd %2, %%mm5 \n\t" //qadd | |
127 "pxor %%mm7, %%mm7 \n\t" | |
128 "packssdw %%mm5, %%mm5 \n\t" | |
129 "packssdw %%mm5, %%mm5 \n\t" | |
130 "psubw %%mm5, %%mm7 \n\t" | |
131 "pxor %%mm4, %%mm4 \n\t" | |
132 ASMALIGN(4) | |
133 "1: \n\t" | |
134 "movq (%0, %3), %%mm0 \n\t" | |
135 "movq 8(%0, %3), %%mm1 \n\t" | |
136 | |
137 "pmullw %%mm6, %%mm0 \n\t" | |
138 "pmullw %%mm6, %%mm1 \n\t" | |
139 | |
140 "movq (%0, %3), %%mm2 \n\t" | |
141 "movq 8(%0, %3), %%mm3 \n\t" | |
142 | |
143 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
144 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
145 | |
146 "pxor %%mm2, %%mm0 \n\t" | |
147 "pxor %%mm3, %%mm1 \n\t" | |
148 | |
149 "paddw %%mm7, %%mm0 \n\t" | |
150 "paddw %%mm7, %%mm1 \n\t" | |
151 | |
152 "pxor %%mm0, %%mm2 \n\t" | |
153 "pxor %%mm1, %%mm3 \n\t" | |
154 | |
155 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | |
156 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
157 | |
158 "pandn %%mm2, %%mm0 \n\t" | |
159 "pandn %%mm3, %%mm1 \n\t" | |
160 | |
161 "movq %%mm0, (%0, %3) \n\t" | |
162 "movq %%mm1, 8(%0, %3) \n\t" | |
163 | |
164 "add $16, %3 \n\t" | |
165 "jng 1b \n\t" | |
166 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
167 : "memory" | |
168 ); | |
169 } | |
170 | |
171 | |
172 /* | |
173 NK: | |
174 Note: looking at PARANOID: | |
175 "enable all paranoid tests for rounding, overflows, etc..." | |
176 | |
177 #ifdef PARANOID | |
178 if (level < -2048 || level > 2047) | |
179 fprintf(stderr, "unquant error %d %d\n", i, level); | |
180 #endif | |
181 We can suppose that result of two multiplications can't be greater than 0xFFFF | |
182 i.e. is 16-bit, so we use here only PMULLW instruction and can avoid | |
183 a complex multiplication. | |
184 ===================================================== | |
185 Full formula for multiplication of 2 integer numbers | |
186 which are represent as high:low words: | |
187 input: value1 = high1:low1 | |
188 value2 = high2:low2 | |
189 output: value3 = value1*value2 | |
190 value3=high3:low3 (on overflow: modulus 2^32 wrap-around) | |
191 this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 | |
192 but this algorithm will compute only 0x66cb0ce4 | |
193 this limited by 16-bit size of operands | |
194 --------------------------------- | |
195 tlow1 = high1*low2 | |
196 tlow2 = high2*low1 | |
197 tlow1 = tlow1 + tlow2 | |
198 high3:low3 = low1*low2 | |
199 high3 += tlow1 | |
200 */ | |
201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, | |
202 DCTELEM *block, int n, int qscale) | |
203 { | |
204 x86_reg nCoeffs; | |
205 const uint16_t *quant_matrix; | |
206 int block0; | |
207 | |
208 assert(s->block_last_index[n]>=0); | |
209 | |
210 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
211 | |
212 if (n < 4) | |
213 block0 = block[0] * s->y_dc_scale; | |
214 else | |
215 block0 = block[0] * s->c_dc_scale; | |
216 /* XXX: only mpeg1 */ | |
217 quant_matrix = s->intra_matrix; | |
218 __asm__ volatile( | |
219 "pcmpeqw %%mm7, %%mm7 \n\t" | |
220 "psrlw $15, %%mm7 \n\t" | |
221 "movd %2, %%mm6 \n\t" | |
222 "packssdw %%mm6, %%mm6 \n\t" | |
223 "packssdw %%mm6, %%mm6 \n\t" | |
224 "mov %3, %%"REG_a" \n\t" | |
225 ASMALIGN(4) | |
226 "1: \n\t" | |
227 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
229 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
231 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
232 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
233 "pxor %%mm2, %%mm2 \n\t" | |
234 "pxor %%mm3, %%mm3 \n\t" | |
235 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
236 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
237 "pxor %%mm2, %%mm0 \n\t" | |
238 "pxor %%mm3, %%mm1 \n\t" | |
239 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
240 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
241 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
242 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
243 "pxor %%mm4, %%mm4 \n\t" | |
244 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
247 "psraw $3, %%mm0 \n\t" | |
248 "psraw $3, %%mm1 \n\t" | |
249 "psubw %%mm7, %%mm0 \n\t" | |
250 "psubw %%mm7, %%mm1 \n\t" | |
251 "por %%mm7, %%mm0 \n\t" | |
252 "por %%mm7, %%mm1 \n\t" | |
253 "pxor %%mm2, %%mm0 \n\t" | |
254 "pxor %%mm3, %%mm1 \n\t" | |
255 "psubw %%mm2, %%mm0 \n\t" | |
256 "psubw %%mm3, %%mm1 \n\t" | |
257 "pandn %%mm0, %%mm4 \n\t" | |
258 "pandn %%mm1, %%mm5 \n\t" | |
259 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
260 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
261 | |
262 "add $16, %%"REG_a" \n\t" | |
263 "js 1b \n\t" | |
264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
265 : "%"REG_a, "memory" | |
266 ); | |
267 block[0]= block0; | |
268 } | |
269 | |
270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, | |
271 DCTELEM *block, int n, int qscale) | |
272 { | |
273 x86_reg nCoeffs; | |
274 const uint16_t *quant_matrix; | |
275 | |
276 assert(s->block_last_index[n]>=0); | |
277 | |
278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
279 | |
280 quant_matrix = s->inter_matrix; | |
281 __asm__ volatile( | |
282 "pcmpeqw %%mm7, %%mm7 \n\t" | |
283 "psrlw $15, %%mm7 \n\t" | |
284 "movd %2, %%mm6 \n\t" | |
285 "packssdw %%mm6, %%mm6 \n\t" | |
286 "packssdw %%mm6, %%mm6 \n\t" | |
287 "mov %3, %%"REG_a" \n\t" | |
288 ASMALIGN(4) | |
289 "1: \n\t" | |
290 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
292 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
294 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
295 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
296 "pxor %%mm2, %%mm2 \n\t" | |
297 "pxor %%mm3, %%mm3 \n\t" | |
298 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
299 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
300 "pxor %%mm2, %%mm0 \n\t" | |
301 "pxor %%mm3, %%mm1 \n\t" | |
302 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
303 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
304 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
305 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
306 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 | |
307 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 | |
308 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
309 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
310 "pxor %%mm4, %%mm4 \n\t" | |
311 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
314 "psraw $4, %%mm0 \n\t" | |
315 "psraw $4, %%mm1 \n\t" | |
316 "psubw %%mm7, %%mm0 \n\t" | |
317 "psubw %%mm7, %%mm1 \n\t" | |
318 "por %%mm7, %%mm0 \n\t" | |
319 "por %%mm7, %%mm1 \n\t" | |
320 "pxor %%mm2, %%mm0 \n\t" | |
321 "pxor %%mm3, %%mm1 \n\t" | |
322 "psubw %%mm2, %%mm0 \n\t" | |
323 "psubw %%mm3, %%mm1 \n\t" | |
324 "pandn %%mm0, %%mm4 \n\t" | |
325 "pandn %%mm1, %%mm5 \n\t" | |
326 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
327 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
328 | |
329 "add $16, %%"REG_a" \n\t" | |
330 "js 1b \n\t" | |
331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
332 : "%"REG_a, "memory" | |
333 ); | |
334 } | |
335 | |
336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, | |
337 DCTELEM *block, int n, int qscale) | |
338 { | |
339 x86_reg nCoeffs; | |
340 const uint16_t *quant_matrix; | |
341 int block0; | |
342 | |
343 assert(s->block_last_index[n]>=0); | |
344 | |
345 if(s->alternate_scan) nCoeffs= 63; //FIXME | |
346 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
347 | |
348 if (n < 4) | |
349 block0 = block[0] * s->y_dc_scale; | |
350 else | |
351 block0 = block[0] * s->c_dc_scale; | |
352 quant_matrix = s->intra_matrix; | |
353 __asm__ volatile( | |
354 "pcmpeqw %%mm7, %%mm7 \n\t" | |
355 "psrlw $15, %%mm7 \n\t" | |
356 "movd %2, %%mm6 \n\t" | |
357 "packssdw %%mm6, %%mm6 \n\t" | |
358 "packssdw %%mm6, %%mm6 \n\t" | |
359 "mov %3, %%"REG_a" \n\t" | |
360 ASMALIGN(4) | |
361 "1: \n\t" | |
362 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
364 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
366 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
367 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
368 "pxor %%mm2, %%mm2 \n\t" | |
369 "pxor %%mm3, %%mm3 \n\t" | |
370 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
371 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
372 "pxor %%mm2, %%mm0 \n\t" | |
373 "pxor %%mm3, %%mm1 \n\t" | |
374 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
375 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
376 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
377 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
378 "pxor %%mm4, %%mm4 \n\t" | |
379 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
382 "psraw $3, %%mm0 \n\t" | |
383 "psraw $3, %%mm1 \n\t" | |
384 "pxor %%mm2, %%mm0 \n\t" | |
385 "pxor %%mm3, %%mm1 \n\t" | |
386 "psubw %%mm2, %%mm0 \n\t" | |
387 "psubw %%mm3, %%mm1 \n\t" | |
388 "pandn %%mm0, %%mm4 \n\t" | |
389 "pandn %%mm1, %%mm5 \n\t" | |
390 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
391 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
392 | |
393 "add $16, %%"REG_a" \n\t" | |
394 "jng 1b \n\t" | |
395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
396 : "%"REG_a, "memory" | |
397 ); | |
398 block[0]= block0; | |
399 //Note, we do not do mismatch control for intra as errors cannot accumulate | |
400 } | |
401 | |
402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, | |
403 DCTELEM *block, int n, int qscale) | |
404 { | |
405 x86_reg nCoeffs; | |
406 const uint16_t *quant_matrix; | |
407 | |
408 assert(s->block_last_index[n]>=0); | |
409 | |
410 if(s->alternate_scan) nCoeffs= 63; //FIXME | |
411 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
412 | |
413 quant_matrix = s->inter_matrix; | |
414 __asm__ volatile( | |
415 "pcmpeqw %%mm7, %%mm7 \n\t" | |
416 "psrlq $48, %%mm7 \n\t" | |
417 "movd %2, %%mm6 \n\t" | |
418 "packssdw %%mm6, %%mm6 \n\t" | |
419 "packssdw %%mm6, %%mm6 \n\t" | |
420 "mov %3, %%"REG_a" \n\t" | |
421 ASMALIGN(4) | |
422 "1: \n\t" | |
423 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
425 "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
427 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
428 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
429 "pxor %%mm2, %%mm2 \n\t" | |
430 "pxor %%mm3, %%mm3 \n\t" | |
431 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
432 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
433 "pxor %%mm2, %%mm0 \n\t" | |
434 "pxor %%mm3, %%mm1 \n\t" | |
435 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
436 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
437 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
438 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
439 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | |
440 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | |
441 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
442 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
443 "pxor %%mm4, %%mm4 \n\t" | |
444 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
447 "psrlw $4, %%mm0 \n\t" | |
448 "psrlw $4, %%mm1 \n\t" | |
449 "pxor %%mm2, %%mm0 \n\t" | |
450 "pxor %%mm3, %%mm1 \n\t" | |
451 "psubw %%mm2, %%mm0 \n\t" | |
452 "psubw %%mm3, %%mm1 \n\t" | |
453 "pandn %%mm0, %%mm4 \n\t" | |
454 "pandn %%mm1, %%mm5 \n\t" | |
455 "pxor %%mm4, %%mm7 \n\t" | |
456 "pxor %%mm5, %%mm7 \n\t" | |
457 "movq %%mm4, (%0, %%"REG_a") \n\t" | |
458 "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
459 | |
460 "add $16, %%"REG_a" \n\t" | |
461 "jng 1b \n\t" | |
462 "movd 124(%0, %3), %%mm0 \n\t" | |
463 "movq %%mm7, %%mm6 \n\t" | |
464 "psrlq $32, %%mm7 \n\t" | |
465 "pxor %%mm6, %%mm7 \n\t" | |
466 "movq %%mm7, %%mm6 \n\t" | |
467 "psrlq $16, %%mm7 \n\t" | |
468 "pxor %%mm6, %%mm7 \n\t" | |
469 "pslld $31, %%mm7 \n\t" | |
470 "psrlq $15, %%mm7 \n\t" | |
471 "pxor %%mm7, %%mm0 \n\t" | |
472 "movd %%mm0, 124(%0, %3) \n\t" | |
473 | |
474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) | |
475 : "%"REG_a, "memory" | |
476 ); | |
477 } | |
478 | |
479 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ | |
480 const int intra= s->mb_intra; | |
481 int *sum= s->dct_error_sum[intra]; | |
482 uint16_t *offset= s->dct_offset[intra]; | |
483 | |
484 s->dct_count[intra]++; | |
485 | |
486 __asm__ volatile( | |
487 "pxor %%mm7, %%mm7 \n\t" | |
488 "1: \n\t" | |
489 "pxor %%mm0, %%mm0 \n\t" | |
490 "pxor %%mm1, %%mm1 \n\t" | |
491 "movq (%0), %%mm2 \n\t" | |
492 "movq 8(%0), %%mm3 \n\t" | |
493 "pcmpgtw %%mm2, %%mm0 \n\t" | |
494 "pcmpgtw %%mm3, %%mm1 \n\t" | |
495 "pxor %%mm0, %%mm2 \n\t" | |
496 "pxor %%mm1, %%mm3 \n\t" | |
497 "psubw %%mm0, %%mm2 \n\t" | |
498 "psubw %%mm1, %%mm3 \n\t" | |
499 "movq %%mm2, %%mm4 \n\t" | |
500 "movq %%mm3, %%mm5 \n\t" | |
501 "psubusw (%2), %%mm2 \n\t" | |
502 "psubusw 8(%2), %%mm3 \n\t" | |
503 "pxor %%mm0, %%mm2 \n\t" | |
504 "pxor %%mm1, %%mm3 \n\t" | |
505 "psubw %%mm0, %%mm2 \n\t" | |
506 "psubw %%mm1, %%mm3 \n\t" | |
507 "movq %%mm2, (%0) \n\t" | |
508 "movq %%mm3, 8(%0) \n\t" | |
509 "movq %%mm4, %%mm2 \n\t" | |
510 "movq %%mm5, %%mm3 \n\t" | |
511 "punpcklwd %%mm7, %%mm4 \n\t" | |
512 "punpckhwd %%mm7, %%mm2 \n\t" | |
513 "punpcklwd %%mm7, %%mm5 \n\t" | |
514 "punpckhwd %%mm7, %%mm3 \n\t" | |
515 "paddd (%1), %%mm4 \n\t" | |
516 "paddd 8(%1), %%mm2 \n\t" | |
517 "paddd 16(%1), %%mm5 \n\t" | |
518 "paddd 24(%1), %%mm3 \n\t" | |
519 "movq %%mm4, (%1) \n\t" | |
520 "movq %%mm2, 8(%1) \n\t" | |
521 "movq %%mm5, 16(%1) \n\t" | |
522 "movq %%mm3, 24(%1) \n\t" | |
523 "add $16, %0 \n\t" | |
524 "add $32, %1 \n\t" | |
525 "add $16, %2 \n\t" | |
526 "cmp %3, %0 \n\t" | |
527 " jb 1b \n\t" | |
528 : "+r" (block), "+r" (sum), "+r" (offset) | |
529 : "r"(block+64) | |
530 ); | |
531 } | |
532 | |
533 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ | |
534 const int intra= s->mb_intra; | |
535 int *sum= s->dct_error_sum[intra]; | |
536 uint16_t *offset= s->dct_offset[intra]; | |
537 | |
538 s->dct_count[intra]++; | |
539 | |
540 __asm__ volatile( | |
541 "pxor %%xmm7, %%xmm7 \n\t" | |
542 "1: \n\t" | |
543 "pxor %%xmm0, %%xmm0 \n\t" | |
544 "pxor %%xmm1, %%xmm1 \n\t" | |
545 "movdqa (%0), %%xmm2 \n\t" | |
546 "movdqa 16(%0), %%xmm3 \n\t" | |
547 "pcmpgtw %%xmm2, %%xmm0 \n\t" | |
548 "pcmpgtw %%xmm3, %%xmm1 \n\t" | |
549 "pxor %%xmm0, %%xmm2 \n\t" | |
550 "pxor %%xmm1, %%xmm3 \n\t" | |
551 "psubw %%xmm0, %%xmm2 \n\t" | |
552 "psubw %%xmm1, %%xmm3 \n\t" | |
553 "movdqa %%xmm2, %%xmm4 \n\t" | |
554 "movdqa %%xmm3, %%xmm5 \n\t" | |
555 "psubusw (%2), %%xmm2 \n\t" | |
556 "psubusw 16(%2), %%xmm3 \n\t" | |
557 "pxor %%xmm0, %%xmm2 \n\t" | |
558 "pxor %%xmm1, %%xmm3 \n\t" | |
559 "psubw %%xmm0, %%xmm2 \n\t" | |
560 "psubw %%xmm1, %%xmm3 \n\t" | |
561 "movdqa %%xmm2, (%0) \n\t" | |
562 "movdqa %%xmm3, 16(%0) \n\t" | |
563 "movdqa %%xmm4, %%xmm6 \n\t" | |
564 "movdqa %%xmm5, %%xmm0 \n\t" | |
565 "punpcklwd %%xmm7, %%xmm4 \n\t" | |
566 "punpckhwd %%xmm7, %%xmm6 \n\t" | |
567 "punpcklwd %%xmm7, %%xmm5 \n\t" | |
568 "punpckhwd %%xmm7, %%xmm0 \n\t" | |
569 "paddd (%1), %%xmm4 \n\t" | |
570 "paddd 16(%1), %%xmm6 \n\t" | |
571 "paddd 32(%1), %%xmm5 \n\t" | |
572 "paddd 48(%1), %%xmm0 \n\t" | |
573 "movdqa %%xmm4, (%1) \n\t" | |
574 "movdqa %%xmm6, 16(%1) \n\t" | |
575 "movdqa %%xmm5, 32(%1) \n\t" | |
576 "movdqa %%xmm0, 48(%1) \n\t" | |
577 "add $32, %0 \n\t" | |
578 "add $64, %1 \n\t" | |
579 "add $32, %2 \n\t" | |
580 "cmp %3, %0 \n\t" | |
581 " jb 1b \n\t" | |
582 : "+r" (block), "+r" (sum), "+r" (offset) | |
583 : "r"(block+64) | |
584 ); | |
585 } | |
586 | |
8590 | 587 #if HAVE_SSSE3 |
8430 | 588 #define HAVE_SSSE3_BAK |
589 #endif | |
590 #undef HAVE_SSSE3 | |
8590 | 591 #define HAVE_SSSE3 0 |
8430 | 592 |
593 #undef HAVE_SSE2 | |
594 #undef HAVE_MMX2 | |
8590 | 595 #define HAVE_SSE2 0 |
596 #define HAVE_MMX2 0 | |
8430 | 597 #define RENAME(a) a ## _MMX |
598 #define RENAMEl(a) a ## _mmx | |
599 #include "mpegvideo_mmx_template.c" | |
600 | |
8590 | 601 #undef HAVE_MMX2 |
602 #define HAVE_MMX2 1 | |
8430 | 603 #undef RENAME |
604 #undef RENAMEl | |
605 #define RENAME(a) a ## _MMX2 | |
606 #define RENAMEl(a) a ## _mmx2 | |
607 #include "mpegvideo_mmx_template.c" | |
608 | |
8590 | 609 #undef HAVE_SSE2 |
610 #define HAVE_SSE2 1 | |
8430 | 611 #undef RENAME |
612 #undef RENAMEl | |
613 #define RENAME(a) a ## _SSE2 | |
614 #define RENAMEl(a) a ## _sse2 | |
615 #include "mpegvideo_mmx_template.c" | |
616 | |
617 #ifdef HAVE_SSSE3_BAK | |
8590 | 618 #undef HAVE_SSSE3 |
619 #define HAVE_SSSE3 1 | |
8430 | 620 #undef RENAME |
621 #undef RENAMEl | |
622 #define RENAME(a) a ## _SSSE3 | |
623 #define RENAMEl(a) a ## _sse2 | |
624 #include "mpegvideo_mmx_template.c" | |
625 #endif | |
626 | |
627 void MPV_common_init_mmx(MpegEncContext *s) | |
628 { | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12456
diff
changeset
|
629 int mm_flags = av_get_cpu_flags(); |
12414 | 630 |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
631 if (mm_flags & AV_CPU_FLAG_MMX) { |
8430 | 632 const int dct_algo = s->avctx->dct_algo; |
633 | |
634 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; | |
635 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; | |
636 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; | |
637 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; | |
638 if(!(s->flags & CODEC_FLAG_BITEXACT)) | |
639 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; | |
640 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; | |
641 | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
642 if (mm_flags & AV_CPU_FLAG_SSE2) { |
8430 | 643 s->denoise_dct= denoise_dct_sse2; |
644 } else { | |
645 s->denoise_dct= denoise_dct_mmx; | |
646 } | |
647 | |
648 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ | |
8590 | 649 #if HAVE_SSSE3 |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
650 if(mm_flags & AV_CPU_FLAG_SSSE3){ |
8430 | 651 s->dct_quantize= dct_quantize_SSSE3; |
652 } else | |
653 #endif | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
654 if(mm_flags & AV_CPU_FLAG_SSE2){ |
8430 | 655 s->dct_quantize= dct_quantize_SSE2; |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
656 } else if(mm_flags & AV_CPU_FLAG_MMX2){ |
8430 | 657 s->dct_quantize= dct_quantize_MMX2; |
658 } else { | |
659 s->dct_quantize= dct_quantize_MMX; | |
660 } | |
661 } | |
662 } | |
663 } |